Add a copy of libswscale into the branch instead of using svn:external.

This will allow merging some changes from trunk. Originally committed as revision 18488 to svn://svn.ffmpeg.org/ffmpeg/branches/0.5
17 years ago · beb93f987c
--- a/libswscale/Makefile
+++ b/libswscale/Makefile
@@ -0,0 +1,24 @@
 include $(SUBDIR)../config.mak

 NAME = swscale
 FFLIBS = avutil

 HEADERS = swscale.h

 OBJS = rgb2rgb.o swscale.o swscale_avoption.o yuv2rgb.o

 OBJS-$(ARCH_BFIN)          +=  internal_bfin.o swscale_bfin.o yuv2rgb_bfin.o
 OBJS-$(CONFIG_MLIB)        +=  yuv2rgb_mlib.o
 OBJS-$(HAVE_ALTIVEC)       +=  yuv2rgb_altivec.o
 OBJS-$(HAVE_VIS)           +=  yuv2rgb_vis.o

 TESTS = cs_test swscale-example

 CLEANFILES = cs_test swscale-example

 include $(SUBDIR)../subdir.mak

 $(SUBDIR)cs_test: $(SUBDIR)cs_test.o $(SUBDIR)$(LIBNAME)

 $(SUBDIR)swscale-example: $(SUBDIR)swscale-example.o $(SUBDIR)$(LIBNAME)
 $(SUBDIR)swscale-example: EXTRALIBS += -lm
--- a/libswscale/cs_test.c
+++ b/libswscale/cs_test.c
@@ -0,0 +1,175 @@
 /*
 * Copyright (C) 2002 Michael Niedermayer <michaelni@gmx.at>
 *
 * This file is part of FFmpeg.
 *
 * FFmpeg is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * FFmpeg is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with FFmpeg; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */

 #include <stdio.h>
 #include <string.h>              /* for memset() */
 #include <unistd.h>
 #include <stdlib.h>
 #include <inttypes.h>

 #include "swscale.h"
 #include "rgb2rgb.h"

 #define SIZE 1000
 #define srcByte 0x55
 #define dstByte 0xBB

 #define FUNC(s,d,n) {s,d,#n,n}

 static int cpu_caps;

 static char *args_parse(int argc, char *argv[])
 {
    int o;

    while ((o = getopt(argc, argv, "m23")) != -1) {
        switch (o) {
            case 'm':
                cpu_caps |= SWS_CPU_CAPS_MMX;
                break;
            case '2':
                cpu_caps |= SWS_CPU_CAPS_MMX2;
                break;
            case '3':
                cpu_caps |= SWS_CPU_CAPS_3DNOW;
                break;
            default:
                av_log(NULL, AV_LOG_ERROR, "Unknown option %c\n", o);
        }
    }

    return argv[optind];
 }

 int main(int argc, char **argv)
 {
    int i, funcNum;
    uint8_t *srcBuffer= (uint8_t*)av_malloc(SIZE);
    uint8_t *dstBuffer= (uint8_t*)av_malloc(SIZE);
    int failedNum=0;
    int passedNum=0;

    av_log(NULL, AV_LOG_INFO, "memory corruption test ...\n");
    args_parse(argc, argv);
    av_log(NULL, AV_LOG_INFO, "CPU capabilities forced to %x\n", cpu_caps);
    sws_rgb2rgb_init(cpu_caps);

    for(funcNum=0; ; funcNum++){
        struct func_info_s {
            int src_bpp;
            int dst_bpp;
            const char *name;
            void (*func)(const uint8_t *src, uint8_t *dst, long src_size);
        } func_info[] = {
            FUNC(2, 2, rgb15to16),
            FUNC(2, 3, rgb15to24),
            FUNC(2, 4, rgb15to32),
            FUNC(2, 3, rgb16to24),
            FUNC(2, 4, rgb16to32),
            FUNC(3, 2, rgb24to15),
            FUNC(3, 2, rgb24to16),
            FUNC(3, 4, rgb24to32),
            FUNC(4, 2, rgb32to15),
            FUNC(4, 2, rgb32to16),
            FUNC(4, 3, rgb32to24),
            FUNC(2, 2, rgb16to15),
            FUNC(2, 2, rgb15tobgr15),
            FUNC(2, 2, rgb15tobgr16),
            FUNC(2, 3, rgb15tobgr24),
            FUNC(2, 4, rgb15tobgr32),
            FUNC(2, 2, rgb16tobgr15),
            FUNC(2, 2, rgb16tobgr16),
            FUNC(2, 3, rgb16tobgr24),
            FUNC(2, 4, rgb16tobgr32),
            FUNC(3, 2, rgb24tobgr15),
            FUNC(3, 2, rgb24tobgr16),
            FUNC(3, 3, rgb24tobgr24),
            FUNC(3, 4, rgb24tobgr32),
            FUNC(4, 2, rgb32tobgr15),
            FUNC(4, 2, rgb32tobgr16),
            FUNC(4, 3, rgb32tobgr24),
            FUNC(4, 4, rgb32tobgr32),
            FUNC(0, 0, NULL)
        };
        int width;
        int failed=0;
        int srcBpp=0;
        int dstBpp=0;

        if (!func_info[funcNum].func) break;

        av_log(NULL, AV_LOG_INFO,".");
        memset(srcBuffer, srcByte, SIZE);

        for(width=63; width>0; width--){
            int dstOffset;
            for(dstOffset=128; dstOffset<196; dstOffset+=4){
                int srcOffset;
                memset(dstBuffer, dstByte, SIZE);

                for(srcOffset=128; srcOffset<196; srcOffset+=4){
                    uint8_t *src= srcBuffer+srcOffset;
                    uint8_t *dst= dstBuffer+dstOffset;
                    const char *name=NULL;

                    if(failed) break; //don't fill the screen with shit ...

                    srcBpp = func_info[funcNum].src_bpp;
                    dstBpp = func_info[funcNum].dst_bpp;
                    name   = func_info[funcNum].name;

                    func_info[funcNum].func(src, dst, width*srcBpp);

                    if(!srcBpp) break;

                    for(i=0; i<SIZE; i++){
                        if(srcBuffer[i]!=srcByte){
                            av_log(NULL, AV_LOG_INFO, "src damaged at %d w:%d src:%d dst:%d %s\n",
                                   i, width, srcOffset, dstOffset, name);
                            failed=1;
                            break;
                        }
                    }
                    for(i=0; i<dstOffset; i++){
                        if(dstBuffer[i]!=dstByte){
                            av_log(NULL, AV_LOG_INFO, "dst damaged at %d w:%d src:%d dst:%d %s\n",
                                   i, width, srcOffset, dstOffset, name);
                            failed=1;
                            break;
                        }
                    }
                    for(i=dstOffset + width*dstBpp; i<SIZE; i++){
                        if(dstBuffer[i]!=dstByte){
                            av_log(NULL, AV_LOG_INFO, "dst damaged at %d w:%d src:%d dst:%d %s\n",
                                   i, width, srcOffset, dstOffset, name);
                            failed=1;
                            break;
                        }
                    }
                }
            }
        }
        if(failed) failedNum++;
        else if(srcBpp) passedNum++;
    }

    av_log(NULL, AV_LOG_INFO, "\n%d converters passed, %d converters randomly overwrote memory\n", passedNum, failedNum);
    return failedNum;
 }
--- a/libswscale/internal_bfin.S
+++ b/libswscale/internal_bfin.S
@@ -0,0 +1,606 @@
 /*
 * Copyright (C) 2007 Marc Hoffman <marc.hoffman@analog.com>
 *                    April 20, 2007
 *
 * Blackfin video color space converter operations
 * convert I420 YV12 to RGB in various formats
 *
 * This file is part of FFmpeg.
 *
 * FFmpeg is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * FFmpeg is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with FFmpeg; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */


 /*
 YUV420 to RGB565 conversion. This routine takes a YUV 420 planar macroblock
 and converts it to RGB565. R:5 bits, G:6 bits, B:5 bits.. packed into shorts.


 The following calculation is used for the conversion:

  r = clipz((y-oy)*cy  + crv*(v-128))
  g = clipz((y-oy)*cy  + cgv*(v-128) + cgu*(u-128))
  b = clipz((y-oy)*cy  + cbu*(u-128))

 y,u,v are prescaled by a factor of 4 i.e. left-shifted to gain precision.


 New factorization to eliminate the truncation error which was
 occurring due to the byteop3p.


 1) Use the bytop16m to subtract quad bytes we use this in U8 this
 then so the offsets need to be renormalized to 8bits.

 2) Scale operands up by a factor of 4 not 8 because Blackfin
   multiplies include a shift.

 3) Compute into the accumulators cy*yx0, cy*yx1.

 4) Compute each of the linear equations:
     r = clipz((y - oy) * cy  + crv * (v - 128))

     g = clipz((y - oy) * cy  + cgv * (v - 128) + cgu * (u - 128))

     b = clipz((y - oy) * cy  + cbu * (u - 128))

   Reuse of the accumulators requires that we actually multiply
   twice once with addition and the second time with a subtraction.

   Because of this we need to compute the equations in the order R B
   then G saving the writes for B in the case of 24/32 bit color
   formats.

   API: yuv2rgb_kind (uint8_t *Y, uint8_t *U, uint8_t *V, int *out,
                      int dW, uint32_t *coeffs);

       A          B
       ---        ---
       i2 = cb    i3 = cr
       i1 = coeff i0 = y

 Where coeffs have the following layout in memory.

 uint32_t oy,oc,zero,cy,crv,rmask,cbu,bmask,cgu,cgv;

 coeffs is a pointer to oy.

 The {rgb} masks are only utilized by the 565 packing algorithm. Note the data
 replication is used to simplify the internal algorithms for the dual Mac
 architecture of BlackFin.

 All routines are exported with _ff_bfin_ as a symbol prefix.

 Rough performance gain compared against -O3:

 2779809/1484290 187.28%

 which translates to ~33c/pel to ~57c/pel for the reference vs 17.5
 c/pel for the optimized implementations. Not sure why there is such a
 huge variation on the reference codes on Blackfin I guess it must have
 to do with the memory system.
 */

 #define mL3 .text
 #ifdef __FDPIC__
 #define mL1 .l1.text
 #else
 #define mL1 mL3
 #endif
 #define MEM mL1

 #define DEFUN(fname,where,interface) \
        .section where;              \
        .global _ff_bfin_ ## fname;  \
        .type _ff_bfin_ ## fname, STT_FUNC; \
        .align 8;                    \
        _ff_bfin_ ## fname

 #define DEFUN_END(fname) \
        .size _ff_bfin_ ## fname, . - _ff_bfin_ ## fname


 .text

 #define COEFF_LEN        11*4
 #define COEFF_REL_CY_OFF 4*4

 #define ARG_OUT   20
 #define ARG_W     24
 #define ARG_COEFF 28

 DEFUN(yuv2rgb565_line,MEM,
   (uint8_t *Y, uint8_t *U, uint8_t *V, int *out, int dW, uint32_t *coeffs)):
        link 0;
        [--sp] = (r7:4);
        p1 = [fp+ARG_OUT];
        r3 = [fp+ARG_W];

        i0 = r0;
        i2 = r1;
        i3 = r2;

        r0 = [fp+ARG_COEFF];
        i1 = r0;
        b1 = i1;
        l1 = COEFF_LEN;
        m0 = COEFF_REL_CY_OFF;
        p0 = r3;

        r0   = [i0++];         // 2Y
        r1.l = w[i2++];        // 2u
        r1.h = w[i3++];        // 2v
        p0 = p0>>2;

        lsetup (.L0565, .L1565) lc0 = p0;

        /*
           uint32_t oy,oc,zero,cy,crv,rmask,cbu,bmask,cgu,cgv
           r0 -- used to load 4ys
           r1 -- used to load 2us,2vs
           r4 -- y3,y2
           r5 -- y1,y0
           r6 -- u1,u0
           r7 -- v1,v0
        */
                                                              r2=[i1++]; // oy
 .L0565:
        /*
        rrrrrrrr gggggggg bbbbbbbb
         5432109876543210
                    bbbbb >>3
              gggggggg    <<3
         rrrrrrrr         <<8
         rrrrrggggggbbbbb
        */
        (r4,r5) = byteop16m (r1:0, r3:2)                   || r3=[i1++]; // oc
        (r7,r6) = byteop16m (r1:0, r3:2) (r);
        r5 = r5 << 2 (v);                                                // y1,y0
        r4 = r4 << 2 (v);                                                // y3,y2
        r6 = r6 << 2 (v)                                   || r0=[i1++]; // u1,u0, r0=zero
        r7 = r7 << 2 (v)                                   || r1=[i1++]; // v1,v0  r1=cy
        /* Y' = y*cy */
        a1 = r1.h*r5.h, a0 = r1.l*r5.l                     || r1=[i1++]; // crv

        /* R = Y+ crv*(Cr-128) */
        r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
                a1 -= r1.h*r7.l,          a0 -= r1.l*r7.l  || r5=[i1++]; // rmask
        r2 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cbu
        r2 = r2 >> 3 (v);
        r3 = r2 & r5;

        /* B = Y+ cbu*(Cb-128) */
        r2.h = (a1 += r1.h*r6.l), r2.l = (a0 += r1.l*r6.l);
                a1 -= r1.h*r6.l,          a0 -= r1.l*r6.l  || r5=[i1++]; // bmask
        r2 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cgu
        r2 = r2 << 8 (v);
        r2 = r2 & r5;
        r3 = r3 | r2;

        /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
                a1 += r1.h*r6.l,          a0 += r1.l*r6.l  || r1=[i1++]; // cgv
        r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
        r2 = byteop3p(r3:2, r1:0)(LO)                      || r5=[i1++m0]; // gmask
        r2 = r2 << 3 (v);
        r2 = r2 & r5;
        r3 = r3 | r2;
        [p1++]=r3                                          || r1=[i1++]; // cy

        /* Y' = y*cy */

        a1 = r1.h*r4.h, a0 = r1.l*r4.l                     || r1=[i1++]; // crv

        /* R = Y+ crv*(Cr-128) */
        r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h);
                a1 -= r1.h*r7.h,          a0 -= r1.l*r7.h  || r5=[i1++]; // rmask
        r2 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cbu
        r2 = r2 >> 3 (v);
        r3 = r2 & r5;

        /* B = Y+ cbu*(Cb-128) */
        r2.h = (a1 += r1.h*r6.h), r2.l = (a0 += r1.l*r6.h);
                a1 -= r1.h*r6.h,          a0 -= r1.l*r6.h  || r5=[i1++]; // bmask
        r2 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cgu
        r2 = r2 << 8 (v);
        r2 = r2 & r5;
        r3 = r3 | r2;

        /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
                a1 += r1.h*r6.h,          a0 += r1.l*r6.h  || r1=[i1++]; // cgv
        r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h) || r5=[i1++]; // gmask
        r2 = byteop3p(r3:2, r1:0)(LO)                      || r0   =  [i0++];        // 2Y
        r2 = r2 << 3 (v)                                   || r1.l = w[i2++];        // 2u
        r2 = r2 & r5;
        r3 = r3 | r2;
        [p1++]=r3                                          || r1.h = w[i3++];        // 2v
 .L1565:                                                       r2=[i1++]; // oy

        l1 = 0;

        (r7:4) = [sp++];
        unlink;
        rts;
 DEFUN_END(yuv2rgb565_line)

 DEFUN(yuv2rgb555_line,MEM,
   (uint8_t *Y, uint8_t *U, uint8_t *V, int *out, int dW, uint32_t *coeffs)):
        link 0;
        [--sp] = (r7:4);
        p1 = [fp+ARG_OUT];
        r3 = [fp+ARG_W];

        i0 = r0;
        i2 = r1;
        i3 = r2;

        r0 = [fp+ARG_COEFF];
        i1 = r0;
        b1 = i1;
        l1 = COEFF_LEN;
        m0 = COEFF_REL_CY_OFF;
        p0 = r3;

        r0   = [i0++];         // 2Y
        r1.l = w[i2++];        // 2u
        r1.h = w[i3++];        // 2v
        p0 = p0>>2;

        lsetup (.L0555, .L1555) lc0 = p0;

        /*
           uint32_t oy,oc,zero,cy,crv,rmask,cbu,bmask,cgu,cgv
           r0 -- used to load 4ys
           r1 -- used to load 2us,2vs
           r4 -- y3,y2
           r5 -- y1,y0
           r6 -- u1,u0
           r7 -- v1,v0
        */
                                                              r2=[i1++]; // oy
 .L0555:
        /*
        rrrrrrrr gggggggg bbbbbbbb
         5432109876543210
                    bbbbb >>3
               gggggggg   <<2
          rrrrrrrr        <<7
         xrrrrrgggggbbbbb
        */

        (r4,r5) = byteop16m (r1:0, r3:2)                   || r3=[i1++]; // oc
        (r7,r6) = byteop16m (r1:0, r3:2) (r);
        r5 = r5 << 2 (v);                                                // y1,y0
        r4 = r4 << 2 (v);                                                // y3,y2
        r6 = r6 << 2 (v)                                   || r0=[i1++]; // u1,u0, r0=zero
        r7 = r7 << 2 (v)                                   || r1=[i1++]; // v1,v0  r1=cy
        /* Y' = y*cy */
        a1 = r1.h*r5.h, a0 = r1.l*r5.l                     || r1=[i1++]; // crv

        /* R = Y+ crv*(Cr-128) */
        r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
                a1 -= r1.h*r7.l,          a0 -= r1.l*r7.l  || r5=[i1++]; // rmask
        r2 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cbu
        r2 = r2 >> 3 (v);
        r3 = r2 & r5;

        /* B = Y+ cbu*(Cb-128) */
        r2.h = (a1 += r1.h*r6.l), r2.l = (a0 += r1.l*r6.l);
                a1 -= r1.h*r6.l,          a0 -= r1.l*r6.l  || r5=[i1++]; // bmask
        r2 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cgu
        r2 = r2 << 7 (v);
        r2 = r2 & r5;
        r3 = r3 | r2;

        /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
                a1 += r1.h*r6.l,          a0 += r1.l*r6.l  || r1=[i1++]; // cgv
        r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
        r2 = byteop3p(r3:2, r1:0)(LO)                      || r5=[i1++m0]; // gmask
        r2 = r2 << 2 (v);
        r2 = r2 & r5;
        r3 = r3 | r2;
        [p1++]=r3                                          || r1=[i1++]; // cy

        /* Y' = y*cy */

        a1 = r1.h*r4.h, a0 = r1.l*r4.l                     || r1=[i1++]; // crv

        /* R = Y+ crv*(Cr-128) */
        r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h);
                a1 -= r1.h*r7.h,          a0 -= r1.l*r7.h  || r5=[i1++]; // rmask
        r2 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cbu
        r2 = r2 >> 3 (v);
        r3 = r2 & r5;

        /* B = Y+ cbu*(Cb-128) */
        r2.h = (a1 += r1.h*r6.h), r2.l = (a0 += r1.l*r6.h);
                a1 -= r1.h*r6.h,          a0 -= r1.l*r6.h  || r5=[i1++]; // bmask
        r2 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cgu
        r2 = r2 << 7 (v);
        r2 = r2 & r5;
        r3 = r3 | r2;

        /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
                a1 += r1.h*r6.h,          a0 += r1.l*r6.h  || r1=[i1++]; // cgv
        r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h) || r5=[i1++]; // gmask
        r2 = byteop3p(r3:2, r1:0)(LO)                      || r0=[i0++];     // 4Y
        r2 = r2 << 2 (v)                                   || r1.l=w[i2++];  // 2u
        r2 = r2 & r5;
        r3 = r3 | r2;
        [p1++]=r3                                          || r1.h=w[i3++]; // 2v

 .L1555:                                                       r2=[i1++]; // oy

        l1 = 0;

        (r7:4) = [sp++];
        unlink;
        rts;
 DEFUN_END(yuv2rgb555_line)

 DEFUN(yuv2rgb24_line,MEM,
   (uint8_t *Y, uint8_t *U, uint8_t *V, int *out, int dW, uint32_t *coeffs)):
        link 0;
        [--sp] = (r7:4);
        p1 = [fp+ARG_OUT];
        r3 = [fp+ARG_W];
        p2 = p1;
        p2 += 3;

        i0 = r0;
        i2 = r1;
        i3 = r2;

        r0 = [fp+ARG_COEFF]; // coeff buffer
        i1 = r0;
        b1 = i1;
        l1 = COEFF_LEN;
        m0 = COEFF_REL_CY_OFF;
        p0 = r3;

        r0   = [i0++];         // 2Y
        r1.l = w[i2++];        // 2u
        r1.h = w[i3++];        // 2v
        p0 = p0>>2;

        lsetup (.L0888, .L1888) lc0 = p0;

        /*
           uint32_t oy,oc,zero,cy,crv,rmask,cbu,bmask,cgu,cgv
           r0 -- used to load 4ys
           r1 -- used to load 2us,2vs
           r4 -- y3,y2
           r5 -- y1,y0
           r6 -- u1,u0
           r7 -- v1,v0
        */
                                                              r2=[i1++]; // oy
 .L0888:
        (r4,r5) = byteop16m (r1:0, r3:2)                   || r3=[i1++]; // oc
        (r7,r6) = byteop16m (r1:0, r3:2) (r);
        r5 = r5 << 2 (v);               // y1,y0
        r4 = r4 << 2 (v);               // y3,y2
        r6 = r6 << 2 (v) || r0=[i1++];  // u1,u0, r0=zero
        r7 = r7 << 2 (v) || r1=[i1++];  // v1,v0  r1=cy

        /* Y' = y*cy */
        a1 = r1.h*r5.h, a0 = r1.l*r5.l                     || r1=[i1++]; // crv

        /* R = Y+ crv*(Cr-128) */
        r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
                a1 -= r1.h*r7.l,          a0 -= r1.l*r7.l  || r5=[i1++]; // rmask
        r2 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cbu
        r2=r2>>16 || B[p1++]=r2;
                     B[p2++]=r2;

        /* B = Y+ cbu*(Cb-128) */
        r2.h = (a1 += r1.h*r6.l), r2.l = (a0 += r1.l*r6.l);
                a1 -= r1.h*r6.l,          a0 -= r1.l*r6.l  || r5=[i1++]; // bmask
        r3 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cgu

        /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
                a1 += r1.h*r6.l,          a0 += r1.l*r6.l  || r1=[i1++]; // cgv
        r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
        r2 = byteop3p(r3:2, r1:0)(LO)                      || r5=[i1++m0]; // gmask, oy,cy,zero

        r2=r2>>16 || B[p1++]=r2;
                     B[p2++]=r2;

        r3=r3>>16 || B[p1++]=r3;
                     B[p2++]=r3                            || r1=[i1++]; // cy

        p1+=3;
        p2+=3;
        /* Y' = y*cy */
        a1 = r1.h*r4.h, a0 = r1.l*r4.l                     || r1=[i1++]; // crv

        /* R = Y+ crv*(Cr-128) */
        r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h);
                a1 -= r1.h*r7.h,          a0 -= r1.l*r7.h  || r5=[i1++]; // rmask
        r2 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cbu
        r2=r2>>16 || B[p1++]=r2;
        B[p2++]=r2;

        /* B = Y+ cbu*(Cb-128) */
        r2.h = (a1 += r1.h*r6.h), r2.l = (a0 += r1.l*r6.h);
                a1 -= r1.h*r6.h,          a0 -= r1.l*r6.h  || r5=[i1++]; // bmask
        r3 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cgu

        /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
                a1 += r1.h*r6.h,          a0 += r1.l*r6.h  || r1=[i1++]; // cgv
        r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h);
        r2 = byteop3p(r3:2, r1:0)(LO)                      || r5=[i1++]; // gmask
        r2=r2>>16 || B[p1++]=r2 || r0 = [i0++];    // 4y
                     B[p2++]=r2 || r1.l = w[i2++]; // 2u
        r3=r3>>16 || B[p1++]=r3 || r1.h = w[i3++]; // 2v
                     B[p2++]=r3 || r2=[i1++];      // oy

        p1+=3;
 .L1888: p2+=3;

        l1 = 0;

        (r7:4) = [sp++];
        unlink;
        rts;
 DEFUN_END(yuv2rgb24_line)



 #define ARG_vdst        20
 #define ARG_width       24
 #define ARG_height      28
 #define ARG_lumStride   32
 #define ARG_chromStride 36
 #define ARG_srcStride   40

 DEFUN(uyvytoyv12, mL3,  (const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
                         long width, long height,
                         long lumStride, long chromStride, long srcStride)):
        link 0;
        [--sp] = (r7:4,p5:4);

        p0 = r1;       // Y top even

        i2 = r2; // *u
        r2 = [fp + ARG_vdst];
        i3 = r2; // *v

        r1 = [fp + ARG_srcStride];
        r2 = r0 + r1;
        r1 += -8;  // i0,i1 is pre read need to correct
        m0 = r1;

        i0 = r0;  // uyvy_T even
        i1 = r2;  // uyvy_B odd

        p2 = [fp + ARG_lumStride];
        p1 = p0 + p2;  // Y bot odd

        p5 = [fp + ARG_width];
        p4 = [fp + ARG_height];
        r0 = p5;
        p4 = p4 >> 1;
        p5 = p5 >> 2;

        r2 = [fp + ARG_chromStride];
        r0 = r0 >> 1;
        r2 = r2 - r0;
        m1 = r2;

        /*   I0,I1 - src input line pointers
         *   p0,p1 - luma output line pointers
         *   I2    - dstU
         *   I3    - dstV
         */

        lsetup (0f, 1f) lc1 = p4;   // H/2
 0:        r0 = [i0++] || r2 = [i1++];
          r1 = [i0++] || r3 = [i1++];
          r4 = byteop1p(r1:0, r3:2);
          r5 = byteop1p(r1:0, r3:2) (r);
          lsetup (2f, 3f) lc0 = p5; // W/4
 2:          r0 = r0 >> 8(v);
            r1 = r1 >> 8(v);
            r2 = r2 >> 8(v);
            r3 = r3 >> 8(v);
            r0 = bytepack(r0, r1);
            r2 = bytepack(r2, r3)         ||  [p0++] = r0;    // yyyy
            r6 = pack(r5.l, r4.l)         ||  [p1++] = r2;    // yyyy
            r7 = pack(r5.h, r4.h)         ||  r0 = [i0++] || r2 = [i1++];
            r6 = bytepack(r6, r7)         ||  r1 = [i0++] || r3 = [i1++];
            r4 = byteop1p(r1:0, r3:2)     ||  w[i2++] = r6.l; // uu
 3:          r5 = byteop1p(r1:0, r3:2) (r) ||  w[i3++] = r6.h; // vv

          i0 += m0;
          i1 += m0;
          i2 += m1;
          i3 += m1;
          p0 = p0 + p2;
 1:        p1 = p1 + p2;

        (r7:4,p5:4) = [sp++];
        unlink;
        rts;
 DEFUN_END(uyvytoyv12)

 DEFUN(yuyvtoyv12, mL3,  (const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
                         long width, long height,
                         long lumStride, long chromStride, long srcStride)):
        link 0;
        [--sp] = (r7:4,p5:4);

        p0 = r1;       // Y top even

        i2 = r2; // *u
        r2 = [fp + ARG_vdst];
        i3 = r2; // *v

        r1 = [fp + ARG_srcStride];
        r2 = r0 + r1;
        r1 += -8;  // i0,i1 is pre read need to correct
        m0 = r1;

        i0 = r0;  // uyvy_T even
        i1 = r2;  // uyvy_B odd

        p2 = [fp + ARG_lumStride];
        p1 = p0 + p2;  // Y bot odd

        p5 = [fp + ARG_width];
        p4 = [fp + ARG_height];
        r0 = p5;
        p4 = p4 >> 1;
        p5 = p5 >> 2;

        r2 = [fp + ARG_chromStride];
        r0 = r0 >> 1;
        r2 = r2 - r0;
        m1 = r2;

        /*   I0,I1 - src input line pointers
         *   p0,p1 - luma output line pointers
         *   I2    - dstU
         *   I3    - dstV
         */

        lsetup (0f, 1f) lc1 = p4;   // H/2
 0:        r0 = [i0++] || r2 = [i1++];
          r1 = [i0++] || r3 = [i1++];
          r4 = bytepack(r0, r1);
          r5 = bytepack(r2, r3);
          lsetup (2f, 3f) lc0 = p5; // W/4
 2:          r0 = r0 >> 8(v) || [p0++] = r4;  // yyyy-even
            r1 = r1 >> 8(v) || [p1++] = r5;  // yyyy-odd
            r2 = r2 >> 8(v);
            r3 = r3 >> 8(v);
            r4 = byteop1p(r1:0, r3:2);
            r5 = byteop1p(r1:0, r3:2) (r);
            r6 = pack(r5.l, r4.l);
            r7 = pack(r5.h, r4.h)         ||  r0 = [i0++] || r2 = [i1++];
            r6 = bytepack(r6, r7)         ||  r1 = [i0++] || r3 = [i1++];
            r4 = bytepack(r0, r1)         ||  w[i2++] = r6.l; // uu
 3:          r5 = bytepack(r2, r3)         ||  w[i3++] = r6.h; // vv

          i0 += m0;
          i1 += m0;
          i2 += m1;
          i3 += m1;
          p0 = p0 + p2;
 1:        p1 = p1 + p2;

        (r7:4,p5:4) = [sp++];
        unlink;
        rts;
 DEFUN_END(yuyvtoyv12)
--- a/libswscale/rgb2rgb.c
+++ b/libswscale/rgb2rgb.c
@@ -0,0 +1,442 @@
 /*
 * software RGB to RGB converter
 * pluralize by software PAL8 to RGB converter
 *              software YUV to YUV converter
 *              software YUV to RGB converter
 * Written by Nick Kurshev.
 * palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at)
 *
 * This file is part of FFmpeg.
 *
 * FFmpeg is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * FFmpeg is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with FFmpeg; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 *
 * The C code (not assembly, MMX, ...) of this file can be used
 * under the LGPL license.
 */
 #include <inttypes.h>
 #include "config.h"
 #include "libavutil/x86_cpu.h"
 #include "libavutil/bswap.h"
 #include "rgb2rgb.h"
 #include "swscale.h"
 #include "swscale_internal.h"

 #define FAST_BGR2YV12 // use 7-bit instead of 15-bit coefficients

 void (*rgb24tobgr32)(const uint8_t *src, uint8_t *dst, long src_size);
 void (*rgb24tobgr16)(const uint8_t *src, uint8_t *dst, long src_size);
 void (*rgb24tobgr15)(const uint8_t *src, uint8_t *dst, long src_size);
 void (*rgb32tobgr24)(const uint8_t *src, uint8_t *dst, long src_size);
 void (*rgb32to16)(const uint8_t *src, uint8_t *dst, long src_size);
 void (*rgb32to15)(const uint8_t *src, uint8_t *dst, long src_size);
 void (*rgb15to16)(const uint8_t *src, uint8_t *dst, long src_size);
 void (*rgb15tobgr24)(const uint8_t *src, uint8_t *dst, long src_size);
 void (*rgb15to32)(const uint8_t *src, uint8_t *dst, long src_size);
 void (*rgb16to15)(const uint8_t *src, uint8_t *dst, long src_size);
 void (*rgb16tobgr24)(const uint8_t *src, uint8_t *dst, long src_size);
 void (*rgb16to32)(const uint8_t *src, uint8_t *dst, long src_size);
 void (*rgb24tobgr24)(const uint8_t *src, uint8_t *dst, long src_size);
 void (*rgb24to16)(const uint8_t *src, uint8_t *dst, long src_size);
 void (*rgb24to15)(const uint8_t *src, uint8_t *dst, long src_size);
 void (*rgb32tobgr32)(const uint8_t *src, uint8_t *dst, long src_size);
 void (*rgb32tobgr16)(const uint8_t *src, uint8_t *dst, long src_size);
 void (*rgb32tobgr15)(const uint8_t *src, uint8_t *dst, long src_size);

 void (*yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
                   long width, long height,
                   long lumStride, long chromStride, long dstStride);
 void (*yv12touyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
                   long width, long height,
                   long lumStride, long chromStride, long dstStride);
 void (*yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
                      long width, long height,
                      long lumStride, long chromStride, long dstStride);
 void (*yuv422ptouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
                      long width, long height,
                      long lumStride, long chromStride, long dstStride);
 void (*yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
                   long width, long height,
                   long lumStride, long chromStride, long srcStride);
 void (*rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
                    long width, long height,
                    long lumStride, long chromStride, long srcStride);
 void (*planar2x)(const uint8_t *src, uint8_t *dst, long width, long height,
                 long srcStride, long dstStride);
 void (*interleaveBytes)(uint8_t *src1, uint8_t *src2, uint8_t *dst,
                        long width, long height, long src1Stride,
                        long src2Stride, long dstStride);
 void (*vu9_to_vu12)(const uint8_t *src1, const uint8_t *src2,
                    uint8_t *dst1, uint8_t *dst2,
                    long width, long height,
                    long srcStride1, long srcStride2,
                    long dstStride1, long dstStride2);
 void (*yvu9_to_yuy2)(const uint8_t *src1, const uint8_t *src2, const uint8_t *src3,
                     uint8_t *dst,
                     long width, long height,
                     long srcStride1, long srcStride2,
                     long srcStride3, long dstStride);

 #if ARCH_X86 && CONFIG_GPL
 DECLARE_ASM_CONST(8, uint64_t, mmx_null)     = 0x0000000000000000ULL;
 DECLARE_ASM_CONST(8, uint64_t, mmx_one)      = 0xFFFFFFFFFFFFFFFFULL;
 DECLARE_ASM_CONST(8, uint64_t, mask32b)      = 0x000000FF000000FFULL;
 DECLARE_ASM_CONST(8, uint64_t, mask32g)      = 0x0000FF000000FF00ULL;
 DECLARE_ASM_CONST(8, uint64_t, mask32r)      = 0x00FF000000FF0000ULL;
 DECLARE_ASM_CONST(8, uint64_t, mask32a)      = 0xFF000000FF000000ULL;
 DECLARE_ASM_CONST(8, uint64_t, mask32)       = 0x00FFFFFF00FFFFFFULL;
 DECLARE_ASM_CONST(8, uint64_t, mask3216br)   = 0x00F800F800F800F8ULL;
 DECLARE_ASM_CONST(8, uint64_t, mask3216g)    = 0x0000FC000000FC00ULL;
 DECLARE_ASM_CONST(8, uint64_t, mask3215g)    = 0x0000F8000000F800ULL;
 DECLARE_ASM_CONST(8, uint64_t, mul3216)      = 0x2000000420000004ULL;
 DECLARE_ASM_CONST(8, uint64_t, mul3215)      = 0x2000000820000008ULL;
 DECLARE_ASM_CONST(8, uint64_t, mask24b)      = 0x00FF0000FF0000FFULL;
 DECLARE_ASM_CONST(8, uint64_t, mask24g)      = 0xFF0000FF0000FF00ULL;
 DECLARE_ASM_CONST(8, uint64_t, mask24r)      = 0x0000FF0000FF0000ULL;
 DECLARE_ASM_CONST(8, uint64_t, mask24l)      = 0x0000000000FFFFFFULL;
 DECLARE_ASM_CONST(8, uint64_t, mask24h)      = 0x0000FFFFFF000000ULL;
 DECLARE_ASM_CONST(8, uint64_t, mask24hh)     = 0xffff000000000000ULL;
 DECLARE_ASM_CONST(8, uint64_t, mask24hhh)    = 0xffffffff00000000ULL;
 DECLARE_ASM_CONST(8, uint64_t, mask24hhhh)   = 0xffffffffffff0000ULL;
 DECLARE_ASM_CONST(8, uint64_t, mask15b)      = 0x001F001F001F001FULL; /* 00000000 00011111  xxB */
 DECLARE_ASM_CONST(8, uint64_t, mask15rg)     = 0x7FE07FE07FE07FE0ULL; /* 01111111 11100000  RGx */
 DECLARE_ASM_CONST(8, uint64_t, mask15s)      = 0xFFE0FFE0FFE0FFE0ULL;
 DECLARE_ASM_CONST(8, uint64_t, mask15g)      = 0x03E003E003E003E0ULL;
 DECLARE_ASM_CONST(8, uint64_t, mask15r)      = 0x7C007C007C007C00ULL;
 #define mask16b mask15b
 DECLARE_ASM_CONST(8, uint64_t, mask16g)      = 0x07E007E007E007E0ULL;
 DECLARE_ASM_CONST(8, uint64_t, mask16r)      = 0xF800F800F800F800ULL;
 DECLARE_ASM_CONST(8, uint64_t, red_16mask)   = 0x0000f8000000f800ULL;
 DECLARE_ASM_CONST(8, uint64_t, green_16mask) = 0x000007e0000007e0ULL;
 DECLARE_ASM_CONST(8, uint64_t, blue_16mask)  = 0x0000001f0000001fULL;
 DECLARE_ASM_CONST(8, uint64_t, red_15mask)   = 0x00007c0000007c00ULL;
 DECLARE_ASM_CONST(8, uint64_t, green_15mask) = 0x000003e0000003e0ULL;
 DECLARE_ASM_CONST(8, uint64_t, blue_15mask)  = 0x0000001f0000001fULL;
 #endif /* ARCH_X86 */

 #define RGB2YUV_SHIFT 8
 #define BY ((int)( 0.098*(1<<RGB2YUV_SHIFT)+0.5))
 #define BV ((int)(-0.071*(1<<RGB2YUV_SHIFT)+0.5))
 #define BU ((int)( 0.439*(1<<RGB2YUV_SHIFT)+0.5))
 #define GY ((int)( 0.504*(1<<RGB2YUV_SHIFT)+0.5))
 #define GV ((int)(-0.368*(1<<RGB2YUV_SHIFT)+0.5))
 #define GU ((int)(-0.291*(1<<RGB2YUV_SHIFT)+0.5))
 #define RY ((int)( 0.257*(1<<RGB2YUV_SHIFT)+0.5))
 #define RV ((int)( 0.439*(1<<RGB2YUV_SHIFT)+0.5))
 #define RU ((int)(-0.148*(1<<RGB2YUV_SHIFT)+0.5))

 //Note: We have C, MMX, MMX2, 3DNOW versions, there is no 3DNOW + MMX2 one.
 //plain C versions
 #undef HAVE_MMX
 #undef HAVE_MMX2
 #undef HAVE_AMD3DNOW
 #undef HAVE_SSE2
 #define HAVE_MMX 0
 #define HAVE_MMX2 0
 #define HAVE_AMD3DNOW 0
 #define HAVE_SSE2 0
 #define RENAME(a) a ## _C
 #include "rgb2rgb_template.c"

 #if ARCH_X86 && CONFIG_GPL

 //MMX versions
 #undef RENAME
 #undef HAVE_MMX
 #define HAVE_MMX 1
 #define RENAME(a) a ## _MMX
 #include "rgb2rgb_template.c"

 //MMX2 versions
 #undef RENAME
 #undef HAVE_MMX2
 #define HAVE_MMX2 1
 #define RENAME(a) a ## _MMX2
 #include "rgb2rgb_template.c"

 //3DNOW versions
 #undef RENAME
 #undef HAVE_MMX2
 #undef HAVE_AMD3DNOW
 #define HAVE_MMX2 0
 #define HAVE_AMD3DNOW 1
 #define RENAME(a) a ## _3DNOW
 #include "rgb2rgb_template.c"

 #endif //ARCH_X86 || ARCH_X86_64

 /*
 RGB15->RGB16 original by Strepto/Astral
 ported to gcc & bugfixed : A'rpi
 MMX2, 3DNOW optimization by Nick Kurshev
 32-bit C version, and and&add trick by Michael Niedermayer
 */

 void sws_rgb2rgb_init(int flags){
 #if (HAVE_MMX2 || HAVE_AMD3DNOW || HAVE_MMX)  && CONFIG_GPL
    if (flags & SWS_CPU_CAPS_MMX2)
        rgb2rgb_init_MMX2();
    else if (flags & SWS_CPU_CAPS_3DNOW)
        rgb2rgb_init_3DNOW();
    else if (flags & SWS_CPU_CAPS_MMX)
        rgb2rgb_init_MMX();
    else
 #endif /* HAVE_MMX2 || HAVE_AMD3DNOW || HAVE_MMX */
        rgb2rgb_init_C();
 }

 /**
 * Convert the palette to the same packet 32-bit format as the palette
 */
 void palette8topacked32(const uint8_t *src, uint8_t *dst, long num_pixels, const uint8_t *palette)
 {
    long i;

    for (i=0; i<num_pixels; i++)
        ((uint32_t *) dst)[i] = ((const uint32_t *) palette)[src[i]];
 }

 /**
 * Palette format: ABCD -> dst format: ABC
 */
 void palette8topacked24(const uint8_t *src, uint8_t *dst, long num_pixels, const uint8_t *palette)
 {
    long i;

    for (i=0; i<num_pixels; i++)
    {
        //FIXME slow?
        dst[0]= palette[src[i]*4+0];
        dst[1]= palette[src[i]*4+1];
        dst[2]= palette[src[i]*4+2];
        dst+= 3;
    }
 }

 /**
 * Palette is assumed to contain BGR16, see rgb32to16 to convert the palette.
 */
 void palette8torgb16(const uint8_t *src, uint8_t *dst, long num_pixels, const uint8_t *palette)
 {
    long i;
    for (i=0; i<num_pixels; i++)
        ((uint16_t *)dst)[i] = ((const uint16_t *)palette)[src[i]];
 }
 void palette8tobgr16(const uint8_t *src, uint8_t *dst, long num_pixels, const uint8_t *palette)
 {
    long i;
    for (i=0; i<num_pixels; i++)
        ((uint16_t *)dst)[i] = bswap_16(((const uint16_t *)palette)[src[i]]);
 }

 /**
 * Palette is assumed to contain BGR15, see rgb32to15 to convert the palette.
 */
 void palette8torgb15(const uint8_t *src, uint8_t *dst, long num_pixels, const uint8_t *palette)
 {
    long i;
    for (i=0; i<num_pixels; i++)
        ((uint16_t *)dst)[i] = ((const uint16_t *)palette)[src[i]];
 }
 void palette8tobgr15(const uint8_t *src, uint8_t *dst, long num_pixels, const uint8_t *palette)
 {
    long i;
    for (i=0; i<num_pixels; i++)
        ((uint16_t *)dst)[i] = bswap_16(((const uint16_t *)palette)[src[i]]);
 }

 void rgb32to24(const uint8_t *src, uint8_t *dst, long src_size)
 {
    long i;
    long num_pixels = src_size >> 2;
    for (i=0; i<num_pixels; i++)
    {
        #ifdef WORDS_BIGENDIAN
            /* RGB32 (= A,B,G,R) -> BGR24 (= B,G,R) */
            dst[3*i + 0] = src[4*i + 1];
            dst[3*i + 1] = src[4*i + 2];
            dst[3*i + 2] = src[4*i + 3];
        #else
            dst[3*i + 0] = src[4*i + 2];
            dst[3*i + 1] = src[4*i + 1];
            dst[3*i + 2] = src[4*i + 0];
        #endif
    }
 }

 void rgb24to32(const uint8_t *src, uint8_t *dst, long src_size)
 {
    long i;
    for (i=0; 3*i<src_size; i++)
    {
        #ifdef WORDS_BIGENDIAN
            /* RGB24 (= R,G,B) -> BGR32 (= A,R,G,B) */
            dst[4*i + 0] = 255;
            dst[4*i + 1] = src[3*i + 0];
            dst[4*i + 2] = src[3*i + 1];
            dst[4*i + 3] = src[3*i + 2];
        #else
            dst[4*i + 0] = src[3*i + 2];
            dst[4*i + 1] = src[3*i + 1];
            dst[4*i + 2] = src[3*i + 0];
            dst[4*i + 3] = 255;
        #endif
    }
 }

 void rgb16tobgr32(const uint8_t *src, uint8_t *dst, long src_size)
 {
    const uint16_t *end;
    uint8_t *d = dst;
    const uint16_t *s = (const uint16_t *)src;
    end = s + src_size/2;
    while (s < end)
    {
        register uint16_t bgr;
        bgr = *s++;
        #ifdef WORDS_BIGENDIAN
            *d++ = 255;
            *d++ = (bgr&0x1F)<<3;
            *d++ = (bgr&0x7E0)>>3;
            *d++ = (bgr&0xF800)>>8;
        #else
            *d++ = (bgr&0xF800)>>8;
            *d++ = (bgr&0x7E0)>>3;
            *d++ = (bgr&0x1F)<<3;
            *d++ = 255;
        #endif
    }
 }

 void rgb16to24(const uint8_t *src, uint8_t *dst, long src_size)
 {
    const uint16_t *end;
    uint8_t *d = dst;
    const uint16_t *s = (const uint16_t *)src;
    end = s + src_size/2;
    while (s < end)
    {
        register uint16_t bgr;
        bgr = *s++;
        *d++ = (bgr&0xF800)>>8;
        *d++ = (bgr&0x7E0)>>3;
        *d++ = (bgr&0x1F)<<3;
    }
 }

 void rgb16tobgr16(const uint8_t *src, uint8_t *dst, long src_size)
 {
    long i;
    long num_pixels = src_size >> 1;

    for (i=0; i<num_pixels; i++)
    {
        unsigned rgb = ((const uint16_t*)src)[i];
        ((uint16_t*)dst)[i] = (rgb>>11) | (rgb&0x7E0) | (rgb<<11);
    }
 }

 void rgb16tobgr15(const uint8_t *src, uint8_t *dst, long src_size)
 {
    long i;
    long num_pixels = src_size >> 1;

    for (i=0; i<num_pixels; i++)
    {
        unsigned rgb = ((const uint16_t*)src)[i];
        ((uint16_t*)dst)[i] = (rgb>>11) | ((rgb&0x7C0)>>1) | ((rgb&0x1F)<<10);
    }
 }

 void rgb15tobgr32(const uint8_t *src, uint8_t *dst, long src_size)
 {
    const uint16_t *end;
    uint8_t *d = dst;
    const uint16_t *s = (const uint16_t *)src;
    end = s + src_size/2;
    while (s < end)
    {
        register uint16_t bgr;
        bgr = *s++;
        #ifdef WORDS_BIGENDIAN
            *d++ = 255;
            *d++ = (bgr&0x1F)<<3;
            *d++ = (bgr&0x3E0)>>2;
            *d++ = (bgr&0x7C00)>>7;
        #else
            *d++ = (bgr&0x7C00)>>7;
            *d++ = (bgr&0x3E0)>>2;
            *d++ = (bgr&0x1F)<<3;
            *d++ = 255;
        #endif
    }
 }

 void rgb15to24(const uint8_t *src, uint8_t *dst, long src_size)
 {
    const uint16_t *end;
    uint8_t *d = dst;
    const uint16_t *s = (const uint16_t *)src;
    end = s + src_size/2;
    while (s < end)
    {
        register uint16_t bgr;
        bgr = *s++;
        *d++ = (bgr&0x7C00)>>7;
        *d++ = (bgr&0x3E0)>>2;
        *d++ = (bgr&0x1F)<<3;
    }
 }

 void rgb15tobgr16(const uint8_t *src, uint8_t *dst, long src_size)
 {
    long i;
    long num_pixels = src_size >> 1;

    for (i=0; i<num_pixels; i++)
    {
        unsigned rgb = ((const uint16_t*)src)[i];
        ((uint16_t*)dst)[i] = ((rgb&0x7C00)>>10) | ((rgb&0x3E0)<<1) | (rgb<<11);
    }
 }

 void rgb15tobgr15(const uint8_t *src, uint8_t *dst, long src_size)
 {
    long i;
    long num_pixels = src_size >> 1;

    for (i=0; i<num_pixels; i++)
    {
        unsigned br;
        unsigned rgb = ((const uint16_t*)src)[i];
        br = rgb&0x7c1F;
        ((uint16_t*)dst)[i] = (br>>10) | (rgb&0x3E0) | (br<<10);
    }
 }

 void bgr8torgb8(const uint8_t *src, uint8_t *dst, long src_size)
 {
    long i;
    long num_pixels = src_size;
    for (i=0; i<num_pixels; i++)
    {
        unsigned b,g,r;
        register uint8_t rgb;
        rgb = src[i];
        r = (rgb&0x07);
        g = (rgb&0x38)>>3;
        b = (rgb&0xC0)>>6;
        dst[i] = ((b<<1)&0x07) | ((g&0x07)<<3) | ((r&0x03)<<6);
    }
 }
--- a/libswscale/rgb2rgb.h
+++ b/libswscale/rgb2rgb.h
@@ -0,0 +1,147 @@
 /*
 *  software RGB to RGB converter
 *  pluralize by Software PAL8 to RGB converter
 *               Software YUV to YUV converter
 *               Software YUV to RGB converter
 *  Written by Nick Kurshev.
 *  palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at)
 *
 * This file is part of FFmpeg.
 *
 * FFmpeg is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * FFmpeg is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with FFmpeg; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */

 #ifndef SWSCALE_RGB2RGB_H
 #define SWSCALE_RGB2RGB_H

 #include <inttypes.h>

 /* A full collection of RGB to RGB(BGR) converters */
 extern void (*rgb24tobgr32)(const uint8_t *src, uint8_t *dst, long src_size);
 extern void (*rgb24tobgr16)(const uint8_t *src, uint8_t *dst, long src_size);
 extern void (*rgb24tobgr15)(const uint8_t *src, uint8_t *dst, long src_size);
 extern void (*rgb32tobgr24)(const uint8_t *src, uint8_t *dst, long src_size);
 extern void (*rgb32to16)   (const uint8_t *src, uint8_t *dst, long src_size);
 extern void (*rgb32to15)   (const uint8_t *src, uint8_t *dst, long src_size);
 extern void (*rgb15to16)   (const uint8_t *src, uint8_t *dst, long src_size);
 extern void (*rgb15tobgr24)(const uint8_t *src, uint8_t *dst, long src_size);
 extern void (*rgb15to32)   (const uint8_t *src, uint8_t *dst, long src_size);
 extern void (*rgb16to15)   (const uint8_t *src, uint8_t *dst, long src_size);
 extern void (*rgb16tobgr24)(const uint8_t *src, uint8_t *dst, long src_size);
 extern void (*rgb16to32)   (const uint8_t *src, uint8_t *dst, long src_size);
 extern void (*rgb24tobgr24)(const uint8_t *src, uint8_t *dst, long src_size);
 extern void (*rgb24to16)   (const uint8_t *src, uint8_t *dst, long src_size);
 extern void (*rgb24to15)   (const uint8_t *src, uint8_t *dst, long src_size);
 extern void (*rgb32tobgr32)(const uint8_t *src, uint8_t *dst, long src_size);
 extern void (*rgb32tobgr16)(const uint8_t *src, uint8_t *dst, long src_size);
 extern void (*rgb32tobgr15)(const uint8_t *src, uint8_t *dst, long src_size);

 void rgb24to32   (const uint8_t *src, uint8_t *dst, long src_size);
 void rgb32to24   (const uint8_t *src, uint8_t *dst, long src_size);
 void rgb16tobgr32(const uint8_t *src, uint8_t *dst, long src_size);
 void rgb16to24   (const uint8_t *src, uint8_t *dst, long src_size);
 void rgb16tobgr16(const uint8_t *src, uint8_t *dst, long src_size);
 void rgb16tobgr15(const uint8_t *src, uint8_t *dst, long src_size);
 void rgb15tobgr32(const uint8_t *src, uint8_t *dst, long src_size);
 void rgb15to24   (const uint8_t *src, uint8_t *dst, long src_size);
 void rgb15tobgr16(const uint8_t *src, uint8_t *dst, long src_size);
 void rgb15tobgr15(const uint8_t *src, uint8_t *dst, long src_size);
 void bgr8torgb8  (const uint8_t *src, uint8_t *dst, long src_size);


 void palette8topacked32(const uint8_t *src, uint8_t *dst, long num_pixels, const uint8_t *palette);
 void palette8topacked24(const uint8_t *src, uint8_t *dst, long num_pixels, const uint8_t *palette);
 void palette8torgb16(const uint8_t *src, uint8_t *dst, long num_pixels, const uint8_t *palette);
 void palette8tobgr16(const uint8_t *src, uint8_t *dst, long num_pixels, const uint8_t *palette);
 void palette8torgb15(const uint8_t *src, uint8_t *dst, long num_pixels, const uint8_t *palette);
 void palette8tobgr15(const uint8_t *src, uint8_t *dst, long num_pixels, const uint8_t *palette);

 /**
 * Height should be a multiple of 2 and width should be a multiple of 16.
 * (If this is a problem for anyone then tell me, and I will fix it.)
 * Chrominance data is only taken from every second line, others are ignored.
 * FIXME: Write high quality version.
 */
 //void uyvytoyv12(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,

 /**
 * Height should be a multiple of 2 and width should be a multiple of 16.
 * (If this is a problem for anyone then tell me, and I will fix it.)
 */
 extern void (*yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
                          long width, long height,
                          long lumStride, long chromStride, long dstStride);

 /**
 * Width should be a multiple of 16.
 */
 extern void (*yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
                             long width, long height,
                             long lumStride, long chromStride, long dstStride);

 /**
 * Height should be a multiple of 2 and width should be a multiple of 16.
 * (If this is a problem for anyone then tell me, and I will fix it.)
 */
 extern void (*yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
                          long width, long height,
                          long lumStride, long chromStride, long srcStride);

 /**
 * Height should be a multiple of 2 and width should be a multiple of 16.
 * (If this is a problem for anyone then tell me, and I will fix it.)
 */
 extern void (*yv12touyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
                          long width, long height,
                          long lumStride, long chromStride, long dstStride);

 /**
 * Width should be a multiple of 16.
 */
 extern void (*yuv422ptouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
                             long width, long height,
                             long lumStride, long chromStride, long dstStride);

 /**
 * Height should be a multiple of 2 and width should be a multiple of 2.
 * (If this is a problem for anyone then tell me, and I will fix it.)
 * Chrominance data is only taken from every second line, others are ignored.
 * FIXME: Write high quality version.
 */
 extern void (*rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
                           long width, long height,
                           long lumStride, long chromStride, long srcStride);
 extern void (*planar2x)(const uint8_t *src, uint8_t *dst, long width, long height,
                        long srcStride, long dstStride);

 extern void (*interleaveBytes)(uint8_t *src1, uint8_t *src2, uint8_t *dst,
                               long width, long height, long src1Stride,
                               long src2Stride, long dstStride);

 extern void (*vu9_to_vu12)(const uint8_t *src1, const uint8_t *src2,
                           uint8_t *dst1, uint8_t *dst2,
                           long width, long height,
                           long srcStride1, long srcStride2,
                           long dstStride1, long dstStride2);

 extern void (*yvu9_to_yuy2)(const uint8_t *src1, const uint8_t *src2, const uint8_t *src3,
                            uint8_t *dst,
                            long width, long height,
                            long srcStride1, long srcStride2,
                            long srcStride3, long dstStride);

 void sws_rgb2rgb_init(int flags);

 #endif /* SWSCALE_RGB2RGB_H */
--- a/libswscale/rgb2rgb_template.c
+++ b/libswscale/rgb2rgb_template.c
--- a/libswscale/swscale-example.c
+++ b/libswscale/swscale-example.c
@@ -0,0 +1,210 @@
 /*
 * Copyright (C) 2003 Michael Niedermayer <michaelni@gmx.at>
 *
 * This file is part of FFmpeg.
 *
 * FFmpeg is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * FFmpeg is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with FFmpeg; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */

 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <inttypes.h>
 #include <stdarg.h>

 #undef HAVE_AV_CONFIG_H
 #include "libavutil/avutil.h"
 #include "swscale.h"
 #include "swscale_internal.h"

 static uint64_t getSSD(uint8_t *src1, uint8_t *src2, int stride1, int stride2, int w, int h){
    int x,y;
    uint64_t ssd=0;

 //printf("%d %d\n", w, h);

    for (y=0; y<h; y++){
        for (x=0; x<w; x++){
            int d= src1[x + y*stride1] - src2[x + y*stride2];
            ssd+= d*d;
 //printf("%d", abs(src1[x + y*stride1] - src2[x + y*stride2])/26 );
        }
 //printf("\n");
    }
    return ssd;
 }

 // test by ref -> src -> dst -> out & compare out against ref
 // ref & out are YV12
 static int doTest(uint8_t *ref[3], int refStride[3], int w, int h, int srcFormat, int dstFormat,
                  int srcW, int srcH, int dstW, int dstH, int flags){
    uint8_t *src[3];
    uint8_t *dst[3];
    uint8_t *out[3];
    int srcStride[3], dstStride[3];
    int i;
    uint64_t ssdY, ssdU, ssdV;
    struct SwsContext *srcContext, *dstContext, *outContext;
    int res;

    res = 0;
    for (i=0; i<3; i++){
        // avoid stride % bpp != 0
        if (srcFormat==PIX_FMT_RGB24 || srcFormat==PIX_FMT_BGR24)
            srcStride[i]= srcW*3;
        else
            srcStride[i]= srcW*4;

        if (dstFormat==PIX_FMT_RGB24 || dstFormat==PIX_FMT_BGR24)
            dstStride[i]= dstW*3;
        else
            dstStride[i]= dstW*4;

        src[i]= (uint8_t*) malloc(srcStride[i]*srcH);
        dst[i]= (uint8_t*) malloc(dstStride[i]*dstH);
        out[i]= (uint8_t*) malloc(refStride[i]*h);
        if (!src[i] || !dst[i] || !out[i]) {
            perror("Malloc");
            res = -1;

            goto end;
        }
    }

    dstContext = outContext = NULL;
    srcContext= sws_getContext(w, h, PIX_FMT_YUV420P, srcW, srcH, srcFormat, flags, NULL, NULL, NULL);
    if (!srcContext) {
        fprintf(stderr, "Failed to get %s ---> %s\n",
                sws_format_name(PIX_FMT_YUV420P),
                sws_format_name(srcFormat));
        res = -1;

        goto end;
    }
    dstContext= sws_getContext(srcW, srcH, srcFormat, dstW, dstH, dstFormat, flags, NULL, NULL, NULL);
    if (!dstContext) {
        fprintf(stderr, "Failed to get %s ---> %s\n",
                sws_format_name(srcFormat),
                sws_format_name(dstFormat));
        res = -1;

        goto end;
    }
    outContext= sws_getContext(dstW, dstH, dstFormat, w, h, PIX_FMT_YUV420P, flags, NULL, NULL, NULL);
    if (!outContext) {
        fprintf(stderr, "Failed to get %s ---> %s\n",
                sws_format_name(dstFormat),
                sws_format_name(PIX_FMT_YUV420P));
        res = -1;

        goto end;
    }
 //    printf("test %X %X %X -> %X %X %X\n", (int)ref[0], (int)ref[1], (int)ref[2],
 //        (int)src[0], (int)src[1], (int)src[2]);

    sws_scale(srcContext, ref, refStride, 0, h   , src, srcStride);
    sws_scale(dstContext, src, srcStride, 0, srcH, dst, dstStride);
    sws_scale(outContext, dst, dstStride, 0, dstH, out, refStride);

    ssdY= getSSD(ref[0], out[0], refStride[0], refStride[0], w, h);
    ssdU= getSSD(ref[1], out[1], refStride[1], refStride[1], (w+1)>>1, (h+1)>>1);
    ssdV= getSSD(ref[2], out[2], refStride[2], refStride[2], (w+1)>>1, (h+1)>>1);

    if (srcFormat == PIX_FMT_GRAY8 || dstFormat==PIX_FMT_GRAY8) ssdU=ssdV=0; //FIXME check that output is really gray

    ssdY/= w*h;
    ssdU/= w*h/4;
    ssdV/= w*h/4;

    printf(" %s %dx%d -> %s %4dx%4d flags=%2d SSD=%5lld,%5lld,%5lld\n",
           sws_format_name(srcFormat), srcW, srcH,
           sws_format_name(dstFormat), dstW, dstH,
           flags, ssdY, ssdU, ssdV);
    fflush(stdout);

    end:

    sws_freeContext(srcContext);
    sws_freeContext(dstContext);
    sws_freeContext(outContext);

    for (i=0; i<3; i++){
        free(src[i]);
        free(dst[i]);
        free(out[i]);
    }

    return res;
 }

 static void selfTest(uint8_t *src[3], int stride[3], int w, int h){
    enum PixelFormat srcFormat, dstFormat;
    int srcW, srcH, dstW, dstH;
    int flags;

    for (srcFormat = 0; srcFormat < PIX_FMT_NB; srcFormat++) {
        for (dstFormat = 0; dstFormat < PIX_FMT_NB; dstFormat++) {
            printf("%s -> %s\n",
                   sws_format_name(srcFormat),
                   sws_format_name(dstFormat));
            fflush(stdout);

            srcW= w;
            srcH= h;
            for (dstW=w - w/3; dstW<= 4*w/3; dstW+= w/3){
                for (dstH=h - h/3; dstH<= 4*h/3; dstH+= h/3){
                    for (flags=1; flags<33; flags*=2) {
                        int res;

                        res = doTest(src, stride, w, h, srcFormat, dstFormat,
                                     srcW, srcH, dstW, dstH, flags);
                        if (res < 0) {
                            dstW = 4 * w / 3;
                            dstH = 4 * h / 3;
                            flags = 33;
                        }
                    }
                }
            }
        }
    }
 }

 #define W 96
 #define H 96

 int main(int argc, char **argv){
    uint8_t *rgb_data = malloc (W*H*4);
    uint8_t *rgb_src[3]= {rgb_data, NULL, NULL};
    int rgb_stride[3]={4*W, 0, 0};
    uint8_t *data = malloc (3*W*H);
    uint8_t *src[3]= {data, data+W*H, data+W*H*2};
    int stride[3]={W, W, W};
    int x, y;
    struct SwsContext *sws;

    sws= sws_getContext(W/12, H/12, PIX_FMT_RGB32, W, H, PIX_FMT_YUV420P, 2, NULL, NULL, NULL);

    for (y=0; y<H; y++){
        for (x=0; x<W*4; x++){
            rgb_data[ x + y*4*W]= random();
        }
    }
    sws_scale(sws, rgb_src, rgb_stride, 0, H, src, stride);

    selfTest(src, stride, W, H);

    return 123;
 }
--- a/libswscale/swscale.c
+++ b/libswscale/swscale.c
--- a/libswscale/swscale.h
+++ b/libswscale/swscale.h
@@ -0,0 +1,247 @@
 /*
 * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
 *
 * This file is part of FFmpeg.
 *
 * FFmpeg is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * FFmpeg is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with FFmpeg; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */

 #ifndef SWSCALE_SWSCALE_H
 #define SWSCALE_SWSCALE_H

 /**
 * @file libswscale/swscale.h
 * @brief
 *     external api for the swscale stuff
 */

 #include "libavutil/avutil.h"

 #define LIBSWSCALE_VERSION_MAJOR 0
 #define LIBSWSCALE_VERSION_MINOR 7
 #define LIBSWSCALE_VERSION_MICRO 1

 #define LIBSWSCALE_VERSION_INT  AV_VERSION_INT(LIBSWSCALE_VERSION_MAJOR, \
                                               LIBSWSCALE_VERSION_MINOR, \
                                               LIBSWSCALE_VERSION_MICRO)
 #define LIBSWSCALE_VERSION      AV_VERSION(LIBSWSCALE_VERSION_MAJOR, \
                                           LIBSWSCALE_VERSION_MINOR, \
                                           LIBSWSCALE_VERSION_MICRO)
 #define LIBSWSCALE_BUILD        LIBSWSCALE_VERSION_INT

 #define LIBSWSCALE_IDENT        "SwS" AV_STRINGIFY(LIBSWSCALE_VERSION)

 /**
 * Returns the LIBSWSCALE_VERSION_INT constant.
 */
 unsigned swscale_version(void);

 /* values for the flags, the stuff on the command line is different */
 #define SWS_FAST_BILINEAR     1
 #define SWS_BILINEAR          2
 #define SWS_BICUBIC           4
 #define SWS_X                 8
 #define SWS_POINT          0x10
 #define SWS_AREA           0x20
 #define SWS_BICUBLIN       0x40
 #define SWS_GAUSS          0x80
 #define SWS_SINC          0x100
 #define SWS_LANCZOS       0x200
 #define SWS_SPLINE        0x400

 #define SWS_SRC_V_CHR_DROP_MASK     0x30000
 #define SWS_SRC_V_CHR_DROP_SHIFT    16

 #define SWS_PARAM_DEFAULT           123456

 #define SWS_PRINT_INFO              0x1000

 //the following 3 flags are not completely implemented
 //internal chrominace subsampling info
 #define SWS_FULL_CHR_H_INT    0x2000
 //input subsampling info
 #define SWS_FULL_CHR_H_INP    0x4000
 #define SWS_DIRECT_BGR        0x8000
 #define SWS_ACCURATE_RND      0x40000
 #define SWS_BITEXACT          0x80000

 #define SWS_CPU_CAPS_MMX      0x80000000
 #define SWS_CPU_CAPS_MMX2     0x20000000
 #define SWS_CPU_CAPS_3DNOW    0x40000000
 #define SWS_CPU_CAPS_ALTIVEC  0x10000000
 #define SWS_CPU_CAPS_BFIN     0x01000000

 #define SWS_MAX_REDUCE_CUTOFF 0.002

 #define SWS_CS_ITU709         1
 #define SWS_CS_FCC            4
 #define SWS_CS_ITU601         5
 #define SWS_CS_ITU624         5
 #define SWS_CS_SMPTE170M      5
 #define SWS_CS_SMPTE240M      7
 #define SWS_CS_DEFAULT        5



 // when used for filters they must have an odd number of elements
 // coeffs cannot be shared between vectors
 typedef struct {
    double *coeff;              ///< pointer to the list of coefficients
    int length;                 ///< number of coefficients in the vector
 } SwsVector;

 // vectors can be shared
 typedef struct {
    SwsVector *lumH;
    SwsVector *lumV;
    SwsVector *chrH;
    SwsVector *chrV;
 } SwsFilter;

 struct SwsContext;

 void sws_freeContext(struct SwsContext *swsContext);

 /**
 * Allocates and returns a SwsContext. You need it to perform
 * scaling/conversion operations using sws_scale().
 *
 * @param srcW the width of the source image
 * @param srcH the height of the source image
 * @param srcFormat the source image format
 * @param dstW the width of the destination image
 * @param dstH the height of the destination image
 * @param dstFormat the destination image format
 * @param flags specify which algorithm and options to use for rescaling
 * @return a pointer to an allocated context, or NULL in case of error
 */
 struct SwsContext *sws_getContext(int srcW, int srcH, enum PixelFormat srcFormat, int dstW, int dstH, enum PixelFormat dstFormat, int flags,
                                  SwsFilter *srcFilter, SwsFilter *dstFilter, double *param);

 /**
 * Scales the image slice in \p srcSlice and puts the resulting scaled
 * slice in the image in \p dst. A slice is a sequence of consecutive
 * rows in an image.
 *
 * @param context   the scaling context previously created with
 *                  sws_getContext()
 * @param srcSlice  the array containing the pointers to the planes of
 *                  the source slice
 * @param srcStride the array containing the strides for each plane of
 *                  the source image
 * @param srcSliceY the position in the source image of the slice to
 *                  process, that is the number (counted starting from
 *                  zero) in the image of the first row of the slice
 * @param srcSliceH the height of the source slice, that is the number
 *                  of rows in the slice
 * @param dst       the array containing the pointers to the planes of
 *                  the destination image
 * @param dstStride the array containing the strides for each plane of
 *                  the destination image
 * @return          the height of the output slice
 */
 int sws_scale(struct SwsContext *context, uint8_t* srcSlice[], int srcStride[], int srcSliceY,
              int srcSliceH, uint8_t* dst[], int dstStride[]);
 #if LIBSWSCALE_VERSION_MAJOR < 1
 /**
 * @deprecated Use sws_scale() instead.
 */
 int sws_scale_ordered(struct SwsContext *context, uint8_t* src[], int srcStride[], int srcSliceY,
                      int srcSliceH, uint8_t* dst[], int dstStride[]) attribute_deprecated;
 #endif


 int sws_setColorspaceDetails(struct SwsContext *c, const int inv_table[4], int srcRange, const int table[4], int dstRange, int brightness, int contrast, int saturation);
 int sws_getColorspaceDetails(struct SwsContext *c, int **inv_table, int *srcRange, int **table, int *dstRange, int *brightness, int *contrast, int *saturation);

 /**
 * Returns a normalized Gaussian curve used to filter stuff
 * quality=3 is high quality, lower is lower quality.
 */
 SwsVector *sws_getGaussianVec(double variance, double quality);

 /**
 * Allocates and returns a vector with \p length coefficients, all
 * with the same value \p c.
 */
 SwsVector *sws_getConstVec(double c, int length);

 /**
 * Allocates and returns a vector with just one coefficient, with
 * value 1.0.
 */
 SwsVector *sws_getIdentityVec(void);

 /**
 * Scales all the coefficients of \p a by the \p scalar value.
 */
 void sws_scaleVec(SwsVector *a, double scalar);

 /**
 * Scales all the coefficients of \p a so that their sum equals \p
 * height."
 */
 void sws_normalizeVec(SwsVector *a, double height);
 void sws_convVec(SwsVector *a, SwsVector *b);
 void sws_addVec(SwsVector *a, SwsVector *b);
 void sws_subVec(SwsVector *a, SwsVector *b);
 void sws_shiftVec(SwsVector *a, int shift);

 /**
 * Allocates and returns a clone of the vector \p a, that is a vector
 * with the same coefficients as \p a.
 */
 SwsVector *sws_cloneVec(SwsVector *a);

 #if LIBSWSCALE_VERSION_MAJOR < 1
 /**
 * @deprecated Use sws_printVec2() instead.
 */
 attribute_deprecated void sws_printVec(SwsVector *a);
 #endif

 /**
 * Prints with av_log() a textual representation of the vector \p a
 * if \p log_level <= av_log_level.
 */
 void sws_printVec2(SwsVector *a, AVClass *log_ctx, int log_level);

 void sws_freeVec(SwsVector *a);

 SwsFilter *sws_getDefaultFilter(float lumaGBlur, float chromaGBlur,
                                float lumaSharpen, float chromaSharpen,
                                float chromaHShift, float chromaVShift,
                                int verbose);
 void sws_freeFilter(SwsFilter *filter);

 /**
 * Checks if \p context can be reused, otherwise reallocates a new
 * one.
 *
 * If \p context is NULL, just calls sws_getContext() to get a new
 * context. Otherwise, checks if the parameters are the ones already
 * saved in \p context. If that is the case, returns the current
 * context. Otherwise, frees \p context and gets a new context with
 * the new parameters.
 *
 * Be warned that \p srcFilter and \p dstFilter are not checked, they
 * are assumed to remain the same.
 */
 struct SwsContext *sws_getCachedContext(struct SwsContext *context,
                                        int srcW, int srcH, enum PixelFormat srcFormat,
                                        int dstW, int dstH, enum PixelFormat dstFormat, int flags,
                                        SwsFilter *srcFilter, SwsFilter *dstFilter, double *param);

 #endif /* SWSCALE_SWSCALE_H */
--- a/libswscale/swscale_altivec_template.c
+++ b/libswscale/swscale_altivec_template.c
@@ -0,0 +1,538 @@
 /*
 * AltiVec-enhanced yuv2yuvX
 *
 * Copyright (C) 2004 Romain Dolbeau <romain@dolbeau.org>
 * based on the equivalent C code in swscale.c
 *
 * This file is part of FFmpeg.
 *
 * FFmpeg is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * FFmpeg is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with FFmpeg; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */

 #define vzero vec_splat_s32(0)

 static inline void
 altivec_packIntArrayToCharArray(int *val, uint8_t* dest, int dstW) {
    register int i;
    vector unsigned int altivec_vectorShiftInt19 =
        vec_add(vec_splat_u32(10), vec_splat_u32(9));
    if ((unsigned long)dest % 16) {
        /* badly aligned store, we force store alignment */
        /* and will handle load misalignment on val w/ vec_perm */
        vector unsigned char perm1;
        vector signed int v1;
        for (i = 0 ; (i < dstW) &&
            (((unsigned long)dest + i) % 16) ; i++) {
                int t = val[i] >> 19;
                dest[i] = (t < 0) ? 0 : ((t > 255) ? 255 : t);
        }
        perm1 = vec_lvsl(i << 2, val);
        v1 = vec_ld(i << 2, val);
        for ( ; i < (dstW - 15); i+=16) {
            int offset = i << 2;
            vector signed int v2 = vec_ld(offset + 16, val);
            vector signed int v3 = vec_ld(offset + 32, val);
            vector signed int v4 = vec_ld(offset + 48, val);
            vector signed int v5 = vec_ld(offset + 64, val);
            vector signed int v12 = vec_perm(v1, v2, perm1);
            vector signed int v23 = vec_perm(v2, v3, perm1);
            vector signed int v34 = vec_perm(v3, v4, perm1);
            vector signed int v45 = vec_perm(v4, v5, perm1);

            vector signed int vA = vec_sra(v12, altivec_vectorShiftInt19);
            vector signed int vB = vec_sra(v23, altivec_vectorShiftInt19);
            vector signed int vC = vec_sra(v34, altivec_vectorShiftInt19);
            vector signed int vD = vec_sra(v45, altivec_vectorShiftInt19);
            vector unsigned short vs1 = vec_packsu(vA, vB);
            vector unsigned short vs2 = vec_packsu(vC, vD);
            vector unsigned char vf = vec_packsu(vs1, vs2);
            vec_st(vf, i, dest);
            v1 = v5;
        }
    } else { // dest is properly aligned, great
        for (i = 0; i < (dstW - 15); i+=16) {
            int offset = i << 2;
            vector signed int v1 = vec_ld(offset, val);
            vector signed int v2 = vec_ld(offset + 16, val);
            vector signed int v3 = vec_ld(offset + 32, val);
            vector signed int v4 = vec_ld(offset + 48, val);
            vector signed int v5 = vec_sra(v1, altivec_vectorShiftInt19);
            vector signed int v6 = vec_sra(v2, altivec_vectorShiftInt19);
            vector signed int v7 = vec_sra(v3, altivec_vectorShiftInt19);
            vector signed int v8 = vec_sra(v4, altivec_vectorShiftInt19);
            vector unsigned short vs1 = vec_packsu(v5, v6);
            vector unsigned short vs2 = vec_packsu(v7, v8);
            vector unsigned char vf = vec_packsu(vs1, vs2);
            vec_st(vf, i, dest);
        }
    }
    for ( ; i < dstW ; i++) {
        int t = val[i] >> 19;
        dest[i] = (t < 0) ? 0 : ((t > 255) ? 255 : t);
    }
 }

 static inline void
 yuv2yuvX_altivec_real(int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
                      int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
                      uint8_t *dest, uint8_t *uDest, uint8_t *vDest, int dstW, int chrDstW)
 {
    const vector signed int vini = {(1 << 18), (1 << 18), (1 << 18), (1 << 18)};
    register int i, j;
    {
        int __attribute__ ((aligned (16))) val[dstW];

        for (i = 0; i < (dstW -7); i+=4) {
            vec_st(vini, i << 2, val);
        }
        for (; i < dstW; i++) {
            val[i] = (1 << 18);
        }

        for (j = 0; j < lumFilterSize; j++) {
            vector signed short l1, vLumFilter = vec_ld(j << 1, lumFilter);
            vector unsigned char perm, perm0 = vec_lvsl(j << 1, lumFilter);
            vLumFilter = vec_perm(vLumFilter, vLumFilter, perm0);
            vLumFilter = vec_splat(vLumFilter, 0); // lumFilter[j] is loaded 8 times in vLumFilter

            perm = vec_lvsl(0, lumSrc[j]);
            l1 = vec_ld(0, lumSrc[j]);

            for (i = 0; i < (dstW - 7); i+=8) {
                int offset = i << 2;
                vector signed short l2 = vec_ld((i << 1) + 16, lumSrc[j]);

                vector signed int v1 = vec_ld(offset, val);
                vector signed int v2 = vec_ld(offset + 16, val);

                vector signed short ls = vec_perm(l1, l2, perm); // lumSrc[j][i] ... lumSrc[j][i+7]

                vector signed int i1 = vec_mule(vLumFilter, ls);
                vector signed int i2 = vec_mulo(vLumFilter, ls);

                vector signed int vf1 = vec_mergeh(i1, i2);
                vector signed int vf2 = vec_mergel(i1, i2); // lumSrc[j][i] * lumFilter[j] ... lumSrc[j][i+7] * lumFilter[j]

                vector signed int vo1 = vec_add(v1, vf1);
                vector signed int vo2 = vec_add(v2, vf2);

                vec_st(vo1, offset, val);
                vec_st(vo2, offset + 16, val);

                l1 = l2;
            }
            for ( ; i < dstW; i++) {
                val[i] += lumSrc[j][i] * lumFilter[j];
            }
        }
        altivec_packIntArrayToCharArray(val, dest, dstW);
    }
    if (uDest != 0) {
        int  __attribute__ ((aligned (16))) u[chrDstW];
        int  __attribute__ ((aligned (16))) v[chrDstW];

        for (i = 0; i < (chrDstW -7); i+=4) {
            vec_st(vini, i << 2, u);
            vec_st(vini, i << 2, v);
        }
        for (; i < chrDstW; i++) {
            u[i] = (1 << 18);
            v[i] = (1 << 18);
        }

        for (j = 0; j < chrFilterSize; j++) {
            vector signed short l1, l1_V, vChrFilter = vec_ld(j << 1, chrFilter);
            vector unsigned char perm, perm0 = vec_lvsl(j << 1, chrFilter);
            vChrFilter = vec_perm(vChrFilter, vChrFilter, perm0);
            vChrFilter = vec_splat(vChrFilter, 0); // chrFilter[j] is loaded 8 times in vChrFilter

            perm = vec_lvsl(0, chrSrc[j]);
            l1 = vec_ld(0, chrSrc[j]);
            l1_V = vec_ld(2048 << 1, chrSrc[j]);

            for (i = 0; i < (chrDstW - 7); i+=8) {
                int offset = i << 2;
                vector signed short l2 = vec_ld((i << 1) + 16, chrSrc[j]);
                vector signed short l2_V = vec_ld(((i + 2048) << 1) + 16, chrSrc[j]);

                vector signed int v1 = vec_ld(offset, u);
                vector signed int v2 = vec_ld(offset + 16, u);
                vector signed int v1_V = vec_ld(offset, v);
                vector signed int v2_V = vec_ld(offset + 16, v);

                vector signed short ls = vec_perm(l1, l2, perm); // chrSrc[j][i] ... chrSrc[j][i+7]
                vector signed short ls_V = vec_perm(l1_V, l2_V, perm); // chrSrc[j][i+2048] ... chrSrc[j][i+2055]

                vector signed int i1 = vec_mule(vChrFilter, ls);
                vector signed int i2 = vec_mulo(vChrFilter, ls);
                vector signed int i1_V = vec_mule(vChrFilter, ls_V);
                vector signed int i2_V = vec_mulo(vChrFilter, ls_V);

                vector signed int vf1 = vec_mergeh(i1, i2);
                vector signed int vf2 = vec_mergel(i1, i2); // chrSrc[j][i] * chrFilter[j] ... chrSrc[j][i+7] * chrFilter[j]
                vector signed int vf1_V = vec_mergeh(i1_V, i2_V);
                vector signed int vf2_V = vec_mergel(i1_V, i2_V); // chrSrc[j][i] * chrFilter[j] ... chrSrc[j][i+7] * chrFilter[j]

                vector signed int vo1 = vec_add(v1, vf1);
                vector signed int vo2 = vec_add(v2, vf2);
                vector signed int vo1_V = vec_add(v1_V, vf1_V);
                vector signed int vo2_V = vec_add(v2_V, vf2_V);

                vec_st(vo1, offset, u);
                vec_st(vo2, offset + 16, u);
                vec_st(vo1_V, offset, v);
                vec_st(vo2_V, offset + 16, v);

                l1 = l2;
                l1_V = l2_V;
            }
            for ( ; i < chrDstW; i++) {
                u[i] += chrSrc[j][i] * chrFilter[j];
                v[i] += chrSrc[j][i + 2048] * chrFilter[j];
            }
        }
        altivec_packIntArrayToCharArray(u, uDest, chrDstW);
        altivec_packIntArrayToCharArray(v, vDest, chrDstW);
    }
 }

 static inline void hScale_altivec_real(int16_t *dst, int dstW, uint8_t *src, int srcW, int xInc, int16_t *filter, int16_t *filterPos, int filterSize) {
    register int i;
    int __attribute__ ((aligned (16))) tempo[4];

    if (filterSize % 4) {
        for (i=0; i<dstW; i++) {
            register int j;
            register int srcPos = filterPos[i];
            register int val = 0;
            for (j=0; j<filterSize; j++) {
                val += ((int)src[srcPos + j])*filter[filterSize*i + j];
            }
            dst[i] = FFMIN(val>>7, (1<<15)-1);
        }
    }
    else
    switch (filterSize) {
    case 4:
    {
    for (i=0; i<dstW; i++) {
        register int srcPos = filterPos[i];

        vector unsigned char src_v0 = vec_ld(srcPos, src);
        vector unsigned char src_v1, src_vF;
        vector signed short src_v, filter_v;
        vector signed int val_vEven, val_s;
        if ((((int)src + srcPos)% 16) > 12) {
            src_v1 = vec_ld(srcPos + 16, src);
        }
        src_vF = vec_perm(src_v0, src_v1, vec_lvsl(srcPos, src));

        src_v = // vec_unpackh sign-extends...
            (vector signed short)(vec_mergeh((vector unsigned char)vzero, src_vF));
        // now put our elements in the even slots
        src_v = vec_mergeh(src_v, (vector signed short)vzero);

        filter_v = vec_ld(i << 3, filter);
        // The 3 above is 2 (filterSize == 4) + 1 (sizeof(short) == 2).

        // The neat trick: We only care for half the elements,
        // high or low depending on (i<<3)%16 (it's 0 or 8 here),
        // and we're going to use vec_mule, so we choose
        // carefully how to "unpack" the elements into the even slots.
        if ((i << 3) % 16)
            filter_v = vec_mergel(filter_v, (vector signed short)vzero);
        else
            filter_v = vec_mergeh(filter_v, (vector signed short)vzero);

        val_vEven = vec_mule(src_v, filter_v);
        val_s = vec_sums(val_vEven, vzero);
        vec_st(val_s, 0, tempo);
        dst[i] = FFMIN(tempo[3]>>7, (1<<15)-1);
    }
    }
    break;

    case 8:
    {
    for (i=0; i<dstW; i++) {
        register int srcPos = filterPos[i];

        vector unsigned char src_v0 = vec_ld(srcPos, src);
        vector unsigned char src_v1, src_vF;
        vector signed short src_v, filter_v;
        vector signed int val_v, val_s;
        if ((((int)src + srcPos)% 16) > 8) {
            src_v1 = vec_ld(srcPos + 16, src);
        }
        src_vF = vec_perm(src_v0, src_v1, vec_lvsl(srcPos, src));

        src_v = // vec_unpackh sign-extends...
            (vector signed short)(vec_mergeh((vector unsigned char)vzero, src_vF));
        filter_v = vec_ld(i << 4, filter);
        // the 4 above is 3 (filterSize == 8) + 1 (sizeof(short) == 2)

        val_v = vec_msums(src_v, filter_v, (vector signed int)vzero);
        val_s = vec_sums(val_v, vzero);
        vec_st(val_s, 0, tempo);
        dst[i] = FFMIN(tempo[3]>>7, (1<<15)-1);
    }
    }
    break;

    case 16:
    {
        for (i=0; i<dstW; i++) {
            register int srcPos = filterPos[i];

            vector unsigned char src_v0 = vec_ld(srcPos, src);
            vector unsigned char src_v1 = vec_ld(srcPos + 16, src);
            vector unsigned char src_vF = vec_perm(src_v0, src_v1, vec_lvsl(srcPos, src));

            vector signed short src_vA = // vec_unpackh sign-extends...
                (vector signed short)(vec_mergeh((vector unsigned char)vzero, src_vF));
            vector signed short src_vB = // vec_unpackh sign-extends...
                (vector signed short)(vec_mergel((vector unsigned char)vzero, src_vF));

            vector signed short filter_v0 = vec_ld(i << 5, filter);
            vector signed short filter_v1 = vec_ld((i << 5) + 16, filter);
            // the 5 above are 4 (filterSize == 16) + 1 (sizeof(short) == 2)

            vector signed int val_acc = vec_msums(src_vA, filter_v0, (vector signed int)vzero);
            vector signed int val_v = vec_msums(src_vB, filter_v1, val_acc);

            vector signed int val_s = vec_sums(val_v, vzero);

            vec_st(val_s, 0, tempo);
            dst[i] = FFMIN(tempo[3]>>7, (1<<15)-1);
        }
    }
    break;

    default:
    {
    for (i=0; i<dstW; i++) {
        register int j;
        register int srcPos = filterPos[i];

        vector signed int val_s, val_v = (vector signed int)vzero;
        vector signed short filter_v0R = vec_ld(i * 2 * filterSize, filter);
        vector unsigned char permF = vec_lvsl((i * 2 * filterSize), filter);

        vector unsigned char src_v0 = vec_ld(srcPos, src);
        vector unsigned char permS = vec_lvsl(srcPos, src);

        for (j = 0 ; j < filterSize - 15; j += 16) {
            vector unsigned char src_v1 = vec_ld(srcPos + j + 16, src);
            vector unsigned char src_vF = vec_perm(src_v0, src_v1, permS);

            vector signed short src_vA = // vec_unpackh sign-extends...
                (vector signed short)(vec_mergeh((vector unsigned char)vzero, src_vF));
            vector signed short src_vB = // vec_unpackh sign-extends...
                (vector signed short)(vec_mergel((vector unsigned char)vzero, src_vF));

            vector signed short filter_v1R = vec_ld((i * 2 * filterSize) + (j * 2) + 16, filter);
            vector signed short filter_v2R = vec_ld((i * 2 * filterSize) + (j * 2) + 32, filter);
            vector signed short filter_v0  = vec_perm(filter_v0R, filter_v1R, permF);
            vector signed short filter_v1  = vec_perm(filter_v1R, filter_v2R, permF);

            vector signed int val_acc = vec_msums(src_vA, filter_v0, val_v);
            val_v = vec_msums(src_vB, filter_v1, val_acc);

            filter_v0R = filter_v2R;
            src_v0 = src_v1;
        }

        if (j < filterSize-7) {
            // loading src_v0 is useless, it's already done above
            //vector unsigned char src_v0 = vec_ld(srcPos + j, src);
            vector unsigned char src_v1, src_vF;
            vector signed short src_v, filter_v1R, filter_v;
            if ((((int)src + srcPos)% 16) > 8) {
                src_v1 = vec_ld(srcPos + j + 16, src);
            }
            src_vF = vec_perm(src_v0, src_v1, permS);

            src_v = // vec_unpackh sign-extends...
                (vector signed short)(vec_mergeh((vector unsigned char)vzero, src_vF));
            // loading filter_v0R is useless, it's already done above
            //vector signed short filter_v0R = vec_ld((i * 2 * filterSize) + j, filter);
            filter_v1R = vec_ld((i * 2 * filterSize) + (j * 2) + 16, filter);
            filter_v = vec_perm(filter_v0R, filter_v1R, permF);

            val_v = vec_msums(src_v, filter_v, val_v);
        }

        val_s = vec_sums(val_v, vzero);

        vec_st(val_s, 0, tempo);
        dst[i] = FFMIN(tempo[3]>>7, (1<<15)-1);
    }

    }
    }
 }

 static inline int yv12toyuy2_unscaled_altivec(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
                                              int srcSliceH, uint8_t* dstParam[], int dstStride_a[]) {
    uint8_t *dst=dstParam[0] + dstStride_a[0]*srcSliceY;
    // yv12toyuy2(src[0], src[1], src[2], dst, c->srcW, srcSliceH, srcStride[0], srcStride[1], dstStride[0]);
    uint8_t *ysrc = src[0];
    uint8_t *usrc = src[1];
    uint8_t *vsrc = src[2];
    const int width = c->srcW;
    const int height = srcSliceH;
    const int lumStride = srcStride[0];
    const int chromStride = srcStride[1];
    const int dstStride = dstStride_a[0];
    const vector unsigned char yperm = vec_lvsl(0, ysrc);
    const int vertLumPerChroma = 2;
    register unsigned int y;

    if (width&15) {
        yv12toyuy2(ysrc, usrc, vsrc, dst, c->srcW, srcSliceH, lumStride, chromStride, dstStride);
        return srcSliceH;
    }

    /* This code assumes:

    1) dst is 16 bytes-aligned
    2) dstStride is a multiple of 16
    3) width is a multiple of 16
    4) lum & chrom stride are multiples of 8
    */

    for (y=0; y<height; y++) {
        int i;
        for (i = 0; i < width - 31; i+= 32) {
            const unsigned int j = i >> 1;
            vector unsigned char v_yA = vec_ld(i, ysrc);
            vector unsigned char v_yB = vec_ld(i + 16, ysrc);
            vector unsigned char v_yC = vec_ld(i + 32, ysrc);
            vector unsigned char v_y1 = vec_perm(v_yA, v_yB, yperm);
            vector unsigned char v_y2 = vec_perm(v_yB, v_yC, yperm);
            vector unsigned char v_uA = vec_ld(j, usrc);
            vector unsigned char v_uB = vec_ld(j + 16, usrc);
            vector unsigned char v_u = vec_perm(v_uA, v_uB, vec_lvsl(j, usrc));
            vector unsigned char v_vA = vec_ld(j, vsrc);
            vector unsigned char v_vB = vec_ld(j + 16, vsrc);
            vector unsigned char v_v = vec_perm(v_vA, v_vB, vec_lvsl(j, vsrc));
            vector unsigned char v_uv_a = vec_mergeh(v_u, v_v);
            vector unsigned char v_uv_b = vec_mergel(v_u, v_v);
            vector unsigned char v_yuy2_0 = vec_mergeh(v_y1, v_uv_a);
            vector unsigned char v_yuy2_1 = vec_mergel(v_y1, v_uv_a);
            vector unsigned char v_yuy2_2 = vec_mergeh(v_y2, v_uv_b);
            vector unsigned char v_yuy2_3 = vec_mergel(v_y2, v_uv_b);
            vec_st(v_yuy2_0, (i << 1), dst);
            vec_st(v_yuy2_1, (i << 1) + 16, dst);
            vec_st(v_yuy2_2, (i << 1) + 32, dst);
            vec_st(v_yuy2_3, (i << 1) + 48, dst);
        }
        if (i < width) {
            const unsigned int j = i >> 1;
            vector unsigned char v_y1 = vec_ld(i, ysrc);
            vector unsigned char v_u = vec_ld(j, usrc);
            vector unsigned char v_v = vec_ld(j, vsrc);
            vector unsigned char v_uv_a = vec_mergeh(v_u, v_v);
            vector unsigned char v_yuy2_0 = vec_mergeh(v_y1, v_uv_a);
            vector unsigned char v_yuy2_1 = vec_mergel(v_y1, v_uv_a);
            vec_st(v_yuy2_0, (i << 1), dst);
            vec_st(v_yuy2_1, (i << 1) + 16, dst);
        }
        if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1) {
            usrc += chromStride;
            vsrc += chromStride;
        }
        ysrc += lumStride;
        dst += dstStride;
    }

    return srcSliceH;
 }

 static inline int yv12touyvy_unscaled_altivec(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
                                              int srcSliceH, uint8_t* dstParam[], int dstStride_a[]) {
    uint8_t *dst=dstParam[0] + dstStride_a[0]*srcSliceY;
    // yv12toyuy2(src[0], src[1], src[2], dst, c->srcW, srcSliceH, srcStride[0], srcStride[1], dstStride[0]);
    uint8_t *ysrc = src[0];
    uint8_t *usrc = src[1];
    uint8_t *vsrc = src[2];
    const int width = c->srcW;
    const int height = srcSliceH;
    const int lumStride = srcStride[0];
    const int chromStride = srcStride[1];
    const int dstStride = dstStride_a[0];
    const int vertLumPerChroma = 2;
    const vector unsigned char yperm = vec_lvsl(0, ysrc);
    register unsigned int y;

    if (width&15) {
        yv12touyvy(ysrc, usrc, vsrc, dst, c->srcW, srcSliceH, lumStride, chromStride, dstStride);
        return srcSliceH;
    }

    /* This code assumes:

    1) dst is 16 bytes-aligned
    2) dstStride is a multiple of 16
    3) width is a multiple of 16
    4) lum & chrom stride are multiples of 8
    */

    for (y=0; y<height; y++) {
        int i;
        for (i = 0; i < width - 31; i+= 32) {
            const unsigned int j = i >> 1;
            vector unsigned char v_yA = vec_ld(i, ysrc);
            vector unsigned char v_yB = vec_ld(i + 16, ysrc);
            vector unsigned char v_yC = vec_ld(i + 32, ysrc);
            vector unsigned char v_y1 = vec_perm(v_yA, v_yB, yperm);
            vector unsigned char v_y2 = vec_perm(v_yB, v_yC, yperm);
            vector unsigned char v_uA = vec_ld(j, usrc);
            vector unsigned char v_uB = vec_ld(j + 16, usrc);
            vector unsigned char v_u = vec_perm(v_uA, v_uB, vec_lvsl(j, usrc));
            vector unsigned char v_vA = vec_ld(j, vsrc);
            vector unsigned char v_vB = vec_ld(j + 16, vsrc);
            vector unsigned char v_v = vec_perm(v_vA, v_vB, vec_lvsl(j, vsrc));
            vector unsigned char v_uv_a = vec_mergeh(v_u, v_v);
            vector unsigned char v_uv_b = vec_mergel(v_u, v_v);
            vector unsigned char v_uyvy_0 = vec_mergeh(v_uv_a, v_y1);
            vector unsigned char v_uyvy_1 = vec_mergel(v_uv_a, v_y1);
            vector unsigned char v_uyvy_2 = vec_mergeh(v_uv_b, v_y2);
            vector unsigned char v_uyvy_3 = vec_mergel(v_uv_b, v_y2);
            vec_st(v_uyvy_0, (i << 1), dst);
            vec_st(v_uyvy_1, (i << 1) + 16, dst);
            vec_st(v_uyvy_2, (i << 1) + 32, dst);
            vec_st(v_uyvy_3, (i << 1) + 48, dst);
        }
        if (i < width) {
            const unsigned int j = i >> 1;
            vector unsigned char v_y1 = vec_ld(i, ysrc);
            vector unsigned char v_u = vec_ld(j, usrc);
            vector unsigned char v_v = vec_ld(j, vsrc);
            vector unsigned char v_uv_a = vec_mergeh(v_u, v_v);
            vector unsigned char v_uyvy_0 = vec_mergeh(v_uv_a, v_y1);
            vector unsigned char v_uyvy_1 = vec_mergel(v_uv_a, v_y1);
            vec_st(v_uyvy_0, (i << 1), dst);
            vec_st(v_uyvy_1, (i << 1) + 16, dst);
        }
        if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1) {
            usrc += chromStride;
            vsrc += chromStride;
        }
        ysrc += lumStride;
        dst += dstStride;
    }
    return srcSliceH;
 }
--- a/libswscale/swscale_avoption.c
+++ b/libswscale/swscale_avoption.c
@@ -0,0 +1,60 @@
 /*
 * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
 *
 * This file is part of FFmpeg.
 *
 * FFmpeg is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * FFmpeg is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with FFmpeg; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */

 #include "libavutil/avutil.h"
 #include "libavcodec/opt.h"
 #include "swscale.h"
 #include "swscale_internal.h"

 static const char * sws_context_to_name(void * ptr) {
    return "swscaler";
 }

 #define OFFSET(x) offsetof(SwsContext, x)
 #define DEFAULT 0
 #define VE AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_ENCODING_PARAM

 static const AVOption options[] = {
    { "sws_flags", "scaler/cpu flags", OFFSET(flags), FF_OPT_TYPE_FLAGS, DEFAULT, 0, UINT_MAX, VE, "sws_flags" },
    { "fast_bilinear", "fast bilinear", 0, FF_OPT_TYPE_CONST, SWS_FAST_BILINEAR, INT_MIN, INT_MAX, VE, "sws_flags" },
    { "bilinear", "bilinear", 0, FF_OPT_TYPE_CONST, SWS_BILINEAR, INT_MIN, INT_MAX, VE, "sws_flags" },
    { "bicubic", "bicubic", 0, FF_OPT_TYPE_CONST, SWS_BICUBIC, INT_MIN, INT_MAX, VE, "sws_flags" },
    { "experimental", "experimental", 0, FF_OPT_TYPE_CONST, SWS_X, INT_MIN, INT_MAX, VE, "sws_flags" },
    { "neighbor", "nearest neighbor", 0, FF_OPT_TYPE_CONST, SWS_POINT, INT_MIN, INT_MAX, VE, "sws_flags" },
    { "area", "averaging area", 0, FF_OPT_TYPE_CONST, SWS_AREA, INT_MIN, INT_MAX, VE, "sws_flags" },
    { "bicublin", "luma bicubic, chroma bilinear", 0, FF_OPT_TYPE_CONST, SWS_BICUBLIN, INT_MIN, INT_MAX, VE, "sws_flags" },
    { "gauss", "gaussian", 0, FF_OPT_TYPE_CONST, SWS_GAUSS, INT_MIN, INT_MAX, VE, "sws_flags" },
    { "sinc", "sinc", 0, FF_OPT_TYPE_CONST, SWS_SINC, INT_MIN, INT_MAX, VE, "sws_flags" },
    { "lanczos", "lanczos", 0, FF_OPT_TYPE_CONST, SWS_LANCZOS, INT_MIN, INT_MAX, VE, "sws_flags" },
    { "spline", "natural bicubic spline", 0, FF_OPT_TYPE_CONST, SWS_SPLINE, INT_MIN, INT_MAX, VE, "sws_flags" },
    { "print_info", "print info", 0, FF_OPT_TYPE_CONST, SWS_PRINT_INFO, INT_MIN, INT_MAX, VE, "sws_flags" },
    { "accurate_rnd", "accurate rounding", 0, FF_OPT_TYPE_CONST, SWS_ACCURATE_RND, INT_MIN, INT_MAX, VE, "sws_flags" },
    { "mmx", "MMX SIMD acceleration", 0, FF_OPT_TYPE_CONST, SWS_CPU_CAPS_MMX, INT_MIN, INT_MAX, VE, "sws_flags" },
    { "mmx2", "MMX2 SIMD acceleration", 0, FF_OPT_TYPE_CONST, SWS_CPU_CAPS_MMX2, INT_MIN, INT_MAX, VE, "sws_flags" },
    { "3dnow", "3DNOW SIMD acceleration", 0, FF_OPT_TYPE_CONST, SWS_CPU_CAPS_3DNOW, INT_MIN, INT_MAX, VE, "sws_flags" },
    { "altivec", "AltiVec SIMD acceleration", 0, FF_OPT_TYPE_CONST, SWS_CPU_CAPS_ALTIVEC, INT_MIN, INT_MAX, VE, "sws_flags" },
    { "bfin", "Blackfin SIMD acceleration", 0, FF_OPT_TYPE_CONST, SWS_CPU_CAPS_BFIN, INT_MIN, INT_MAX, VE, "sws_flags" },
    { "full_chroma_int", "full chroma interpolation", 0 , FF_OPT_TYPE_CONST, SWS_FULL_CHR_H_INT, INT_MIN, INT_MAX, VE, "sws_flags" },
    { "full_chroma_inp", "full chroma input", 0 , FF_OPT_TYPE_CONST, SWS_FULL_CHR_H_INP, INT_MIN, INT_MAX, VE, "sws_flags" },
    { "bitexact", "", 0 , FF_OPT_TYPE_CONST, SWS_BITEXACT, INT_MIN, INT_MAX, VE, "sws_flags" },
    { NULL }
 };

 const AVClass sws_context_class = { "SWScaler", sws_context_to_name, options };
--- a/libswscale/swscale_bfin.c
+++ b/libswscale/swscale_bfin.c
@@ -0,0 +1,91 @@
 /*
 * Copyright (C) 2007 Marc Hoffman <marc.hoffman@analog.com>
 *
 * Blackfin software video scaler operations
 *
 * This file is part of FFmpeg.
 *
 * FFmpeg is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * FFmpeg is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with FFmpeg; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */

 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <inttypes.h>
 #include <assert.h>
 #include "config.h"
 #include <unistd.h>
 #include "rgb2rgb.h"
 #include "swscale.h"
 #include "swscale_internal.h"

 #ifdef __FDPIC__
 #define L1CODE __attribute__ ((l1_text))
 #else
 #define L1CODE
 #endif

 int ff_bfin_uyvytoyv12 (const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
                        long width, long height,
                        long lumStride, long chromStride, long srcStride) L1CODE;

 int ff_bfin_yuyvtoyv12 (const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
                        long width, long height,
                        long lumStride, long chromStride, long srcStride) L1CODE;

 static int uyvytoyv12_unscaled (SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
                                int srcSliceH, uint8_t* dst[], int dstStride[])
 {
    uint8_t *dsty = dst[0] + dstStride[0]*srcSliceY;
    uint8_t *dstu = dst[1] + dstStride[1]*srcSliceY/2;
    uint8_t *dstv = dst[2] + dstStride[2]*srcSliceY/2;
    uint8_t *ip   = src[0] + srcStride[0]*srcSliceY;
    int w         = dstStride[0];

    ff_bfin_uyvytoyv12 (ip, dsty, dstu, dstv, w, srcSliceH, dstStride[0], dstStride[1], srcStride[0]);

    return srcSliceH;
 }

 static int yuyvtoyv12_unscaled (SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
                                int srcSliceH, uint8_t* dst[], int dstStride[])
 {
    uint8_t *dsty = dst[0] + dstStride[0]*srcSliceY;
    uint8_t *dstu = dst[1] + dstStride[1]*srcSliceY/2;
    uint8_t *dstv = dst[2] + dstStride[2]*srcSliceY/2;
    uint8_t *ip   = src[0] + srcStride[0]*srcSliceY;
    int w         = dstStride[0];

    ff_bfin_yuyvtoyv12 (ip, dsty, dstu, dstv, w, srcSliceH, dstStride[0], dstStride[1], srcStride[0]);

    return srcSliceH;
 }


 void ff_bfin_get_unscaled_swscale (SwsContext *c)
 {
    SwsFunc swScale = c->swScale;
    if (c->flags & SWS_CPU_CAPS_BFIN)
        if (c->dstFormat == PIX_FMT_YUV420P)
            if (c->srcFormat == PIX_FMT_UYVY422) {
                av_log (NULL, AV_LOG_VERBOSE, "selecting Blackfin optimized uyvytoyv12_unscaled\n");
                c->swScale = uyvytoyv12_unscaled;
            }
        if (c->dstFormat == PIX_FMT_YUV420P)
            if (c->srcFormat == PIX_FMT_YUYV422) {
                av_log (NULL, AV_LOG_VERBOSE, "selecting Blackfin optimized yuyvtoyv12_unscaled\n");
                c->swScale = yuyvtoyv12_unscaled;
            }
 }
--- a/libswscale/swscale_internal.h
+++ b/libswscale/swscale_internal.h
@@ -0,0 +1,324 @@
 /*
 * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
 *
 * This file is part of FFmpeg.
 *
 * FFmpeg is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * FFmpeg is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with FFmpeg; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */

 #ifndef SWSCALE_SWSCALE_INTERNAL_H
 #define SWSCALE_SWSCALE_INTERNAL_H

 #include "config.h"

 #if HAVE_ALTIVEC_H
 #include <altivec.h>
 #endif

 #include "libavutil/avutil.h"

 #define STR(s)         AV_TOSTRING(s) //AV_STRINGIFY is too long

 #define MAX_FILTER_SIZE 256

 #define VOFW 2048
 #define VOF  (VOFW*2)

 #ifdef WORDS_BIGENDIAN
 #define ALT32_CORR (-1)
 #else
 #define ALT32_CORR   1
 #endif

 #if ARCH_X86_64
 #   define APCK_PTR2 8
 #   define APCK_COEF 16
 #   define APCK_SIZE 24
 #else
 #   define APCK_PTR2 4
 #   define APCK_COEF 8
 #   define APCK_SIZE 16
 #endif

 struct SwsContext;

 typedef int (*SwsFunc)(struct SwsContext *context, uint8_t* src[], int srcStride[], int srcSliceY,
             int srcSliceH, uint8_t* dst[], int dstStride[]);

 /* This struct should be aligned on at least a 32-byte boundary. */
 typedef struct SwsContext{
    /**
     * info on struct for av_log
     */
    const AVClass *av_class;

    /**
     * Note that src, dst, srcStride, dstStride will be copied in the
     * sws_scale() wrapper so they can be freely modified here.
     */
    SwsFunc swScale;
    int srcW, srcH, dstH;
    int chrSrcW, chrSrcH, chrDstW, chrDstH;
    int lumXInc, chrXInc;
    int lumYInc, chrYInc;
    enum PixelFormat dstFormat, srcFormat;  ///< format 4:2:0 type is always YV12
    int origDstFormat, origSrcFormat;       ///< format
    int chrSrcHSubSample, chrSrcVSubSample;
    int chrIntHSubSample, chrIntVSubSample;
    int chrDstHSubSample, chrDstVSubSample;
    int vChrDrop;
    int sliceDir;
    double param[2];

    uint32_t pal_yuv[256];
    uint32_t pal_rgb[256];

    int16_t **lumPixBuf;
    int16_t **chrPixBuf;
    int16_t *hLumFilter;
    int16_t *hLumFilterPos;
    int16_t *hChrFilter;
    int16_t *hChrFilterPos;
    int16_t *vLumFilter;
    int16_t *vLumFilterPos;
    int16_t *vChrFilter;
    int16_t *vChrFilterPos;

    uint8_t formatConvBuffer[VOF]; //FIXME dynamic allocation, but we have to change a lot of code for this to be useful

    int hLumFilterSize;
    int hChrFilterSize;
    int vLumFilterSize;
    int vChrFilterSize;
    int vLumBufSize;
    int vChrBufSize;

    uint8_t *funnyYCode;
    uint8_t *funnyUVCode;
    int32_t *lumMmx2FilterPos;
    int32_t *chrMmx2FilterPos;
    int16_t *lumMmx2Filter;
    int16_t *chrMmx2Filter;

    int canMMX2BeUsed;

    int lastInLumBuf;
    int lastInChrBuf;
    int lumBufIndex;
    int chrBufIndex;
    int dstY;
    int flags;
    void * yuvTable;            // pointer to the yuv->rgb table start so it can be freed()
    uint8_t * table_rV[256];
    uint8_t * table_gU[256];
    int    table_gV[256];
    uint8_t * table_bU[256];

    //Colorspace stuff
    int contrast, brightness, saturation;    // for sws_getColorspaceDetails
    int srcColorspaceTable[4];
    int dstColorspaceTable[4];
    int srcRange, dstRange;
    int yuv2rgb_y_offset;
    int yuv2rgb_y_coeff;
    int yuv2rgb_v2r_coeff;
    int yuv2rgb_v2g_coeff;
    int yuv2rgb_u2g_coeff;
    int yuv2rgb_u2b_coeff;

 #define RED_DITHER            "0*8"
 #define GREEN_DITHER          "1*8"
 #define BLUE_DITHER           "2*8"
 #define Y_COEFF               "3*8"
 #define VR_COEFF              "4*8"
 #define UB_COEFF              "5*8"
 #define VG_COEFF              "6*8"
 #define UG_COEFF              "7*8"
 #define Y_OFFSET              "8*8"
 #define U_OFFSET              "9*8"
 #define V_OFFSET              "10*8"
 #define LUM_MMX_FILTER_OFFSET "11*8"
 #define CHR_MMX_FILTER_OFFSET "11*8+4*4*256"
 #define DSTW_OFFSET           "11*8+4*4*256*2" //do not change, it is hardcoded in the ASM
 #define ESP_OFFSET            "11*8+4*4*256*2+8"
 #define VROUNDER_OFFSET       "11*8+4*4*256*2+16"
 #define U_TEMP                "11*8+4*4*256*2+24"
 #define V_TEMP                "11*8+4*4*256*2+32"

    uint64_t redDither   __attribute__((aligned(8)));
    uint64_t greenDither __attribute__((aligned(8)));
    uint64_t blueDither  __attribute__((aligned(8)));

    uint64_t yCoeff      __attribute__((aligned(8)));
    uint64_t vrCoeff     __attribute__((aligned(8)));
    uint64_t ubCoeff     __attribute__((aligned(8)));
    uint64_t vgCoeff     __attribute__((aligned(8)));
    uint64_t ugCoeff     __attribute__((aligned(8)));
    uint64_t yOffset     __attribute__((aligned(8)));
    uint64_t uOffset     __attribute__((aligned(8)));
    uint64_t vOffset     __attribute__((aligned(8)));
    int32_t  lumMmxFilter[4*MAX_FILTER_SIZE];
    int32_t  chrMmxFilter[4*MAX_FILTER_SIZE];
    int dstW;
    uint64_t esp          __attribute__((aligned(8)));
    uint64_t vRounder     __attribute__((aligned(8)));
    uint64_t u_temp       __attribute__((aligned(8)));
    uint64_t v_temp       __attribute__((aligned(8)));

 #if HAVE_ALTIVEC

  vector signed short   CY;
  vector signed short   CRV;
  vector signed short   CBU;
  vector signed short   CGU;
  vector signed short   CGV;
  vector signed short   OY;
  vector unsigned short CSHIFT;
  vector signed short   *vYCoeffsBank, *vCCoeffsBank;

 #endif


 #if ARCH_BFIN
    uint32_t oy           __attribute__((aligned(4)));
    uint32_t oc           __attribute__((aligned(4)));
    uint32_t zero         __attribute__((aligned(4)));
    uint32_t cy           __attribute__((aligned(4)));
    uint32_t crv          __attribute__((aligned(4)));
    uint32_t rmask        __attribute__((aligned(4)));
    uint32_t cbu          __attribute__((aligned(4)));
    uint32_t bmask        __attribute__((aligned(4)));
    uint32_t cgu          __attribute__((aligned(4)));
    uint32_t cgv          __attribute__((aligned(4)));
    uint32_t gmask        __attribute__((aligned(4)));
 #endif

 #if HAVE_VIS
    uint64_t sparc_coeffs[10] __attribute__((aligned(8)));
 #endif

 } SwsContext;
 //FIXME check init (where 0)

 SwsFunc sws_yuv2rgb_get_func_ptr (SwsContext *c);
 int sws_yuv2rgb_c_init_tables (SwsContext *c, const int inv_table[4], int fullRange, int brightness, int contrast, int saturation);

 void sws_yuv2rgb_altivec_init_tables (SwsContext *c, const int inv_table[4],int brightness,int contrast, int saturation);
 SwsFunc sws_yuv2rgb_init_altivec (SwsContext *c);
 void altivec_yuv2packedX (SwsContext *c,
                          int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
                          int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
                          uint8_t *dest, int dstW, int dstY);

 const char *sws_format_name(int format);

 //FIXME replace this with something faster
 #define isPlanarYUV(x)  (           \
           (x)==PIX_FMT_YUV410P     \
        || (x)==PIX_FMT_YUV420P     \
        || (x)==PIX_FMT_YUV411P     \
        || (x)==PIX_FMT_YUV422P     \
        || (x)==PIX_FMT_YUV444P     \
        || (x)==PIX_FMT_YUV440P     \
        || (x)==PIX_FMT_NV12        \
        || (x)==PIX_FMT_NV21        \
    )
 #define isYUV(x)        (           \
           (x)==PIX_FMT_UYVY422     \
        || (x)==PIX_FMT_YUYV422     \
        || isPlanarYUV(x)           \
    )
 #define isGray(x)       (           \
           (x)==PIX_FMT_GRAY8       \
        || (x)==PIX_FMT_GRAY16BE    \
        || (x)==PIX_FMT_GRAY16LE    \
    )
 #define isGray16(x)     (           \
           (x)==PIX_FMT_GRAY16BE    \
        || (x)==PIX_FMT_GRAY16LE    \
    )
 #define isRGB(x)        (           \
           (x)==PIX_FMT_RGB32       \
        || (x)==PIX_FMT_RGB32_1     \
        || (x)==PIX_FMT_RGB24       \
        || (x)==PIX_FMT_RGB565      \
        || (x)==PIX_FMT_RGB555      \
        || (x)==PIX_FMT_RGB8        \
        || (x)==PIX_FMT_RGB4        \
        || (x)==PIX_FMT_RGB4_BYTE   \
        || (x)==PIX_FMT_MONOBLACK   \
        || (x)==PIX_FMT_MONOWHITE   \
    )
 #define isBGR(x)        (           \
           (x)==PIX_FMT_BGR32       \
        || (x)==PIX_FMT_BGR32_1     \
        || (x)==PIX_FMT_BGR24       \
        || (x)==PIX_FMT_BGR565      \
        || (x)==PIX_FMT_BGR555      \
        || (x)==PIX_FMT_BGR8        \
        || (x)==PIX_FMT_BGR4        \
        || (x)==PIX_FMT_BGR4_BYTE   \
        || (x)==PIX_FMT_MONOBLACK   \
        || (x)==PIX_FMT_MONOWHITE   \
    )
 #define isALPHA(x)      (           \
           (x)==PIX_FMT_BGR32       \
        || (x)==PIX_FMT_BGR32_1     \
        || (x)==PIX_FMT_RGB32       \
        || (x)==PIX_FMT_RGB32_1     \
        || (x)==PIX_FMT_YUVA420P    \
    )

 static inline int fmt_depth(int fmt)
 {
    switch(fmt) {
        case PIX_FMT_BGRA:
        case PIX_FMT_ABGR:
        case PIX_FMT_RGBA:
        case PIX_FMT_ARGB:
            return 32;
        case PIX_FMT_BGR24:
        case PIX_FMT_RGB24:
            return 24;
        case PIX_FMT_BGR565:
        case PIX_FMT_RGB565:
        case PIX_FMT_GRAY16BE:
        case PIX_FMT_GRAY16LE:
            return 16;
        case PIX_FMT_BGR555:
        case PIX_FMT_RGB555:
            return 15;
        case PIX_FMT_BGR8:
        case PIX_FMT_RGB8:
            return 8;
        case PIX_FMT_BGR4:
        case PIX_FMT_RGB4:
        case PIX_FMT_BGR4_BYTE:
        case PIX_FMT_RGB4_BYTE:
            return 4;
        case PIX_FMT_MONOBLACK:
        case PIX_FMT_MONOWHITE:
            return 1;
        default:
            return 0;
    }
 }

 extern const uint64_t ff_dither4[2];
 extern const uint64_t ff_dither8[2];

 extern const AVClass sws_context_class;

 #endif /* SWSCALE_SWSCALE_INTERNAL_H */
--- a/libswscale/swscale_template.c
+++ b/libswscale/swscale_template.c
--- a/libswscale/yuv2rgb.c
+++ b/libswscale/yuv2rgb.c
@@ -0,0 +1,684 @@
 /*
 * software YUV to RGB converter
 *
 * Copyright (C) 2009 Konstantin Shishkov
 *
 * MMX/MMX2 template stuff (needed for fast movntq support),
 * 1,4,8bpp support and context / deglobalize stuff
 * by Michael Niedermayer (michaelni@gmx.at)
 *
 * This file is part of FFmpeg.
 *
 * FFmpeg is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * FFmpeg is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with FFmpeg; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */

 #include <stdio.h>
 #include <stdlib.h>
 #include <inttypes.h>
 #include <assert.h>

 #include "config.h"
 #include "rgb2rgb.h"
 #include "swscale.h"
 #include "swscale_internal.h"

 #define DITHER1XBPP // only for MMX

 extern const uint8_t dither_8x8_32[8][8];
 extern const uint8_t dither_8x8_73[8][8];
 extern const uint8_t dither_8x8_220[8][8];

 #if HAVE_MMX && CONFIG_GPL

 /* hope these constant values are cache line aligned */
 DECLARE_ASM_CONST(8, uint64_t, mmx_00ffw)   = 0x00ff00ff00ff00ffULL;
 DECLARE_ASM_CONST(8, uint64_t, mmx_redmask) = 0xf8f8f8f8f8f8f8f8ULL;
 DECLARE_ASM_CONST(8, uint64_t, mmx_grnmask) = 0xfcfcfcfcfcfcfcfcULL;

 //MMX versions
 #undef RENAME
 #undef HAVE_MMX2
 #undef HAVE_AMD3DNOW
 #define HAVE_MMX2 0
 #define HAVE_AMD3DNOW 0
 #define RENAME(a) a ## _MMX
 #include "yuv2rgb_template.c"

 //MMX2 versions
 #undef RENAME
 #undef HAVE_MMX2
 #define HAVE_MMX2 1
 #define RENAME(a) a ## _MMX2
 #include "yuv2rgb_template.c"

 #endif /* HAVE_MMX && CONFIG_GPL */

 const int32_t ff_yuv2rgb_coeffs[8][4] = {
    {117504, 138453, 13954, 34903}, /* no sequence_display_extension */
    {117504, 138453, 13954, 34903}, /* ITU-R Rec. 709 (1990) */
    {104597, 132201, 25675, 53279}, /* unspecified */
    {104597, 132201, 25675, 53279}, /* reserved */
    {104448, 132798, 24759, 53109}, /* FCC */
    {104597, 132201, 25675, 53279}, /* ITU-R Rec. 624-4 System B, G */
    {104597, 132201, 25675, 53279}, /* SMPTE 170M */
    {117579, 136230, 16907, 35559}  /* SMPTE 240M (1987) */
 };

 #define LOADCHROMA(i)                               \
    U = pu[i];                                      \
    V = pv[i];                                      \
    r = (void *)c->table_rV[V];                     \
    g = (void *)(c->table_gU[U] + c->table_gV[V]);  \
    b = (void *)c->table_bU[U];

 #define PUTRGB(dst,src,i,o)          \
    Y = src[2*i+o];                  \
    dst[2*i  ] = r[Y] + g[Y] + b[Y]; \
    Y = src[2*i+1-o];                \
    dst[2*i+1] = r[Y] + g[Y] + b[Y];

 #define PUTRGB24(dst,src,i)                                  \
    Y = src[2*i];                                            \
    dst[6*i+0] = r[Y]; dst[6*i+1] = g[Y]; dst[6*i+2] = b[Y]; \
    Y = src[2*i+1];                                          \
    dst[6*i+3] = r[Y]; dst[6*i+4] = g[Y]; dst[6*i+5] = b[Y];

 #define PUTBGR24(dst,src,i)                                  \
    Y = src[2*i];                                            \
    dst[6*i+0] = b[Y]; dst[6*i+1] = g[Y]; dst[6*i+2] = r[Y]; \
    Y = src[2*i+1];                                          \
    dst[6*i+3] = b[Y]; dst[6*i+4] = g[Y]; dst[6*i+5] = r[Y];

 #define YUV2RGBFUNC(func_name, dst_type) \
 static int func_name(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY, \
                     int srcSliceH, uint8_t* dst[], int dstStride[]){\
    int y;\
 \
    if (c->srcFormat == PIX_FMT_YUV422P) {\
        srcStride[1] *= 2;\
        srcStride[2] *= 2;\
    }\
    for (y=0; y<srcSliceH; y+=2) {\
        dst_type *dst_1 = (dst_type*)(dst[0] + (y+srcSliceY  )*dstStride[0]);\
        dst_type *dst_2 = (dst_type*)(dst[0] + (y+srcSliceY+1)*dstStride[0]);\
        dst_type av_unused *r, *b;\
        dst_type *g;\
        uint8_t *py_1 = src[0] + y*srcStride[0];\
        uint8_t *py_2 = py_1 + srcStride[0];\
        uint8_t *pu = src[1] + (y>>1)*srcStride[1];\
        uint8_t *pv = src[2] + (y>>1)*srcStride[2];\
        unsigned int h_size = c->dstW>>3;\
        while (h_size--) {\
            int av_unused U, V;\
            int Y;\

 #define ENDYUV2RGBLINE(dst_delta)\
            pu += 4;\
            pv += 4;\
            py_1 += 8;\
            py_2 += 8;\
            dst_1 += dst_delta;\
            dst_2 += dst_delta;\
        }\
        if (c->dstW & 4) {\
            int av_unused Y, U, V;\

 #define ENDYUV2RGBFUNC()\
        }\
    }\
    return srcSliceH;\
 }

 #define CLOSEYUV2RGBFUNC(dst_delta)\
    ENDYUV2RGBLINE(dst_delta)\
    ENDYUV2RGBFUNC()

 YUV2RGBFUNC(yuv2rgb_c_32, uint32_t)
    LOADCHROMA(0);
    PUTRGB(dst_1,py_1,0,0);
    PUTRGB(dst_2,py_2,0,1);

    LOADCHROMA(1);
    PUTRGB(dst_2,py_2,1,1);
    PUTRGB(dst_1,py_1,1,0);
    LOADCHROMA(1);
    PUTRGB(dst_2,py_2,1,1);
    PUTRGB(dst_1,py_1,1,0);

    LOADCHROMA(2);
    PUTRGB(dst_1,py_1,2,0);
    PUTRGB(dst_2,py_2,2,1);

    LOADCHROMA(3);
    PUTRGB(dst_2,py_2,3,1);
    PUTRGB(dst_1,py_1,3,0);
 ENDYUV2RGBLINE(8)
    LOADCHROMA(0);
    PUTRGB(dst_1,py_1,0,0);
    PUTRGB(dst_2,py_2,0,1);

    LOADCHROMA(1);
    PUTRGB(dst_2,py_2,1,1);
    PUTRGB(dst_1,py_1,1,0);
 ENDYUV2RGBFUNC()

 YUV2RGBFUNC(yuv2rgb_c_24_rgb, uint8_t)
    LOADCHROMA(0);
    PUTRGB24(dst_1,py_1,0);
    PUTRGB24(dst_2,py_2,0);

    LOADCHROMA(1);
    PUTRGB24(dst_2,py_2,1);
    PUTRGB24(dst_1,py_1,1);

    LOADCHROMA(2);
    PUTRGB24(dst_1,py_1,2);
    PUTRGB24(dst_2,py_2,2);

    LOADCHROMA(3);
    PUTRGB24(dst_2,py_2,3);
    PUTRGB24(dst_1,py_1,3);
 ENDYUV2RGBLINE(24)
    LOADCHROMA(0);
    PUTRGB24(dst_1,py_1,0);
    PUTRGB24(dst_2,py_2,0);

    LOADCHROMA(1);
    PUTRGB24(dst_2,py_2,1);
    PUTRGB24(dst_1,py_1,1);
 ENDYUV2RGBFUNC()

 // only trivial mods from yuv2rgb_c_24_rgb
 YUV2RGBFUNC(yuv2rgb_c_24_bgr, uint8_t)
    LOADCHROMA(0);
    PUTBGR24(dst_1,py_1,0);
    PUTBGR24(dst_2,py_2,0);

    LOADCHROMA(1);
    PUTBGR24(dst_2,py_2,1);
    PUTBGR24(dst_1,py_1,1);

    LOADCHROMA(2);
    PUTBGR24(dst_1,py_1,2);
    PUTBGR24(dst_2,py_2,2);

    LOADCHROMA(3);
    PUTBGR24(dst_2,py_2,3);
    PUTBGR24(dst_1,py_1,3);
 ENDYUV2RGBLINE(24)
    LOADCHROMA(0);
    PUTBGR24(dst_1,py_1,0);
    PUTBGR24(dst_2,py_2,0);

    LOADCHROMA(1);
    PUTBGR24(dst_2,py_2,1);
    PUTBGR24(dst_1,py_1,1);
 ENDYUV2RGBFUNC()

 // This is exactly the same code as yuv2rgb_c_32 except for the types of
 // r, g, b, dst_1, dst_2
 YUV2RGBFUNC(yuv2rgb_c_16, uint16_t)
    LOADCHROMA(0);
    PUTRGB(dst_1,py_1,0,0);
    PUTRGB(dst_2,py_2,0,1);

    LOADCHROMA(1);
    PUTRGB(dst_2,py_2,1,1);
    PUTRGB(dst_1,py_1,1,0);

    LOADCHROMA(2);
    PUTRGB(dst_1,py_1,2,0);
    PUTRGB(dst_2,py_2,2,1);

    LOADCHROMA(3);
    PUTRGB(dst_2,py_2,3,1);
    PUTRGB(dst_1,py_1,3,0);
 CLOSEYUV2RGBFUNC(8)

 // This is exactly the same code as yuv2rgb_c_32 except for the types of
 // r, g, b, dst_1, dst_2
 YUV2RGBFUNC(yuv2rgb_c_8, uint8_t)
    LOADCHROMA(0);
    PUTRGB(dst_1,py_1,0,0);
    PUTRGB(dst_2,py_2,0,1);

    LOADCHROMA(1);
    PUTRGB(dst_2,py_2,1,1);
    PUTRGB(dst_1,py_1,1,0);

    LOADCHROMA(2);
    PUTRGB(dst_1,py_1,2,0);
    PUTRGB(dst_2,py_2,2,1);

    LOADCHROMA(3);
    PUTRGB(dst_2,py_2,3,1);
    PUTRGB(dst_1,py_1,3,0);
 CLOSEYUV2RGBFUNC(8)

 // r, g, b, dst_1, dst_2
 YUV2RGBFUNC(yuv2rgb_c_8_ordered_dither, uint8_t)
    const uint8_t *d32 = dither_8x8_32[y&7];
    const uint8_t *d64 = dither_8x8_73[y&7];
 #define PUTRGB8(dst,src,i,o)                                    \
    Y = src[2*i];                                               \
    dst[2*i]   = r[Y+d32[0+o]] + g[Y+d32[0+o]] + b[Y+d64[0+o]]; \
    Y = src[2*i+1];                                             \
    dst[2*i+1] = r[Y+d32[1+o]] + g[Y+d32[1+o]] + b[Y+d64[1+o]];

    LOADCHROMA(0);
    PUTRGB8(dst_1,py_1,0,0);
    PUTRGB8(dst_2,py_2,0,0+8);

    LOADCHROMA(1);
    PUTRGB8(dst_2,py_2,1,2+8);
    PUTRGB8(dst_1,py_1,1,2);

    LOADCHROMA(2);
    PUTRGB8(dst_1,py_1,2,4);
    PUTRGB8(dst_2,py_2,2,4+8);

    LOADCHROMA(3);
    PUTRGB8(dst_2,py_2,3,6+8);
    PUTRGB8(dst_1,py_1,3,6);
 CLOSEYUV2RGBFUNC(8)


 // This is exactly the same code as yuv2rgb_c_32 except for the types of
 // r, g, b, dst_1, dst_2
 YUV2RGBFUNC(yuv2rgb_c_4, uint8_t)
    int acc;
 #define PUTRGB4(dst,src,i)          \
    Y = src[2*i];                   \
    acc = r[Y] + g[Y] + b[Y];       \
    Y = src[2*i+1];                 \
    acc |= (r[Y] + g[Y] + b[Y])<<4; \
    dst[i] = acc;

    LOADCHROMA(0);
    PUTRGB4(dst_1,py_1,0);
    PUTRGB4(dst_2,py_2,0);

    LOADCHROMA(1);
    PUTRGB4(dst_2,py_2,1);
    PUTRGB4(dst_1,py_1,1);

    LOADCHROMA(2);
    PUTRGB4(dst_1,py_1,2);
    PUTRGB4(dst_2,py_2,2);

    LOADCHROMA(3);
    PUTRGB4(dst_2,py_2,3);
    PUTRGB4(dst_1,py_1,3);
 CLOSEYUV2RGBFUNC(4)

 YUV2RGBFUNC(yuv2rgb_c_4_ordered_dither, uint8_t)
    const uint8_t *d64 =  dither_8x8_73[y&7];
    const uint8_t *d128 = dither_8x8_220[y&7];
    int acc;

 #define PUTRGB4D(dst,src,i,o)                                     \
    Y = src[2*i];                                                 \
    acc = r[Y+d128[0+o]] + g[Y+d64[0+o]] + b[Y+d128[0+o]];        \
    Y = src[2*i+1];                                               \
    acc |= (r[Y+d128[1+o]] + g[Y+d64[1+o]] + b[Y+d128[1+o]])<<4;  \
    dst[i]= acc;

    LOADCHROMA(0);
    PUTRGB4D(dst_1,py_1,0,0);
    PUTRGB4D(dst_2,py_2,0,0+8);

    LOADCHROMA(1);
    PUTRGB4D(dst_2,py_2,1,2+8);
    PUTRGB4D(dst_1,py_1,1,2);

    LOADCHROMA(2);
    PUTRGB4D(dst_1,py_1,2,4);
    PUTRGB4D(dst_2,py_2,2,4+8);

    LOADCHROMA(3);
    PUTRGB4D(dst_2,py_2,3,6+8);
    PUTRGB4D(dst_1,py_1,3,6);
 CLOSEYUV2RGBFUNC(4)

 // This is exactly the same code as yuv2rgb_c_32 except for the types of
 // r, g, b, dst_1, dst_2
 YUV2RGBFUNC(yuv2rgb_c_4b, uint8_t)
    LOADCHROMA(0);
    PUTRGB(dst_1,py_1,0,0);
    PUTRGB(dst_2,py_2,0,1);

    LOADCHROMA(1);
    PUTRGB(dst_2,py_2,1,1);
    PUTRGB(dst_1,py_1,1,0);

    LOADCHROMA(2);
    PUTRGB(dst_1,py_1,2,0);
    PUTRGB(dst_2,py_2,2,1);

    LOADCHROMA(3);
    PUTRGB(dst_2,py_2,3,1);
    PUTRGB(dst_1,py_1,3,0);
 CLOSEYUV2RGBFUNC(8)

 YUV2RGBFUNC(yuv2rgb_c_4b_ordered_dither, uint8_t)
    const uint8_t *d64 =  dither_8x8_73[y&7];
    const uint8_t *d128 = dither_8x8_220[y&7];

 #define PUTRGB4DB(dst,src,i,o)                                    \
    Y = src[2*i];                                                 \
    dst[2*i]   = r[Y+d128[0+o]] + g[Y+d64[0+o]] + b[Y+d128[0+o]]; \
    Y = src[2*i+1];                                               \
    dst[2*i+1] = r[Y+d128[1+o]] + g[Y+d64[1+o]] + b[Y+d128[1+o]];

    LOADCHROMA(0);
    PUTRGB4DB(dst_1,py_1,0,0);
    PUTRGB4DB(dst_2,py_2,0,0+8);

    LOADCHROMA(1);
    PUTRGB4DB(dst_2,py_2,1,2+8);
    PUTRGB4DB(dst_1,py_1,1,2);

    LOADCHROMA(2);
    PUTRGB4DB(dst_1,py_1,2,4);
    PUTRGB4DB(dst_2,py_2,2,4+8);

    LOADCHROMA(3);
    PUTRGB4DB(dst_2,py_2,3,6+8);
    PUTRGB4DB(dst_1,py_1,3,6);
 CLOSEYUV2RGBFUNC(8)

 YUV2RGBFUNC(yuv2rgb_c_1_ordered_dither, uint8_t)
        const uint8_t *d128 = dither_8x8_220[y&7];
        char out_1 = 0, out_2 = 0;
        g= c->table_gU[128] + c->table_gV[128];

 #define PUTRGB1(out,src,i,o)    \
    Y = src[2*i];               \
    out+= out + g[Y+d128[0+o]]; \
    Y = src[2*i+1];             \
    out+= out + g[Y+d128[1+o]];

    PUTRGB1(out_1,py_1,0,0);
    PUTRGB1(out_2,py_2,0,0+8);

    PUTRGB1(out_2,py_2,1,2+8);
    PUTRGB1(out_1,py_1,1,2);

    PUTRGB1(out_1,py_1,2,4);
    PUTRGB1(out_2,py_2,2,4+8);

    PUTRGB1(out_2,py_2,3,6+8);
    PUTRGB1(out_1,py_1,3,6);

    dst_1[0]= out_1;
    dst_2[0]= out_2;
 CLOSEYUV2RGBFUNC(1)

 SwsFunc sws_yuv2rgb_get_func_ptr(SwsContext *c)
 {
    SwsFunc t = NULL;
 #if (HAVE_MMX2 || HAVE_MMX) && CONFIG_GPL
    if (c->flags & SWS_CPU_CAPS_MMX2) {
        switch (c->dstFormat) {
        case PIX_FMT_RGB32:  return yuv420_rgb32_MMX2;
        case PIX_FMT_BGR24:  return yuv420_rgb24_MMX2;
        case PIX_FMT_RGB565: return yuv420_rgb16_MMX2;
        case PIX_FMT_RGB555: return yuv420_rgb15_MMX2;
        }
    }
    if (c->flags & SWS_CPU_CAPS_MMX) {
        switch (c->dstFormat) {
        case PIX_FMT_RGB32:  return yuv420_rgb32_MMX;
        case PIX_FMT_BGR24:  return yuv420_rgb24_MMX;
        case PIX_FMT_RGB565: return yuv420_rgb16_MMX;
        case PIX_FMT_RGB555: return yuv420_rgb15_MMX;
        }
    }
 #endif
 #if HAVE_VIS
    t = sws_yuv2rgb_init_vis(c);
 #endif
 #if CONFIG_MLIB
    t = sws_yuv2rgb_init_mlib(c);
 #endif
 #if HAVE_ALTIVEC && CONFIG_GPL
    if (c->flags & SWS_CPU_CAPS_ALTIVEC)
        t = sws_yuv2rgb_init_altivec(c);
 #endif

 #if ARCH_BFIN
    if (c->flags & SWS_CPU_CAPS_BFIN)
        t = sws_ff_bfin_yuv2rgb_get_func_ptr(c);
 #endif

    if (t)
        return t;

    av_log(c, AV_LOG_WARNING, "No accelerated colorspace conversion found.\n");

    switch (c->dstFormat) {
    case PIX_FMT_BGR32_1:
    case PIX_FMT_RGB32_1:
    case PIX_FMT_BGR32:
    case PIX_FMT_RGB32:      return yuv2rgb_c_32;
    case PIX_FMT_RGB24:      return yuv2rgb_c_24_rgb;
    case PIX_FMT_BGR24:      return yuv2rgb_c_24_bgr;
    case PIX_FMT_RGB565:
    case PIX_FMT_BGR565:
    case PIX_FMT_RGB555:
    case PIX_FMT_BGR555:     return yuv2rgb_c_16;
    case PIX_FMT_RGB8:
    case PIX_FMT_BGR8:       return yuv2rgb_c_8_ordered_dither;
    case PIX_FMT_RGB4:
    case PIX_FMT_BGR4:       return yuv2rgb_c_4_ordered_dither;
    case PIX_FMT_RGB4_BYTE:
    case PIX_FMT_BGR4_BYTE:  return yuv2rgb_c_4b_ordered_dither;
    case PIX_FMT_MONOBLACK:  return yuv2rgb_c_1_ordered_dither;
    default:
        assert(0);
    }
    return NULL;
 }

 static void fill_table(uint8_t* table[256], const int elemsize, const int inc, uint8_t *y_table)
 {
    int i;
    int64_t cb = 0;

    y_table -= elemsize * (inc >> 9);

    for (i = 0; i < 256; i++) {
        table[i] = y_table + elemsize * (cb >> 16);
        cb += inc;
    }
 }

 static void fill_gv_table(int table[256], const int elemsize, const int inc)
 {
    int i;
    int64_t cb = 0;
    int off = -(inc >> 9);

    for (i = 0; i < 256; i++) {
        table[i] = elemsize * (off + (cb >> 16));
        cb += inc;
    }
 }

 av_cold int sws_yuv2rgb_c_init_tables(SwsContext *c, const int inv_table[4], int fullRange,
                                      int brightness, int contrast, int saturation)
 {
    const int isRgb =      c->dstFormat==PIX_FMT_RGB32
                        || c->dstFormat==PIX_FMT_RGB32_1
                        || c->dstFormat==PIX_FMT_BGR24
                        || c->dstFormat==PIX_FMT_RGB565
                        || c->dstFormat==PIX_FMT_RGB555
                        || c->dstFormat==PIX_FMT_RGB8
                        || c->dstFormat==PIX_FMT_RGB4
                        || c->dstFormat==PIX_FMT_RGB4_BYTE
                        || c->dstFormat==PIX_FMT_MONOBLACK;
    const int bpp = fmt_depth(c->dstFormat);
    uint8_t *y_table;
    uint16_t *y_table16;
    uint32_t *y_table32;
    int i, base, rbase, gbase, bbase, abase;
    const int yoffs = fullRange ? 384 : 326;

    int64_t crv =  inv_table[0];
    int64_t cbu =  inv_table[1];
    int64_t cgu = -inv_table[2];
    int64_t cgv = -inv_table[3];
    int64_t cy  = 1<<16;
    int64_t oy  = 0;

    int64_t yb = 0;

    if (!fullRange) {
        cy = (cy*255) / 219;
        oy = 16<<16;
    } else {
        crv = (crv*224) / 255;
        cbu = (cbu*224) / 255;
        cgu = (cgu*224) / 255;
        cgv = (cgv*224) / 255;
    }

    cy  = (cy *contrast             ) >> 16;
    crv = (crv*contrast * saturation) >> 32;
    cbu = (cbu*contrast * saturation) >> 32;
    cgu = (cgu*contrast * saturation) >> 32;
    cgv = (cgv*contrast * saturation) >> 32;
    oy -= 256*brightness;

    //scale coefficients by cy
    crv = ((crv << 16) + 0x8000) / cy;
    cbu = ((cbu << 16) + 0x8000) / cy;
    cgu = ((cgu << 16) + 0x8000) / cy;
    cgv = ((cgv << 16) + 0x8000) / cy;

    av_free(c->yuvTable);

    switch (bpp) {
    case 1:
        c->yuvTable = av_malloc(1024);
        y_table = c->yuvTable;
        yb = -(384<<16) - oy;
        for (i = 0; i < 1024-110; i++) {
            y_table[i+110] = av_clip_uint8((yb + 0x8000) >> 16) >> 7;
            yb += cy;
        }
        fill_table(c->table_gU, 1, cgu, y_table + yoffs);
        fill_gv_table(c->table_gV, 1, cgv);
        break;
    case 4:
    case 4|128:
        rbase = isRgb ? 3 : 0;
        gbase = 1;
        bbase = isRgb ? 0 : 3;
        c->yuvTable = av_malloc(1024*3);
        y_table = c->yuvTable;
        yb = -(384<<16) - oy;
        for (i = 0; i < 1024-110; i++) {
            int yval = av_clip_uint8((yb + 0x8000) >> 16);
            y_table[i+110     ] =  (yval >> 7)       << rbase;
            y_table[i+ 37+1024] = ((yval + 43) / 85) << gbase;
            y_table[i+110+2048] =  (yval >> 7)       << bbase;
            yb += cy;
        }
        fill_table(c->table_rV, 1, crv, y_table + yoffs);
        fill_table(c->table_gU, 1, cgu, y_table + yoffs + 1024);
        fill_table(c->table_bU, 1, cbu, y_table + yoffs + 2048);
        fill_gv_table(c->table_gV, 1, cgv);
        break;
    case 8:
        rbase = isRgb ? 5 : 0;
        gbase = isRgb ? 2 : 3;
        bbase = isRgb ? 0 : 6;
        c->yuvTable = av_malloc(1024*3);
        y_table = c->yuvTable;
        yb = -(384<<16) - oy;
        for (i = 0; i < 1024-38; i++) {
            int yval = av_clip_uint8((yb + 0x8000) >> 16);
            y_table[i+16     ] = ((yval + 18) / 36) << rbase;
            y_table[i+16+1024] = ((yval + 18) / 36) << gbase;
            y_table[i+37+2048] = ((yval + 43) / 85) << bbase;
            yb += cy;
        }
        fill_table(c->table_rV, 1, crv, y_table + yoffs);
        fill_table(c->table_gU, 1, cgu, y_table + yoffs + 1024);
        fill_table(c->table_bU, 1, cbu, y_table + yoffs + 2048);
        fill_gv_table(c->table_gV, 1, cgv);
        break;
    case 15:
    case 16:
        rbase = isRgb ? bpp - 5 : 0;
        gbase = 5;
        bbase = isRgb ? 0 : (bpp - 5);
        c->yuvTable = av_malloc(1024*3*2);
        y_table16 = c->yuvTable;
        yb = -(384<<16) - oy;
        for (i = 0; i < 1024; i++) {
            uint8_t yval = av_clip_uint8((yb + 0x8000) >> 16);
            y_table16[i     ] = (yval >> 3)          << rbase;
            y_table16[i+1024] = (yval >> (18 - bpp)) << gbase;
            y_table16[i+2048] = (yval >> 3)          << bbase;
            yb += cy;
        }
        fill_table(c->table_rV, 2, crv, y_table16 + yoffs);
        fill_table(c->table_gU, 2, cgu, y_table16 + yoffs + 1024);
        fill_table(c->table_bU, 2, cbu, y_table16 + yoffs + 2048);
        fill_gv_table(c->table_gV, 2, cgv);
        break;
    case 24:
        c->yuvTable = av_malloc(1024);
        y_table = c->yuvTable;
        yb = -(384<<16) - oy;
        for (i = 0; i < 1024; i++) {
            y_table[i] = av_clip_uint8((yb + 0x8000) >> 16);
            yb += cy;
        }
        fill_table(c->table_rV, 1, crv, y_table + yoffs);
        fill_table(c->table_gU, 1, cgu, y_table + yoffs);
        fill_table(c->table_bU, 1, cbu, y_table + yoffs);
        fill_gv_table(c->table_gV, 1, cgv);
        break;
    case 32:
        base = (c->dstFormat == PIX_FMT_RGB32_1 || c->dstFormat == PIX_FMT_BGR32_1) ? 8 : 0;
        rbase = base + (isRgb ? 16 : 0);
        gbase = base + 8;
        bbase = base + (isRgb ? 0 : 16);
        abase = (base + 24) & 31;
        c->yuvTable = av_malloc(1024*3*4);
        y_table32 = c->yuvTable;
        yb = -(384<<16) - oy;
        for (i = 0; i < 1024; i++) {
            uint8_t yval = av_clip_uint8((yb + 0x8000) >> 16);
            y_table32[i     ] = (yval << rbase) + (255 << abase);
            y_table32[i+1024] = yval << gbase;
            y_table32[i+2048] = yval << bbase;
            yb += cy;
        }
        fill_table(c->table_rV, 4, crv, y_table32 + yoffs);
        fill_table(c->table_gU, 4, cgu, y_table32 + yoffs + 1024);
        fill_table(c->table_bU, 4, cbu, y_table32 + yoffs + 2048);
        fill_gv_table(c->table_gV, 4, cgv);
        break;
    default:
        c->yuvTable = NULL;
        av_log(c, AV_LOG_ERROR, "%ibpp not supported by yuv2rgb\n", bpp);
        return -1;
    }
    return 0;
 }
--- a/libswscale/yuv2rgb_altivec.c
+++ b/libswscale/yuv2rgb_altivec.c
@@ -0,0 +1,962 @@
 /*
 * AltiVec acceleration for colorspace conversion
 *
 * copyright (C) 2004 Marc Hoffman <marc.hoffman@analog.com>
 *
 * This file is part of FFmpeg.
 *
 * FFmpeg is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * FFmpeg is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with FFmpeg; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */

 /*
 Convert I420 YV12 to RGB in various formats,
  it rejects images that are not in 420 formats,
  it rejects images that don't have widths of multiples of 16,
  it rejects images that don't have heights of multiples of 2.
 Reject defers to C simulation code.

 Lots of optimizations to be done here.

 1. Need to fix saturation code. I just couldn't get it to fly with packs
   and adds, so we currently use max/min to clip.

 2. The inefficient use of chroma loading needs a bit of brushing up.

 3. Analysis of pipeline stalls needs to be done. Use shark to identify
   pipeline stalls.


 MODIFIED to calculate coeffs from currently selected color space.
 MODIFIED core to be a macro where you specify the output format.
 ADDED UYVY conversion which is never called due to some thing in swscale.
 CORRECTED algorithim selection to be strict on input formats.
 ADDED runtime detection of AltiVec.

 ADDED altivec_yuv2packedX vertical scl + RGB converter

 March 27,2004
 PERFORMANCE ANALYSIS

 The C version uses 25% of the processor or ~250Mips for D1 video rawvideo
 used as test.
 The AltiVec version uses 10% of the processor or ~100Mips for D1 video
 same sequence.

 720 * 480 * 30  ~10MPS

 so we have roughly 10 clocks per pixel. This is too high, something has
 to be wrong.

 OPTIMIZED clip codes to utilize vec_max and vec_packs removing the
 need for vec_min.

 OPTIMIZED DST OUTPUT cache/DMA controls. We are pretty much guaranteed to have
 the input video frame, it was just decompressed so it probably resides in L1
 caches. However, we are creating the output video stream. This needs to use the
 DSTST instruction to optimize for the cache. We couple this with the fact that
 we are not going to be visiting the input buffer again so we mark it Least
 Recently Used. This shaves 25% of the processor cycles off.

 Now memcpy is the largest mips consumer in the system, probably due
 to the inefficient X11 stuff.

 GL libraries seem to be very slow on this machine 1.33Ghz PB running
 Jaguar, this is not the case for my 1Ghz PB.  I thought it might be
 a versioning issue, however I have libGL.1.2.dylib for both
 machines. (We need to figure this out now.)

 GL2 libraries work now with patch for RGB32.

 NOTE: quartz vo driver ARGB32_to_RGB24 consumes 30% of the processor.

 Integrated luma prescaling adjustment for saturation/contrast/brightness
 adjustment.
 */

 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <inttypes.h>
 #include <assert.h>
 #include "config.h"
 #include "rgb2rgb.h"
 #include "swscale.h"
 #include "swscale_internal.h"

 #undef PROFILE_THE_BEAST
 #undef INC_SCALING

 typedef unsigned char ubyte;
 typedef signed char   sbyte;


 /* RGB interleaver, 16 planar pels 8-bit samples per channel in
   homogeneous vector registers x0,x1,x2 are interleaved with the
   following technique:

      o0 = vec_mergeh (x0,x1);
      o1 = vec_perm (o0, x2, perm_rgb_0);
      o2 = vec_perm (o0, x2, perm_rgb_1);
      o3 = vec_mergel (x0,x1);
      o4 = vec_perm (o3,o2,perm_rgb_2);
      o5 = vec_perm (o3,o2,perm_rgb_3);

  perm_rgb_0:   o0(RG).h v1(B) --> o1*
              0   1  2   3   4
             rgbr|gbrg|brgb|rgbr
             0010 0100 1001 0010
             0102 3145 2673 894A

  perm_rgb_1:   o0(RG).h v1(B) --> o2
              0   1  2   3   4
             gbrg|brgb|bbbb|bbbb
             0100 1001 1111 1111
             B5CD 6EF7 89AB CDEF

  perm_rgb_2:   o3(RG).l o2(rgbB.l) --> o4*
              0   1  2   3   4
             gbrg|brgb|rgbr|gbrg
             1111 1111 0010 0100
             89AB CDEF 0182 3945

  perm_rgb_2:   o3(RG).l o2(rgbB.l) ---> o5*
              0   1  2   3   4
             brgb|rgbr|gbrg|brgb
             1001 0010 0100 1001
             a67b 89cA BdCD eEFf

 */
 static
 const vector unsigned char
  perm_rgb_0 = {0x00,0x01,0x10,0x02,0x03,0x11,0x04,0x05,
                0x12,0x06,0x07,0x13,0x08,0x09,0x14,0x0a},
  perm_rgb_1 = {0x0b,0x15,0x0c,0x0d,0x16,0x0e,0x0f,0x17,
                0x18,0x19,0x1a,0x1b,0x1c,0x1d,0x1e,0x1f},
  perm_rgb_2 = {0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,
                0x00,0x01,0x18,0x02,0x03,0x19,0x04,0x05},
  perm_rgb_3 = {0x1a,0x06,0x07,0x1b,0x08,0x09,0x1c,0x0a,
                0x0b,0x1d,0x0c,0x0d,0x1e,0x0e,0x0f,0x1f};

 #define vec_merge3(x2,x1,x0,y0,y1,y2)       \
 do {                                        \
    __typeof__(x0) o0,o2,o3;                \
        o0 = vec_mergeh (x0,x1);            \
        y0 = vec_perm (o0, x2, perm_rgb_0); \
        o2 = vec_perm (o0, x2, perm_rgb_1); \
        o3 = vec_mergel (x0,x1);            \
        y1 = vec_perm (o3,o2,perm_rgb_2);   \
        y2 = vec_perm (o3,o2,perm_rgb_3);   \
 } while(0)

 #define vec_mstbgr24(x0,x1,x2,ptr)      \
 do {                                    \
    __typeof__(x0) _0,_1,_2;            \
    vec_merge3 (x0,x1,x2,_0,_1,_2);     \
    vec_st (_0, 0, ptr++);              \
    vec_st (_1, 0, ptr++);              \
    vec_st (_2, 0, ptr++);              \
 }  while (0);

 #define vec_mstrgb24(x0,x1,x2,ptr)      \
 do {                                    \
    __typeof__(x0) _0,_1,_2;            \
    vec_merge3 (x2,x1,x0,_0,_1,_2);     \
    vec_st (_0, 0, ptr++);              \
    vec_st (_1, 0, ptr++);              \
    vec_st (_2, 0, ptr++);              \
 }  while (0);

 /* pack the pixels in rgb0 format
   msb R
   lsb 0
 */
 #define vec_mstrgb32(T,x0,x1,x2,x3,ptr)                                       \
 do {                                                                          \
    T _0,_1,_2,_3;                                                            \
    _0 = vec_mergeh (x0,x1);                                                  \
    _1 = vec_mergeh (x2,x3);                                                  \
    _2 = (T)vec_mergeh ((vector unsigned short)_0,(vector unsigned short)_1); \
    _3 = (T)vec_mergel ((vector unsigned short)_0,(vector unsigned short)_1); \
    vec_st (_2, 0*16, (T *)ptr);                                              \
    vec_st (_3, 1*16, (T *)ptr);                                              \
    _0 = vec_mergel (x0,x1);                                                  \
    _1 = vec_mergel (x2,x3);                                                  \
    _2 = (T)vec_mergeh ((vector unsigned short)_0,(vector unsigned short)_1); \
    _3 = (T)vec_mergel ((vector unsigned short)_0,(vector unsigned short)_1); \
    vec_st (_2, 2*16, (T *)ptr);                                              \
    vec_st (_3, 3*16, (T *)ptr);                                              \
    ptr += 4;                                                                 \
 }  while (0);

 /*

  | 1     0       1.4021   | | Y |
  | 1    -0.3441 -0.7142   |x| Cb|
  | 1     1.7718  0        | | Cr|


  Y:      [-128 127]
  Cb/Cr : [-128 127]

  typical yuv conversion work on Y: 0-255 this version has been optimized for jpeg decode.

 */




 #define vec_unh(x) \
    (vector signed short) \
        vec_perm(x,(__typeof__(x)){0}, \
                 ((vector unsigned char){0x10,0x00,0x10,0x01,0x10,0x02,0x10,0x03,\
                                         0x10,0x04,0x10,0x05,0x10,0x06,0x10,0x07}))
 #define vec_unl(x) \
    (vector signed short) \
        vec_perm(x,(__typeof__(x)){0}, \
                 ((vector unsigned char){0x10,0x08,0x10,0x09,0x10,0x0A,0x10,0x0B,\
                                         0x10,0x0C,0x10,0x0D,0x10,0x0E,0x10,0x0F}))

 #define vec_clip_s16(x) \
    vec_max (vec_min (x, ((vector signed short){235,235,235,235,235,235,235,235})), \
                         ((vector signed short){ 16, 16, 16, 16, 16, 16, 16, 16}))

 #define vec_packclp(x,y) \
    (vector unsigned char)vec_packs \
        ((vector unsigned short)vec_max (x,((vector signed short) {0})), \
         (vector unsigned short)vec_max (y,((vector signed short) {0})))

 //#define out_pixels(a,b,c,ptr) vec_mstrgb32(__typeof__(a),((__typeof__ (a)){255}),a,a,a,ptr)


 static inline void cvtyuvtoRGB (SwsContext *c,
                                vector signed short Y, vector signed short U, vector signed short V,
                                vector signed short *R, vector signed short *G, vector signed short *B)
 {
    vector signed   short vx,ux,uvx;

    Y = vec_mradds (Y, c->CY, c->OY);
    U  = vec_sub (U,(vector signed short)
                    vec_splat((vector signed short){128},0));
    V  = vec_sub (V,(vector signed short)
                    vec_splat((vector signed short){128},0));

    //   ux  = (CBU*(u<<c->CSHIFT)+0x4000)>>15;
    ux = vec_sl (U, c->CSHIFT);
    *B = vec_mradds (ux, c->CBU, Y);

    // vx  = (CRV*(v<<c->CSHIFT)+0x4000)>>15;
    vx = vec_sl (V, c->CSHIFT);
    *R = vec_mradds (vx, c->CRV, Y);

    // uvx = ((CGU*u) + (CGV*v))>>15;
    uvx = vec_mradds (U, c->CGU, Y);
    *G  = vec_mradds (V, c->CGV, uvx);
 }


 /*
  ------------------------------------------------------------------------------
  CS converters
  ------------------------------------------------------------------------------
 */


 #define DEFCSP420_CVT(name,out_pixels)                                  \
 static int altivec_##name (SwsContext *c,                               \
                           unsigned char **in, int *instrides,          \
                           int srcSliceY,        int srcSliceH,         \
                           unsigned char **oplanes, int *outstrides)    \
 {                                                                       \
    int w = c->srcW;                                                    \
    int h = srcSliceH;                                                  \
    int i,j;                                                            \
    int instrides_scl[3];                                               \
    vector unsigned char y0,y1;                                         \
                                                                        \
    vector signed char  u,v;                                            \
                                                                        \
    vector signed short Y0,Y1,Y2,Y3;                                    \
    vector signed short U,V;                                            \
    vector signed short vx,ux,uvx;                                      \
    vector signed short vx0,ux0,uvx0;                                   \
    vector signed short vx1,ux1,uvx1;                                   \
    vector signed short R0,G0,B0;                                       \
    vector signed short R1,G1,B1;                                       \
    vector unsigned char R,G,B;                                         \
                                                                        \
    vector unsigned char *y1ivP, *y2ivP, *uivP, *vivP;                  \
    vector unsigned char align_perm;                                    \
                                                                        \
    vector signed short                                                 \
        lCY  = c->CY,                                                   \
        lOY  = c->OY,                                                   \
        lCRV = c->CRV,                                                  \
        lCBU = c->CBU,                                                  \
        lCGU = c->CGU,                                                  \
        lCGV = c->CGV;                                                  \
                                                                        \
    vector unsigned short lCSHIFT = c->CSHIFT;                          \
                                                                        \
    ubyte *y1i   = in[0];                                               \
    ubyte *y2i   = in[0]+instrides[0];                                  \
    ubyte *ui    = in[1];                                               \
    ubyte *vi    = in[2];                                               \
                                                                        \
    vector unsigned char *oute                                          \
        = (vector unsigned char *)                                      \
            (oplanes[0]+srcSliceY*outstrides[0]);                       \
    vector unsigned char *outo                                          \
        = (vector unsigned char *)                                      \
            (oplanes[0]+srcSliceY*outstrides[0]+outstrides[0]);         \
                                                                        \
                                                                        \
    instrides_scl[0] = instrides[0]*2-w;  /* the loop moves y{1,2}i by w */ \
    instrides_scl[1] = instrides[1]-w/2;  /* the loop moves ui by w/2 */    \
    instrides_scl[2] = instrides[2]-w/2;  /* the loop moves vi by w/2 */    \
                                                                        \
                                                                        \
    for (i=0;i<h/2;i++) {                                               \
        vec_dstst (outo, (0x02000002|(((w*3+32)/32)<<16)), 0);          \
        vec_dstst (oute, (0x02000002|(((w*3+32)/32)<<16)), 1);          \
                                                                        \
        for (j=0;j<w/16;j++) {                                          \
                                                                        \
            y1ivP = (vector unsigned char *)y1i;                        \
            y2ivP = (vector unsigned char *)y2i;                        \
            uivP  = (vector unsigned char *)ui;                         \
            vivP  = (vector unsigned char *)vi;                         \
                                                                        \
            align_perm = vec_lvsl (0, y1i);                             \
            y0 = (vector unsigned char)                                 \
                 vec_perm (y1ivP[0], y1ivP[1], align_perm);             \
                                                                        \
            align_perm = vec_lvsl (0, y2i);                             \
            y1 = (vector unsigned char)                                 \
                 vec_perm (y2ivP[0], y2ivP[1], align_perm);             \
                                                                        \
            align_perm = vec_lvsl (0, ui);                              \
            u = (vector signed char)                                    \
                vec_perm (uivP[0], uivP[1], align_perm);                \
                                                                        \
            align_perm = vec_lvsl (0, vi);                              \
            v = (vector signed char)                                    \
                vec_perm (vivP[0], vivP[1], align_perm);                \
                                                                        \
            u  = (vector signed char)                                   \
                 vec_sub (u,(vector signed char)                        \
                          vec_splat((vector signed char){128},0));      \
            v  = (vector signed char)                                   \
                 vec_sub (v,(vector signed char)                        \
                          vec_splat((vector signed char){128},0));      \
                                                                        \
            U  = vec_unpackh (u);                                       \
            V  = vec_unpackh (v);                                       \
                                                                        \
                                                                        \
            Y0 = vec_unh (y0);                                          \
            Y1 = vec_unl (y0);                                          \
            Y2 = vec_unh (y1);                                          \
            Y3 = vec_unl (y1);                                          \
                                                                        \
            Y0 = vec_mradds (Y0, lCY, lOY);                             \
            Y1 = vec_mradds (Y1, lCY, lOY);                             \
            Y2 = vec_mradds (Y2, lCY, lOY);                             \
            Y3 = vec_mradds (Y3, lCY, lOY);                             \
                                                                        \
            /*   ux  = (CBU*(u<<CSHIFT)+0x4000)>>15 */                  \
            ux = vec_sl (U, lCSHIFT);                                   \
            ux = vec_mradds (ux, lCBU, (vector signed short){0});       \
            ux0  = vec_mergeh (ux,ux);                                  \
            ux1  = vec_mergel (ux,ux);                                  \
                                                                        \
            /* vx  = (CRV*(v<<CSHIFT)+0x4000)>>15;        */            \
            vx = vec_sl (V, lCSHIFT);                                   \
            vx = vec_mradds (vx, lCRV, (vector signed short){0});       \
            vx0  = vec_mergeh (vx,vx);                                  \
            vx1  = vec_mergel (vx,vx);                                  \
                                                                        \
            /* uvx = ((CGU*u) + (CGV*v))>>15 */                         \
            uvx = vec_mradds (U, lCGU, (vector signed short){0});       \
            uvx = vec_mradds (V, lCGV, uvx);                            \
            uvx0 = vec_mergeh (uvx,uvx);                                \
            uvx1 = vec_mergel (uvx,uvx);                                \
                                                                        \
            R0 = vec_add (Y0,vx0);                                      \
            G0 = vec_add (Y0,uvx0);                                     \
            B0 = vec_add (Y0,ux0);                                      \
            R1 = vec_add (Y1,vx1);                                      \
            G1 = vec_add (Y1,uvx1);                                     \
            B1 = vec_add (Y1,ux1);                                      \
                                                                        \
            R  = vec_packclp (R0,R1);                                   \
            G  = vec_packclp (G0,G1);                                   \
            B  = vec_packclp (B0,B1);                                   \
                                                                        \
            out_pixels(R,G,B,oute);                                     \
                                                                        \
            R0 = vec_add (Y2,vx0);                                      \
            G0 = vec_add (Y2,uvx0);                                     \
            B0 = vec_add (Y2,ux0);                                      \
            R1 = vec_add (Y3,vx1);                                      \
            G1 = vec_add (Y3,uvx1);                                     \
            B1 = vec_add (Y3,ux1);                                      \
            R  = vec_packclp (R0,R1);                                   \
            G  = vec_packclp (G0,G1);                                   \
            B  = vec_packclp (B0,B1);                                   \
                                                                        \
                                                                        \
            out_pixels(R,G,B,outo);                                     \
                                                                        \
            y1i  += 16;                                                 \
            y2i  += 16;                                                 \
            ui   += 8;                                                  \
            vi   += 8;                                                  \
                                                                        \
        }                                                               \
                                                                        \
        outo  += (outstrides[0])>>4;                                    \
        oute  += (outstrides[0])>>4;                                    \
                                                                        \
        ui    += instrides_scl[1];                                      \
        vi    += instrides_scl[2];                                      \
        y1i   += instrides_scl[0];                                      \
        y2i   += instrides_scl[0];                                      \
    }                                                                   \
    return srcSliceH;                                                   \
 }


 #define out_abgr(a,b,c,ptr)  vec_mstrgb32(__typeof__(a),((__typeof__ (a)){255}),c,b,a,ptr)
 #define out_bgra(a,b,c,ptr)  vec_mstrgb32(__typeof__(a),c,b,a,((__typeof__ (a)){255}),ptr)
 #define out_rgba(a,b,c,ptr)  vec_mstrgb32(__typeof__(a),a,b,c,((__typeof__ (a)){255}),ptr)
 #define out_argb(a,b,c,ptr)  vec_mstrgb32(__typeof__(a),((__typeof__ (a)){255}),a,b,c,ptr)
 #define out_rgb24(a,b,c,ptr) vec_mstrgb24(a,b,c,ptr)
 #define out_bgr24(a,b,c,ptr) vec_mstbgr24(a,b,c,ptr)

 DEFCSP420_CVT (yuv2_abgr, out_abgr)
 #if 1
 DEFCSP420_CVT (yuv2_bgra, out_bgra)
 #else
 static int altivec_yuv2_bgra32 (SwsContext *c,
                                unsigned char **in, int *instrides,
                                int srcSliceY,        int srcSliceH,
                                unsigned char **oplanes, int *outstrides)
 {
    int w = c->srcW;
    int h = srcSliceH;
    int i,j;
    int instrides_scl[3];
    vector unsigned char y0,y1;

    vector signed char  u,v;

    vector signed short Y0,Y1,Y2,Y3;
    vector signed short U,V;
    vector signed short vx,ux,uvx;
    vector signed short vx0,ux0,uvx0;
    vector signed short vx1,ux1,uvx1;
    vector signed short R0,G0,B0;
    vector signed short R1,G1,B1;
    vector unsigned char R,G,B;

    vector unsigned char *uivP, *vivP;
    vector unsigned char align_perm;

    vector signed short
        lCY  = c->CY,
        lOY  = c->OY,
        lCRV = c->CRV,
        lCBU = c->CBU,
        lCGU = c->CGU,
        lCGV = c->CGV;

    vector unsigned short lCSHIFT = c->CSHIFT;

    ubyte *y1i   = in[0];
    ubyte *y2i   = in[0]+w;
    ubyte *ui    = in[1];
    ubyte *vi    = in[2];

    vector unsigned char *oute
        = (vector unsigned char *)
          (oplanes[0]+srcSliceY*outstrides[0]);
    vector unsigned char *outo
        = (vector unsigned char *)
          (oplanes[0]+srcSliceY*outstrides[0]+outstrides[0]);


    instrides_scl[0] = instrides[0];
    instrides_scl[1] = instrides[1]-w/2;  /* the loop moves ui by w/2 */
    instrides_scl[2] = instrides[2]-w/2;  /* the loop moves vi by w/2 */


    for (i=0;i<h/2;i++) {
        vec_dstst (outo, (0x02000002|(((w*3+32)/32)<<16)), 0);
        vec_dstst (oute, (0x02000002|(((w*3+32)/32)<<16)), 1);

        for (j=0;j<w/16;j++) {

            y0 = vec_ldl (0,y1i);
            y1 = vec_ldl (0,y2i);
            uivP = (vector unsigned char *)ui;
            vivP = (vector unsigned char *)vi;

            align_perm = vec_lvsl (0, ui);
            u  = (vector signed char)vec_perm (uivP[0], uivP[1], align_perm);

            align_perm = vec_lvsl (0, vi);
            v  = (vector signed char)vec_perm (vivP[0], vivP[1], align_perm);
            u  = (vector signed char)
                 vec_sub (u,(vector signed char)
                          vec_splat((vector signed char){128},0));

            v  = (vector signed char)
                 vec_sub (v, (vector signed char)
                          vec_splat((vector signed char){128},0));

            U  = vec_unpackh (u);
            V  = vec_unpackh (v);


            Y0 = vec_unh (y0);
            Y1 = vec_unl (y0);
            Y2 = vec_unh (y1);
            Y3 = vec_unl (y1);

            Y0 = vec_mradds (Y0, lCY, lOY);
            Y1 = vec_mradds (Y1, lCY, lOY);
            Y2 = vec_mradds (Y2, lCY, lOY);
            Y3 = vec_mradds (Y3, lCY, lOY);

            /*   ux  = (CBU*(u<<CSHIFT)+0x4000)>>15 */
            ux = vec_sl (U, lCSHIFT);
            ux = vec_mradds (ux, lCBU, (vector signed short){0});
            ux0  = vec_mergeh (ux,ux);
            ux1  = vec_mergel (ux,ux);

            /* vx  = (CRV*(v<<CSHIFT)+0x4000)>>15;        */
            vx = vec_sl (V, lCSHIFT);
            vx = vec_mradds (vx, lCRV, (vector signed short){0});
            vx0  = vec_mergeh (vx,vx);
            vx1  = vec_mergel (vx,vx);
            /* uvx = ((CGU*u) + (CGV*v))>>15 */
            uvx = vec_mradds (U, lCGU, (vector signed short){0});
            uvx = vec_mradds (V, lCGV, uvx);
            uvx0 = vec_mergeh (uvx,uvx);
            uvx1 = vec_mergel (uvx,uvx);
            R0 = vec_add (Y0,vx0);
            G0 = vec_add (Y0,uvx0);
            B0 = vec_add (Y0,ux0);
            R1 = vec_add (Y1,vx1);
            G1 = vec_add (Y1,uvx1);
            B1 = vec_add (Y1,ux1);
            R  = vec_packclp (R0,R1);
            G  = vec_packclp (G0,G1);
            B  = vec_packclp (B0,B1);

            out_argb(R,G,B,oute);
            R0 = vec_add (Y2,vx0);
            G0 = vec_add (Y2,uvx0);
            B0 = vec_add (Y2,ux0);
            R1 = vec_add (Y3,vx1);
            G1 = vec_add (Y3,uvx1);
            B1 = vec_add (Y3,ux1);
            R  = vec_packclp (R0,R1);
            G  = vec_packclp (G0,G1);
            B  = vec_packclp (B0,B1);

            out_argb(R,G,B,outo);
            y1i  += 16;
            y2i  += 16;
            ui   += 8;
            vi   += 8;

        }

        outo  += (outstrides[0])>>4;
        oute  += (outstrides[0])>>4;

        ui    += instrides_scl[1];
        vi    += instrides_scl[2];
        y1i   += instrides_scl[0];
        y2i   += instrides_scl[0];
    }
    return srcSliceH;
 }

 #endif


 DEFCSP420_CVT (yuv2_rgba, out_rgba)
 DEFCSP420_CVT (yuv2_argb, out_argb)
 DEFCSP420_CVT (yuv2_rgb24,  out_rgb24)
 DEFCSP420_CVT (yuv2_bgr24,  out_bgr24)


 // uyvy|uyvy|uyvy|uyvy
 // 0123 4567 89ab cdef
 static
 const vector unsigned char
    demux_u = {0x10,0x00,0x10,0x00,
               0x10,0x04,0x10,0x04,
               0x10,0x08,0x10,0x08,
               0x10,0x0c,0x10,0x0c},
    demux_v = {0x10,0x02,0x10,0x02,
               0x10,0x06,0x10,0x06,
               0x10,0x0A,0x10,0x0A,
               0x10,0x0E,0x10,0x0E},
    demux_y = {0x10,0x01,0x10,0x03,
               0x10,0x05,0x10,0x07,
               0x10,0x09,0x10,0x0B,
               0x10,0x0D,0x10,0x0F};

 /*
  this is so I can play live CCIR raw video
 */
 static int altivec_uyvy_rgb32 (SwsContext *c,
                               unsigned char **in, int *instrides,
                               int srcSliceY,        int srcSliceH,
                               unsigned char **oplanes, int *outstrides)
 {
    int w = c->srcW;
    int h = srcSliceH;
    int i,j;
    vector unsigned char uyvy;
    vector signed   short Y,U,V;
    vector signed   short R0,G0,B0,R1,G1,B1;
    vector unsigned char  R,G,B;
    vector unsigned char *out;
    ubyte *img;

    img = in[0];
    out = (vector unsigned char *)(oplanes[0]+srcSliceY*outstrides[0]);

    for (i=0;i<h;i++) {
        for (j=0;j<w/16;j++) {
            uyvy = vec_ld (0, img);
            U = (vector signed short)
                vec_perm (uyvy, (vector unsigned char){0}, demux_u);

            V = (vector signed short)
                vec_perm (uyvy, (vector unsigned char){0}, demux_v);

            Y = (vector signed short)
                vec_perm (uyvy, (vector unsigned char){0}, demux_y);

            cvtyuvtoRGB (c, Y,U,V,&R0,&G0,&B0);

            uyvy = vec_ld (16, img);
            U = (vector signed short)
                vec_perm (uyvy, (vector unsigned char){0}, demux_u);

            V = (vector signed short)
                vec_perm (uyvy, (vector unsigned char){0}, demux_v);

            Y = (vector signed short)
                vec_perm (uyvy, (vector unsigned char){0}, demux_y);

            cvtyuvtoRGB (c, Y,U,V,&R1,&G1,&B1);

            R  = vec_packclp (R0,R1);
            G  = vec_packclp (G0,G1);
            B  = vec_packclp (B0,B1);

            //      vec_mstbgr24 (R,G,B, out);
            out_rgba (R,G,B,out);

            img += 32;
        }
    }
    return srcSliceH;
 }



 /* Ok currently the acceleration routine only supports
   inputs of widths a multiple of 16
   and heights a multiple 2

   So we just fall back to the C codes for this.
 */
 SwsFunc sws_yuv2rgb_init_altivec (SwsContext *c)
 {
    if (!(c->flags & SWS_CPU_CAPS_ALTIVEC))
        return NULL;

    /*
      and this seems not to matter too much I tried a bunch of
      videos with abnormal widths and MPlayer crashes elsewhere.
      mplayer -vo x11 -rawvideo on:w=350:h=240 raw-350x240.eyuv
      boom with X11 bad match.

    */
    if ((c->srcW & 0xf) != 0)    return NULL;

    switch (c->srcFormat) {
    case PIX_FMT_YUV410P:
    case PIX_FMT_YUV420P:
    /*case IMGFMT_CLPL:        ??? */
    case PIX_FMT_GRAY8:
    case PIX_FMT_NV12:
    case PIX_FMT_NV21:
        if ((c->srcH & 0x1) != 0)
            return NULL;

        switch(c->dstFormat){
        case PIX_FMT_RGB24:
            av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space RGB24\n");
            return altivec_yuv2_rgb24;
        case PIX_FMT_BGR24:
            av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space BGR24\n");
            return altivec_yuv2_bgr24;
        case PIX_FMT_ARGB:
            av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space ARGB\n");
            return altivec_yuv2_argb;
        case PIX_FMT_ABGR:
            av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space ABGR\n");
            return altivec_yuv2_abgr;
        case PIX_FMT_RGBA:
            av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space RGBA\n");
            return altivec_yuv2_rgba;
        case PIX_FMT_BGRA:
            av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space BGRA\n");
            return altivec_yuv2_bgra;
        default: return NULL;
        }
        break;

    case PIX_FMT_UYVY422:
        switch(c->dstFormat){
        case PIX_FMT_BGR32:
            av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space UYVY -> RGB32\n");
            return altivec_uyvy_rgb32;
        default: return NULL;
        }
        break;

    }
    return NULL;
 }

 void sws_yuv2rgb_altivec_init_tables (SwsContext *c, const int inv_table[4],int brightness,int contrast, int saturation)
 {
    union {
        signed short tmp[8] __attribute__ ((aligned(16)));
        vector signed short vec;
    } buf;

    buf.tmp[0] =  ((0xffffLL) * contrast>>8)>>9;                        //cy
    buf.tmp[1] =  -256*brightness;                                      //oy
    buf.tmp[2] =  (inv_table[0]>>3) *(contrast>>16)*(saturation>>16);   //crv
    buf.tmp[3] =  (inv_table[1]>>3) *(contrast>>16)*(saturation>>16);   //cbu
    buf.tmp[4] = -((inv_table[2]>>1)*(contrast>>16)*(saturation>>16));  //cgu
    buf.tmp[5] = -((inv_table[3]>>1)*(contrast>>16)*(saturation>>16));  //cgv


    c->CSHIFT = (vector unsigned short)vec_splat_u16(2);
    c->CY   = vec_splat ((vector signed short)buf.vec, 0);
    c->OY   = vec_splat ((vector signed short)buf.vec, 1);
    c->CRV  = vec_splat ((vector signed short)buf.vec, 2);
    c->CBU  = vec_splat ((vector signed short)buf.vec, 3);
    c->CGU  = vec_splat ((vector signed short)buf.vec, 4);
    c->CGV  = vec_splat ((vector signed short)buf.vec, 5);
 #if 0
    {
    int i;
    char *v[6]={"cy","oy","crv","cbu","cgu","cgv"};
    for (i=0; i<6; i++)
        printf("%s %d ", v[i],buf.tmp[i] );
        printf("\n");
    }
 #endif
    return;
 }


 void
 altivec_yuv2packedX (SwsContext *c,
                     int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
                     int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
                     uint8_t *dest, int dstW, int dstY)
 {
    int i,j;
    vector signed short X,X0,X1,Y0,U0,V0,Y1,U1,V1,U,V;
    vector signed short R0,G0,B0,R1,G1,B1;

    vector unsigned char R,G,B;
    vector unsigned char *out,*nout;

    vector signed short   RND = vec_splat_s16(1<<3);
    vector unsigned short SCL = vec_splat_u16(4);
    unsigned long scratch[16] __attribute__ ((aligned (16)));

    vector signed short *YCoeffs, *CCoeffs;

    YCoeffs = c->vYCoeffsBank+dstY*lumFilterSize;
    CCoeffs = c->vCCoeffsBank+dstY*chrFilterSize;

    out = (vector unsigned char *)dest;

    for (i=0; i<dstW; i+=16){
        Y0 = RND;
        Y1 = RND;
        /* extract 16 coeffs from lumSrc */
        for (j=0; j<lumFilterSize; j++) {
            X0 = vec_ld (0,  &lumSrc[j][i]);
            X1 = vec_ld (16, &lumSrc[j][i]);
            Y0 = vec_mradds (X0, YCoeffs[j], Y0);
            Y1 = vec_mradds (X1, YCoeffs[j], Y1);
        }

        U = RND;
        V = RND;
        /* extract 8 coeffs from U,V */
        for (j=0; j<chrFilterSize; j++) {
            X  = vec_ld (0, &chrSrc[j][i/2]);
            U  = vec_mradds (X, CCoeffs[j], U);
            X  = vec_ld (0, &chrSrc[j][i/2+2048]);
            V  = vec_mradds (X, CCoeffs[j], V);
        }

        /* scale and clip signals */
        Y0 = vec_sra (Y0, SCL);
        Y1 = vec_sra (Y1, SCL);
        U  = vec_sra (U,  SCL);
        V  = vec_sra (V,  SCL);

        Y0 = vec_clip_s16 (Y0);
        Y1 = vec_clip_s16 (Y1);
        U  = vec_clip_s16 (U);
        V  = vec_clip_s16 (V);

        /* now we have
          Y0= y0 y1 y2 y3 y4 y5 y6 y7     Y1= y8 y9 y10 y11 y12 y13 y14 y15
          U= u0 u1 u2 u3 u4 u5 u6 u7      V= v0 v1 v2 v3 v4 v5 v6 v7

          Y0= y0 y1 y2 y3 y4 y5 y6 y7    Y1= y8 y9 y10 y11 y12 y13 y14 y15
          U0= u0 u0 u1 u1 u2 u2 u3 u3    U1= u4 u4 u5 u5 u6 u6 u7 u7
          V0= v0 v0 v1 v1 v2 v2 v3 v3    V1= v4 v4 v5 v5 v6 v6 v7 v7
        */

        U0 = vec_mergeh (U,U);
        V0 = vec_mergeh (V,V);

        U1 = vec_mergel (U,U);
        V1 = vec_mergel (V,V);

        cvtyuvtoRGB (c, Y0,U0,V0,&R0,&G0,&B0);
        cvtyuvtoRGB (c, Y1,U1,V1,&R1,&G1,&B1);

        R  = vec_packclp (R0,R1);
        G  = vec_packclp (G0,G1);
        B  = vec_packclp (B0,B1);

        switch(c->dstFormat) {
            case PIX_FMT_ABGR:  out_abgr  (R,G,B,out); break;
            case PIX_FMT_BGRA:  out_bgra  (R,G,B,out); break;
            case PIX_FMT_RGBA:  out_rgba  (R,G,B,out); break;
            case PIX_FMT_ARGB:  out_argb  (R,G,B,out); break;
            case PIX_FMT_RGB24: out_rgb24 (R,G,B,out); break;
            case PIX_FMT_BGR24: out_bgr24 (R,G,B,out); break;
            default:
            {
                /* If this is reached, the caller should have called yuv2packedXinC
                   instead. */
                static int printed_error_message;
                if (!printed_error_message) {
                    av_log(c, AV_LOG_ERROR, "altivec_yuv2packedX doesn't support %s output\n",
                           sws_format_name(c->dstFormat));
                    printed_error_message=1;
                }
                return;
            }
        }
    }

    if (i < dstW) {
        i -= 16;

        Y0 = RND;
        Y1 = RND;
        /* extract 16 coeffs from lumSrc */
        for (j=0; j<lumFilterSize; j++) {
            X0 = vec_ld (0,  &lumSrc[j][i]);
            X1 = vec_ld (16, &lumSrc[j][i]);
            Y0 = vec_mradds (X0, YCoeffs[j], Y0);
            Y1 = vec_mradds (X1, YCoeffs[j], Y1);
        }

        U = RND;
        V = RND;
        /* extract 8 coeffs from U,V */
        for (j=0; j<chrFilterSize; j++) {
            X  = vec_ld (0, &chrSrc[j][i/2]);
            U  = vec_mradds (X, CCoeffs[j], U);
            X  = vec_ld (0, &chrSrc[j][i/2+2048]);
            V  = vec_mradds (X, CCoeffs[j], V);
        }

        /* scale and clip signals */
        Y0 = vec_sra (Y0, SCL);
        Y1 = vec_sra (Y1, SCL);
        U  = vec_sra (U,  SCL);
        V  = vec_sra (V,  SCL);

        Y0 = vec_clip_s16 (Y0);
        Y1 = vec_clip_s16 (Y1);
        U  = vec_clip_s16 (U);
        V  = vec_clip_s16 (V);

        /* now we have
           Y0= y0 y1 y2 y3 y4 y5 y6 y7     Y1= y8 y9 y10 y11 y12 y13 y14 y15
           U = u0 u1 u2 u3 u4 u5 u6 u7     V = v0 v1 v2 v3 v4 v5 v6 v7

           Y0= y0 y1 y2 y3 y4 y5 y6 y7    Y1= y8 y9 y10 y11 y12 y13 y14 y15
           U0= u0 u0 u1 u1 u2 u2 u3 u3    U1= u4 u4 u5 u5 u6 u6 u7 u7
           V0= v0 v0 v1 v1 v2 v2 v3 v3    V1= v4 v4 v5 v5 v6 v6 v7 v7
        */

        U0 = vec_mergeh (U,U);
        V0 = vec_mergeh (V,V);

        U1 = vec_mergel (U,U);
        V1 = vec_mergel (V,V);

        cvtyuvtoRGB (c, Y0,U0,V0,&R0,&G0,&B0);
        cvtyuvtoRGB (c, Y1,U1,V1,&R1,&G1,&B1);

        R  = vec_packclp (R0,R1);
        G  = vec_packclp (G0,G1);
        B  = vec_packclp (B0,B1);

        nout = (vector unsigned char *)scratch;
        switch(c->dstFormat) {
            case PIX_FMT_ABGR:  out_abgr  (R,G,B,nout); break;
            case PIX_FMT_BGRA:  out_bgra  (R,G,B,nout); break;
            case PIX_FMT_RGBA:  out_rgba  (R,G,B,nout); break;
            case PIX_FMT_ARGB:  out_argb  (R,G,B,nout); break;
            case PIX_FMT_RGB24: out_rgb24 (R,G,B,nout); break;
            case PIX_FMT_BGR24: out_bgr24 (R,G,B,nout); break;
            default:
                /* Unreachable, I think. */
                av_log(c, AV_LOG_ERROR, "altivec_yuv2packedX doesn't support %s output\n",
                       sws_format_name(c->dstFormat));
                return;
        }

        memcpy (&((uint32_t*)dest)[i], scratch, (dstW-i)/4);
    }

 }
--- a/libswscale/yuv2rgb_bfin.c
+++ b/libswscale/yuv2rgb_bfin.c
@@ -0,0 +1,203 @@
 /*
 * Copyright (C) 2007 Marc Hoffman <marc.hoffman@analog.com>
 *
 * Blackfin video color space converter operations
 * convert I420 YV12 to RGB in various formats
 *
 * This file is part of FFmpeg.
 *
 * FFmpeg is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * FFmpeg is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with FFmpeg; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */

 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <inttypes.h>
 #include <assert.h>
 #include "config.h"
 #include <unistd.h>
 #include "rgb2rgb.h"
 #include "swscale.h"
 #include "swscale_internal.h"

 #ifdef __FDPIC__
 #define L1CODE __attribute__ ((l1_text))
 #else
 #define L1CODE
 #endif

 void ff_bfin_yuv2rgb555_line (uint8_t *Y, uint8_t *U, uint8_t *V, uint8_t *out,
                              int w, uint32_t *coeffs) L1CODE;

 void ff_bfin_yuv2rgb565_line (uint8_t *Y, uint8_t *U, uint8_t *V, uint8_t *out,
                              int w, uint32_t *coeffs) L1CODE;

 void ff_bfin_yuv2rgb24_line (uint8_t *Y, uint8_t *U, uint8_t *V, uint8_t *out,
                             int w, uint32_t *coeffs) L1CODE;

 typedef void (* ltransform)(uint8_t *Y, uint8_t *U, uint8_t *V, uint8_t *out,
                            int w, uint32_t *coeffs);


 static void bfin_prepare_coefficients (SwsContext *c, int rgb, int masks)
 {
    int oy;
    oy      = c->yOffset&0xffff;
    oy      = oy >> 3; // keep everything U8.0 for offset calculation

    c->oc   = 128*0x01010101U;
    c->oy   =  oy*0x01010101U;

    /* copy 64bit vector coeffs down to 32bit vector coeffs */
    c->cy  = c->yCoeff;
    c->zero = 0;

    if (rgb) {
        c->crv = c->vrCoeff;
        c->cbu = c->ubCoeff;
        c->cgu = c->ugCoeff;
        c->cgv = c->vgCoeff;
    } else {
        c->crv = c->ubCoeff;
        c->cbu = c->vrCoeff;
        c->cgu = c->vgCoeff;
        c->cgv = c->ugCoeff;
    }


    if (masks == 555) {
        c->rmask = 0x001f * 0x00010001U;
        c->gmask = 0x03e0 * 0x00010001U;
        c->bmask = 0x7c00 * 0x00010001U;
    } else if (masks == 565) {
        c->rmask = 0x001f * 0x00010001U;
        c->gmask = 0x07e0 * 0x00010001U;
        c->bmask = 0xf800 * 0x00010001U;
    }
 }

 static int core_yuv420_rgb (SwsContext *c,
                            uint8_t **in, int *instrides,
                            int srcSliceY, int srcSliceH,
                            uint8_t **oplanes, int *outstrides,
                            ltransform lcscf, int rgb, int masks)
 {
    uint8_t *py,*pu,*pv,*op;
    int w  = instrides[0];
    int h2 = srcSliceH>>1;
    int i;

    bfin_prepare_coefficients (c, rgb, masks);

    py = in[0];
    pu = in[1+(1^rgb)];
    pv = in[1+(0^rgb)];

    op = oplanes[0] + srcSliceY*outstrides[0];

    for (i=0;i<h2;i++) {

        lcscf (py, pu, pv, op, w, &c->oy);

        py += instrides[0];
        op += outstrides[0];

        lcscf (py, pu, pv, op, w, &c->oy);

        py += instrides[0];
        pu += instrides[1];
        pv += instrides[2];
        op += outstrides[0];
    }

    return srcSliceH;
 }


 static int bfin_yuv420_rgb555 (SwsContext *c,
                               uint8_t **in, int *instrides,
                               int srcSliceY, int srcSliceH,
                               uint8_t **oplanes, int *outstrides)
 {
    return core_yuv420_rgb (c, in, instrides, srcSliceY, srcSliceH, oplanes, outstrides,
                            ff_bfin_yuv2rgb555_line, 1, 555);
 }

 static int bfin_yuv420_bgr555 (SwsContext *c,
                               uint8_t **in, int *instrides,
                               int srcSliceY, int srcSliceH,
                               uint8_t **oplanes, int *outstrides)
 {
    return core_yuv420_rgb (c, in, instrides, srcSliceY, srcSliceH, oplanes, outstrides,
                            ff_bfin_yuv2rgb555_line, 0, 555);
 }

 static int bfin_yuv420_rgb24 (SwsContext *c,
                              uint8_t **in, int *instrides,
                              int srcSliceY, int srcSliceH,
                              uint8_t **oplanes, int *outstrides)
 {
    return core_yuv420_rgb (c, in, instrides, srcSliceY, srcSliceH, oplanes, outstrides,
                            ff_bfin_yuv2rgb24_line, 1, 888);
 }

 static int bfin_yuv420_bgr24 (SwsContext *c,
                              uint8_t **in, int *instrides,
                              int srcSliceY, int srcSliceH,
                              uint8_t **oplanes, int *outstrides)
 {
    return core_yuv420_rgb (c, in, instrides, srcSliceY, srcSliceH, oplanes, outstrides,
                            ff_bfin_yuv2rgb24_line, 0, 888);
 }

 static int bfin_yuv420_rgb565 (SwsContext *c,
                               uint8_t **in, int *instrides,
                               int srcSliceY, int srcSliceH,
                               uint8_t **oplanes, int *outstrides)
 {
    return core_yuv420_rgb (c, in, instrides, srcSliceY, srcSliceH, oplanes, outstrides,
                            ff_bfin_yuv2rgb565_line, 1, 565);
 }

 static int bfin_yuv420_bgr565 (SwsContext *c,
                               uint8_t **in, int *instrides,
                               int srcSliceY, int srcSliceH,
                               uint8_t **oplanes, int *outstrides)
 {
    return core_yuv420_rgb (c, in, instrides, srcSliceY, srcSliceH, oplanes, outstrides,
                            ff_bfin_yuv2rgb565_line, 0, 565);
 }


 SwsFunc ff_bfin_yuv2rgb_get_func_ptr (SwsContext *c)
 {
    SwsFunc f;

    switch(c->dstFormat) {
    case PIX_FMT_RGB555: f = bfin_yuv420_rgb555; break;
    case PIX_FMT_BGR555: f = bfin_yuv420_bgr555; break;
    case PIX_FMT_RGB565: f = bfin_yuv420_rgb565; break;
    case PIX_FMT_BGR565: f = bfin_yuv420_bgr565; break;
    case PIX_FMT_RGB24:  f = bfin_yuv420_rgb24;  break;
    case PIX_FMT_BGR24:  f = bfin_yuv420_bgr24;  break;
    default:
        return 0;
    }

    av_log(c, AV_LOG_INFO, "BlackFin accelerated color space converter %s\n",
           sws_format_name (c->dstFormat));

    return f;
 }
--- a/libswscale/yuv2rgb_mlib.c
+++ b/libswscale/yuv2rgb_mlib.c
@@ -0,0 +1,85 @@
 /*
 * software YUV to RGB converter using mediaLib
 *
 * Copyright (C) 2003 Michael Niedermayer <michaelni@gmx.at>
 *
 * This file is part of FFmpeg.
 *
 * FFmpeg is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * FFmpeg is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with FFmpeg; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */

 #include <mlib_types.h>
 #include <mlib_status.h>
 #include <mlib_sys.h>
 #include <mlib_video.h>
 #include <inttypes.h>
 #include <stdlib.h>
 #include <assert.h>

 #include "swscale.h"

 static int mlib_YUV2ARGB420_32(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
                               int srcSliceH, uint8_t* dst[], int dstStride[]){
    if(c->srcFormat == PIX_FMT_YUV422P){
        srcStride[1] *= 2;
        srcStride[2] *= 2;
    }

    assert(srcStride[1] == srcStride[2]);

    mlib_VideoColorYUV2ARGB420(dst[0]+srcSliceY*dstStride[0], src[0], src[1], src[2], c->dstW,
                               srcSliceH, dstStride[0], srcStride[0], srcStride[1]);
    return srcSliceH;
 }

 static int mlib_YUV2ABGR420_32(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
                               int srcSliceH, uint8_t* dst[], int dstStride[]){
    if(c->srcFormat == PIX_FMT_YUV422P){
        srcStride[1] *= 2;
        srcStride[2] *= 2;
    }

    assert(srcStride[1] == srcStride[2]);

    mlib_VideoColorYUV2ABGR420(dst[0]+srcSliceY*dstStride[0], src[0], src[1], src[2], c->dstW,
                               srcSliceH, dstStride[0], srcStride[0], srcStride[1]);
    return srcSliceH;
 }

 static int mlib_YUV2RGB420_24(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
                              int srcSliceH, uint8_t* dst[], int dstStride[]){
    if(c->srcFormat == PIX_FMT_YUV422P){
        srcStride[1] *= 2;
        srcStride[2] *= 2;
    }

    assert(srcStride[1] == srcStride[2]);

    mlib_VideoColorYUV2RGB420(dst[0]+srcSliceY*dstStride[0], src[0], src[1], src[2], c->dstW,
                              srcSliceH, dstStride[0], srcStride[0], srcStride[1]);
    return srcSliceH;
 }


 SwsFunc sws_yuv2rgb_init_mlib(SwsContext *c)
 {
    switch(c->dstFormat){
    case PIX_FMT_RGB24: return mlib_YUV2RGB420_24;
    case PIX_FMT_BGR32: return mlib_YUV2ARGB420_32;
    case PIX_FMT_RGB32: return mlib_YUV2ABGR420_32;
    default: return NULL;
    }
 }

--- a/libswscale/yuv2rgb_template.c
+++ b/libswscale/yuv2rgb_template.c
@@ -0,0 +1,453 @@
 /*
 * yuv2rgb_mmx.c, software YUV to RGB converter with Intel MMX "technology"
 *
 * Copyright (C) 2000, Silicon Integrated System Corp
 *
 * Author: Olie Lho <ollie@sis.com.tw>
 *
 * 15,24 bpp and dithering from Michael Niedermayer (michaelni@gmx.at)
 * MMX/MMX2 Template stuff from Michael Niedermayer (needed for fast movntq support)
 * context / deglobalize stuff by Michael Niedermayer
 *
 * This file is part of mpeg2dec, a free MPEG-2 video decoder
 *
 * mpeg2dec is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2, or (at your option)
 * any later version.
 *
 * mpeg2dec is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with mpeg2dec; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */

 #undef MOVNTQ
 #undef EMMS
 #undef SFENCE

 #if HAVE_AMD3DNOW
 /* On K6 femms is faster than emms. On K7 femms is directly mapped to emms. */
 #define EMMS     "femms"
 #else
 #define EMMS     "emms"
 #endif

 #if HAVE_MMX2
 #define MOVNTQ "movntq"
 #define SFENCE "sfence"
 #else
 #define MOVNTQ "movq"
 #define SFENCE "/nop"
 #endif

 #define YUV2RGB \
    /* Do the multiply part of the conversion for even and odd pixels,
       register usage:
       mm0 -> Cblue, mm1 -> Cred, mm2 -> Cgreen even pixels,
       mm3 -> Cblue, mm4 -> Cred, mm5 -> Cgreen odd pixels,
       mm6 -> Y even, mm7 -> Y odd */\
    /* convert the chroma part */\
    "punpcklbw %%mm4, %%mm0;" /* scatter 4 Cb 00 u3 00 u2 00 u1 00 u0 */ \
    "punpcklbw %%mm4, %%mm1;" /* scatter 4 Cr 00 v3 00 v2 00 v1 00 v0 */ \
 \
    "psllw $3, %%mm0;" /* Promote precision */ \
    "psllw $3, %%mm1;" /* Promote precision */ \
 \
    "psubsw "U_OFFSET"(%4), %%mm0;" /* Cb -= 128 */ \
    "psubsw "V_OFFSET"(%4), %%mm1;" /* Cr -= 128 */ \
 \
    "movq %%mm0, %%mm2;" /* Copy 4 Cb 00 u3 00 u2 00 u1 00 u0 */ \
    "movq %%mm1, %%mm3;" /* Copy 4 Cr 00 v3 00 v2 00 v1 00 v0 */ \
 \
    "pmulhw "UG_COEFF"(%4), %%mm2;" /* Mul Cb with green coeff -> Cb green */ \
    "pmulhw "VG_COEFF"(%4), %%mm3;" /* Mul Cr with green coeff -> Cr green */ \
 \
    "pmulhw "UB_COEFF"(%4), %%mm0;" /* Mul Cb -> Cblue 00 b3 00 b2 00 b1 00 b0 */\
    "pmulhw "VR_COEFF"(%4), %%mm1;" /* Mul Cr -> Cred 00 r3 00 r2 00 r1 00 r0 */\
 \
    "paddsw %%mm3, %%mm2;" /* Cb green + Cr green -> Cgreen */\
 \
    /* convert the luma part */\
    "movq %%mm6, %%mm7;" /* Copy 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */\
    "pand "MANGLE(mmx_00ffw)", %%mm6;" /* get Y even 00 Y6 00 Y4 00 Y2 00 Y0 */\
 \
    "psrlw $8, %%mm7;" /* get Y odd 00 Y7 00 Y5 00 Y3 00 Y1 */\
 \
    "psllw $3, %%mm6;" /* Promote precision */\
    "psllw $3, %%mm7;" /* Promote precision */\
 \
    "psubw "Y_OFFSET"(%4), %%mm6;" /* Y -= 16 */\
    "psubw "Y_OFFSET"(%4), %%mm7;" /* Y -= 16 */\
 \
    "pmulhw "Y_COEFF"(%4), %%mm6;" /* Mul 4 Y even 00 y6 00 y4 00 y2 00 y0 */\
    "pmulhw "Y_COEFF"(%4), %%mm7;" /* Mul 4 Y odd 00 y7 00 y5 00 y3 00 y1 */\
 \
    /* Do the addition part of the conversion for even and odd pixels,
       register usage:
       mm0 -> Cblue, mm1 -> Cred, mm2 -> Cgreen even pixels,
       mm3 -> Cblue, mm4 -> Cred, mm5 -> Cgreen odd pixels,
       mm6 -> Y even, mm7 -> Y odd */\
    "movq %%mm0, %%mm3;" /* Copy Cblue */\
    "movq %%mm1, %%mm4;" /* Copy Cred */\
    "movq %%mm2, %%mm5;" /* Copy Cgreen */\
 \
    "paddsw %%mm6, %%mm0;" /* Y even + Cblue 00 B6 00 B4 00 B2 00 B0 */\
    "paddsw %%mm7, %%mm3;" /* Y odd + Cblue 00 B7 00 B5 00 B3 00 B1 */\
 \
    "paddsw %%mm6, %%mm1;" /* Y even + Cred 00 R6 00 R4 00 R2 00 R0 */\
    "paddsw %%mm7, %%mm4;" /* Y odd + Cred 00 R7 00 R5 00 R3 00 R1 */\
 \
    "paddsw %%mm6, %%mm2;" /* Y even + Cgreen 00 G6 00 G4 00 G2 00 G0 */\
    "paddsw %%mm7, %%mm5;" /* Y odd + Cgreen 00 G7 00 G5 00 G3 00 G1 */\
 \
    /* Limit RGB even to 0..255 */\
    "packuswb %%mm0, %%mm0;" /* B6 B4 B2 B0  B6 B4 B2 B0 */\
    "packuswb %%mm1, %%mm1;" /* R6 R4 R2 R0  R6 R4 R2 R0 */\
    "packuswb %%mm2, %%mm2;" /* G6 G4 G2 G0  G6 G4 G2 G0 */\
 \
    /* Limit RGB odd to 0..255 */\
    "packuswb %%mm3, %%mm3;" /* B7 B5 B3 B1  B7 B5 B3 B1 */\
    "packuswb %%mm4, %%mm4;" /* R7 R5 R3 R1  R7 R5 R3 R1 */\
    "packuswb %%mm5, %%mm5;" /* G7 G5 G3 G1  G7 G5 G3 G1 */\
 \
    /* Interleave RGB even and odd */\
    "punpcklbw %%mm3, %%mm0;" /* B7 B6 B5 B4 B3 B2 B1 B0 */\
    "punpcklbw %%mm4, %%mm1;" /* R7 R6 R5 R4 R3 R2 R1 R0 */\
    "punpcklbw %%mm5, %%mm2;" /* G7 G6 G5 G4 G3 G2 G1 G0 */\


 #define YUV422_UNSHIFT                   \
    if(c->srcFormat == PIX_FMT_YUV422P){ \
        srcStride[1] *= 2;               \
        srcStride[2] *= 2;               \
    }                                    \

 #define YUV2RGB_LOOP(depth)                                   \
    h_size= (c->dstW+7)&~7;                                   \
    if(h_size*depth > FFABS(dstStride[0])) h_size-=8;         \
 \
    __asm__ volatile ("pxor %mm4, %mm4;" /* zero mm4 */ );    \
    for (y= 0; y<srcSliceH; y++ ) {                           \
        uint8_t *image = dst[0] + (y+srcSliceY)*dstStride[0]; \
        uint8_t *py = src[0] + y*srcStride[0];                \
        uint8_t *pu = src[1] + (y>>1)*srcStride[1];           \
        uint8_t *pv = src[2] + (y>>1)*srcStride[2];           \
        long index= -h_size/2;                                \

 #define YUV2RGB_INIT                                                       \
        /* This MMX assembly code deals with a SINGLE scan line at a time, \
         * it converts 8 pixels in each iteration. */                      \
        __asm__ volatile (                                                 \
        /* load data for start of next scan line */                        \
        "movd    (%2, %0), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */ \
        "movd    (%3, %0), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */ \
        "movq (%5, %0, 2), %%mm6;" /* Load 8  Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */ \
        /*                                                                 \
        ".balign 16     \n\t"                                              \
        */                                                                 \
        "1:             \n\t"                                              \
        /* No speed difference on my p3@500 with prefetch,                 \
         * if it is faster for anyone with -benchmark then tell me.        \
        PREFETCH" 64(%0) \n\t"                                             \
        PREFETCH" 64(%1) \n\t"                                             \
        PREFETCH" 64(%2) \n\t"                                             \
        */                                                                 \

 #define YUV2RGB_ENDLOOP(depth) \
        "add $"AV_STRINGIFY(depth*8)", %1    \n\t" \
        "add                       $4, %0    \n\t" \
        " js                       1b        \n\t" \
 \
        : "+r" (index), "+r" (image) \
        : "r" (pu - index), "r" (pv - index), "r"(&c->redDither), "r" (py - 2*index) \
        ); \
    } \
    __asm__ volatile (EMMS); \
    return srcSliceH; \

 static inline int RENAME(yuv420_rgb16)(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
                                       int srcSliceH, uint8_t* dst[], int dstStride[]){
    int y, h_size;

    YUV422_UNSHIFT
    YUV2RGB_LOOP(2)

        c->blueDither= ff_dither8[y&1];
        c->greenDither= ff_dither4[y&1];
        c->redDither= ff_dither8[(y+1)&1];

        YUV2RGB_INIT
        YUV2RGB

 #ifdef DITHER1XBPP
        "paddusb "BLUE_DITHER"(%4), %%mm0;"
        "paddusb "GREEN_DITHER"(%4), %%mm2;"
        "paddusb "RED_DITHER"(%4), %%mm1;"
 #endif
        /* mask unneeded bits off */
        "pand "MANGLE(mmx_redmask)", %%mm0;" /* b7b6b5b4 b3_0_0_0 b7b6b5b4 b3_0_0_0 */
        "pand "MANGLE(mmx_grnmask)", %%mm2;" /* g7g6g5g4 g3g2_0_0 g7g6g5g4 g3g2_0_0 */
        "pand "MANGLE(mmx_redmask)", %%mm1;" /* r7r6r5r4 r3_0_0_0 r7r6r5r4 r3_0_0_0 */

        "psrlw   $3, %%mm0;" /* 0_0_0_b7 b6b5b4b3 0_0_0_b7 b6b5b4b3 */
        "pxor %%mm4, %%mm4;" /* zero mm4 */

        "movq %%mm0, %%mm5;" /* Copy B7-B0 */
        "movq %%mm2, %%mm7;" /* Copy G7-G0 */

        /* convert RGB24 plane to RGB16 pack for pixel 0-3 */
        "punpcklbw %%mm4, %%mm2;" /* 0_0_0_0 0_0_0_0 g7g6g5g4 g3g2_0_0 */
        "punpcklbw %%mm1, %%mm0;" /* r7r6r5r4 r3_0_0_0 0_0_0_b7 b6b5b4b3 */

        "psllw  $3, %%mm2;" /* 0_0_0_0 0_g7g6g5 g4g3g2_0 0_0_0_0 */
        "por %%mm2, %%mm0;" /* r7r6r5r4 r3g7g6g5 g4g3g2b7 b6b5b4b3 */

        "movq 8 (%5, %0, 2), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */
        MOVNTQ "      %%mm0, (%1);" /* store pixel 0-3 */

        /* convert RGB24 plane to RGB16 pack for pixel 0-3 */
        "punpckhbw %%mm4, %%mm7;" /* 0_0_0_0 0_0_0_0 g7g6g5g4 g3g2_0_0 */
        "punpckhbw %%mm1, %%mm5;" /* r7r6r5r4 r3_0_0_0 0_0_0_b7 b6b5b4b3 */

        "psllw        $3, %%mm7;" /* 0_0_0_0 0_g7g6g5 g4g3g2_0 0_0_0_0 */
        "movd 4 (%2, %0), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */

        "por       %%mm7, %%mm5;" /* r7r6r5r4 r3g7g6g5 g4g3g2b7 b6b5b4b3 */
        "movd 4 (%3, %0), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */

        MOVNTQ "   %%mm5, 8 (%1);" /* store pixel 4-7 */

    YUV2RGB_ENDLOOP(2)
 }

 static inline int RENAME(yuv420_rgb15)(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
                                       int srcSliceH, uint8_t* dst[], int dstStride[]){
    int y, h_size;

    YUV422_UNSHIFT
    YUV2RGB_LOOP(2)

        c->blueDither= ff_dither8[y&1];
        c->greenDither= ff_dither8[y&1];
        c->redDither= ff_dither8[(y+1)&1];

        YUV2RGB_INIT
        YUV2RGB

 #ifdef DITHER1XBPP
        "paddusb "BLUE_DITHER"(%4), %%mm0  \n\t"
        "paddusb "GREEN_DITHER"(%4), %%mm2  \n\t"
        "paddusb "RED_DITHER"(%4), %%mm1  \n\t"
 #endif

        /* mask unneeded bits off */
        "pand "MANGLE(mmx_redmask)", %%mm0;" /* b7b6b5b4 b3_0_0_0 b7b6b5b4 b3_0_0_0 */
        "pand "MANGLE(mmx_redmask)", %%mm2;" /* g7g6g5g4 g3_0_0_0 g7g6g5g4 g3_0_0_0 */
        "pand "MANGLE(mmx_redmask)", %%mm1;" /* r7r6r5r4 r3_0_0_0 r7r6r5r4 r3_0_0_0 */

        "psrlw   $3, %%mm0;" /* 0_0_0_b7 b6b5b4b3 0_0_0_b7 b6b5b4b3 */
        "psrlw   $1, %%mm1;" /* 0_r7r6r5  r4r3_0_0 0_r7r6r5 r4r3_0_0 */
        "pxor %%mm4, %%mm4;" /* zero mm4 */

        "movq %%mm0, %%mm5;" /* Copy B7-B0 */
        "movq %%mm2, %%mm7;" /* Copy G7-G0 */

        /* convert RGB24 plane to RGB16 pack for pixel 0-3 */
        "punpcklbw %%mm4, %%mm2;" /* 0_0_0_0 0_0_0_0 g7g6g5g4 g3_0_0_0 */
        "punpcklbw %%mm1, %%mm0;" /* r7r6r5r4 r3_0_0_0 0_0_0_b7 b6b5b4b3 */

        "psllw  $2, %%mm2;" /* 0_0_0_0 0_0_g7g6 g5g4g3_0 0_0_0_0 */
        "por %%mm2, %%mm0;" /* 0_r7r6r5 r4r3g7g6 g5g4g3b7 b6b5b4b3 */

        "movq 8 (%5, %0, 2), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */
        MOVNTQ "      %%mm0, (%1);"  /* store pixel 0-3 */

        /* convert RGB24 plane to RGB16 pack for pixel 0-3 */
        "punpckhbw %%mm4, %%mm7;" /* 0_0_0_0 0_0_0_0 0_g7g6g5 g4g3_0_0 */
        "punpckhbw %%mm1, %%mm5;" /* r7r6r5r4 r3_0_0_0 0_0_0_b7 b6b5b4b3 */

        "psllw        $2, %%mm7;" /* 0_0_0_0 0_0_g7g6 g5g4g3_0 0_0_0_0 */
        "movd 4 (%2, %0), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */

        "por       %%mm7, %%mm5;" /* 0_r7r6r5 r4r3g7g6 g5g4g3b7 b6b5b4b3 */
        "movd 4 (%3, %0), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */

        MOVNTQ " %%mm5, 8 (%1);" /* store pixel 4-7 */

    YUV2RGB_ENDLOOP(2)
 }

 static inline int RENAME(yuv420_rgb24)(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
                                       int srcSliceH, uint8_t* dst[], int dstStride[]){
    int y, h_size;

    YUV422_UNSHIFT
    YUV2RGB_LOOP(3)

        YUV2RGB_INIT
        YUV2RGB
        /* mm0=B, %%mm2=G, %%mm1=R */
 #if HAVE_MMX2
        "movq "MANGLE(ff_M24A)", %%mm4     \n\t"
        "movq "MANGLE(ff_M24C)", %%mm7     \n\t"
        "pshufw $0x50, %%mm0, %%mm5     \n\t" /* B3 B2 B3 B2  B1 B0 B1 B0 */
        "pshufw $0x50, %%mm2, %%mm3     \n\t" /* G3 G2 G3 G2  G1 G0 G1 G0 */
        "pshufw $0x00, %%mm1, %%mm6     \n\t" /* R1 R0 R1 R0  R1 R0 R1 R0 */

        "pand   %%mm4, %%mm5            \n\t" /*    B2        B1       B0 */
        "pand   %%mm4, %%mm3            \n\t" /*    G2        G1       G0 */
        "pand   %%mm7, %%mm6            \n\t" /*       R1        R0       */

        "psllq     $8, %%mm3            \n\t" /* G2        G1       G0    */
        "por    %%mm5, %%mm6            \n\t"
        "por    %%mm3, %%mm6            \n\t"
        MOVNTQ" %%mm6, (%1)             \n\t"

        "psrlq     $8, %%mm2            \n\t" /* 00 G7 G6 G5  G4 G3 G2 G1 */
        "pshufw $0xA5, %%mm0, %%mm5     \n\t" /* B5 B4 B5 B4  B3 B2 B3 B2 */
        "pshufw $0x55, %%mm2, %%mm3     \n\t" /* G4 G3 G4 G3  G4 G3 G4 G3 */
        "pshufw $0xA5, %%mm1, %%mm6     \n\t" /* R5 R4 R5 R4  R3 R2 R3 R2 */

        "pand "MANGLE(ff_M24B)", %%mm5     \n\t" /* B5       B4        B3    */
        "pand          %%mm7, %%mm3     \n\t" /*       G4        G3       */
        "pand          %%mm4, %%mm6     \n\t" /*    R4        R3       R2 */

        "por    %%mm5, %%mm3            \n\t" /* B5    G4 B4     G3 B3    */
        "por    %%mm3, %%mm6            \n\t"
        MOVNTQ" %%mm6, 8(%1)            \n\t"

        "pshufw $0xFF, %%mm0, %%mm5     \n\t" /* B7 B6 B7 B6  B7 B6 B6 B7 */
        "pshufw $0xFA, %%mm2, %%mm3     \n\t" /* 00 G7 00 G7  G6 G5 G6 G5 */
        "pshufw $0xFA, %%mm1, %%mm6     \n\t" /* R7 R6 R7 R6  R5 R4 R5 R4 */
        "movd 4 (%2, %0), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */

        "pand          %%mm7, %%mm5     \n\t" /*       B7        B6       */
        "pand          %%mm4, %%mm3     \n\t" /*    G7        G6       G5 */
        "pand "MANGLE(ff_M24B)", %%mm6     \n\t" /* R7       R6        R5    */
        "movd 4 (%3, %0), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */
 \
        "por          %%mm5, %%mm3      \n\t"
        "por          %%mm3, %%mm6      \n\t"
        MOVNTQ"       %%mm6, 16(%1)     \n\t"
        "movq 8 (%5, %0, 2), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */
        "pxor         %%mm4, %%mm4      \n\t"

 #else

        "pxor      %%mm4, %%mm4     \n\t"
        "movq      %%mm0, %%mm5     \n\t" /* B */
        "movq      %%mm1, %%mm6     \n\t" /* R */
        "punpcklbw %%mm2, %%mm0     \n\t" /* GBGBGBGB 0 */
        "punpcklbw %%mm4, %%mm1     \n\t" /* 0R0R0R0R 0 */
        "punpckhbw %%mm2, %%mm5     \n\t" /* GBGBGBGB 2 */
        "punpckhbw %%mm4, %%mm6     \n\t" /* 0R0R0R0R 2 */
        "movq      %%mm0, %%mm7     \n\t" /* GBGBGBGB 0 */
        "movq      %%mm5, %%mm3     \n\t" /* GBGBGBGB 2 */
        "punpcklwd %%mm1, %%mm7     \n\t" /* 0RGB0RGB 0 */
        "punpckhwd %%mm1, %%mm0     \n\t" /* 0RGB0RGB 1 */
        "punpcklwd %%mm6, %%mm5     \n\t" /* 0RGB0RGB 2 */
        "punpckhwd %%mm6, %%mm3     \n\t" /* 0RGB0RGB 3 */

        "movq      %%mm7, %%mm2     \n\t" /* 0RGB0RGB 0 */
        "movq      %%mm0, %%mm6     \n\t" /* 0RGB0RGB 1 */
        "movq      %%mm5, %%mm1     \n\t" /* 0RGB0RGB 2 */
        "movq      %%mm3, %%mm4     \n\t" /* 0RGB0RGB 3 */

        "psllq       $40, %%mm7     \n\t" /* RGB00000 0 */
        "psllq       $40, %%mm0     \n\t" /* RGB00000 1 */
        "psllq       $40, %%mm5     \n\t" /* RGB00000 2 */
        "psllq       $40, %%mm3     \n\t" /* RGB00000 3 */

        "punpckhdq %%mm2, %%mm7     \n\t" /* 0RGBRGB0 0 */
        "punpckhdq %%mm6, %%mm0     \n\t" /* 0RGBRGB0 1 */
        "punpckhdq %%mm1, %%mm5     \n\t" /* 0RGBRGB0 2 */
        "punpckhdq %%mm4, %%mm3     \n\t" /* 0RGBRGB0 3 */

        "psrlq        $8, %%mm7     \n\t" /* 00RGBRGB 0 */
        "movq      %%mm0, %%mm6     \n\t" /* 0RGBRGB0 1 */
        "psllq       $40, %%mm0     \n\t" /* GB000000 1 */
        "por       %%mm0, %%mm7     \n\t" /* GBRGBRGB 0 */
        MOVNTQ"    %%mm7, (%1)      \n\t"

        "movd 4 (%2, %0), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */

        "psrlq       $24, %%mm6     \n\t" /* 0000RGBR 1 */
        "movq      %%mm5, %%mm1     \n\t" /* 0RGBRGB0 2 */
        "psllq       $24, %%mm5     \n\t" /* BRGB0000 2 */
        "por       %%mm5, %%mm6     \n\t" /* BRGBRGBR 1 */
        MOVNTQ"    %%mm6, 8(%1)     \n\t"

        "movq 8 (%5, %0, 2), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */

        "psrlq       $40, %%mm1     \n\t" /* 000000RG 2 */
        "psllq        $8, %%mm3     \n\t" /* RGBRGB00 3 */
        "por       %%mm3, %%mm1     \n\t" /* RGBRGBRG 2 */
        MOVNTQ"    %%mm1, 16(%1)    \n\t"

        "movd 4 (%3, %0), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */
        "pxor      %%mm4, %%mm4     \n\t"
 #endif

    YUV2RGB_ENDLOOP(3)
 }

 #define RGB_PLANAR2PACKED32                                             \
    /* convert RGB plane to RGB packed format,                          \
       mm0 ->  B, mm1 -> R, mm2 -> G, mm3 -> A,                         \
       mm4 -> GB, mm5 -> AR pixel 4-7,                                  \
       mm6 -> GB, mm7 -> AR pixel 0-3 */                                \
    "movq      %%mm0, %%mm6;"   /* B7 B6 B5 B4 B3 B2 B1 B0 */           \
    "movq      %%mm1, %%mm7;"   /* R7 R6 R5 R4 R3 R2 R1 R0 */           \
 \
    "movq      %%mm0, %%mm4;"   /* B7 B6 B5 B4 B3 B2 B1 B0 */           \
    "movq      %%mm1, %%mm5;"   /* R7 R6 R5 R4 R3 R2 R1 R0 */           \
 \
    "punpcklbw %%mm2, %%mm6;"   /* G3 B3 G2 B2 G1 B1 G0 B0 */           \
    "punpcklbw %%mm3, %%mm7;"   /* A3 R3 A2 R2 A1 R1 A0 R0 */           \
 \
    "punpcklwd %%mm7, %%mm6;"   /* A1 R1 B1 G1 A0 R0 B0 G0 */           \
    MOVNTQ "   %%mm6, (%1);"    /* Store ARGB1 ARGB0 */                 \
 \
    "movq      %%mm0, %%mm6;"   /* B7 B6 B5 B4 B3 B2 B1 B0 */           \
    "punpcklbw %%mm2, %%mm6;"   /* G3 B3 G2 B2 G1 B1 G0 B0 */           \
 \
    "punpckhwd %%mm7, %%mm6;"   /* A3 R3 G3 B3 A2 R2 B3 G2 */           \
    MOVNTQ "   %%mm6, 8 (%1);"  /* Store ARGB3 ARGB2 */                 \
 \
    "punpckhbw %%mm2, %%mm4;"   /* G7 B7 G6 B6 G5 B5 G4 B4 */           \
    "punpckhbw %%mm3, %%mm5;"   /* A7 R7 A6 R6 A5 R5 A4 R4 */           \
 \
    "punpcklwd %%mm5, %%mm4;"   /* A5 R5 B5 G5 A4 R4 B4 G4 */           \
    MOVNTQ "   %%mm4, 16 (%1);" /* Store ARGB5 ARGB4 */                 \
 \
    "movq      %%mm0, %%mm4;"   /* B7 B6 B5 B4 B3 B2 B1 B0 */           \
    "punpckhbw %%mm2, %%mm4;"   /* G7 B7 G6 B6 G5 B5 G4 B4 */           \
 \
    "punpckhwd %%mm5, %%mm4;"   /* A7 R7 G7 B7 A6 R6 B6 G6 */           \
    MOVNTQ "   %%mm4, 24 (%1);" /* Store ARGB7 ARGB6 */                 \
 \
    "movd 4 (%2, %0), %%mm0;"   /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */ \
    "movd 4 (%3, %0), %%mm1;"   /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */ \
 \
    "pxor         %%mm4, %%mm4;" /* zero mm4 */                         \
    "movq 8 (%5, %0, 2), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */ \

 static inline int RENAME(yuv420_rgb32)(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
                                       int srcSliceH, uint8_t* dst[], int dstStride[]){
    int y, h_size;

    YUV422_UNSHIFT
    YUV2RGB_LOOP(4)

        YUV2RGB_INIT
        YUV2RGB
        "pcmpeqd   %%mm3, %%mm3;"   /* fill mm3 */
        RGB_PLANAR2PACKED32

    YUV2RGB_ENDLOOP(4)
 }
--- a/libswscale/yuv2rgb_vis.c
+++ b/libswscale/yuv2rgb_vis.c
@@ -0,0 +1,209 @@
 /*
 * VIS optimized software YUV to RGB converter
 * Copyright (c) 2007 Denes Balatoni <dbalatoni@programozo.hu>
 *
 * This file is part of FFmpeg.
 *
 * FFmpeg is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * FFmpeg is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with FFmpeg; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */

 #include <inttypes.h>
 #include <stdlib.h>

 #include "swscale.h"
 #include "swscale_internal.h"

 #define YUV2RGB_INIT \
    "wr %%g0, 0x10, %%gsr \n\t" \
    "ldd [%5], %%f32      \n\t" \
    "ldd [%5+8], %%f34    \n\t" \
    "ldd [%5+16], %%f36   \n\t" \
    "ldd [%5+24], %%f38   \n\t" \
    "ldd [%5+32], %%f40   \n\t" \
    "ldd [%5+40], %%f42   \n\t" \
    "ldd [%5+48], %%f44   \n\t" \
    "ldd [%5+56], %%f46   \n\t" \
    "ldd [%5+64], %%f48   \n\t" \
    "ldd [%5+72], %%f50   \n\t"

 #define YUV2RGB_KERNEL \
    /* ^^^^ f0=Y f3=u f5=v */ \
    "fmul8x16 %%f3, %%f48, %%f6   \n\t" \
    "fmul8x16 %%f19, %%f48, %%f22 \n\t" \
    "fmul8x16 %%f5, %%f44, %%f8   \n\t" \
    "fmul8x16 %%f21, %%f44, %%f24 \n\t" \
    "fmul8x16 %%f0, %%f42, %%f0   \n\t" \
    "fmul8x16 %%f16, %%f42, %%f16 \n\t" \
    "fmul8x16 %%f3, %%f50, %%f2   \n\t" \
    "fmul8x16 %%f19, %%f50, %%f18 \n\t" \
    "fmul8x16 %%f5, %%f46, %%f4   \n\t" \
    "fmul8x16 %%f21, %%f46, %%f20 \n\t" \
    \
    "fpsub16 %%f6, %%f34, %%f6   \n\t" /* 1 */ \
    "fpsub16 %%f22, %%f34, %%f22 \n\t" /* 1 */ \
    "fpsub16 %%f8, %%f38, %%f8   \n\t" /* 3 */ \
    "fpsub16 %%f24, %%f38, %%f24 \n\t" /* 3 */ \
    "fpsub16 %%f0, %%f32, %%f0   \n\t" /* 0 */ \
    "fpsub16 %%f16, %%f32, %%f16 \n\t" /* 0 */ \
    "fpsub16 %%f2, %%f36, %%f2   \n\t" /* 2 */ \
    "fpsub16 %%f18, %%f36, %%f18 \n\t" /* 2 */ \
    "fpsub16 %%f4, %%f40, %%f4   \n\t" /* 4 */ \
    "fpsub16 %%f20, %%f40, %%f20 \n\t" /* 4 */ \
    \
    "fpadd16 %%f0, %%f8, %%f8    \n\t" /* Gt */ \
    "fpadd16 %%f16, %%f24, %%f24 \n\t" /* Gt */ \
    "fpadd16 %%f0, %%f4, %%f4    \n\t" /* R */ \
    "fpadd16 %%f16, %%f20, %%f20 \n\t" /* R */ \
    "fpadd16 %%f0, %%f6, %%f6    \n\t" /* B */ \
    "fpadd16 %%f16, %%f22, %%f22 \n\t" /* B */ \
    "fpadd16 %%f8, %%f2, %%f2    \n\t" /* G */ \
    "fpadd16 %%f24, %%f18, %%f18 \n\t" /* G */ \
    \
    "fpack16 %%f4, %%f4    \n\t" \
    "fpack16 %%f20, %%f20  \n\t" \
    "fpack16 %%f6, %%f6    \n\t" \
    "fpack16 %%f22, %%f22  \n\t" \
    "fpack16 %%f2, %%f2    \n\t" \
    "fpack16 %%f18, %%f18  \n\t"



 // FIXME: must be changed to set alpha to 255 instead of 0
 static int vis_420P_ARGB32(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
                           int srcSliceH, uint8_t* dst[], int dstStride[]){
  int y, out1, out2, out3, out4, out5, out6;

  for(y=0;y < srcSliceH;++y) {
      __asm__ volatile (
          YUV2RGB_INIT
          "wr %%g0, 0xd2, %%asi        \n\t" /* ASI_FL16_P */
          "1:                          \n\t"
          "ldda [%1] %%asi, %%f2       \n\t"
          "ldda [%1+2] %%asi, %%f18    \n\t"
          "ldda [%2] %%asi, %%f4       \n\t"
          "ldda [%2+2] %%asi, %%f20    \n\t"
          "ld [%0], %%f0               \n\t"
          "ld [%0+4], %%f16            \n\t"
          "fpmerge %%f3, %%f3, %%f2    \n\t"
          "fpmerge %%f19, %%f19, %%f18 \n\t"
          "fpmerge %%f5, %%f5, %%f4    \n\t"
          "fpmerge %%f21, %%f21, %%f20 \n\t"
          YUV2RGB_KERNEL
          "fzero %%f0                  \n\t"
          "fpmerge %%f4, %%f6, %%f8    \n\t"  // r,b,t1
          "fpmerge %%f20, %%f22, %%f24 \n\t"  // r,b,t1
          "fpmerge %%f0, %%f2, %%f10   \n\t"  // 0,g,t2
          "fpmerge %%f0, %%f18, %%f26  \n\t"  // 0,g,t2
          "fpmerge %%f10, %%f8, %%f4   \n\t"  // t2,t1,msb
          "fpmerge %%f26, %%f24, %%f20 \n\t"  // t2,t1,msb
          "fpmerge %%f11, %%f9, %%f6   \n\t"  // t2,t1,lsb
          "fpmerge %%f27, %%f25, %%f22 \n\t"  // t2,t1,lsb
          "std %%f4, [%3]              \n\t"
          "std %%f20, [%3+16]          \n\t"
          "std %%f6, [%3+8]            \n\t"
          "std %%f22, [%3+24]          \n\t"

          "add %0, 8, %0   \n\t"
          "add %1, 4, %1   \n\t"
          "add %2, 4, %2   \n\t"
          "subcc %4, 8, %4 \n\t"
          "bne 1b          \n\t"
          "add %3, 32, %3  \n\t" //delay slot
          : "=r" (out1), "=r" (out2), "=r" (out3), "=r" (out4), "=r" (out5), "=r" (out6)
          : "0" (src[0]+(y+srcSliceY)*srcStride[0]), "1" (src[1]+((y+srcSliceY)>>1)*srcStride[1]),
            "2" (src[2]+((y+srcSliceY)>>1)*srcStride[2]), "3" (dst[0]+(y+srcSliceY)*dstStride[0]),
            "4" (c->dstW),
            "5" (c->sparc_coeffs)
      );
  }

  return srcSliceH;
 }

 // FIXME: must be changed to set alpha to 255 instead of 0
 static int vis_422P_ARGB32(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
                           int srcSliceH, uint8_t* dst[], int dstStride[]){
  int y, out1, out2, out3, out4, out5, out6;

  for(y=0;y < srcSliceH;++y) {
      __asm__ volatile (
          YUV2RGB_INIT
          "wr %%g0, 0xd2, %%asi        \n\t" /* ASI_FL16_P */
          "1:                          \n\t"
          "ldda [%1] %%asi, %%f2       \n\t"
          "ldda [%1+2] %%asi, %%f18    \n\t"
          "ldda [%2] %%asi, %%f4       \n\t"
          "ldda [%2+2] %%asi, %%f20    \n\t"
          "ld [%0], %%f0               \n\t"
          "ld [%0+4], %%f16            \n\t"
          "fpmerge %%f3, %%f3, %%f2    \n\t"
          "fpmerge %%f19, %%f19, %%f18 \n\t"
          "fpmerge %%f5, %%f5, %%f4    \n\t"
          "fpmerge %%f21, %%f21, %%f20 \n\t"
          YUV2RGB_KERNEL
          "fzero %%f0 \n\t"
          "fpmerge %%f4, %%f6, %%f8    \n\t"  // r,b,t1
          "fpmerge %%f20, %%f22, %%f24 \n\t"  // r,b,t1
          "fpmerge %%f0, %%f2, %%f10   \n\t"  // 0,g,t2
          "fpmerge %%f0, %%f18, %%f26  \n\t"  // 0,g,t2
          "fpmerge %%f10, %%f8, %%f4   \n\t"  // t2,t1,msb
          "fpmerge %%f26, %%f24, %%f20 \n\t"  // t2,t1,msb
          "fpmerge %%f11, %%f9, %%f6   \n\t"  // t2,t1,lsb
          "fpmerge %%f27, %%f25, %%f22 \n\t"  // t2,t1,lsb
          "std %%f4, [%3]              \n\t"
          "std %%f20, [%3+16]          \n\t"
          "std %%f6, [%3+8]            \n\t"
          "std %%f22, [%3+24]          \n\t"

          "add %0, 8, %0   \n\t"
          "add %1, 4, %1   \n\t"
          "add %2, 4, %2   \n\t"
          "subcc %4, 8, %4 \n\t"
          "bne 1b          \n\t"
          "add %3, 32, %3  \n\t" //delay slot
          : "=r" (out1), "=r" (out2), "=r" (out3), "=r" (out4), "=r" (out5), "=r" (out6)
          : "0" (src[0]+(y+srcSliceY)*srcStride[0]), "1" (src[1]+(y+srcSliceY)*srcStride[1]),
            "2" (src[2]+(y+srcSliceY)*srcStride[2]), "3" (dst[0]+(y+srcSliceY)*dstStride[0]),
            "4" (c->dstW),
            "5" (c->sparc_coeffs)
      );
  }

  return srcSliceH;
 }

 SwsFunc sws_yuv2rgb_init_vis(SwsContext *c) {
    c->sparc_coeffs[5]=c->yCoeff;
    c->sparc_coeffs[6]=c->vgCoeff;
    c->sparc_coeffs[7]=c->vrCoeff;
    c->sparc_coeffs[8]=c->ubCoeff;
    c->sparc_coeffs[9]=c->ugCoeff;

    c->sparc_coeffs[0]=(((int16_t)c->yOffset*(int16_t)c->yCoeff >>11) & 0xffff) * 0x0001000100010001ULL;
    c->sparc_coeffs[1]=(((int16_t)c->uOffset*(int16_t)c->ubCoeff>>11) & 0xffff) * 0x0001000100010001ULL;
    c->sparc_coeffs[2]=(((int16_t)c->uOffset*(int16_t)c->ugCoeff>>11) & 0xffff) * 0x0001000100010001ULL;
    c->sparc_coeffs[3]=(((int16_t)c->vOffset*(int16_t)c->vgCoeff>>11) & 0xffff) * 0x0001000100010001ULL;
    c->sparc_coeffs[4]=(((int16_t)c->vOffset*(int16_t)c->vrCoeff>>11) & 0xffff) * 0x0001000100010001ULL;

    if (c->dstFormat == PIX_FMT_RGB32 && c->srcFormat == PIX_FMT_YUV422P && (c->dstW & 7)==0) {
        av_log(c, AV_LOG_INFO, "SPARC VIS accelerated YUV422P -> RGB32 (WARNING: alpha value is wrong)\n");
        return vis_422P_ARGB32;
    }
    else if (c->dstFormat == PIX_FMT_RGB32 && c->srcFormat == PIX_FMT_YUV420P && (c->dstW & 7)==0) {
        av_log(c, AV_LOG_INFO, "SPARC VIS accelerated YUV420P -> RGB32 (WARNING: alpha value is wrong)\n");
        return vis_420P_ARGB32;
    }
    return NULL;
 }