spelling/grammar/wording overhaul

Originally committed as revision 27190 to svn://svn.mplayerhq.hu/mplayer/trunk/libswscale
17 years ago · 8a3227968c
--- a/libswscale/internal_bfin.S
+++ b/libswscale/internal_bfin.S
@@ -2,8 +2,8 @@
 * Copyright (C) 2007 Marc Hoffman <marc.hoffman@analog.com>
 *                    April 20, 2007
 *
 * Blackfin Video Color Space Converters Operations
 *  convert I420 YV12 to RGB in various formats,
 * Blackfin video color space converter operations
 * convert I420 YV12 to RGB in various formats
 *
 * This file is part of FFmpeg.
 *
@@ -24,8 +24,8 @@


 /*
 YUV420 to RGB565 conversion.  This routine takes a YUV 420 planar macroblock
 and converts it to RGB565.  R:5 bits, G:6 bits, B:5 bits.. packed into shorts
 YUV420 to RGB565 conversion. This routine takes a YUV 420 planar macroblock
 and converts it to RGB565. R:5 bits, G:6 bits, B:5 bits.. packed into shorts.


 The following calculation is used for the conversion:
@@ -34,36 +34,36 @@ The following calculation is used for the conversion:
  g = clipz((y-oy)*cy  + cgv*(v-128) + cgu*(u-128))
  b = clipz((y-oy)*cy  + cbu*(u-128))

 y,u,v are pre scaled by a factor of 4 i.e. left shifted to gain precision.
 y,u,v are prescaled by a factor of 4 i.e. left-shifted to gain precision.


 New factorization to eliminate the truncation error which was
 occuring due to the byteop3p.
 occurring due to the byteop3p.


 1) use the bytop16m to subtract quad bytes we use this in U8 this
 1) Use the bytop16m to subtract quad bytes we use this in U8 this
 then so the offsets need to be renormalized to 8bits.

 2) scale operands up by a factor of 4 not 8 because Blackfin
 2) Scale operands up by a factor of 4 not 8 because Blackfin
   multiplies include a shift.

 3) compute into the accumulators cy*yx0, cy*yx1
 3) Compute into the accumulators cy*yx0, cy*yx1.

 4) compute each of the linear equations
 4) Compute each of the linear equations:
     r = clipz((y - oy) * cy  + crv * (v - 128))

     g = clipz((y - oy) * cy  + cgv * (v - 128) + cgu * (u - 128))

     b = clipz((y - oy) * cy  + cbu * (u - 128))

   reuse of the accumulators requires that we actually multiply
   twice once with addition and the second time with a subtaction.
   Reuse of the accumulators requires that we actually multiply
   twice once with addition and the second time with a subtraction.

   because of this we need to compute the equations in the order R B
   Because of this we need to compute the equations in the order R B
   then G saving the writes for B in the case of 24/32 bit color
   formats.

   api: yuv2rgb_kind (uint8_t *Y, uint8_t *U, uint8_t *V, int *out,
   API: yuv2rgb_kind (uint8_t *Y, uint8_t *U, uint8_t *V, int *out,
                      int dW, uint32_t *coeffs);

       A          B
@@ -77,13 +77,13 @@ uint32_t oy,oc,zero,cy,crv,rmask,cbu,bmask,cgu,cgv;

 coeffs is a pointer to oy.

 the {rgb} masks are only utilized by the 565 packing algorithm. Note the data
 replication is used to simplify the internal algorithms for the dual mac architecture
 of BlackFin.
 The {rgb} masks are only utilized by the 565 packing algorithm. Note the data
 replication is used to simplify the internal algorithms for the dual Mac
 architecture of BlackFin.

 All routines are exported with _ff_bfin_ as a symbol prefix
 All routines are exported with _ff_bfin_ as a symbol prefix.

 rough performance gain compared against -O3:
 Rough performance gain compared against -O3:

 2779809/1484290 187.28%

--- a/libswscale/rgb2rgb.c
+++ b/libswscale/rgb2rgb.c
@@ -1,10 +1,10 @@
 /*
 *  rgb2rgb.c, Software RGB to RGB convertor
 *  pluralize by Software PAL8 to RGB convertor
 *               Software YUV to YUV convertor
 *               Software YUV to RGB convertor
 *  Written by Nick Kurshev.
 *  palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at)
 * software RGB to RGB converter
 * pluralize by software PAL8 to RGB converter
 *              software YUV to YUV converter
 *              software YUV to RGB converter
 * Written by Nick Kurshev.
 * palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at)
 *
 * This file is part of FFmpeg.
 *
@@ -22,8 +22,8 @@
 * along with FFmpeg; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 *
 * the C code (not assembly, mmx, ...) of this file can be used
 * under the LGPL license too
 * The C code (not assembly, MMX, ...) of this file can be used
 * under the LGPL license.
 */
 #include <inttypes.h>
 #include "config.h"
@@ -33,7 +33,7 @@
 #include "swscale.h"
 #include "swscale_internal.h"

 #define FAST_BGR2YV12 // use 7 bit coeffs instead of 15bit
 #define FAST_BGR2YV12 // use 7-bit instead of 15-bit coefficients

 void (*rgb24to32)(const uint8_t *src, uint8_t *dst, long src_size);
 void (*rgb24to16)(const uint8_t *src, uint8_t *dst, long src_size);
@@ -149,8 +149,8 @@ static uint64_t __attribute__((aligned(8))) dither8[2]={
 #define RV ((int)( 0.439*(1<<RGB2YUV_SHIFT)+0.5))
 #define RU ((int)(-0.148*(1<<RGB2YUV_SHIFT)+0.5))

 //Note: we have C, MMX, MMX2, 3DNOW version therse no 3DNOW+MMX2 one
 //Plain C versions
 //Note: We have C, MMX, MMX2, 3DNOW versions, there is no 3DNOW + MMX2 one.
 //plain C versions
 #undef HAVE_MMX
 #undef HAVE_MMX2
 #undef HAVE_3DNOW
@@ -190,10 +190,10 @@ static uint64_t __attribute__((aligned(8))) dither8[2]={
 #endif //ARCH_X86 || ARCH_X86_64

 /*
 rgb15->rgb16 Original by Strepto/Astral
 RGB15->RGB16 original by Strepto/Astral
 ported to gcc & bugfixed : A'rpi
 MMX2, 3DNOW optimization by Nick Kurshev
 32bit c version, and and&add trick by Michael Niedermayer
 32-bit C version, and and&add trick by Michael Niedermayer
 */

 void sws_rgb2rgb_init(int flags){
@@ -266,7 +266,7 @@ void palette8torgb24(const uint8_t *src, uint8_t *dst, long num_pixels, const ui
 {
    long i;
 /*
    writes 1 byte o much and might cause alignment issues on some architectures?
    Writes 1 byte too much and might cause alignment issues on some architectures?
    for (i=0; i<num_pixels; i++)
        ((unsigned *)(&dst[i*3])) = ((unsigned *)palette)[src[i]];
 */
@@ -284,7 +284,7 @@ void palette8tobgr24(const uint8_t *src, uint8_t *dst, long num_pixels, const ui
 {
    long i;
 /*
    writes 1 byte o much and might cause alignment issues on some architectures?
    Writes 1 byte too much and might cause alignment issues on some architectures?
    for (i=0; i<num_pixels; i++)
        ((unsigned *)(&dst[i*3])) = ((unsigned *)palette)[src[i]];
 */
@@ -299,7 +299,7 @@ void palette8tobgr24(const uint8_t *src, uint8_t *dst, long num_pixels, const ui
 }

 /**
 * Palette is assumed to contain bgr16, see rgb32to16 to convert the palette
 * Palette is assumed to contain BGR16, see rgb32to16 to convert the palette.
 */
 void palette8torgb16(const uint8_t *src, uint8_t *dst, long num_pixels, const uint8_t *palette)
 {
--- a/libswscale/rgb2rgb.h
+++ b/libswscale/rgb2rgb.h
@@ -1,8 +1,8 @@
 /*
 *  rgb2rgb.h, Software RGB to RGB convertor
 *  pluralize by Software PAL8 to RGB convertor
 *               Software YUV to YUV convertor
 *               Software YUV to RGB convertor
 *  software RGB to RGB converter
 *  pluralize by Software PAL8 to RGB converter
 *               Software YUV to YUV converter
 *               Software YUV to RGB converter
 *  Written by Nick Kurshev.
 *  palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at)
 *
@@ -28,7 +28,7 @@

 #include <inttypes.h>

 /* A full collection of rgb to rgb(bgr) convertors */
 /* A full collection of RGB to RGB(BGR) converters */
 extern void (*rgb24to32)   (const uint8_t *src, uint8_t *dst, long src_size);
 extern void (*rgb24to16)   (const uint8_t *src, uint8_t *dst, long src_size);
 extern void (*rgb24to15)   (const uint8_t *src, uint8_t *dst, long src_size);
@@ -71,53 +71,49 @@ extern void palette8torgb15(const uint8_t *src, uint8_t *dst, long num_pixels, c
 extern void palette8tobgr15(const uint8_t *src, uint8_t *dst, long num_pixels, const uint8_t *palette);

 /**
 *
 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
 * problem for anyone then tell me, and ill fix it)
 * chrominance data is only taken from every secound line others are ignored FIXME write HQ version
 * Height should be a multiple of 2 and width should be a multiple of 16.
 * (If this is a problem for anyone then tell me, and I will fix it.)
 * Chrominance data is only taken from every second line, others are ignored.
 * FIXME: Write HQ version.
 */
 //void uyvytoyv12(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,

 /**
 *
 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
 * problem for anyone then tell me, and ill fix it)
 * Height should be a multiple of 2 and width should be a multiple of 16.
 * (If this is a problem for anyone then tell me, and I will fix it.)
 */
 extern void (*yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
                          long width, long height,
                          long lumStride, long chromStride, long dstStride);

 /**
 *
 * width should be a multiple of 16
 * Width should be a multiple of 16.
 */
 extern void (*yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
                             long width, long height,
                             long lumStride, long chromStride, long dstStride);

 /**
 *
 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
 * problem for anyone then tell me, and ill fix it)
 * Height should be a multiple of 2 and width should be a multiple of 16.
 * (If this is a problem for anyone then tell me, and I will fix it.)
 */
 extern void (*yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
                          long width, long height,
                          long lumStride, long chromStride, long srcStride);

 /**
 *
 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
 * problem for anyone then tell me, and ill fix it)
 * Height should be a multiple of 2 and width should be a multiple of 16.
 * (If this is a problem for anyone then tell me, and I will fix it.)
 */
 extern void (*yv12touyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
                          long width, long height,
                          long lumStride, long chromStride, long dstStride);

 /**
 *
 * height should be a multiple of 2 and width should be a multiple of 2 (if this is a
 * problem for anyone then tell me, and ill fix it)
 * chrominance data is only taken from every secound line others are ignored FIXME write HQ version
 * Height should be a multiple of 2 and width should be a multiple of 2.
 * (If this is a problem for anyone then tell me, and I will fix it.)
 * Chrominance data is only taken from every second line, others are ignored.
 * FIXME: Write HQ version.
 */
 extern void (*rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
                           long width, long height,
--- a/libswscale/rgb2rgb_template.c
+++ b/libswscale/rgb2rgb_template.c
@@ -1,11 +1,11 @@
 /*
 *  rgb2rgb.c, Software RGB to RGB convertor
 *  pluralize by Software PAL8 to RGB convertor
 *               Software YUV to YUV convertor
 *               Software YUV to RGB convertor
 *  Written by Nick Kurshev.
 *  palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at)
 *  lot of big-endian byteorder fixes by Alex Beregszaszi
 * software RGB to RGB converter
 * pluralize by software PAL8 to RGB converter
 *              software YUV to YUV converter
 *              software YUV to RGB converter
 * Written by Nick Kurshev.
 * palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at)
 * lot of big-endian byte order fixes by Alex Beregszaszi
 *
 * This file is part of FFmpeg.
 *
@@ -23,7 +23,7 @@
 * along with FFmpeg; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 *
 * The C code (not assembly, mmx, ...) of this file can be used
 * The C code (not assembly, MMX, ...) of this file can be used
 * under the LGPL license.
 */

@@ -229,10 +229,10 @@ static inline void RENAME(rgb32to24)(const uint8_t *src, uint8_t *dst, long src_
 }

 /*
 Original by Strepto/Astral
 ported to gcc & bugfixed : A'rpi
 original by Strepto/Astral
 ported to gcc & bugfixed: A'rpi
 MMX2, 3DNOW optimization by Nick Kurshev
 32 bit C version, and and&add trick by Michael Niedermayer
 32-bit C version, and and&add trick by Michael Niedermayer
 */
 static inline void RENAME(rgb15to16)(const uint8_t *src, uint8_t *dst, long src_size)
 {
@@ -926,9 +926,9 @@ static inline void RENAME(rgb24tobgr15)(const uint8_t *src, uint8_t *dst, long s
   ----------------
   1 1 0 1 1  1 1 0
   |=======|  |===|
       |      Leftmost Bits Repeated to Fill Open Bits
       |      leftmost bits repeated to fill open bits
       |
   Original Bits
   original bits
 */
 static inline void RENAME(rgb15to24)(const uint8_t *src, uint8_t *dst, long src_size)
 {
@@ -1006,7 +1006,7 @@ static inline void RENAME(rgb15to24)(const uint8_t *src, uint8_t *dst, long src_
        :"=m"(*d)
        :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r), "m"(mmx_null)
        :"memory");
        /* Borrowed 32 to 24 */
        /* borrowed 32 to 24 */
        asm volatile(
        "movq       %%mm0, %%mm4    \n\t"
        "movq       %%mm3, %%mm5    \n\t"
@@ -1147,7 +1147,7 @@ static inline void RENAME(rgb16to24)(const uint8_t *src, uint8_t *dst, long src_
        :"=m"(*d)
        :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mmx_null)
        :"memory");
        /* Borrowed 32 to 24 */
        /* borrowed 32 to 24 */
        asm volatile(
        "movq       %%mm0, %%mm4    \n\t"
        "movq       %%mm3, %%mm5    \n\t"
@@ -1479,7 +1479,7 @@ static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, long s
    asm volatile(SFENCE:::"memory");
    asm volatile(EMMS:::"memory");

    if (mmx_size==23) return; //finihsed, was multiple of 8
    if (mmx_size==23) return; //finished, was multiple of 8

    src+= src_size;
    dst+= src_size;
@@ -1638,8 +1638,8 @@ asm(    EMMS"       \n\t"
 }

 /**
 * Height should be a multiple of 2 and width should be a multiple of 16 (if
 * this is a problem for anyone then tell me, and I will fix it).
 * Height should be a multiple of 2 and width should be a multiple of 16.
 * (If this is a problem for anyone then tell me, and I will fix it.)
 */
 static inline void RENAME(yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
                                      long width, long height,
@@ -1720,7 +1720,7 @@ static inline void RENAME(yuvPlanartouyvy)(const uint8_t *ysrc, const uint8_t *u
                (vc[0] << 8) + (yc[1] << 0);
 #else
            *idst++ = uc[0] + (yc[0] << 8) +
                (vc[0] << 16) + (yc[1] << 24);
               (vc[0] << 16) + (yc[1] << 24);
 #endif
            yc += 2;
            uc++;
@@ -1744,8 +1744,8 @@ asm(    EMMS"       \n\t"
 }

 /**
 * Height should be a multiple of 2 and width should be a multiple of 16 (if
 * this is a problem for anyone then tell me, and I will fix it).
 * Height should be a multiple of 2 and width should be a multiple of 16
 * (If this is a problem for anyone then tell me, and I will fix it.)
 */
 static inline void RENAME(yv12touyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
                                      long width, long height,
@@ -1766,8 +1766,8 @@ static inline void RENAME(yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usr
 }

 /**
 * Height should be a multiple of 2 and width should be a multiple of 16 (if
 * this is a problem for anyone then tell me, and I will fix it).
 * Height should be a multiple of 2 and width should be a multiple of 16.
 * (If this is a problem for anyone then tell me, and I will fix it.)
 */
 static inline void RENAME(yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
                                      long width, long height,
@@ -2002,9 +2002,9 @@ asm volatile(   EMMS"       \n\t"
 }

 /**
 * Height should be a multiple of 2 and width should be a multiple of 16 (if
 * this is a problem for anyone then tell me, and I will fix it).
 * Chrominance data is only taken from every secound line, others are ignored.
 * Height should be a multiple of 2 and width should be a multiple of 16.
 * (If this is a problem for anyone then tell me, and I will fix it.)
 * Chrominance data is only taken from every second line, others are ignored.
 * FIXME: Write HQ version.
 */
 static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
@@ -2128,9 +2128,9 @@ asm volatile(   EMMS"       \n\t"
 }

 /**
 * Height should be a multiple of 2 and width should be a multiple of 2 (if
 * this is a problem for anyone then tell me, and I will fix it).
 * Chrominance data is only taken from every secound line,
 * Height should be a multiple of 2 and width should be a multiple of 2.
 * (If this is a problem for anyone then tell me, and I will fix it.)
 * Chrominance data is only taken from every second line,
 * others are ignored in the C version.
 * FIXME: Write HQ version.
 */
--- a/libswscale/swscale_altivec_template.c
+++ b/libswscale/swscale_altivec_template.c
@@ -245,12 +245,12 @@ static inline void hScale_altivec_real(int16_t *dst, int dstW, uint8_t *src, int
        src_v = vec_mergeh(src_v, (vector signed short)vzero);

        filter_v = vec_ld(i << 3, filter);
        // the 3 above is 2 (filterSize == 4) + 1 (sizeof(short) == 2)
        // The 3 above is 2 (filterSize == 4) + 1 (sizeof(short) == 2).

        // the neat trick : we only care for half the elements,
        // The neat trick: We only care for half the elements,
        // high or low depending on (i<<3)%16 (it's 0 or 8 here),
        // and we're going to use vec_mule, so we chose
        // carefully how to "unpack" the elements into the even slots
        // and we're going to use vec_mule, so we choose
        // carefully how to "unpack" the elements into the even slots.
        if ((i << 3) % 16)
            filter_v = vec_mergel(filter_v, (vector signed short)vzero);
        else
@@ -405,12 +405,12 @@ static inline int yv12toyuy2_unscaled_altivec(SwsContext *c, uint8_t* src[], int
        return srcSliceH;
    }

    /* this code assume:
    /* This code assumes:

    1) dst is 16 bytes-aligned
    2) dstStride is a multiple of 16
    3) width is a multiple of 16
    4) lum&chrom stride are multiple of 8
    4) lum & chrom stride are multiples of 8
    */

    for (y=0; y<height; y++) {
@@ -482,12 +482,12 @@ static inline int yv12touyvy_unscaled_altivec(SwsContext *c, uint8_t* src[], int
        return srcSliceH;
    }

    /* this code assume:
    /* This code assumes:

    1) dst is 16 bytes-aligned
    2) dstStride is a multiple of 16
    3) width is a multiple of 16
    4) lum&chrom stride are multiple of 8
    4) lum & chrom stride are multiples of 8
    */

    for (y=0; y<height; y++) {
--- a/libswscale/swscale_bfin.c
+++ b/libswscale/swscale_bfin.c
@@ -1,7 +1,7 @@
 /*
 * Copyright (C) 2007 Marc Hoffman <marc.hoffman@analog.com>
 *
 * Blackfin Software Video SCALER Operations
 * Blackfin software video scaler operations
 *
 * This file is part of FFmpeg.
 *
--- a/libswscale/swscale_internal.h
+++ b/libswscale/swscale_internal.h
@@ -37,7 +37,7 @@
 typedef int (*SwsFunc)(struct SwsContext *context, uint8_t* src[], int srcStride[], int srcSliceY,
             int srcSliceH, uint8_t* dst[], int dstStride[]);

 /* this struct should be aligned on at least 32-byte boundary */
 /* This struct should be aligned on at least a 32-byte boundary. */
 typedef struct SwsContext{
    /**
     * info on struct for av_log
@@ -73,7 +73,7 @@ typedef struct SwsContext{
    int16_t *vChrFilter;
    int16_t *vChrFilterPos;

    uint8_t formatConvBuffer[VOF]; //FIXME dynamic alloc, but we have to change a lot of code for this to be useful
    uint8_t formatConvBuffer[VOF]; //FIXME dynamic allocation, but we have to change a lot of code for this to be useful

    int hLumFilterSize;
    int hChrFilterSize;
@@ -122,7 +122,7 @@ typedef struct SwsContext{
 #define V_OFFSET              "10*8"
 #define LUM_MMX_FILTER_OFFSET "11*8"
 #define CHR_MMX_FILTER_OFFSET "11*8+4*4*256"
 #define DSTW_OFFSET           "11*8+4*4*256*2" //do not change, it is hardcoded in the asm
 #define DSTW_OFFSET           "11*8+4*4*256*2" //do not change, it is hardcoded in the ASM
 #define ESP_OFFSET            "11*8+4*4*256*2+8"
 #define VROUNDER_OFFSET       "11*8+4*4*256*2+16"
 #define U_TEMP                "11*8+4*4*256*2+24"
--- a/libswscale/swscale_template.c
+++ b/libswscale/swscale_template.c
@@ -17,8 +17,8 @@
 * along with FFmpeg; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 *
 * the C code (not assembly, mmx, ...) of this file can be used
 * under the LGPL license too
 * The C code (not assembly, MMX, ...) of this file can be used
 * under the LGPL license.
 */

 #undef REAL_MOVNTQ
@@ -30,7 +30,7 @@
 #undef SFENCE

 #ifdef HAVE_3DNOW
 /* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
 /* On K6 femms is faster than emms. On K7 femms is directly mapped on emms. */
 #define EMMS     "femms"
 #else
 #define EMMS     "emms"
@@ -1503,7 +1503,7 @@ static inline void RENAME(yuv2packed1)(SwsContext *c, uint16_t *buf0, uint16_t *
    const int yalpha1=0;
    int i;

    uint16_t *buf1= buf0; //FIXME needed for the rgb1/bgr1
    uint16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
    const int yalpha= 4096; //FIXME ...

    if (flags&SWS_FULL_CHR_H_INT)
@@ -1700,7 +1700,7 @@ static inline void RENAME(yuv2packed1)(SwsContext *c, uint16_t *buf0, uint16_t *
    }
 }

 //FIXME yuy2* can read upto 7 samples to much
 //FIXME yuy2* can read up to 7 samples too much

 static inline void RENAME(yuy2ToY)(uint8_t *dst, uint8_t *src, long width)
 {
@@ -2297,7 +2297,7 @@ static inline void RENAME(palToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1,
    }
 }

 // Bilinear / Bicubic scaling
 // bilinear / bicubic scaling
 static inline void RENAME(hScale)(int16_t *dst, int dstW, uint8_t *src, int srcW, int xInc,
                                  int16_t *filter, int16_t *filterPos, long filterSize)
 {
@@ -2544,7 +2544,7 @@ static inline void RENAME(hyscale)(uint16_t *dst, long dstWidth, uint8_t *src, i
    }

 #ifdef HAVE_MMX
    // use the new MMX scaler if the mmx2 can't be used (it is faster than the x86 ASM one)
    // Use the new MMX scaler if the MMX2 one can't be used (it is faster than the x86 ASM one).
    if (!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
 #else
    if (!(flags&SWS_FAST_BILINEAR))
@@ -2552,7 +2552,7 @@ static inline void RENAME(hyscale)(uint16_t *dst, long dstWidth, uint8_t *src, i
    {
        RENAME(hScale)(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize);
    }
    else // Fast Bilinear upscale / crap downscale
    else // fast bilinear upscale / crap downscale
    {
 #if defined(ARCH_X86)
 #ifdef HAVE_MMX2
@@ -2761,7 +2761,7 @@ inline static void RENAME(hcscale)(uint16_t *dst, long dstWidth, uint8_t *src1,
    }

 #ifdef HAVE_MMX
    // use the new MMX scaler if the mmx2 can't be used (it is faster than the x86 ASM one)
    // Use the new MMX scaler if the MMX2 one can't be used (it is faster than the x86 ASM one).
    if (!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
 #else
    if (!(flags&SWS_FAST_BILINEAR))
@@ -2770,7 +2770,7 @@ inline static void RENAME(hcscale)(uint16_t *dst, long dstWidth, uint8_t *src1,
        RENAME(hScale)(dst     , dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
        RENAME(hScale)(dst+VOFW, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
    }
    else // Fast Bilinear upscale / crap downscale
    else // fast bilinear upscale / crap downscale
    {
 #if defined(ARCH_X86)
 #ifdef HAVE_MMX2
@@ -2890,8 +2890,8 @@ FUNNY_UV_CODE
            "cmp        %2, %%"REG_a"               \n\t"
            " jb        1b                          \n\t"

 /* GCC-3.3 makes MPlayer crash on IA-32 machines when using "g" operand here,
   which is needed to support GCC-4.0 */
 /* GCC 3.3 makes MPlayer crash on IA-32 machines when using "g" operand here,
   which is needed to support GCC 4.0. */
 #if defined(ARCH_X86_64) && ((__GNUC__ > 3) || (__GNUC__ == 3 && __GNUC_MINOR__ >= 4))
            :: "m" (src1), "m" (dst), "g" ((long)dstWidth), "m" (xInc_shr16), "m" (xInc_mask),
 #else
@@ -2963,7 +2963,7 @@ static int RENAME(swScale)(SwsContext *c, uint8_t* src[], int srcStride[], int s
    int lastDstY;
    uint8_t *pal=NULL;

    /* vars whch will change and which we need to storw back in the context */
    /* vars which will change and which we need to store back in the context */
    int dstY= c->dstY;
    int lumBufIndex= c->lumBufIndex;
    int chrBufIndex= c->chrBufIndex;
@@ -3004,13 +3004,14 @@ static int RENAME(swScale)(SwsContext *c, uint8_t* src[], int srcStride[], int s
        if (flags & SWS_PRINT_INFO && firstTime)
        {
            av_log(c, AV_LOG_WARNING, "Warning: dstStride is not aligned!\n"
                   "         ->cannot do aligned memory acesses anymore\n");
                   "         ->cannot do aligned memory accesses anymore\n");
            firstTime=0;
        }
    }

    /* Note the user might start scaling the picture in the middle so this will not get executed
       this is not really intended but works currently, so ppl might do it */
    /* Note the user might start scaling the picture in the middle so this
       will not get executed. This is not really intended but works
       currently, so people might do it. */
    if (srcSliceY ==0){
        lumBufIndex=0;
        chrBufIndex=0;
@@ -3182,7 +3183,7 @@ static int RENAME(swScale)(SwsContext *c, uint8_t* src[], int srcStride[], int s
            {
                const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
                if ((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
                if (vLumFilterSize == 1 && vChrFilterSize == 1) // Unscaled YV12
                if (vLumFilterSize == 1 && vChrFilterSize == 1) // unscaled YV12
                {
                    int16_t *lumBuf = lumPixBuf[0];
                    int16_t *chrBuf= chrPixBuf[0];
@@ -3200,13 +3201,13 @@ static int RENAME(swScale)(SwsContext *c, uint8_t* src[], int srcStride[], int s
            {
                ASSERT(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
                ASSERT(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
                if (vLumFilterSize == 1 && vChrFilterSize == 2) //Unscaled RGB
                if (vLumFilterSize == 1 && vChrFilterSize == 2) //unscaled RGB
                {
                    int chrAlpha= vChrFilter[2*dstY+1];
                    RENAME(yuv2packed1)(c, *lumSrcPtr, *chrSrcPtr, *(chrSrcPtr+1),
                        dest, dstW, chrAlpha, dstFormat, flags, dstY);
                }
                else if (vLumFilterSize == 2 && vChrFilterSize == 2) //BiLinear Upscale RGB
                else if (vLumFilterSize == 2 && vChrFilterSize == 2) //bilinear upscale RGB
                {
                    int lumAlpha= vLumFilter[2*dstY+1];
                    int chrAlpha= vChrFilter[2*dstY+1];
@@ -3217,7 +3218,7 @@ static int RENAME(swScale)(SwsContext *c, uint8_t* src[], int srcStride[], int s
                    RENAME(yuv2packed2)(c, *lumSrcPtr, *(lumSrcPtr+1), *chrSrcPtr, *(chrSrcPtr+1),
                        dest, dstW, lumAlpha, chrAlpha, dstY);
                }
                else //General RGB
                else //general RGB
                {
                    RENAME(yuv2packedX)(c,
                        vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
--- a/libswscale/yuv2rgb.c
+++ b/libswscale/yuv2rgb.c
@@ -39,7 +39,7 @@
 #include "swscale.h"
 #include "swscale_internal.h"

 #define DITHER1XBPP // only for mmx
 #define DITHER1XBPP // only for MMX

 const uint8_t  __attribute__((aligned(8))) dither_2x2_4[2][8]={
 {  1,   3,   1,   3,   1,   3,   1,   3, },
@@ -155,8 +155,8 @@ DECLARE_ASM_CONST(8, uint64_t, mmx_00ffw)   = 0x00ff00ff00ff00ffULL;
 DECLARE_ASM_CONST(8, uint64_t, mmx_redmask) = 0xf8f8f8f8f8f8f8f8ULL;
 DECLARE_ASM_CONST(8, uint64_t, mmx_grnmask) = 0xfcfcfcfcfcfcfcfcULL;

 // the volatile is required because gcc otherwise optimizes some writes away not knowing that these
 // are read in the asm block
 // The volatile is required because gcc otherwise optimizes some writes away
 // not knowing that these are read in the ASM block.
 static volatile uint64_t attribute_used __attribute__((aligned(8))) b5Dither;
 static volatile uint64_t attribute_used __attribute__((aligned(8))) g5Dither;
 static volatile uint64_t attribute_used __attribute__((aligned(8))) g6Dither;
@@ -641,7 +641,7 @@ SwsFunc yuv2rgb_get_func_ptr (SwsContext *c)
    }
 #endif

    av_log(c, AV_LOG_WARNING, "No accelerated colorspace conversion found\n");
    av_log(c, AV_LOG_WARNING, "No accelerated colorspace conversion found.\n");

    switch(c->dstFormat){
    case PIX_FMT_BGR32:
--- a/libswscale/yuv2rgb_altivec.c
+++ b/libswscale/yuv2rgb_altivec.c
@@ -21,63 +21,68 @@
 */

 /*
 convert I420 YV12 to RGB in various formats,
  it rejects images that are not in 420 formats
  it rejects images that don't have widths of multiples of 16
  it rejects images that don't have heights of multiples of 2
 reject defers to C simulation codes.
 Convert I420 YV12 to RGB in various formats,
  it rejects images that are not in 420 formats,
  it rejects images that don't have widths of multiples of 16,
  it rejects images that don't have heights of multiples of 2.
 Reject defers to C simulation code.

 lots of optimizations to be done here
 Lots of optimizations to be done here.

 1. need to fix saturation code, I just couldn't get it to fly with packs and adds.
   so we currently use max min to clip
 1. Need to fix saturation code. I just couldn't get it to fly with packs
   and adds, so we currently use max/min to clip.

 2. the inefficient use of chroma loading needs a bit of brushing up
 2. The inefficient use of chroma loading needs a bit of brushing up.

 3. analysis of pipeline stalls needs to be done, use shark to identify pipeline stalls
 3. Analysis of pipeline stalls needs to be done. Use shark to identify
   pipeline stalls.


 MODIFIED to calculate coeffs from currently selected color space.
 MODIFIED core to be a macro which you spec the output format.
 ADDED UYVY conversion which is never called due to some thing in SWSCALE.
 MODIFIED core to be a macro where you specify the output format.
 ADDED UYVY conversion which is never called due to some thing in swscale.
 CORRECTED algorithim selection to be strict on input formats.
 ADDED runtime detection of altivec.
 ADDED runtime detection of AltiVec.

 ADDED altivec_yuv2packedX vertical scl + RGB converter

 March 27,2004
 PERFORMANCE ANALYSIS

 The C version use 25% of the processor or ~250Mips for D1 video rawvideo used as test
 The ALTIVEC version uses 10% of the processor or ~100Mips for D1 video same sequence
 The C version uses 25% of the processor or ~250Mips for D1 video rawvideo
 used as test.
 The AltiVec version uses 10% of the processor or ~100Mips for D1 video
 same sequence.

 720*480*30  ~10MPS
 720 * 480 * 30  ~10MPS

 so we have roughly 10clocks per pixel this is too high something has to be wrong.
 so we have roughly 10 clocks per pixel. This is too high, something has
 to be wrong.

 OPTIMIZED clip codes to utilize vec_max and vec_packs removing the need for vec_min.
 OPTIMIZED clip codes to utilize vec_max and vec_packs removing the
 need for vec_min.

 OPTIMIZED DST OUTPUT cache/dma controls. we are pretty much
 guaranteed to have the input video frame it was just decompressed so
 it probably resides in L1 caches.  However we are creating the
 output video stream this needs to use the DSTST instruction to
 optimize for the cache.  We couple this with the fact that we are
 not going to be visiting the input buffer again so we mark it Least
 Recently Used.  This shaves 25% of the processor cycles off.
 OPTIMIZED DST OUTPUT cache/DMA controls. We are pretty much guaranteed to have
 the input video frame, it was just decompressed so it probably resides in L1
 caches. However, we are creating the output video stream. This needs to use the
 DSTST instruction to optimize for the cache. We couple this with the fact that
 we are not going to be visiting the input buffer again so we mark it Least
 Recently Used. This shaves 25% of the processor cycles off.

 Now MEMCPY is the largest mips consumer in the system, probably due
 Now memcpy is the largest mips consumer in the system, probably due
 to the inefficient X11 stuff.

 GL libraries seem to be very slow on this machine 1.33Ghz PB running
 Jaguar, this is not the case for my 1Ghz PB.  I thought it might be
 a versioning issues, however I have libGL.1.2.dylib for both
 machines. ((We need to figure this out now))
 a versioning issue, however I have libGL.1.2.dylib for both
 machines. (We need to figure this out now.)

 GL2 libraries work now with patch for RGB32
 GL2 libraries work now with patch for RGB32.

 NOTE quartz vo driver ARGB32_to_RGB24 consumes 30% of the processor
 NOTE: quartz vo driver ARGB32_to_RGB24 consumes 30% of the processor.

 Integrated luma prescaling adjustment for saturation/contrast/brightness adjustment.
 Integrated luma prescaling adjustment for saturation/contrast/brightness
 adjustment.
 */

 #include <stdio.h>
--- a/libswscale/yuv2rgb_bfin.c
+++ b/libswscale/yuv2rgb_bfin.c
@@ -1,9 +1,8 @@
 /*
 * Copyright (C) 2007 Marc Hoffman <marc.hoffman@analog.com>
 *                    April 20, 2007
 *
 * Blackfin Video Color Space Converters Operations
 *  convert I420 YV12 to RGB in various formats,
 * Blackfin video color space converter operations
 * convert I420 YV12 to RGB in various formats
 *
 * This file is part of FFmpeg.
 *
@@ -200,7 +199,7 @@ SwsFunc ff_bfin_yuv2rgb_get_func_ptr (SwsContext *c)
        return 0;
    }

    av_log(c, AV_LOG_INFO, "BlackFin Accelerated Color Space Converter %s\n",
    av_log(c, AV_LOG_INFO, "BlackFin accelerated color space converter %s\n",
           sws_format_name (c->dstFormat));

    return f;
--- a/libswscale/yuv2rgb_mlib.c
+++ b/libswscale/yuv2rgb_mlib.c
@@ -1,5 +1,6 @@
 /*
 * yuv2rgb_mlib.c, Software YUV to RGB converter using mediaLib
 * software YUV to RGB converter using mediaLib
 *
 * Copyright (C) 2003 Michael Niedermayer <michaelni@gmx.at>
 *
 * This file is part of FFmpeg.
--- a/libswscale/yuv2rgb_template.c
+++ b/libswscale/yuv2rgb_template.c
@@ -1,5 +1,5 @@
 /*
 * yuv2rgb_mmx.c, Software YUV to RGB converter with Intel MMX "technology"
 * yuv2rgb_mmx.c, software YUV to RGB converter with Intel MMX "technology"
 *
 * Copyright (C) 2000, Silicon Integrated System Corp.
 *
@@ -31,7 +31,7 @@
 #undef SFENCE

 #ifdef HAVE_3DNOW
 /* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
 /* On K6 femms is faster than emms. On K7 femms is directly mapped on emms. */
 #define EMMS     "femms"
 #else
 #define EMMS     "emms"
@@ -147,8 +147,8 @@ static inline int RENAME(yuv420_rgb16)(SwsContext *c, uint8_t* src[], int srcStr
        g6Dither= ff_dither4[y&1];
        g5Dither= ff_dither8[y&1];
        r5Dither= ff_dither8[(y+1)&1];
        /* this mmx assembly code deals with SINGLE scan line at a time, it convert 8
           pixels in each iteration */
        /* This MMX assembly code deals with a SINGLE scan line at a time,
         * it converts 8 pixels in each iteration. */
        asm volatile (
        /* load data for start of next scan line */
        "movd    (%2, %0), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */
@@ -156,8 +156,8 @@ static inline int RENAME(yuv420_rgb16)(SwsContext *c, uint8_t* src[], int srcStr
        "movq (%5, %0, 2), %%mm6;" /* Load 8  Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */
        //".balign 16     \n\t"
        "1:             \n\t"
        /* no speed diference on my p3@500 with prefetch,
         * if it is faster for anyone with -benchmark then tell me
        /* No speed difference on my p3@500 with prefetch,
         * if it is faster for anyone with -benchmark then tell me.
        PREFETCH" 64(%0) \n\t"
        PREFETCH" 64(%1) \n\t"
        PREFETCH" 64(%2) \n\t"
@@ -180,7 +180,7 @@ YUV2RGB
        "movq %%mm0, %%mm5;" /* Copy B7-B0 */
        "movq %%mm2, %%mm7;" /* Copy G7-G0 */

        /* convert rgb24 plane to rgb16 pack for pixel 0-3 */
        /* convert RGB24 plane to RGB16 pack for pixel 0-3 */
        "punpcklbw %%mm4, %%mm2;" /* 0_0_0_0 0_0_0_0 g7g6g5g4 g3g2_0_0 */
        "punpcklbw %%mm1, %%mm0;" /* r7r6r5r4 r3_0_0_0 0_0_0_b7 b6b5b4b3 */

@@ -190,7 +190,7 @@ YUV2RGB
        "movq 8 (%5, %0, 2), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */
        MOVNTQ "      %%mm0, (%1);" /* store pixel 0-3 */

        /* convert rgb24 plane to rgb16 pack for pixel 0-3 */
        /* convert RGB24 plane to RGB16 pack for pixel 0-3 */
        "punpckhbw %%mm4, %%mm7;" /* 0_0_0_0 0_0_0_0 g7g6g5g4 g3g2_0_0 */
        "punpckhbw %%mm1, %%mm5;" /* r7r6r5r4 r3_0_0_0 0_0_0_b7 b6b5b4b3 */

@@ -242,8 +242,8 @@ static inline int RENAME(yuv420_rgb15)(SwsContext *c, uint8_t* src[], int srcStr
        g6Dither= ff_dither4[y&1];
        g5Dither= ff_dither8[y&1];
        r5Dither= ff_dither8[(y+1)&1];
        /* this mmx assembly code deals with SINGLE scan line at a time, it convert 8
           pixels in each iteration */
        /* This MMX assembly code deals with a SINGLE scan line at a time,
         * it converts 8 pixels in each iteration. */
        asm volatile (
        /* load data for start of next scan line */
        "movd    (%2, %0), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */
@@ -271,7 +271,7 @@ YUV2RGB
        "movq %%mm0, %%mm5;" /* Copy B7-B0 */
        "movq %%mm2, %%mm7;" /* Copy G7-G0 */

        /* convert rgb24 plane to rgb16 pack for pixel 0-3 */
        /* convert RGB24 plane to RGB16 pack for pixel 0-3 */
        "punpcklbw %%mm4, %%mm2;" /* 0_0_0_0 0_0_0_0 g7g6g5g4 g3_0_0_0 */
        "punpcklbw %%mm1, %%mm0;" /* r7r6r5r4 r3_0_0_0 0_0_0_b7 b6b5b4b3 */

@@ -281,7 +281,7 @@ YUV2RGB
        "movq 8 (%5, %0, 2), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */
        MOVNTQ "      %%mm0, (%1);"  /* store pixel 0-3 */

        /* convert rgb24 plane to rgb16 pack for pixel 0-3 */
        /* convert RGB24 plane to RGB16 pack for pixel 0-3 */
        "punpckhbw %%mm4, %%mm7;" /* 0_0_0_0 0_0_0_0 0_g7g6g5 g4g3_0_0 */
        "punpckhbw %%mm1, %%mm5;" /* r7r6r5r4 r3_0_0_0 0_0_0_b7 b6b5b4b3 */

@@ -326,8 +326,8 @@ static inline int RENAME(yuv420_rgb24)(SwsContext *c, uint8_t* src[], int srcStr
        uint8_t *pv = src[2] + (y>>1)*srcStride[2];
        long index= -h_size/2;

        /* this mmx assembly code deals with SINGLE scan line at a time, it convert 8
           pixels in each iteration */
        /* This MMX assembly code deals with a SINGLE scan line at a time,
         * it converts 8 pixels in each iteration. */
        asm volatile (
        /* load data for start of next scan line */
        "movd    (%2, %0), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */
@@ -472,8 +472,8 @@ static inline int RENAME(yuv420_rgb32)(SwsContext *c, uint8_t* src[], int srcStr
        uint8_t *pv = src[2] + (y>>1)*srcStride[2];
        long index= -h_size/2;

        /* this mmx assembly code deals with SINGLE scan line at a time, it convert 8
           pixels in each iteration */
        /* This MMX assembly code deals with a SINGLE scan line at a time,
         * it converts 8 pixels in each iteration. */
        asm volatile (
        /* load data for start of next scan line */
        "movd    (%2, %0), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */