yuv2rgb brightness/contrast/saturation/different colorspaces support finished

yuv2rgb deglobalize yuv2rgb optimizations / cleanup bugs? Originally committed as revision 9477 to svn://svn.mplayerhq.hu/mplayer/trunk/postproc
23 years ago · 5427e24291
--- a/postproc/rgb2rgb.h
+++ b/postproc/rgb2rgb.h
@@ -84,13 +84,15 @@ extern void yvu9_to_yuy2(const uint8_t *src1, const uint8_t *src2, const uint8_t
 #define MODE_RGB  0x1
 #define MODE_BGR  0x2

 typedef void (* yuv2rgb_fun) (uint8_t * image, uint8_t * py,
 static void yuv2rgb(uint8_t * image, uint8_t * py,
 			      uint8_t * pu, uint8_t * pv,
 			      unsigned h_size, unsigned v_size,
 			      int rgb_stride, int y_stride, int uv_stride);
 			      int rgb_stride, int y_stride, int uv_stride){
 printf("broken, this should use the swscaler\n");
 }

 extern yuv2rgb_fun yuv2rgb;

 void yuv2rgb_init (unsigned bpp, int mode);
 static void yuv2rgb_init (unsigned bpp, int mode){
 printf("broken, this should use the swscaler\n");
 }

 #endif
--- a/postproc/swscale.c
+++ b/postproc/swscale.c
@@ -1,5 +1,5 @@
 /*
    Copyright (C) 2001-2002 Michael Niedermayer <michaelni@gmx.at>
    Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>

    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -62,6 +62,7 @@ untested special converters
 #include <stdlib.h>
 #endif
 #include "swscale.h"
 #include "swscale_internal.h"
 #include "../cpudetect.h"
 #include "../bswap.h"
 #include "../libvo/img_format.h"
@@ -147,7 +148,6 @@ add support for Y8 output
 optimize bgr24 & bgr32
 add BGR4 output support
 write special BGR->BGR scaler
 deglobalize yuv2rgb*.c
 */

 #define ABS(a) ((a) > 0 ? (a) : (-(a)))
@@ -230,8 +230,6 @@ void (*swScale)(SwsContext *context, uint8_t* src[], int srcStride[], int srcSli
             int srcSliceH, uint8_t* dst[], int dstStride[])=NULL;

 static SwsVector *getConvVec(SwsVector *a, SwsVector *b);
 static inline void orderYUV(int format, uint8_t * sortedP[], int sortedStride[], uint8_t * p[], int stride[]);
 void *yuv2rgb_c_init (unsigned bpp, int mode, void *table_rV[256], void *table_gU[256], int table_gV[256], void *table_bU[256]);

 extern const uint8_t dither_2x2_4[2][8];
 extern const uint8_t dither_2x2_8[2][8];
@@ -1634,18 +1632,6 @@ static void PlanarToNV12Wrapper(SwsContext *c, uint8_t* src[], int srcStride[],
 		interleaveBytes( src[2],src[1],dst,c->srcW,srcSliceH,srcStride[2],srcStride[1],dstStride[0] );
 }


 /* Warper functions for yuv2bgr */
 static void planarYuvToBgr(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
             int srcSliceH, uint8_t* dstParam[], int dstStride[]){
 	uint8_t *dst=dstParam[0] + dstStride[0]*srcSliceY;

 	if(c->srcFormat==IMGFMT_YV12)
 		yuv2rgb( dst,src[0],src[1],src[2],c->srcW,srcSliceH,dstStride[0],srcStride[0],srcStride[1] );
 	else /* I420 & IYUV */
 		yuv2rgb( dst,src[0],src[2],src[1],c->srcW,srcSliceH,dstStride[0],srcStride[0],srcStride[1] );
 }

 static void PlanarToYuy2Wrapper(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
             int srcSliceH, uint8_t* dstParam[], int dstStride[]){
 	uint8_t *dst=dstParam[0] + dstStride[0]*srcSliceY;
@@ -1773,7 +1759,7 @@ static void yvu9toyv12Wrapper(SwsContext *c, uint8_t* src[], int srcStride[], in
 /**
 * bring pointers in YUV order instead of YVU
 */
 static inline void orderYUV(int format, uint8_t * sortedP[], int sortedStride[], uint8_t * p[], int stride[]){
 inline void sws_orderYUV(int format, uint8_t * sortedP[], int sortedStride[], uint8_t * p[], int stride[]){
 	if(format == IMGFMT_YV12 || format == IMGFMT_YVU9 
           || format == IMGFMT_444P || format == IMGFMT_422P || format == IMGFMT_411P){
 		sortedP[0]= p[0];
@@ -1814,8 +1800,8 @@ static void simpleCopy(SwsContext *c, uint8_t* srcParam[], int srcStrideParam[],
 	uint8_t *src[3];
 	uint8_t *dst[3];

 	orderYUV(c->srcFormat, src, srcStride, srcParam, srcStrideParam);
 	orderYUV(c->dstFormat, dst, dstStride, dstParam, dstStrideParam);
 	sws_orderYUV(c->srcFormat, src, srcStride, srcParam, srcStrideParam);
 	sws_orderYUV(c->dstFormat, dst, dstStride, dstParam, dstStrideParam);

 	if(isPacked(c->srcFormat))
 	{
@@ -1923,41 +1909,51 @@ static void getSubSampleFactors(int *h, int *v, int format){
 	}
 }

 static uint16_t roundToInt16(float f){
 	     if(f<-0x7FFF) f= -0x7FFF;
 	else if(f> 0x7FFF) f=  0x7FFF;
 	
 	return (int)floor(f + 0.5);
 static uint16_t roundToInt16(int64_t f){
 	int r= (f + (1<<15))>>16;
 	     if(r<-0x7FFF) return 0x8000;
 	else if(r> 0x7FFF) return 0x7FFF;
 	else               return r;
 }

 /**
 * @param colorspace colorspace
 * @param inv_table the yuv2rgb coeffs, normally Inverse_Table_6_9[x]
 * @param fullRange if 1 then the luma range is 0..255 if 0 its 16..235
 * @return -1 if not supported
 */
 void setInputColorspaceDetails(SwsContext *c, int colorspace, int fullRange, float brightness, float contrast, float saturation){

 	float crv =  Inverse_Table_6_9[colorspace][0]/65536.0;
 	float cbu =  Inverse_Table_6_9[colorspace][1]/65536.0;
 	float cgu = -Inverse_Table_6_9[colorspace][2]/65536.0;
 	float cgv = -Inverse_Table_6_9[colorspace][3]/65536.0;
 	float cy  = 1.0;
 	float oy  = 0;
 int sws_setColorspaceDetails(SwsContext *c, const int inv_table[4], int srcRange, const int table[4], int dstRange, int brightness, int contrast, int saturation){
 	int64_t crv =  inv_table[0];
 	int64_t cbu =  inv_table[1];
 	int64_t cgu = -inv_table[2];
 	int64_t cgv = -inv_table[3];
 	int64_t cy  = 1<<16;
 	int64_t oy  = 0;

 	if(isYUV(c->dstFormat) || isGray(c->dstFormat)) return -1;
 	memcpy(c->srcColorspaceTable, inv_table, sizeof(int)*4);
 	memcpy(c->dstColorspaceTable,     table, sizeof(int)*4);

 	c->brightness= brightness;
 	c->contrast  = contrast;
 	c->saturation= saturation;
 	c->srcRange  = srcRange;
 	c->dstRange  = dstRange;

 	c->uOffset=   0x0400040004000400LL;
 	c->vOffset=   0x0400040004000400LL;

 	if(!fullRange){
 		cy= (cy*255.0) / 219.0;
 		oy= 16.0;
 	if(!srcRange){
 		cy= (cy*255) / 219;
 		oy= 16<<16;
 	}

 	cy *= contrast;
 	crv*= contrast * saturation;
 	cbu*= contrast * saturation;
 	cgu*= contrast * saturation;
 	cgv*= contrast * saturation;
 	cy = (cy *contrast             )>>16;
 	crv= (crv*contrast * saturation)>>32;
 	cbu= (cbu*contrast * saturation)>>32;
 	cgu= (cgu*contrast * saturation)>>32;
 	cgv= (cgv*contrast * saturation)>>32;

 	oy -= 256.0*brightness;
 	oy -= 256*brightness;

 	c->yCoeff=    roundToInt16(cy *8192) * 0x0001000100010001ULL;
 	c->vrCoeff=   roundToInt16(crv*8192) * 0x0001000100010001ULL;
@@ -1965,6 +1961,28 @@ void setInputColorspaceDetails(SwsContext *c, int colorspace, int fullRange, flo
 	c->vgCoeff=   roundToInt16(cgv*8192) * 0x0001000100010001ULL;
 	c->ugCoeff=   roundToInt16(cgu*8192) * 0x0001000100010001ULL;
 	c->yOffset=   roundToInt16(oy *   8) * 0x0001000100010001ULL;

 	yuv2rgb_c_init_tables(c, inv_table, srcRange, brightness, contrast, saturation);
 	//FIXME factorize
 	
 	return 0;
 }

 /**
 * @return -1 if not supported
 */
 int sws_getColorspaceDetails(SwsContext *c, int **inv_table, int *srcRange, int **table, int *dstRange, int *brightness, int *contrast, int *saturation){
 	if(isYUV(c->dstFormat) || isGray(c->dstFormat)) return -1;

 	*inv_table = c->srcColorspaceTable;
 	*table     = c->dstColorspaceTable;
 	*srcRange  = c->srcRange;
 	*dstRange  = c->dstRange;
 	*brightness= c->brightness;
 	*contrast  = c->contrast;
 	*saturation= c->saturation;
 	
 	return 0;	
 }

 SwsContext *getSwsContext(int srcW, int srcH, int srcFormat, int dstW, int dstH, int dstFormat, int flags,
@@ -2026,8 +2044,6 @@ SwsContext *getSwsContext(int srcW, int srcH, int srcFormat, int dstW, int dstH,
 	c->dstFormat= dstFormat;
 	c->srcFormat= srcFormat;

 	setInputColorspaceDetails(c, SWS_CS_DEFAULT, 0, 0.0, 1.0, 1.0);
 	
 	usesFilter=0;
 	if(dstFilter->lumV!=NULL && dstFilter->lumV->length>1) usesFilter=1;
 	if(dstFilter->lumH!=NULL && dstFilter->lumH->length>1) usesFilter=1;
@@ -2054,17 +2070,14 @@ SwsContext *getSwsContext(int srcW, int srcH, int srcFormat, int dstW, int dstH,

 	c->chrIntHSubSample= c->chrDstHSubSample;
 	c->chrIntVSubSample= c->chrSrcVSubSample;
 	

 	// note the -((-x)>>y) is so that we allways round toward +inf
 	c->chrSrcW= -((-srcW) >> c->chrSrcHSubSample);
 	c->chrSrcH= -((-srcH) >> c->chrSrcVSubSample);
 	c->chrDstW= -((-dstW) >> c->chrDstHSubSample);
 	c->chrDstH= -((-dstH) >> c->chrDstVSubSample);
 	
 	if(isBGR(dstFormat))
 		c->yuvTable= yuv2rgb_c_init(dstFormat & 0xFF, MODE_RGB, c->table_rV, c->table_gU, c->table_gV, c->table_bU);
 	if(isRGB(dstFormat))
 		c->yuvTable= yuv2rgb_c_init(dstFormat & 0xFF, MODE_BGR, c->table_rV, c->table_gU, c->table_gV, c->table_bU);

 	sws_setColorspaceDetails(c, Inverse_Table_6_9[SWS_CS_DEFAULT], 0, Inverse_Table_6_9[SWS_CS_DEFAULT] /* FIXME*/, 0, 0, 1<<16, 1<<16); 

 	/* unscaled special Cases */
 	if(unscaled && !usesFilter)
@@ -2075,19 +2088,9 @@ SwsContext *getSwsContext(int srcW, int srcH, int srcFormat, int dstW, int dstH,
 			c->swScale= PlanarToNV12Wrapper;
 		}
 		/* yuv2bgr */
 		if((srcFormat==IMGFMT_YV12 || srcFormat==IMGFMT_I420) && isBGR(dstFormat))
 		if((srcFormat==IMGFMT_YV12 || srcFormat==IMGFMT_I420 || srcFormat==IMGFMT_422P) && (isBGR(dstFormat) || isRGB(dstFormat)))
 		{
 			// FIXME multiple yuv2rgb converters wont work that way cuz that thing is full of globals&statics
 			//FIXME rgb vs. bgr ? 
 #ifdef WORDS_BIGENDIAN
 			if(dstFormat==IMGFMT_BGR32)
 				yuv2rgb_init( dstFormat&0xFF /* =bpp */, MODE_BGR);
 			else
 				yuv2rgb_init( dstFormat&0xFF /* =bpp */, MODE_RGB);
 #else
 			yuv2rgb_init( dstFormat&0xFF /* =bpp */, MODE_RGB);
 #endif
 			c->swScale= planarYuvToBgr;
 			c->swScale= yuv2rgb_get_func_ptr(c);
 		}
 		
 		if( srcFormat==IMGFMT_YVU9 && (dstFormat==IMGFMT_YV12 || dstFormat==IMGFMT_I420) )
--- a/postproc/swscale.h
+++ b/postproc/swscale.h
@@ -1,5 +1,5 @@
 /*
    Copyright (C) 2001-2002 Michael Niedermayer <michaelni@gmx.at>
    Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>

    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -16,6 +16,11 @@
    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */

 #ifndef SWSCALE_H
 #define SWSCALE_H

 #include "swscale_internal.h" //FIXME HACK REMOVE

 /* values for the flags, the stuff on the command line is different */
 #define SWS_FAST_BILINEAR 1
 #define SWS_BILINEAR 2
@@ -44,8 +49,6 @@
 #define SWS_FULL_CHR_H_INP	0x4000
 #define SWS_DIRECT_BGR		0x8000

 #define MAX_FILTER_SIZE 256

 #define SWS_MAX_REDUCE_CUTOFF 0.002

 #define SWS_CS_ITU709		1
@@ -56,97 +59,6 @@
 #define SWS_CS_SMPTE240M 	7
 #define SWS_CS_DEFAULT 		5

 /* this struct should be aligned on at least 32-byte boundary */
 typedef struct SwsContext{
 	int srcW, srcH, dstH;
 	int chrSrcW, chrSrcH, chrDstW, chrDstH;
 	int lumXInc, chrXInc;
 	int lumYInc, chrYInc;
 	int dstFormat, srcFormat;
 	int chrSrcHSubSample, chrSrcVSubSample;
 	int chrIntHSubSample, chrIntVSubSample;
 	int chrDstHSubSample, chrDstVSubSample;
 	int vChrDrop;

 	int16_t **lumPixBuf;
 	int16_t **chrPixBuf;
 	int16_t *hLumFilter;
 	int16_t *hLumFilterPos;
 	int16_t *hChrFilter;
 	int16_t *hChrFilterPos;
 	int16_t *vLumFilter;
 	int16_t *vLumFilterPos;
 	int16_t *vChrFilter;
 	int16_t *vChrFilterPos;

 	uint8_t formatConvBuffer[4000]; //FIXME dynamic alloc, but we have to change alot of code for this to be usefull

 	int hLumFilterSize;
 	int hChrFilterSize;
 	int vLumFilterSize;
 	int vChrFilterSize;
 	int vLumBufSize;
 	int vChrBufSize;

 	uint8_t __attribute__((aligned(32))) funnyYCode[10000];
 	uint8_t __attribute__((aligned(32))) funnyUVCode[10000];
 	int32_t *lumMmx2FilterPos;
 	int32_t *chrMmx2FilterPos;
 	int16_t *lumMmx2Filter;
 	int16_t *chrMmx2Filter;

 	int canMMX2BeUsed;

 	int lastInLumBuf;
 	int lastInChrBuf;
 	int lumBufIndex;
 	int chrBufIndex;
 	int dstY;
 	int flags;
 	void * yuvTable;
 	void * table_rV[256];
 	void * table_gU[256];
 	int    table_gV[256];
 	void * table_bU[256];

 	void (*swScale)(struct SwsContext *context, uint8_t* src[], int srcStride[], int srcSliceY,
             int srcSliceH, uint8_t* dst[], int dstStride[]);

 #define RED_DITHER   "0*8"
 #define GREEN_DITHER "1*8"
 #define BLUE_DITHER  "2*8"
 #define Y_COEFF      "3*8"
 #define VR_COEFF     "4*8"
 #define UB_COEFF     "5*8"
 #define VG_COEFF     "6*8"
 #define UG_COEFF     "7*8"
 #define Y_OFFSET     "8*8"
 #define U_OFFSET     "9*8"
 #define V_OFFSET     "10*8"
 #define LUM_MMX_FILTER_OFFSET "11*8"
 #define CHR_MMX_FILTER_OFFSET "11*8+4*4*256"
 #define DSTW_OFFSET  "11*8+4*4*256*2"
 #define ESP_OFFSET  "11*8+4*4*256*2+4"
                  
 	uint64_t redDither   __attribute__((aligned(8)));
 	uint64_t greenDither __attribute__((aligned(8)));
 	uint64_t blueDither  __attribute__((aligned(8)));

 	uint64_t yCoeff      __attribute__((aligned(8)));
 	uint64_t vrCoeff     __attribute__((aligned(8)));
 	uint64_t ubCoeff     __attribute__((aligned(8)));
 	uint64_t vgCoeff     __attribute__((aligned(8)));
 	uint64_t ugCoeff     __attribute__((aligned(8)));
 	uint64_t yOffset     __attribute__((aligned(8)));
 	uint64_t uOffset     __attribute__((aligned(8)));
 	uint64_t vOffset     __attribute__((aligned(8)));
 	int32_t  lumMmxFilter[4*MAX_FILTER_SIZE];
 	int32_t  chrMmxFilter[4*MAX_FILTER_SIZE];
 	int dstW;
 	int esp;
 } SwsContext;
 //FIXME check init (where 0)
 //FIXME split private & public


 // when used for filters they must have an odd number of elements
@@ -185,6 +97,9 @@ SwsContext *getSwsContext(int srcW, int srcH, int srcFormat, int dstW, int dstH,
 			 SwsFilter *srcFilter, SwsFilter *dstFilter);
 void swsGetFlagsAndFilterFromCmdLine(int *flags, SwsFilter **srcFilterParam, SwsFilter **dstFilterParam);

 int sws_setColorspaceDetails(SwsContext *c, const int inv_table[4], int srcRange, const int table[4], int dstRange, int brightness, int contrast, int saturation);
 int sws_getColorspaceDetails(SwsContext *c, int **inv_table, int *srcRange, int **table, int *dstRange, int *brightness, int *contrast, int *saturation);

 SwsVector *getGaussianVec(double variance, double quality);
 SwsVector *getConstVec(double c, int length);
 SwsVector *getIdentityVec(void);
@@ -199,3 +114,4 @@ SwsVector *cloneVec(SwsVector *a);
 void printVec(SwsVector *a);
 void freeVec(SwsVector *a);

 #endif
--- a/postproc/swscale_internal.h
+++ b/postproc/swscale_internal.h
@@ -0,0 +1,130 @@
 /*
    Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>

    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with this program; if not, write to the Free Software
    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */

 #ifndef SWSCALE_INTERNAL_H
 #define SWSCALE_INTERNAL_H

 #define MAX_FILTER_SIZE 256

 struct SwsContext;

 typedef void (*SwsFunc)(struct SwsContext *context, uint8_t* src[], int srcStride[], int srcSliceY,
             int srcSliceH, uint8_t* dst[], int dstStride[]);

 /* this struct should be aligned on at least 32-byte boundary */
 typedef struct SwsContext{
 	int srcW, srcH, dstH;
 	int chrSrcW, chrSrcH, chrDstW, chrDstH;
 	int lumXInc, chrXInc;
 	int lumYInc, chrYInc;
 	int dstFormat, srcFormat;
 	int chrSrcHSubSample, chrSrcVSubSample;
 	int chrIntHSubSample, chrIntVSubSample;
 	int chrDstHSubSample, chrDstVSubSample;
 	int vChrDrop;

 	int16_t **lumPixBuf;
 	int16_t **chrPixBuf;
 	int16_t *hLumFilter;
 	int16_t *hLumFilterPos;
 	int16_t *hChrFilter;
 	int16_t *hChrFilterPos;
 	int16_t *vLumFilter;
 	int16_t *vLumFilterPos;
 	int16_t *vChrFilter;
 	int16_t *vChrFilterPos;

 	uint8_t formatConvBuffer[4000]; //FIXME dynamic alloc, but we have to change alot of code for this to be usefull

 	int hLumFilterSize;
 	int hChrFilterSize;
 	int vLumFilterSize;
 	int vChrFilterSize;
 	int vLumBufSize;
 	int vChrBufSize;

 	uint8_t __attribute__((aligned(32))) funnyYCode[10000];
 	uint8_t __attribute__((aligned(32))) funnyUVCode[10000];
 	int32_t *lumMmx2FilterPos;
 	int32_t *chrMmx2FilterPos;
 	int16_t *lumMmx2Filter;
 	int16_t *chrMmx2Filter;

 	int canMMX2BeUsed;

 	int lastInLumBuf;
 	int lastInChrBuf;
 	int lumBufIndex;
 	int chrBufIndex;
 	int dstY;
 	int flags;
 	void * yuvTable;			// pointer to the yuv->rgb table start so it can be freed()
 	void * table_rV[256];
 	void * table_gU[256];
 	int    table_gV[256];
 	void * table_bU[256];

 	//Colorspace stuff
 	int contrast, brightness, saturation;	// for sws_getColorspaceDetails
 	int srcColorspaceTable[4];
 	int dstColorspaceTable[4];
 	int srcRange, dstRange;

 	SwsFunc swScale;

 #define RED_DITHER   "0*8"
 #define GREEN_DITHER "1*8"
 #define BLUE_DITHER  "2*8"
 #define Y_COEFF      "3*8"
 #define VR_COEFF     "4*8"
 #define UB_COEFF     "5*8"
 #define VG_COEFF     "6*8"
 #define UG_COEFF     "7*8"
 #define Y_OFFSET     "8*8"
 #define U_OFFSET     "9*8"
 #define V_OFFSET     "10*8"
 #define LUM_MMX_FILTER_OFFSET "11*8"
 #define CHR_MMX_FILTER_OFFSET "11*8+4*4*256"
 #define DSTW_OFFSET  "11*8+4*4*256*2"
 #define ESP_OFFSET  "11*8+4*4*256*2+4"
                  
 	uint64_t redDither   __attribute__((aligned(8)));
 	uint64_t greenDither __attribute__((aligned(8)));
 	uint64_t blueDither  __attribute__((aligned(8)));

 	uint64_t yCoeff      __attribute__((aligned(8)));
 	uint64_t vrCoeff     __attribute__((aligned(8)));
 	uint64_t ubCoeff     __attribute__((aligned(8)));
 	uint64_t vgCoeff     __attribute__((aligned(8)));
 	uint64_t ugCoeff     __attribute__((aligned(8)));
 	uint64_t yOffset     __attribute__((aligned(8)));
 	uint64_t uOffset     __attribute__((aligned(8)));
 	uint64_t vOffset     __attribute__((aligned(8)));
 	int32_t  lumMmxFilter[4*MAX_FILTER_SIZE];
 	int32_t  chrMmxFilter[4*MAX_FILTER_SIZE];
 	int dstW;
 	int esp;
 } SwsContext;
 //FIXME check init (where 0)
 //FIXME split private & public

 inline void sws_orderYUV(int format, uint8_t * sortedP[], int sortedStride[], uint8_t * p[], int stride[]);
 SwsFunc yuv2rgb_get_func_ptr (SwsContext *c);
 int yuv2rgb_c_init_tables (SwsContext *c, const int inv_table[4], int fullRange, int brightness, int contrast, int saturation);

 #endif
--- a/postproc/swscale_template.c
+++ b/postproc/swscale_template.c
@@ -1,5 +1,5 @@
 /*
    Copyright (C) 2001-2002 Michael Niedermayer <michaelni@gmx.at>
    Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>

    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -741,7 +741,6 @@ static inline void RENAME(yuv2yuvX)(SwsContext *c, int16_t *lumFilter, int16_t *
 				    int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
 				    uint8_t *dest, uint8_t *uDest, uint8_t *vDest, int dstW, int chrDstW)
 {
 	int dummy=0;
 #ifdef HAVE_MMX
 	if(uDest != NULL)
 	{
@@ -2553,8 +2552,8 @@ static void RENAME(swScale)(SwsContext *c, uint8_t* srcParam[], int srcStridePar
 	uint8_t *src[3];
 	uint8_t *dst[3];
 	
 	orderYUV(c->srcFormat, src, srcStride, srcParam, srcStrideParam);
 	orderYUV(c->dstFormat, dst, dstStride, dstParam, dstStrideParam);
 	sws_orderYUV(c->srcFormat, src, srcStride, srcParam, srcStrideParam);
 	sws_orderYUV(c->dstFormat, dst, dstStride, dstParam, dstStrideParam);

 	if(isPacked(c->srcFormat)){
 		src[0]=
--- a/postproc/yuv2rgb.c
+++ b/postproc/yuv2rgb.c
@@ -27,18 +27,23 @@
 *
 * MMX/MMX2 Template stuff from Michael Niedermayer (michaelni@gmx.at) (needed for fast movntq support)
 * 1,4,8bpp support by Michael Niedermayer (michaelni@gmx.at)
 * context / deglobalize stuff by Michael Niedermayer
 */

 #include <stdio.h>
 #include <stdlib.h>
 #include <inttypes.h>
 #include <assert.h>

 #include "config.h"
 //#include "video_out.h"
 #include "rgb2rgb.h"
 #include "swscale.h"
 #include "swscale_internal.h"
 #include "../cpudetect.h"
 #include "../mangle.h"
 #include "../mp_msg.h"
 #include "../libvo/img_format.h" //FIXME try to reduce dependency of such stuff

 #ifdef HAVE_MLIB
 #include "yuv2rgb_mlib.c"
@@ -46,10 +51,6 @@

 #define DITHER1XBPP // only for mmx

 #ifdef ARCH_X86
 #define CAN_COMPILE_X86_ASM
 #endif

 const uint8_t  __attribute__((aligned(8))) dither_2x2_4[2][8]={
 {  1,   3,   1,   3,   1,   3,   1,   3, },
 {  2,   0,   2,   0,   2,   0,   2,   0, },
@@ -157,21 +158,10 @@ const uint8_t  __attribute__((aligned(8))) dither_8x8_220[8][8]={
 };
 #endif

 #ifdef CAN_COMPILE_X86_ASM
 #ifdef ARCH_X86

 /* hope these constant values are cache line aligned */
 uint64_t __attribute__((aligned(8))) mmx_80w = 0x0080008000800080;
 uint64_t __attribute__((aligned(8))) mmx_10w = 0x1010101010101010;
 uint64_t __attribute__((aligned(8))) mmx_00ffw = 0x00ff00ff00ff00ff;
 uint64_t __attribute__((aligned(8))) mmx_Y_coeff = 0x253f253f253f253f;

 /* hope these constant values are cache line aligned */
 uint64_t __attribute__((aligned(8))) mmx_U_green = 0xf37df37df37df37d;
 uint64_t __attribute__((aligned(8))) mmx_U_blue = 0x4093409340934093;
 uint64_t __attribute__((aligned(8))) mmx_V_red = 0x3312331233123312;
 uint64_t __attribute__((aligned(8))) mmx_V_green = 0xe5fce5fce5fce5fc;

 /* hope these constant values are cache line aligned */
 uint64_t __attribute__((aligned(8))) mmx_redmask = 0xf8f8f8f8f8f8f8f8;
 uint64_t __attribute__((aligned(8))) mmx_grnmask = 0xfcfcfcfcfcfcfcfc;

@@ -217,8 +207,6 @@ uint64_t __attribute__((aligned(8))) dither8[2]={

 #endif // CAN_COMPILE_X86_ASM

 uint32_t matrix_coefficients = 6;

 const int32_t Inverse_Table_6_9[8][4] = {
    {117504, 138453, 13954, 34903}, /* no sequence_display_extension */
    {117504, 138453, 13954, 34903}, /* ITU-R Rec. 709 (1990) */
@@ -230,82 +218,12 @@ const int32_t Inverse_Table_6_9[8][4] = {
    {117579, 136230, 16907, 35559}  /* SMPTE 240M (1987) */
 };

 void *yuv2rgb_c_init (unsigned bpp, int mode, void *table_rV[256], void *table_gU[256], int table_gV[256], void *table_bU[256]);

 yuv2rgb_fun yuv2rgb= NULL;

 static void (* yuv2rgb_c_internal) (uint8_t *, uint8_t *,
 				    uint8_t *, uint8_t *,
 				    void *, void *, int, int);

 static void yuv2rgb_c (void * dst, uint8_t * py,
 		       uint8_t * pu, uint8_t * pv,
 		       unsigned h_size, unsigned v_size,
 		       unsigned rgb_stride, unsigned y_stride, unsigned uv_stride)
 {
    v_size >>= 1;

    while (v_size--) {
 	yuv2rgb_c_internal (py, py + y_stride, pu, pv, dst, dst + rgb_stride,
 			    h_size, v_size<<1);

 	py += 2 * y_stride;
 	pu += uv_stride;
 	pv += uv_stride;
 	dst += 2 * rgb_stride;
    }
 }

 void * table_rV[256];
 void * table_gU[256];
 int table_gV[256];
 void * table_bU[256];

 void yuv2rgb_init (unsigned bpp, int mode)
 {
    if(yuv2rgb) return;
 #ifdef CAN_COMPILE_X86_ASM
    if(gCpuCaps.hasMMX2)
    {
 	if (yuv2rgb == NULL /*&& (config.flags & VO_MMX_ENABLE)*/) {
 		yuv2rgb = yuv2rgb_init_MMX2 (bpp, mode);
 		if (yuv2rgb != NULL)
 			mp_msg(MSGT_SWS,MSGL_INFO,"Using MMX2 for colorspace transform\n");
 		else
 			mp_msg(MSGT_SWS,MSGL_WARN,"Cannot init MMX2 colorspace transform\n");
 	}
    }
    else if(gCpuCaps.hasMMX)
    {
 	if (yuv2rgb == NULL /*&& (config.flags & VO_MMX_ENABLE)*/) {
 		yuv2rgb = yuv2rgb_init_MMX (bpp, mode);
 		if (yuv2rgb != NULL)
 			mp_msg(MSGT_SWS,MSGL_INFO,"Using MMX for colorspace transform\n");
 		else
 			mp_msg(MSGT_SWS,MSGL_WARN,"Cannot init MMX colorspace transform\n");
 	}
    }
 #endif
 #ifdef HAVE_MLIB
    if (yuv2rgb == NULL /*&& (config.flags & VO_MLIB_ENABLE)*/) {
 	yuv2rgb = yuv2rgb_init_mlib (bpp, mode);
 	if (yuv2rgb != NULL)
 	    mp_msg(MSGT_SWS,MSGL_INFO,"Using mlib for colorspace transform\n");
    }
 #endif
    if (yuv2rgb == NULL) {
 	mp_msg(MSGT_SWS,MSGL_INFO,"No accelerated colorspace conversion found\n");
 	yuv2rgb_c_init (bpp, mode, table_rV, table_gU, table_gV, table_bU);
 	yuv2rgb = (yuv2rgb_fun)yuv2rgb_c;
    }
 }

 #define RGB(i)					\
 	U = pu[i];				\
 	V = pv[i];				\
 	r = table_rV[V];			\
 	g = table_gU[U] + table_gV[V];		\
 	b = table_bU[U];
 	r = c->table_rV[V];			\
 	g = c->table_gU[U] + c->table_gV[V];		\
 	b = c->table_bU[U];

 #define DST1(i)					\
 	Y = py_1[2*i];				\
@@ -343,19 +261,42 @@ void yuv2rgb_init (unsigned bpp, int mode)
 	Y = py_2[2*i+1];						\
 	dst_2[6*i+3] = b[Y]; dst_2[6*i+4] = g[Y]; dst_2[6*i+5] = r[Y];

 static void yuv2rgb_c_32 (uint8_t * py_1, uint8_t * py_2,
 			  uint8_t * pu, uint8_t * pv,
 			  void * _dst_1, void * _dst_2, int h_size, int v_pos)
 {
    int U, V, Y;
    uint32_t * r, * g, * b;
    uint32_t * dst_1, * dst_2;

    h_size >>= 3;
    dst_1 = _dst_1;
    dst_2 = _dst_2;
 #define PROLOG(func_name, dst_type) \
 static void func_name(SwsContext *c, uint8_t* srcParam[], int srcStrideParam[], int srcSliceY, \
             int srcSliceH, uint8_t* dst[], int dstStride[]){\
    uint8_t *src[3];\
    int srcStride[3];\
    int y;\
 \
    sws_orderYUV(c->srcFormat, src, srcStride, srcParam, srcStrideParam);\
    if(c->srcFormat == IMGFMT_422P){\
 	srcStride[1] *= 2;\
 	srcStride[2] *= 2;\
    }\
    for(y=0; y<srcSliceH; y+=2){\
 	dst_type *dst_1= (dst_type*)(dst[0] + (y+srcSliceY  )*dstStride[0]);\
 	dst_type *dst_2= (dst_type*)(dst[0] + (y+srcSliceY+1)*dstStride[0]);\
 	dst_type *r, *g, *b;\
 	uint8_t *py_1= src[0] + y*srcStride[0];\
 	uint8_t *py_2= py_1 + srcStride[0];\
 	uint8_t *pu= src[1] + (y>>1)*srcStride[1];\
 	uint8_t *pv= src[2] + (y>>1)*srcStride[2];\
 	unsigned int h_size= c->dstW>>3;\
 	while (h_size--) {\
 	    int U, V, Y;\

 #define EPILOG(dst_delta)\
 	    pu += 4;\
 	    pv += 4;\
 	    py_1 += 8;\
 	    py_2 += 8;\
 	    dst_1 += dst_delta;\
 	    dst_2 += dst_delta;\
 	}\
    }\
 }

    while (h_size--) {
 PROLOG(yuv2rgb_c_32, uint32_t)
 	RGB(0);
 	DST1(0);
 	DST2(0);
@@ -371,30 +312,9 @@ static void yuv2rgb_c_32 (uint8_t * py_1, uint8_t * py_2,
 	RGB(3);
 	DST2(3);
 	DST1(3);
 EPILOG(8)

 	pu += 4;
 	pv += 4;
 	py_1 += 8;
 	py_2 += 8;
 	dst_1 += 8;
 	dst_2 += 8;
    }
 }

 // This is very near from the yuv2rgb_c_32 code
 static void yuv2rgb_c_24_rgb (uint8_t * py_1, uint8_t * py_2,
 			      uint8_t * pu, uint8_t * pv,
 			      void * _dst_1, void * _dst_2, int h_size, int v_pos)
 {
    int U, V, Y;
    uint8_t * r, * g, * b;
    uint8_t * dst_1, * dst_2;

    h_size >>= 3;
    dst_1 = _dst_1;
    dst_2 = _dst_2;

    while (h_size--) {
 PROLOG(yuv2rgb_c_24_rgb, uint8_t)
 	RGB(0);
 	DST1RGB(0);
 	DST2RGB(0);
@@ -410,30 +330,10 @@ static void yuv2rgb_c_24_rgb (uint8_t * py_1, uint8_t * py_2,
 	RGB(3);
 	DST2RGB(3);
 	DST1RGB(3);

 	pu += 4;
 	pv += 4;
 	py_1 += 8;
 	py_2 += 8;
 	dst_1 += 24;
 	dst_2 += 24;
    }
 }
 EPILOG(24)

 // only trivial mods from yuv2rgb_c_24_rgb
 static void yuv2rgb_c_24_bgr (uint8_t * py_1, uint8_t * py_2,
 			      uint8_t * pu, uint8_t * pv,
 			      void * _dst_1, void * _dst_2, int h_size, int v_pos)
 {
    int U, V, Y;
    uint8_t * r, * g, * b;
    uint8_t * dst_1, * dst_2;

    h_size >>= 3;
    dst_1 = _dst_1;
    dst_2 = _dst_2;

    while (h_size--) {
 PROLOG(yuv2rgb_c_24_bgr, uint8_t)
 	RGB(0);
 	DST1BGR(0);
 	DST2BGR(0);
@@ -449,31 +349,11 @@ static void yuv2rgb_c_24_bgr (uint8_t * py_1, uint8_t * py_2,
 	RGB(3);
 	DST2BGR(3);
 	DST1BGR(3);

 	pu += 4;
 	pv += 4;
 	py_1 += 8;
 	py_2 += 8;
 	dst_1 += 24;
 	dst_2 += 24;
    }
 }
 EPILOG(24)

 // This is exactly the same code as yuv2rgb_c_32 except for the types of
 // r, g, b, dst_1, dst_2
 static void yuv2rgb_c_16 (uint8_t * py_1, uint8_t * py_2,
 			  uint8_t * pu, uint8_t * pv,
 			  void * _dst_1, void * _dst_2, int h_size, int v_pos)
 {
    int U, V, Y;
    uint16_t * r, * g, * b;
    uint16_t * dst_1, * dst_2;

    h_size >>= 3;
    dst_1 = _dst_1;
    dst_2 = _dst_2;

    while (h_size--) {
 PROLOG(yuv2rgb_c_16, uint16_t)
 	RGB(0);
 	DST1(0);
 	DST2(0);
@@ -489,31 +369,11 @@ static void yuv2rgb_c_16 (uint8_t * py_1, uint8_t * py_2,
 	RGB(3);
 	DST2(3);
 	DST1(3);

 	pu += 4;
 	pv += 4;
 	py_1 += 8;
 	py_2 += 8;
 	dst_1 += 8;
 	dst_2 += 8;
    }
 }
 EPILOG(8)

 // This is exactly the same code as yuv2rgb_c_32 except for the types of
 // r, g, b, dst_1, dst_2
 static void yuv2rgb_c_8  (uint8_t * py_1, uint8_t * py_2,
 			  uint8_t * pu, uint8_t * pv,
 			  void * _dst_1, void * _dst_2, int h_size, int v_pos)
 {
    int U, V, Y;
    uint8_t * r, * g, * b;
    uint8_t * dst_1, * dst_2;

    h_size >>= 3;
    dst_1 = _dst_1;
    dst_2 = _dst_2;

    while (h_size--) {
 PROLOG(yuv2rgb_c_8, uint8_t)
 	RGB(0);
 	DST1(0);
 	DST2(0);
@@ -529,32 +389,12 @@ static void yuv2rgb_c_8  (uint8_t * py_1, uint8_t * py_2,
 	RGB(3);
 	DST2(3);
 	DST1(3);

 	pu += 4;
 	pv += 4;
 	py_1 += 8;
 	py_2 += 8;
 	dst_1 += 8;
 	dst_2 += 8;
    }
 }
 EPILOG(8)

 // r, g, b, dst_1, dst_2
 static void yuv2rgb_c_8_ordered_dither  (uint8_t * py_1, uint8_t * py_2,
 			  uint8_t * pu, uint8_t * pv,
 			  void * _dst_1, void * _dst_2, int h_size, int v_pos)
 {
    int U, V, Y;
    uint8_t * r, * g, * b;
    uint8_t * dst_1, * dst_2;

    h_size >>= 3;
    dst_1 = _dst_1;
    dst_2 = _dst_2;

    while (h_size--) {
 	const uint8_t *d32= dither_8x8_32[v_pos&7];
 	const uint8_t *d64= dither_8x8_73[v_pos&7];
 PROLOG(yuv2rgb_c_8_ordered_dither, uint8_t)
 	const uint8_t *d32= dither_8x8_32[y&7];
 	const uint8_t *d64= dither_8x8_73[y&7];
 #define DST1bpp8(i,o)					\
 	Y = py_1[2*i];				\
 	dst_1[2*i] = r[Y+d32[0+o]] + g[Y+d32[0+o]] + b[Y+d64[0+o]];	\
@@ -583,32 +423,12 @@ static void yuv2rgb_c_8_ordered_dither  (uint8_t * py_1, uint8_t * py_2,
 	RGB(3);
 	DST2bpp8(3,6);
 	DST1bpp8(3,6);

 	pu += 4;
 	pv += 4;
 	py_1 += 8;
 	py_2 += 8;
 	dst_1 += 8;
 	dst_2 += 8;
    }
 }
 EPILOG(8)


 // This is exactly the same code as yuv2rgb_c_32 except for the types of
 // r, g, b, dst_1, dst_2
 static void yuv2rgb_c_4  (uint8_t * py_1, uint8_t * py_2,
 			  uint8_t * pu, uint8_t * pv,
 			  void * _dst_1, void * _dst_2, int h_size, int v_pos)
 {
    int U, V, Y;
    uint8_t * r, * g, * b;
    uint8_t * dst_1, * dst_2;

    h_size >>= 3;
    dst_1 = _dst_1;
    dst_2 = _dst_2;

    while (h_size--) {
 PROLOG(yuv2rgb_c_4, uint8_t)
        int acc;
 #define DST1_4(i)					\
 	Y = py_1[2*i];				\
@@ -639,31 +459,11 @@ static void yuv2rgb_c_4  (uint8_t * py_1, uint8_t * py_2,
 	RGB(3);
 	DST2_4(3);
 	DST1_4(3);
 EPILOG(4)

 	pu += 4;
 	pv += 4;
 	py_1 += 8;
 	py_2 += 8;
 	dst_1 += 4;
 	dst_2 += 4;
    }
 }

 static void yuv2rgb_c_4_ordered_dither  (uint8_t * py_1, uint8_t * py_2,
 			  uint8_t * pu, uint8_t * pv,
 			  void * _dst_1, void * _dst_2, int h_size, int v_pos)
 {
    int U, V, Y;
    uint8_t * r, * g, * b;
    uint8_t * dst_1, * dst_2;

    h_size >>= 3;
    dst_1 = _dst_1;
    dst_2 = _dst_2;

    while (h_size--) {
 	const uint8_t *d64= dither_8x8_73[v_pos&7];
 	const uint8_t *d128=dither_8x8_220[v_pos&7];
 PROLOG(yuv2rgb_c_4_ordered_dither, uint8_t)
 	const uint8_t *d64= dither_8x8_73[y&7];
 	const uint8_t *d128=dither_8x8_220[y&7];
        int acc;

 #define DST1bpp4(i,o)					\
@@ -696,31 +496,11 @@ static void yuv2rgb_c_4_ordered_dither  (uint8_t * py_1, uint8_t * py_2,
 	RGB(3);
 	DST2bpp4(3,6);
 	DST1bpp4(3,6);

 	pu += 4;
 	pv += 4;
 	py_1 += 8;
 	py_2 += 8;
 	dst_1 += 4;
 	dst_2 += 4;
    }
 }
 EPILOG(4)

 // This is exactly the same code as yuv2rgb_c_32 except for the types of
 // r, g, b, dst_1, dst_2
 static void yuv2rgb_c_4b  (uint8_t * py_1, uint8_t * py_2,
 			  uint8_t * pu, uint8_t * pv,
 			  void * _dst_1, void * _dst_2, int h_size, int v_pos)
 {
    int U, V, Y;
    uint8_t * r, * g, * b;
    uint8_t * dst_1, * dst_2;

    h_size >>= 3;
    dst_1 = _dst_1;
    dst_2 = _dst_2;

    while (h_size--) {
 PROLOG(yuv2rgb_c_4b, uint8_t)
 	RGB(0);
 	DST1(0);
 	DST2(0);
@@ -736,31 +516,11 @@ static void yuv2rgb_c_4b  (uint8_t * py_1, uint8_t * py_2,
 	RGB(3);
 	DST2(3);
 	DST1(3);
 EPILOG(8)

 	pu += 4;
 	pv += 4;
 	py_1 += 8;
 	py_2 += 8;
 	dst_1 += 8;
 	dst_2 += 8;
    }
 }

 static void yuv2rgb_c_4b_ordered_dither  (uint8_t * py_1, uint8_t * py_2,
 			  uint8_t * pu, uint8_t * pv,
 			  void * _dst_1, void * _dst_2, int h_size, int v_pos)
 {
    int U, V, Y;
    uint8_t * r, * g, * b;
    uint8_t * dst_1, * dst_2;

    h_size >>= 3;
    dst_1 = _dst_1;
    dst_2 = _dst_2;

    while (h_size--) {
 	const uint8_t *d64= dither_8x8_73[v_pos&7];
 	const uint8_t *d128=dither_8x8_220[v_pos&7];
 PROLOG(yuv2rgb_c_4b_ordered_dither, uint8_t)
 	const uint8_t *d64= dither_8x8_73[y&7];
 	const uint8_t *d128=dither_8x8_220[y&7];

 #define DST1bpp4b(i,o)					\
 	Y = py_1[2*i];				\
@@ -790,31 +550,11 @@ static void yuv2rgb_c_4b_ordered_dither  (uint8_t * py_1, uint8_t * py_2,
 	RGB(3);
 	DST2bpp4b(3,6);
 	DST1bpp4b(3,6);
 EPILOG(8)

 	pu += 4;
 	pv += 4;
 	py_1 += 8;
 	py_2 += 8;
 	dst_1 += 8;
 	dst_2 += 8;
    }
 }

 static void yuv2rgb_c_1_ordered_dither  (uint8_t * py_1, uint8_t * py_2,
 			  uint8_t * pu, uint8_t * pv,
 			  void * _dst_1, void * _dst_2, int h_size, int v_pos)
 {
    int Y;
    uint8_t * g;
    uint8_t * dst_1, * dst_2;

    h_size >>= 3;
    dst_1 = _dst_1;
    dst_2 = _dst_2;
    g= table_gU[128] + table_gV[128];

    while (h_size--) {
 	const uint8_t *d128=dither_8x8_220[v_pos&7];
 PROLOG(yuv2rgb_c_1_ordered_dither, uint8_t)
 	g= c->table_gU[128] + c->table_gV[128];
 	const uint8_t *d128=dither_8x8_220[y&7];
 	char out_1=0, out_2=0;

 #define DST1bpp1(i,o)					\
@@ -843,17 +583,59 @@ static void yuv2rgb_c_1_ordered_dither  (uint8_t * py_1, uint8_t * py_2,
 	
 	dst_1[0]= out_1;
 	dst_2[0]= out_2;
 EPILOG(1)

 	pu += 4;
 	pv += 4;
 	py_1 += 8;
 	py_2 += 8;
 	dst_1 ++;
 	dst_2 ++;
 SwsFunc yuv2rgb_get_func_ptr (SwsContext *c)
 {
 #ifdef ARCH_X86
    if(gCpuCaps.hasMMX2){
 	switch(c->dstFormat){
 	case IMGFMT_BGR32: return yuv420_rgb32_MMX2;
 	case IMGFMT_BGR24: return yuv420_rgb24_MMX2;
 	case IMGFMT_BGR16: return yuv420_rgb16_MMX2;
 	case IMGFMT_BGR15: return yuv420_rgb15_MMX2;
 	}
    }
    if(gCpuCaps.hasMMX){
 	switch(c->dstFormat){
 	case IMGFMT_BGR32: return yuv420_rgb32_MMX;
 	case IMGFMT_BGR24: return yuv420_rgb24_MMX;
 	case IMGFMT_BGR16: return yuv420_rgb16_MMX;
 	case IMGFMT_BGR15: return yuv420_rgb15_MMX;
 	}
    }
 #endif
 #ifdef HAVE_MLIB
    {
 	SwsFunc t= yuv2rgb_init_mlib(c);
 	if(t) return t;
    }
 #endif
    mp_msg(MSGT_SWS,MSGL_WARN,"No accelerated colorspace conversion found\n");

    switch(c->dstFormat){
    case IMGFMT_RGB32:
    case IMGFMT_BGR32: return yuv2rgb_c_32;
    case IMGFMT_RGB24: return yuv2rgb_c_24_rgb;
    case IMGFMT_BGR24: return yuv2rgb_c_24_bgr;
    case IMGFMT_RGB16:
    case IMGFMT_BGR16:
    case IMGFMT_RGB15:
    case IMGFMT_BGR15: return yuv2rgb_c_16;
    case IMGFMT_RGB8:
    case IMGFMT_BGR8:  return yuv2rgb_c_8_ordered_dither;
    case IMGFMT_RGB4:
    case IMGFMT_BGR4:  return yuv2rgb_c_4_ordered_dither;
    case IMGFMT_RG4B:
    case IMGFMT_BG4B:  return yuv2rgb_c_4b_ordered_dither;
    case IMGFMT_RGB1:
    case IMGFMT_BGR1:  return yuv2rgb_c_1_ordered_dither;
    default:
    	assert(0);
    }
    return NULL;
 }


 static int div_round (int dividend, int divisor)
 {
    if (dividend > 0)
@@ -862,8 +644,10 @@ static int div_round (int dividend, int divisor)
 	return -((-dividend + (divisor>>1)) / divisor);
 }

 void *yuv2rgb_c_init (unsigned bpp, int mode, void *table_rV[256], void *table_gU[256], int table_gV[256], void *table_bU[256])
 int yuv2rgb_c_init_tables (SwsContext *c, const int inv_table[4], int fullRange, int brightness, int contrast, int saturation)
 {  
    const int bpp= c->dstFormat&0xFF;
    const int isRgb= (c->dstFormat>>24) != 'R';
    int i;
    uint8_t table_Y[1024];
    uint32_t *table_32 = 0;
@@ -876,23 +660,37 @@ void *yuv2rgb_c_init (unsigned bpp, int mode, void *table_rV[256], void *table_g
    void *table_r = 0, *table_g = 0, *table_b = 0;
    void *table_start;

    int crv = Inverse_Table_6_9[matrix_coefficients][0];
    int cbu = Inverse_Table_6_9[matrix_coefficients][1];
    int cgu = -Inverse_Table_6_9[matrix_coefficients][2];
    int cgv = -Inverse_Table_6_9[matrix_coefficients][3];
    int64_t crv =  inv_table[0];
    int64_t cbu =  inv_table[1];
    int64_t cgu = -inv_table[2];
    int64_t cgv = -inv_table[3];
    int64_t cy  = 1<<16;
    int64_t oy  = 0;

 //printf("%lld %lld %lld %lld %lld\n", cy, crv, cbu, cgu, cgv);
    if(!fullRange){
 	cy= (cy*255) / 219;
 	oy= 16<<16;
    }
 	
    cy = (cy *contrast             )>>16;
    crv= (crv*contrast * saturation)>>32;
    cbu= (cbu*contrast * saturation)>>32;
    cgu= (cgu*contrast * saturation)>>32;
    cgv= (cgv*contrast * saturation)>>32;
 //printf("%lld %lld %lld %lld %lld\n", cy, crv, cbu, cgu, cgv);
    oy -= 256*brightness;

    for (i = 0; i < 1024; i++) {
 	int j;

 	j = (76309 * (i - 384 - 16) + 32768) >> 16;
 	j= (cy*(((i - 384)<<16) - oy) + (1<<31))>>32;
 	j = (j < 0) ? 0 : ((j > 255) ? 255 : j);
 	table_Y[i] = j;
    }

    switch (bpp) {
    case 32:
 	yuv2rgb_c_internal = yuv2rgb_c_32;

 	table_start= table_32 = malloc ((197 + 2*682 + 256 + 132) * sizeof (uint32_t));

 	entry_size = sizeof (uint32_t);
@@ -901,17 +699,14 @@ void *yuv2rgb_c_init (unsigned bpp, int mode, void *table_rV[256], void *table_g
 	table_g = table_32 + 197 + 2*682;

 	for (i = -197; i < 256+197; i++)
 	    ((uint32_t *)table_r)[i] = table_Y[i+384] << ((mode==MODE_RGB) ? 16 : 0);
 	    ((uint32_t *)table_r)[i] = table_Y[i+384] << (isRgb ? 16 : 0);
 	for (i = -132; i < 256+132; i++)
 	    ((uint32_t *)table_g)[i] = table_Y[i+384] << 8;
 	for (i = -232; i < 256+232; i++)
 	    ((uint32_t *)table_b)[i] = table_Y[i+384] << ((mode==MODE_RGB) ? 0 : 16);
 	    ((uint32_t *)table_b)[i] = table_Y[i+384] << (isRgb ? 0 : 16);
 	break;

    case 24:
 //	yuv2rgb_c_internal = (mode==MODE_RGB) ? yuv2rgb_c_24_rgb : yuv2rgb_c_24_bgr;
 	yuv2rgb_c_internal = (mode!=MODE_RGB) ? yuv2rgb_c_24_rgb : yuv2rgb_c_24_bgr;

 	table_start= table_8 = malloc ((256 + 2*232) * sizeof (uint8_t));

 	entry_size = sizeof (uint8_t);
@@ -923,8 +718,6 @@ void *yuv2rgb_c_init (unsigned bpp, int mode, void *table_rV[256], void *table_g

    case 15:
    case 16:
 	yuv2rgb_c_internal = yuv2rgb_c_16;

 	table_start= table_16 = malloc ((197 + 2*682 + 256 + 132) * sizeof (uint16_t));

 	entry_size = sizeof (uint16_t);
@@ -935,7 +728,7 @@ void *yuv2rgb_c_init (unsigned bpp, int mode, void *table_rV[256], void *table_g
 	for (i = -197; i < 256+197; i++) {
 	    int j = table_Y[i+384] >> 3;

 	    if (mode == MODE_RGB)
 	    if (isRgb)
 		j <<= ((bpp==16) ? 11 : 10);

 	    ((uint16_t *)table_r)[i] = j;
@@ -948,7 +741,7 @@ void *yuv2rgb_c_init (unsigned bpp, int mode, void *table_rV[256], void *table_g
 	for (i = -232; i < 256+232; i++) {
 	    int j = table_Y[i+384] >> 3;

 	    if (mode == MODE_BGR)
 	    if (!isRgb)
 		j <<= ((bpp==16) ? 11 : 10);

 	    ((uint16_t *)table_b)[i] = j;
@@ -956,8 +749,6 @@ void *yuv2rgb_c_init (unsigned bpp, int mode, void *table_rV[256], void *table_g
 	break;

    case 8:
 	yuv2rgb_c_internal = yuv2rgb_c_8_ordered_dither; //yuv2rgb_c_8;

 	table_start= table_332 = malloc ((197 + 2*682 + 256 + 132) * sizeof (uint8_t));

 	entry_size = sizeof (uint8_t);
@@ -968,7 +759,7 @@ void *yuv2rgb_c_init (unsigned bpp, int mode, void *table_rV[256], void *table_g
 	for (i = -197; i < 256+197; i++) {
 	    int j = (table_Y[i+384 - 16] + 18)/36;

 	    if (mode == MODE_RGB)
 	    if (isRgb)
 		j <<= 5;

 	    ((uint8_t *)table_r)[i] = j;
@@ -976,7 +767,7 @@ void *yuv2rgb_c_init (unsigned bpp, int mode, void *table_rV[256], void *table_g
 	for (i = -132; i < 256+132; i++) {
 	    int j = (table_Y[i+384 - 16] + 18)/36;

 	    if (mode == MODE_BGR)
 	    if (!isRgb)
 		j <<= 1;

 	    ((uint8_t *)table_g)[i] = j << 2;
@@ -984,7 +775,7 @@ void *yuv2rgb_c_init (unsigned bpp, int mode, void *table_rV[256], void *table_g
 	for (i = -232; i < 256+232; i++) {
 	    int j = (table_Y[i+384 - 37] + 43)/85;

 	    if (mode == MODE_BGR)
 	    if (!isRgb)
 		j <<= 6;

 	    ((uint8_t *)table_b)[i] = j;
@@ -992,11 +783,6 @@ void *yuv2rgb_c_init (unsigned bpp, int mode, void *table_rV[256], void *table_g
 	break;
    case 4:
    case 4|128:
        if(bpp==4)
 	    yuv2rgb_c_internal = yuv2rgb_c_4_ordered_dither; //yuv2rgb_c_4;
        else
 	    yuv2rgb_c_internal = yuv2rgb_c_4b_ordered_dither; //yuv2rgb_c_4;

 	table_start= table_121 = malloc ((197 + 2*682 + 256 + 132) * sizeof (uint8_t));

 	entry_size = sizeof (uint8_t);
@@ -1007,7 +793,7 @@ void *yuv2rgb_c_init (unsigned bpp, int mode, void *table_rV[256], void *table_g
 	for (i = -197; i < 256+197; i++) {
 	    int j = table_Y[i+384 - 110] >> 7;

 	    if (mode == MODE_RGB)
 	    if (isRgb)
 		j <<= 3;

 	    ((uint8_t *)table_r)[i] = j;
@@ -1020,7 +806,7 @@ void *yuv2rgb_c_init (unsigned bpp, int mode, void *table_rV[256], void *table_g
 	for (i = -232; i < 256+232; i++) {
 	    int j =table_Y[i+384 - 110] >> 7;

 	    if (mode == MODE_BGR)
 	    if (!isRgb)
 		j <<= 3;

 	    ((uint8_t *)table_b)[i] = j;
@@ -1028,8 +814,6 @@ void *yuv2rgb_c_init (unsigned bpp, int mode, void *table_rV[256], void *table_g
 	break;

    case 1:
 	yuv2rgb_c_internal = yuv2rgb_c_1_ordered_dither;

 	table_start= table_1 = malloc (256*2 * sizeof (uint8_t));

 	entry_size = sizeof (uint8_t);
@@ -1046,15 +830,18 @@ void *yuv2rgb_c_init (unsigned bpp, int mode, void *table_rV[256], void *table_g
    default:
 	table_start= NULL;
 	mp_msg(MSGT_SWS,MSGL_ERR,"%ibpp not supported by yuv2rgb\n", bpp);
 	//exit (1);
 	//free mem?
 	return -1;
    }

    for (i = 0; i < 256; i++) {
 	table_rV[i] = table_r + entry_size * div_round (crv * (i-128), 76309);
 	table_gU[i] = table_g + entry_size * div_round (cgu * (i-128), 76309);
 	table_gV[i] = entry_size * div_round (cgv * (i-128), 76309);
 	table_bU[i] = table_b + entry_size * div_round (cbu * (i-128), 76309);
 	c->table_rV[i] = table_r + entry_size * div_round (crv * (i-128), 76309);
 	c->table_gU[i] = table_g + entry_size * div_round (cgu * (i-128), 76309);
 	c->table_gV[i] = entry_size * div_round (cgv * (i-128), 76309);
 	c->table_bU[i] = table_b + entry_size * div_round (cbu * (i-128), 76309);
    }

    return table_start; 
    if(c->yuvTable) free(c->yuvTable);
    c->yuvTable= table_start;
    return 0;
 }
--- a/postproc/yuv2rgb_mlib.c
+++ b/postproc/yuv2rgb_mlib.c
@@ -26,52 +26,72 @@
 #include <mlib_status.h>
 #include <mlib_sys.h>
 #include <mlib_video.h>
 #include <inttypes.h>
 #include <stdlib.h>
 #include <assert.h>

 static void mlib_YUV2ARGB420_32(uint8_t* image, uint8_t* py, 
 			 uint8_t* pu, uint8_t* pv, 
 			 unsigned h_size, unsigned v_size, 
 			 int rgb_stride, int y_stride, int uv_stride)
 {
  mlib_VideoColorYUV2ARGB420(image, py, pu, pv, h_size,
 			     v_size, rgb_stride, y_stride, uv_stride);
 }
 #include "../libvo/img_format.h" //FIXME try to reduce dependency of such stuff
 #include "swscale.h"

 static void mlib_YUV2ABGR420_32(uint8_t* image, uint8_t* py, 
 			 uint8_t* pu, uint8_t* pv, 
 			 unsigned h_size, unsigned v_size, 
 			 int rgb_stride, int y_stride, int uv_stride)
 {
  mlib_VideoColorYUV2ABGR420(image, py, pu, pv, h_size,
 			     v_size, rgb_stride, y_stride, uv_stride);
 static void mlib_YUV2ARGB420_32(SwsContext *c, uint8_t* srcParam[], int srcStrideParam[], int srcSliceY, 
             int srcSliceH, uint8_t* dst[], int dstStride[]){
    uint8_t *src[3];
    int srcStride[3];

    sws_orderYUV(c->srcFormat, src, srcStride, srcParam, srcStrideParam);
    if(c->srcFormat == IMGFMT_422P){
 	srcStride[1] *= 2;
 	srcStride[2] *= 2;
    }
    
    assert(srcStride[1] == srcStride[2]);
 
    mlib_VideoColorYUV2ARGB420(dst[0], src[0], src[1], src[2], c->dstW,
 			     c->dstH, dstStride[0], srcStride[0], srcStride[1]);
 }

 static void mlib_YUV2RGB420_24(uint8_t* image, uint8_t* py, 
 			 uint8_t* pu, uint8_t* pv, 
 			 unsigned h_size, unsigned v_size, 
 			 int rgb_stride, int y_stride, int uv_stride)
 {
  mlib_VideoColorYUV2RGB420(image, py, pu, pv, h_size,
 			    v_size, rgb_stride, y_stride, uv_stride);
 static void mlib_YUV2ABGR420_32(SwsContext *c, uint8_t* srcParam[], int srcStrideParam[], int srcSliceY, 
             int srcSliceH, uint8_t* dst[], int dstStride[]){
    uint8_t *src[3];
    int srcStride[3];

    sws_orderYUV(c->srcFormat, src, srcStride, srcParam, srcStrideParam);
    if(c->srcFormat == IMGFMT_422P){
 	srcStride[1] *= 2;
 	srcStride[2] *= 2;
    }
    
    assert(srcStride[1] == srcStride[2]);
 
    mlib_VideoColorYUV2ABGR420(dst[0], src[0], src[1], src[2], c->dstW,
 			     c->dstH, dstStride[0], srcStride[0], srcStride[1]);
 }

 static void mlib_YUV2RGB420_24(SwsContext *c, uint8_t* srcParam[], int srcStrideParam[], int srcSliceY, 
             int srcSliceH, uint8_t* dst[], int dstStride[]){
    uint8_t *src[3];
    int srcStride[3];

 yuv2rgb_fun yuv2rgb_init_mlib(unsigned bpp, int mode) 
 {  
    sws_orderYUV(c->srcFormat, src, srcStride, srcParam, srcStrideParam);
    if(c->srcFormat == IMGFMT_422P){
 	srcStride[1] *= 2;
 	srcStride[2] *= 2;
    }
    
    assert(srcStride[1] == srcStride[2]);
 
    mlib_VideoColorYUV2RGB420(dst[0], src[0], src[1], src[2], c->dstW,
 			     c->dstH, dstStride[0], srcStride[0], srcStride[1]);
 }

 	if( bpp == 24 ) 
 	{
 		if( mode == MODE_RGB )
 			return mlib_YUV2RGB420_24;
  }

 	if( bpp == 32 ) 
 	{
 		if( mode == MODE_RGB )
 			return mlib_YUV2ARGB420_32;
 		else if( mode == MODE_BGR )
 			return mlib_YUV2ABGR420_32;
 SwsFunc yuv2rgb_init_mlib(SwsContext *c) 
 {
 	switch(c->dstFormat){
 	case IMGFMT_RGB24: return mlib_YUV2RGB420_24;
 	case IMGFMT_RGB32: return mlib_YUV2ARGB420_32;
 	case IMGFMT_BGR32: return mlib_YUV2ARGB420_32;
 	default: return NULL;
 	}
  
 	return NULL;
 }

--- a/postproc/yuv2rgb_template.c
+++ b/postproc/yuv2rgb_template.c
@@ -25,6 +25,7 @@
 *
 * 15,24 bpp and dithering from Michael Niedermayer (michaelni@gmx.at)
 * MMX/MMX2 Template stuff from Michael Niedermayer (needed for fast movntq support)
 * context / deglobalize stuff by Michael Niedermayer
 */

 #undef MOVNTQ
@@ -55,27 +56,25 @@
 		     /* convert the chroma part */\
 		     "punpcklbw %%mm4, %%mm0;" /* scatter 4 Cb 00 u3 00 u2 00 u1 00 u0 */ \
 		     "punpcklbw %%mm4, %%mm1;" /* scatter 4 Cr 00 v3 00 v2 00 v1 00 v0 */ \
 \
 		     "psubsw "MANGLE(mmx_80w)", %%mm0;" /* Cb -= 128 */ \
 		     "psubsw "MANGLE(mmx_80w)", %%mm1;" /* Cr -= 128 */ \
 \
 		     "psllw $3, %%mm0;" /* Promote precision */ \
 		     "psllw $3, %%mm1;" /* Promote precision */ \
 \
 		     "psubsw "U_OFFSET"(%4), %%mm0;" /* Cb -= 128 */ \
 		     "psubsw "V_OFFSET"(%4), %%mm1;" /* Cr -= 128 */ \
 \
 		     "movq %%mm0, %%mm2;" /* Copy 4 Cb 00 u3 00 u2 00 u1 00 u0 */ \
 		     "movq %%mm1, %%mm3;" /* Copy 4 Cr 00 v3 00 v2 00 v1 00 v0 */ \
 \
 		     "pmulhw "MANGLE(mmx_U_green)", %%mm2;" /* Mul Cb with green coeff -> Cb green */ \
 		     "pmulhw "MANGLE(mmx_V_green)", %%mm3;" /* Mul Cr with green coeff -> Cr green */ \
 		     "pmulhw "UG_COEFF"(%4), %%mm2;" /* Mul Cb with green coeff -> Cb green */ \
 		     "pmulhw "VG_COEFF"(%4), %%mm3;" /* Mul Cr with green coeff -> Cr green */ \
 \
 		     "pmulhw "MANGLE(mmx_U_blue)", %%mm0;" /* Mul Cb -> Cblue 00 b3 00 b2 00 b1 00 b0 */\
 		     "pmulhw "MANGLE(mmx_V_red)", %%mm1;" /* Mul Cr -> Cred 00 r3 00 r2 00 r1 00 r0 */\
 		     "pmulhw "UB_COEFF"(%4), %%mm0;" /* Mul Cb -> Cblue 00 b3 00 b2 00 b1 00 b0 */\
 		     "pmulhw "VR_COEFF"(%4), %%mm1;" /* Mul Cr -> Cred 00 r3 00 r2 00 r1 00 r0 */\
 \
 		     "paddsw %%mm3, %%mm2;" /* Cb green + Cr green -> Cgreen */\
 \
 		     /* convert the luma part */\
 		     "psubusb "MANGLE(mmx_10w)", %%mm6;" /* Y -= 16 */\
 \
 		     "movq %%mm6, %%mm7;" /* Copy 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */\
 		     "pand "MANGLE(mmx_00ffw)", %%mm6;" /* get Y even 00 Y6 00 Y4 00 Y2 00 Y0 */\
 \
@@ -84,8 +83,11 @@
 		     "psllw $3, %%mm6;" /* Promote precision */\
 		     "psllw $3, %%mm7;" /* Promote precision */\
 \
 		     "pmulhw "MANGLE(mmx_Y_coeff)", %%mm6;" /* Mul 4 Y even 00 y6 00 y4 00 y2 00 y0 */\
 		     "pmulhw "MANGLE(mmx_Y_coeff)", %%mm7;" /* Mul 4 Y odd 00 y7 00 y5 00 y3 00 y1 */\
 		     "psubw "Y_OFFSET"(%4), %%mm6;" /* Y -= 16 */\
 		     "psubw "Y_OFFSET"(%4), %%mm7;" /* Y -= 16 */\
 \
 		     "pmulhw "Y_COEFF"(%4), %%mm6;" /* Mul 4 Y even 00 y6 00 y4 00 y2 00 y0 */\
 		     "pmulhw "Y_COEFF"(%4), %%mm7;" /* Mul 4 Y odd 00 y7 00 y5 00 y3 00 y1 */\
 \
 		     /* Do the addition part of the conversion for even and odd pixels,
 			register usage:
@@ -121,44 +123,44 @@
 		     "punpcklbw %%mm5, %%mm2;" /* G7 G6 G5 G4 G3 G2 G1 G0 */\


 static inline void RENAME(yuv420_rgb16) (uint8_t * image, uint8_t * py,
 			      uint8_t * pu, uint8_t * pv,
 			      unsigned h_size, unsigned v_size,
 			      int rgb_stride, int y_stride, int uv_stride)
 {
    int even = 1;
    int x, y;
 static inline void RENAME(yuv420_rgb16)(SwsContext *c, uint8_t* srcParam[], int srcStrideParam[], int srcSliceY,
             int srcSliceH, uint8_t* dst[], int dstStride[]){
    int srcStride[3];
    uint8_t *src[3];
    int y, h_size;

    __asm__ __volatile__ ("pxor %mm4, %mm4;" /* zero mm4 */ );

    for (y = v_size; --y >= 0; ) {
 	uint8_t *_image = image;
 	uint8_t *_py = py;
 	uint8_t *_pu = pu;
 	uint8_t *_pv = pv;
 	int internal_h_size= h_size;
 	int aligned_h_size= (h_size+7)&~7;
    sws_orderYUV(c->srcFormat, src, srcStride, srcParam, srcStrideParam);
    if(c->srcFormat == IMGFMT_422P){
 	srcStride[1] *= 2;
 	srcStride[2] *= 2;
    }

 	if(rgb_stride >= aligned_h_size*2) internal_h_size= aligned_h_size;
    h_size= (c->dstW+7)&~7;
    if(h_size*2 > dstStride[0]) h_size-=8;
    
    __asm__ __volatile__ ("pxor %mm4, %mm4;" /* zero mm4 */ );
 //printf("%X %X %X %X %X %X %X %X %X %X\n", (int)&c->redDither, (int)&b5Dither, (int)src[0], (int)src[1], (int)src[2], (int)dst[0],
 //srcStride[0],srcStride[1],srcStride[2],dstStride[0]);
    for (y= 0; y<srcSliceH; y++ ) {
 	uint8_t *_image = dst[0] + (y+srcSliceY)*dstStride[0];
 	uint8_t *_py = src[0] + y*srcStride[0];
 	uint8_t *_pu = src[1] + (y>>1)*srcStride[1];
 	uint8_t *_pv = src[2] + (y>>1)*srcStride[2];
 	int index= -h_size/2;

 	b5Dither= dither8[y&1];
 	g6Dither= dither4[y&1];
 	g5Dither= dither8[y&1];
 	r5Dither= dither8[(y+1)&1];

 	/* load data for start of next scan line */
 	__asm__ __volatile__ (
 		 "movd (%1), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */
 		 "movd (%2), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */
 		 "movq (%0), %%mm6;" /* Load 8  Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */

 		 : : "r" (_py), "r" (_pu), "r" (_pv));

 	for (x = internal_h_size >> 3; --x >= 0; ) {
 	    /* this mmx assembly code deals with SINGLE scan line at a time, it convert 8
 	       pixels in each iteration */

 	    __asm__ __volatile__ (
 	/* load data for start of next scan line */
 		     "movd (%2, %0), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */
 		     "movd (%3, %0), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */
 		     "movq (%5, %0, 2), %%mm6;" /* Load 8  Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */
 //		    ".balign 16			\n\t"
 		    "1:				\n\t"
 /* no speed diference on my p3@500 with prefetch,
 * if it is faster for anyone with -benchmark then tell me
 			PREFETCH" 64(%0) \n\t"
@@ -190,80 +192,71 @@ YUV2RGB
 		     "psllw $3, %%mm2;" /* 0_0_0_0 0_g7g6g5 g4g3g2_0 0_0_0_0 */
 		     "por %%mm2, %%mm0;" /* r7r6r5r4 r3g7g6g5 g4g3g2b7 b6b5b4b3 */

 		     "movq 8 (%0), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */
 		     MOVNTQ " %%mm0, (%3);" /* store pixel 0-3 */
 		     "movq 8 (%5, %0, 2), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */
 		     MOVNTQ " %%mm0, (%1);" /* store pixel 0-3 */

 		     /* convert rgb24 plane to rgb16 pack for pixel 0-3 */
 		     "punpckhbw %%mm4, %%mm7;" /* 0_0_0_0 0_0_0_0 g7g6g5g4 g3g2_0_0 */
 		     "punpckhbw %%mm1, %%mm5;" /* r7r6r5r4 r3_0_0_0 0_0_0_b7 b6b5b4b3 */

 		     "psllw $3, %%mm7;" /* 0_0_0_0 0_g7g6g5 g4g3g2_0 0_0_0_0 */
 		     "movd 4 (%1), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */
 		     "movd 4 (%2, %0), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */

 		     "por %%mm7, %%mm5;" /* r7r6r5r4 r3g7g6g5 g4g3g2b7 b6b5b4b3 */
 		     "movd 4 (%2), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */

 		     MOVNTQ " %%mm5, 8 (%3);" /* store pixel 4-7 */
 		     : : "r" (_py), "r" (_pu), "r" (_pv), "r" (_image));

 	    _py += 8;
 	    _pu += 4;
 	    _pv += 4;
 	    _image += 16;
 	}

 	if (!even) {
 	    pu += uv_stride;
 	    pv += uv_stride;
 	}

 	py += y_stride;
 	image += rgb_stride;

 	even = (!even);
 		     "movd 4 (%3, %0), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */

 		     MOVNTQ " %%mm5, 8 (%1);" /* store pixel 4-7 */
 		     
 		     "addl $16, %1			\n\t"
 		     "addl $4, %0			\n\t"
 		     " js 1b				\n\t"
 		     
 		     : "+r" (index), "+r" (_image)
 		     : "r" (_pu - index), "r" (_pv - index), "r"(&c->redDither), "r" (_py - 2*index)
 		     );
    }

    __asm__ __volatile__ (EMMS);
 }

 static inline void RENAME(yuv420_rgb15) (uint8_t * image, uint8_t * py,
 			      uint8_t * pu, uint8_t * pv,
 			      unsigned h_size, unsigned v_size,
 			      int rgb_stride, int y_stride, int uv_stride)
 {
    int even = 1;
    int x, y;
 static inline void RENAME(yuv420_rgb15)(SwsContext *c, uint8_t* srcParam[], int srcStrideParam[], int srcSliceY,
             int srcSliceH, uint8_t* dst[], int dstStride[]){
    int srcStride[3];
    uint8_t *src[3];
    int y, h_size;

    __asm__ __volatile__ ("pxor %mm4, %mm4;" /* zero mm4 */ );

    for (y = v_size; --y >= 0; ) {
 	uint8_t *_image = image;
 	uint8_t *_py = py;
 	uint8_t *_pu = pu;
 	uint8_t *_pv = pv;
 	int internal_h_size= h_size;
 	int aligned_h_size= (h_size+7)&~7;
    sws_orderYUV(c->srcFormat, src, srcStride, srcParam, srcStrideParam);
    if(c->srcFormat == IMGFMT_422P){
 	srcStride[1] *= 2;
 	srcStride[2] *= 2;
    }

 	if(rgb_stride >= aligned_h_size*2) internal_h_size= aligned_h_size;
    h_size= (c->dstW+7)&~7;
    if(h_size*2 > dstStride[0]) h_size-=8;
    
    __asm__ __volatile__ ("pxor %mm4, %mm4;" /* zero mm4 */ );
 //printf("%X %X %X %X %X %X %X %X %X %X\n", (int)&c->redDither, (int)&b5Dither, (int)src[0], (int)src[1], (int)src[2], (int)dst[0],
 //srcStride[0],srcStride[1],srcStride[2],dstStride[0]);
    for (y= 0; y<srcSliceH; y++ ) {
 	uint8_t *_image = dst[0] + (y+srcSliceY)*dstStride[0];
 	uint8_t *_py = src[0] + y*srcStride[0];
 	uint8_t *_pu = src[1] + (y>>1)*srcStride[1];
 	uint8_t *_pv = src[2] + (y>>1)*srcStride[2];
 	int index= -h_size/2;

 	b5Dither= dither8[y&1];
 	g6Dither= dither4[y&1];
 	g5Dither= dither8[y&1];
 	r5Dither= dither8[(y+1)&1];

 	/* load data for start of next scan line */
 	__asm__ __volatile__ (
 		 "movd (%1), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */
 		 "movd (%2), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */
 		 "movq (%0), %%mm6;" /* Load 8  Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */

 		 : : "r" (_py), "r" (_pu), "r" (_pv));

 	for (x = internal_h_size >> 3; --x >= 0; ) {
 	    /* this mmx assembly code deals with SINGLE scan line at a time, it convert 8
 	       pixels in each iteration */

 	    __asm__ __volatile__ (
 	/* load data for start of next scan line */
 		     "movd (%2, %0), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */
 		     "movd (%3, %0), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */
 		     "movq (%5, %0, 2), %%mm6;" /* Load 8  Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */
 //		    ".balign 16			\n\t"
 		    "1:				\n\t"
 YUV2RGB

 #ifdef DITHER1XBPP
@@ -291,75 +284,65 @@ YUV2RGB
 		     "psllw $2, %%mm2;" /* 0_0_0_0 0_0_g7g6 g5g4g3_0 0_0_0_0 */
 		     "por %%mm2, %%mm0;" /* 0_r7r6r5 r4r3g7g6 g5g4g3b7 b6b5b4b3 */

 		     "movq 8 (%0), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */
 		     MOVNTQ " %%mm0, (%3);" /* store pixel 0-3 */
 		     "movq 8 (%5, %0, 2), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */
 		     MOVNTQ " %%mm0, (%1);" /* store pixel 0-3 */

 		     /* convert rgb24 plane to rgb16 pack for pixel 0-3 */
 		     "punpckhbw %%mm4, %%mm7;" /* 0_0_0_0 0_0_0_0 0_g7g6g5 g4g3_0_0 */
 		     "punpckhbw %%mm1, %%mm5;" /* r7r6r5r4 r3_0_0_0 0_0_0_b7 b6b5b4b3 */

 		     "psllw $2, %%mm7;" /* 0_0_0_0 0_0_g7g6 g5g4g3_0 0_0_0_0 */
 		     "movd 4 (%1), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */
 		     "movd 4 (%2, %0), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */

 		     "por %%mm7, %%mm5;" /* 0_r7r6r5 r4r3g7g6 g5g4g3b7 b6b5b4b3 */
 		     "movd 4 (%2), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */

 		     MOVNTQ " %%mm5, 8 (%3);" /* store pixel 4-7 */
 		     : : "r" (_py), "r" (_pu), "r" (_pv), "r" (_image));

 	    _py += 8;
 	    _pu += 4;
 	    _pv += 4;
 	    _image += 16;
 	}

 	if (!even) {
 	    pu += uv_stride;
 	    pv += uv_stride;
 	}

 	py += y_stride;
 	image += rgb_stride;

 	even = (!even);
 		     "movd 4 (%3, %0), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */

 		     MOVNTQ " %%mm5, 8 (%1);" /* store pixel 4-7 */
 		     
 		     "addl $16, %1			\n\t"
 		     "addl $4, %0			\n\t"
 		     " js 1b				\n\t"
 		     : "+r" (index), "+r" (_image)
 		     : "r" (_pu - index), "r" (_pv - index), "r"(&c->redDither), "r" (_py - 2*index)
 		     );
    }

    __asm__ __volatile__ (EMMS);
 }

 static inline void RENAME(yuv420_rgb24) (uint8_t * image, uint8_t * py,
 			      uint8_t * pu, uint8_t * pv,
 			      unsigned h_size, unsigned v_size,
 			      int rgb_stride, int y_stride, int uv_stride)
 {
    int even = 1;
    int x, y;
 static inline void RENAME(yuv420_rgb24)(SwsContext *c, uint8_t* srcParam[], int srcStrideParam[], int srcSliceY,
             int srcSliceH, uint8_t* dst[], int dstStride[]){
    int srcStride[3];
    uint8_t *src[3];
    int y, h_size;

    __asm__ __volatile__ ("pxor %mm4, %mm4;" /* zero mm4 */ );

    for (y = v_size; --y >= 0; ) {
 	uint8_t *_image = image;
 	uint8_t *_py = py;
 	uint8_t *_pu = pu;
 	uint8_t *_pv = pv;
 	int internal_h_size= h_size;
 	int aligned_h_size= (h_size+7)&~7;

 	if(rgb_stride >= aligned_h_size*3) internal_h_size= aligned_h_size;
    sws_orderYUV(c->srcFormat, src, srcStride, srcParam, srcStrideParam);
    if(c->srcFormat == IMGFMT_422P){
 	srcStride[1] *= 2;
 	srcStride[2] *= 2;
    }

 	/* load data for start of next scan line */
 	__asm__ __volatile__ (
 		 "movd (%1), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */
 		 "movd (%2), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */
 		 "movq (%0), %%mm6;" /* Load 8  Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */
    h_size= (c->dstW+7)&~7;
    if(h_size*3 > dstStride[0]) h_size-=8;
    
    __asm__ __volatile__ ("pxor %mm4, %mm4;" /* zero mm4 */ );

 		 : : "r" (_py), "r" (_pu), "r" (_pv));
    for (y= 0; y<srcSliceH; y++ ) {
 	uint8_t *_image = dst[0] + (y+srcSliceY)*dstStride[0];
 	uint8_t *_py = src[0] + y*srcStride[0];
 	uint8_t *_pu = src[1] + (y>>1)*srcStride[1];
 	uint8_t *_pv = src[2] + (y>>1)*srcStride[2];
 	int index= -h_size/2;

 	for (x = internal_h_size >> 3; --x >= 0; ) {
 	    /* this mmx assembly code deals with SINGLE scan line at a time, it convert 8
 	       pixels in each iteration */

 	    __asm__ __volatile__ (
 	/* load data for start of next scan line */
 		     "movd (%2, %0), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */
 		     "movd (%3, %0), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */
 		     "movq (%5, %0, 2), %%mm6;" /* Load 8  Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */
 //		    ".balign 16			\n\t"
 		    "1:				\n\t"
 YUV2RGB
 	/* mm0=B, %%mm2=G, %%mm1=R */
 #ifdef HAVE_MMX2
@@ -376,7 +359,7 @@ YUV2RGB
 			"psllq $8, %%mm3		\n\t" /* G2        G1       G0    */
 			"por %%mm5, %%mm6		\n\t"
 			"por %%mm3, %%mm6		\n\t"
 			MOVNTQ" %%mm6, (%3)		\n\t"
 			MOVNTQ" %%mm6, (%1)		\n\t"

 			"psrlq $8, %%mm2		\n\t" /* 00 G7 G6 G5  G4 G3 G2 G1 */
 			"pshufw $0xA5, %%mm0, %%mm5	\n\t" /* B5 B4 B5 B4  B3 B2 B3 B2 */
@@ -389,22 +372,22 @@ YUV2RGB

 			"por %%mm5, %%mm3		\n\t" /* B5    G4 B4     G3 B3    */
 			"por %%mm3, %%mm6		\n\t"
 			MOVNTQ" %%mm6, 8(%3)		\n\t"
 			MOVNTQ" %%mm6, 8(%1)		\n\t"

 			"pshufw $0xFF, %%mm0, %%mm5	\n\t" /* B7 B6 B7 B6  B7 B6 B6 B7 */
 			"pshufw $0xFA, %%mm2, %%mm3	\n\t" /* 00 G7 00 G7  G6 G5 G6 G5 */
 			"pshufw $0xFA, %%mm1, %%mm6	\n\t" /* R7 R6 R7 R6  R5 R4 R5 R4 */
 			"movd 4 (%1), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */
 			"movd 4 (%2, %0), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */

 			"pand %%mm7, %%mm5		\n\t" /*       B7        B6       */
 			"pand %%mm4, %%mm3		\n\t" /*    G7        G6       G5 */
 			"pand "MANGLE(M24B)", %%mm6	\n\t" /* R7       R6        R5    */
 			"movd 4 (%2), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */
 			"movd 4 (%3, %0), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */
 \
 			"por %%mm5, %%mm3		\n\t"
 			"por %%mm3, %%mm6		\n\t"
 			MOVNTQ" %%mm6, 16(%3)		\n\t"
 			"movq 8 (%0), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */
 			MOVNTQ" %%mm6, 16(%1)		\n\t"
 			"movq 8 (%5, %0, 2), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */
 			"pxor %%mm4, %%mm4		\n\t"

 #else
@@ -442,83 +425,72 @@ YUV2RGB
 			"movq %%mm0, %%mm6		\n\t" /* 0RGBRGB0 1 */
 			"psllq $40, %%mm0		\n\t" /* GB000000 1 */
 			"por %%mm0, %%mm7		\n\t" /* GBRGBRGB 0 */
 			MOVNTQ" %%mm7, (%3)		\n\t"
 			MOVNTQ" %%mm7, (%1)		\n\t"

 			"movd 4 (%1), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */
 			"movd 4 (%2, %0), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */

 			"psrlq $24, %%mm6		\n\t" /* 0000RGBR 1 */
 			"movq %%mm5, %%mm1		\n\t" /* 0RGBRGB0 2 */
 			"psllq $24, %%mm5		\n\t" /* BRGB0000 2 */
 			"por %%mm5, %%mm6		\n\t" /* BRGBRGBR 1 */
 			MOVNTQ" %%mm6, 8(%3)		\n\t"
 			MOVNTQ" %%mm6, 8(%1)		\n\t"

 			"movq 8 (%0), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */
 			"movq 8 (%5, %0, 2), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */

 			"psrlq $40, %%mm1		\n\t" /* 000000RG 2 */
 			"psllq $8, %%mm3		\n\t" /* RGBRGB00 3 */
 			"por %%mm3, %%mm1		\n\t" /* RGBRGBRG 2 */
 			MOVNTQ" %%mm1, 16(%3)		\n\t"
 			MOVNTQ" %%mm1, 16(%1)		\n\t"

 			"movd 4 (%2), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */
 			"movd 4 (%3, %0), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */
 			"pxor %%mm4, %%mm4		\n\t"
 #endif

 		     : : "r" (_py), "r" (_pu), "r" (_pv), "r" (_image));

 	    _py += 8;
 	    _pu += 4;
 	    _pv += 4;
 	    _image += 24;
 	}

 	if (!even) {
 	    pu += uv_stride;
 	    pv += uv_stride;
 	}

 	py += y_stride;
 	image += rgb_stride;

 	even = (!even);
 		     
 		     "addl $24, %1			\n\t"
 		     "addl $4, %0			\n\t"
 		     " js 1b				\n\t"
 		     
 		     : "+r" (index), "+r" (_image)
 		     : "r" (_pu - index), "r" (_pv - index), "r"(&c->redDither), "r" (_py - 2*index)
 		     );
    }

    __asm__ __volatile__ (EMMS);
 }

 static inline void RENAME(yuv420_rgb32)(SwsContext *c, uint8_t* srcParam[], int srcStrideParam[], int srcSliceY,
             int srcSliceH, uint8_t* dst[], int dstStride[]){
    int srcStride[3];
    uint8_t *src[3];
    int y, h_size;

 static inline void RENAME(yuv420_argb32) (uint8_t * image, uint8_t * py,
 			       uint8_t * pu, uint8_t * pv,
 			       unsigned h_size, unsigned v_size,
 			       int rgb_stride, int y_stride, int uv_stride)
 {
    int even = 1;
    int x, y;
    sws_orderYUV(c->srcFormat, src, srcStride, srcParam, srcStrideParam);
    if(c->srcFormat == IMGFMT_422P){
 	srcStride[1] *= 2;
 	srcStride[2] *= 2;
    }

    h_size= (c->dstW+7)&~7;
    if(h_size*4 > dstStride[0]) h_size-=8;
    
    __asm__ __volatile__ ("pxor %mm4, %mm4;" /* zero mm4 */ );

    for (y = v_size; --y >= 0; ) {
 	uint8_t *_image = image;
 	uint8_t *_py = py;
 	uint8_t *_pu = pu;
 	uint8_t *_pv = pv;
 	int internal_h_size= h_size;
 	int aligned_h_size= (h_size+7)&~7;

 	if(rgb_stride >= aligned_h_size*4) internal_h_size= aligned_h_size;
    for (y= 0; y<srcSliceH; y++ ) {
 	uint8_t *_image = dst[0] + (y+srcSliceY)*dstStride[0];
 	uint8_t *_py = src[0] + y*srcStride[0];
 	uint8_t *_pu = src[1] + (y>>1)*srcStride[1];
 	uint8_t *_pv = src[2] + (y>>1)*srcStride[2];
 	int index= -h_size/2;

 	/* load data for start of next scan line */
 	__asm__ __volatile__ 
 	    (
 	     "movd (%1), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */ 
 	     "movd (%2), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */ 
 	     "movq (%0), %%mm6;" /* Load 8  Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */ 
 	     : : "r" (_py), "r" (_pu), "r" (_pv)
 	     );

 	for (x = internal_h_size >> 3; --x >= 0; ) {
 	    /* this mmx assembly code deals with SINGLE scan line at a time, it convert 8
 	       pixels in each iteration */
 	    __asm__ __volatile__ (
 	/* load data for start of next scan line */
 		     "movd (%2, %0), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */
 		     "movd (%3, %0), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */
 		     "movq (%5, %0, 2), %%mm6;" /* Load 8  Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */
 //		    ".balign 16			\n\t"
 		    "1:				\n\t"
 YUV2RGB
 		     /* convert RGB plane to RGB packed format,
 			mm0 -> B, mm1 -> R, mm2 -> G, mm3 -> 0,
@@ -536,60 +508,40 @@ YUV2RGB
 		     "punpcklbw %%mm3, %%mm7;" /* 00 R3 00 R2 00 R1 00 R0 */

 		     "punpcklwd %%mm7, %%mm6;" /* 00 R1 B1 G1 00 R0 B0 G0 */
 		     MOVNTQ " %%mm6, (%3);" /* Store ARGB1 ARGB0 */
 		     MOVNTQ " %%mm6, (%1);" /* Store ARGB1 ARGB0 */

 		     "movq %%mm0, %%mm6;" /* B7 B6 B5 B4 B3 B2 B1 B0 */
 		     "punpcklbw %%mm2, %%mm6;" /* G3 B3 G2 B2 G1 B1 G0 B0 */

 		     "punpckhwd %%mm7, %%mm6;" /* 00 R3 G3 B3 00 R2 B3 G2 */
 		     MOVNTQ " %%mm6, 8 (%3);" /* Store ARGB3 ARGB2 */
 		     MOVNTQ " %%mm6, 8 (%1);" /* Store ARGB3 ARGB2 */

 		     "punpckhbw %%mm2, %%mm4;" /* G7 B7 G6 B6 G5 B5 G4 B4 */
 		     "punpckhbw %%mm3, %%mm5;" /* 00 R7 00 R6 00 R5 00 R4 */

 		     "punpcklwd %%mm5, %%mm4;" /* 00 R5 B5 G5 00 R4 B4 G4 */
 		     MOVNTQ " %%mm4, 16 (%3);" /* Store ARGB5 ARGB4 */
 		     MOVNTQ " %%mm4, 16 (%1);" /* Store ARGB5 ARGB4 */

 		     "movq %%mm0, %%mm4;" /* B7 B6 B5 B4 B3 B2 B1 B0 */
 		     "punpckhbw %%mm2, %%mm4;" /* G7 B7 G6 B6 G5 B5 G4 B4 */

 		     "punpckhwd %%mm5, %%mm4;" /* 00 R7 G7 B7 00 R6 B6 G6 */
 		     MOVNTQ " %%mm4, 24 (%3);" /* Store ARGB7 ARGB6 */
 		     MOVNTQ " %%mm4, 24 (%1);" /* Store ARGB7 ARGB6 */

 		     "movd 4 (%1), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */
 		     "movd 4 (%2), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */
 		     "movd 4 (%2, %0), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */
 		     "movd 4 (%3, %0), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */

 		     "pxor %%mm4, %%mm4;" /* zero mm4 */
 		     "movq 8 (%0), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */

 		     : : "r" (_py), "r" (_pu), "r" (_pv), "r" (_image));

 	    _py += 8;
 	    _pu += 4;
 	    _pv += 4;
 	    _image += 32;
 	}

 	if (!even) {
 	    pu += uv_stride;
 	    pv += uv_stride;
 	}

 	py += y_stride;
 	image += rgb_stride;

 	even = (!even);
 		     "movq 8 (%5, %0, 2), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */

 		     "addl $32, %1			\n\t"
 		     "addl $4, %0			\n\t"
 		     " js 1b				\n\t"
 		     
 		     : "+r" (index), "+r" (_image)
 		     : "r" (_pu - index), "r" (_pv - index), "r"(&c->redDither), "r" (_py - 2*index)
 		     );
    }

    __asm__ __volatile__ (EMMS);
 }

 yuv2rgb_fun RENAME(yuv2rgb_init) (unsigned bpp, int mode)
 {
    if (bpp == 15 && mode == MODE_RGB) return RENAME(yuv420_rgb15);
    if (bpp == 16 && mode == MODE_RGB) return RENAME(yuv420_rgb16);
    if (bpp == 24 && mode == MODE_RGB) return RENAME(yuv420_rgb24);
    if (bpp == 32 && mode == MODE_RGB) return RENAME(yuv420_argb32);
    return NULL; // Fallback to C.
 }