Originally committed as revision 27188 to svn://svn.mplayerhq.hu/mplayer/trunk/libswscaletags/v0.5
| @@ -24,74 +24,73 @@ | |||
| /* | |||
| YUV420 to RGB565 conversion. This routine takes a YUV 420 planar macroblock | |||
| and converts it to RGB565. R:5 bits, G:6 bits, B:5 bits.. packed into shorts | |||
| YUV420 to RGB565 conversion. This routine takes a YUV 420 planar macroblock | |||
| and converts it to RGB565. R:5 bits, G:6 bits, B:5 bits.. packed into shorts | |||
| The following calculation is used for the conversion: | |||
| The following calculation is used for the conversion: | |||
| r = clipz((y-oy)*cy + crv*(v-128)) | |||
| g = clipz((y-oy)*cy + cgv*(v-128) + cgu*(u-128)) | |||
| b = clipz((y-oy)*cy + cbu*(u-128)) | |||
| r = clipz((y-oy)*cy + crv*(v-128)) | |||
| g = clipz((y-oy)*cy + cgv*(v-128) + cgu*(u-128)) | |||
| b = clipz((y-oy)*cy + cbu*(u-128)) | |||
| y,u,v are pre scaled by a factor of 4 i.e. left shifted to gain precision. | |||
| y,u,v are pre scaled by a factor of 4 i.e. left shifted to gain precision. | |||
| New factorization to eliminate the truncation error which was | |||
| occuring due to the byteop3p. | |||
| New factorization to eliminate the truncation error which was | |||
| occuring due to the byteop3p. | |||
| 1) use the bytop16m to subtract quad bytes we use this in U8 this | |||
| then so the offsets need to be renormalized to 8bits. | |||
| 1) use the bytop16m to subtract quad bytes we use this in U8 this | |||
| then so the offsets need to be renormalized to 8bits. | |||
| 2) scale operands up by a factor of 4 not 8 because Blackfin | |||
| multiplies include a shift. | |||
| 2) scale operands up by a factor of 4 not 8 because Blackfin | |||
| multiplies include a shift. | |||
| 3) compute into the accumulators cy*yx0, cy*yx1 | |||
| 3) compute into the accumulators cy*yx0, cy*yx1 | |||
| 4) compute each of the linear equations | |||
| r = clipz((y-oy)*cy + crv*(v-128)) | |||
| 4) compute each of the linear equations | |||
| r = clipz((y - oy) * cy + crv * (v - 128)) | |||
| g = clipz((y-oy)*cy + cgv*(v-128) + cgu*(u-128)) | |||
| g = clipz((y - oy) * cy + cgv * (v - 128) + cgu * (u - 128)) | |||
| b = clipz((y-oy)*cy + cbu*(u-128)) | |||
| b = clipz((y - oy) * cy + cbu * (u - 128)) | |||
| reuse of the accumulators requires that we actually multiply | |||
| twice once with addition and the second time with a subtaction. | |||
| reuse of the accumulators requires that we actually multiply | |||
| twice once with addition and the second time with a subtaction. | |||
| because of this we need to compute the equations in the order R B | |||
| then G saving the writes for B in the case of 24/32 bit color | |||
| formats. | |||
| because of this we need to compute the equations in the order R B | |||
| then G saving the writes for B in the case of 24/32 bit color | |||
| formats. | |||
| api: yuv2rgb_kind (uint8_t *Y, uint8_t *U, uint8_t *V, int *out, | |||
| int dW, uint32_t *coeffs); | |||
| api: yuv2rgb_kind (uint8_t *Y, uint8_t *U, uint8_t *V, int *out, | |||
| int dW, uint32_t *coeffs); | |||
| A B | |||
| --- --- | |||
| i2 = cb i3 = cr | |||
| i1 = coeff i0 = y | |||
| A B | |||
| --- --- | |||
| i2 = cb i3 = cr | |||
| i1 = coeff i0 = y | |||
| Where coeffs have the following layout in memory. | |||
| Where coeffs have the following layout in memory. | |||
| uint32_t oy,oc,zero,cy,crv,rmask,cbu,bmask,cgu,cgv; | |||
| uint32_t oy,oc,zero,cy,crv,rmask,cbu,bmask,cgu,cgv; | |||
| coeffs is a pointer to oy. | |||
| coeffs is a pointer to oy. | |||
| the {rgb} masks are only utilized by the 565 packing algorithm. Note the data | |||
| replication is used to simplify the internal algorithms for the dual mac architecture | |||
| of BlackFin. | |||
| the {rgb} masks are only utilized by the 565 packing algorithm. Note the data | |||
| replication is used to simplify the internal algorithms for the dual mac architecture | |||
| of BlackFin. | |||
| All routines are exported with _ff_bfin_ as a symbol prefix | |||
| All routines are exported with _ff_bfin_ as a symbol prefix | |||
| rough performance gain compared against -O3: | |||
| rough performance gain compared against -O3: | |||
| 2779809/1484290 187.28% | |||
| which translates to ~33c/pel to ~57c/pel for the reference vs 17.5 | |||
| c/pel for the optimized implementations. Not sure why there is such a | |||
| huge variation on the reference codes on Blackfin I guess it must have | |||
| to do with the memory system. | |||
| 2779809/1484290 187.28% | |||
| which translates to ~33c/pel to ~57c/pel for the reference vs 17.5 | |||
| c/pel for the optimized implementations. Not sure why there is such a | |||
| huge variation on the reference codes on Blackfin I guess it must have | |||
| to do with the memory system. | |||
| */ | |||
| #define mL3 .text | |||
| @@ -21,63 +21,63 @@ | |||
| */ | |||
| /* | |||
| convert I420 YV12 to RGB in various formats, | |||
| it rejects images that are not in 420 formats | |||
| it rejects images that don't have widths of multiples of 16 | |||
| it rejects images that don't have heights of multiples of 2 | |||
| reject defers to C simulation codes. | |||
| convert I420 YV12 to RGB in various formats, | |||
| it rejects images that are not in 420 formats | |||
| it rejects images that don't have widths of multiples of 16 | |||
| it rejects images that don't have heights of multiples of 2 | |||
| reject defers to C simulation codes. | |||
| lots of optimizations to be done here | |||
| lots of optimizations to be done here | |||
| 1. need to fix saturation code, I just couldn't get it to fly with packs and adds. | |||
| so we currently use max min to clip | |||
| 1. need to fix saturation code, I just couldn't get it to fly with packs and adds. | |||
| so we currently use max min to clip | |||
| 2. the inefficient use of chroma loading needs a bit of brushing up | |||
| 2. the inefficient use of chroma loading needs a bit of brushing up | |||
| 3. analysis of pipeline stalls needs to be done, use shark to identify pipeline stalls | |||
| 3. analysis of pipeline stalls needs to be done, use shark to identify pipeline stalls | |||
| MODIFIED to calculate coeffs from currently selected color space. | |||
| MODIFIED core to be a macro which you spec the output format. | |||
| ADDED UYVY conversion which is never called due to some thing in SWSCALE. | |||
| CORRECTED algorithim selection to be strict on input formats. | |||
| ADDED runtime detection of altivec. | |||
| MODIFIED to calculate coeffs from currently selected color space. | |||
| MODIFIED core to be a macro which you spec the output format. | |||
| ADDED UYVY conversion which is never called due to some thing in SWSCALE. | |||
| CORRECTED algorithim selection to be strict on input formats. | |||
| ADDED runtime detection of altivec. | |||
| ADDED altivec_yuv2packedX vertical scl + RGB converter | |||
| ADDED altivec_yuv2packedX vertical scl + RGB converter | |||
| March 27,2004 | |||
| PERFORMANCE ANALYSIS | |||
| March 27,2004 | |||
| PERFORMANCE ANALYSIS | |||
| The C version use 25% of the processor or ~250Mips for D1 video rawvideo used as test | |||
| The ALTIVEC version uses 10% of the processor or ~100Mips for D1 video same sequence | |||
| The C version use 25% of the processor or ~250Mips for D1 video rawvideo used as test | |||
| The ALTIVEC version uses 10% of the processor or ~100Mips for D1 video same sequence | |||
| 720*480*30 ~10MPS | |||
| 720*480*30 ~10MPS | |||
| so we have roughly 10clocks per pixel this is too high something has to be wrong. | |||
| so we have roughly 10clocks per pixel this is too high something has to be wrong. | |||
| OPTIMIZED clip codes to utilize vec_max and vec_packs removing the need for vec_min. | |||
| OPTIMIZED clip codes to utilize vec_max and vec_packs removing the need for vec_min. | |||
| OPTIMIZED DST OUTPUT cache/dma controls. we are pretty much | |||
| guaranteed to have the input video frame it was just decompressed so | |||
| it probably resides in L1 caches. However we are creating the | |||
| output video stream this needs to use the DSTST instruction to | |||
| optimize for the cache. We couple this with the fact that we are | |||
| not going to be visiting the input buffer again so we mark it Least | |||
| Recently Used. This shaves 25% of the processor cycles off. | |||
| OPTIMIZED DST OUTPUT cache/dma controls. we are pretty much | |||
| guaranteed to have the input video frame it was just decompressed so | |||
| it probably resides in L1 caches. However we are creating the | |||
| output video stream this needs to use the DSTST instruction to | |||
| optimize for the cache. We couple this with the fact that we are | |||
| not going to be visiting the input buffer again so we mark it Least | |||
| Recently Used. This shaves 25% of the processor cycles off. | |||
| Now MEMCPY is the largest mips consumer in the system, probably due | |||
| to the inefficient X11 stuff. | |||
| Now MEMCPY is the largest mips consumer in the system, probably due | |||
| to the inefficient X11 stuff. | |||
| GL libraries seem to be very slow on this machine 1.33Ghz PB running | |||
| Jaguar, this is not the case for my 1Ghz PB. I thought it might be | |||
| a versioning issues, however I have libGL.1.2.dylib for both | |||
| machines. ((We need to figure this out now)) | |||
| GL libraries seem to be very slow on this machine 1.33Ghz PB running | |||
| Jaguar, this is not the case for my 1Ghz PB. I thought it might be | |||
| a versioning issues, however I have libGL.1.2.dylib for both | |||
| machines. ((We need to figure this out now)) | |||
| GL2 libraries work now with patch for RGB32 | |||
| GL2 libraries work now with patch for RGB32 | |||
| NOTE quartz vo driver ARGB32_to_RGB24 consumes 30% of the processor | |||
| NOTE quartz vo driver ARGB32_to_RGB24 consumes 30% of the processor | |||
| Integrated luma prescaling adjustment for saturation/contrast/brightness adjustment. | |||
| Integrated luma prescaling adjustment for saturation/contrast/brightness adjustment. | |||
| */ | |||
| #include <stdio.h> | |||