Based on a somewhat similar idea in FFmpeg's swscale copy.tags/n0.9
| @@ -193,6 +193,18 @@ DECLARE_ALIGNED(8, const uint8_t, dither_8x8_220)[8][8]={ | |||
| { 77, 23, 60, 15, 72, 21, 56, 14, }, | |||
| }; | |||
| #endif | |||
| DECLARE_ALIGNED(8, const uint8_t, dither_8x8_128)[8][8] = { | |||
| { 36, 68, 60, 92, 34, 66, 58, 90,}, | |||
| { 100, 4,124, 28, 98, 2,122, 26,}, | |||
| { 52, 84, 44, 76, 50, 82, 42, 74,}, | |||
| { 116, 20,108, 12,114, 18,106, 10,}, | |||
| { 32, 64, 56, 88, 38, 70, 62, 94,}, | |||
| { 96, 0,120, 24,102, 6,126, 30,}, | |||
| { 48, 80, 40, 72, 54, 86, 46, 78,}, | |||
| { 112, 16,104, 8,118, 22,110, 14,}, | |||
| }; | |||
| DECLARE_ALIGNED(8, const uint8_t, ff_sws_pb_64)[8] = | |||
| { 64, 64, 64, 64, 64, 64, 64, 64 }; | |||
| DECLARE_ALIGNED(8, const uint8_t, dithers)[8][8][8]={ | |||
| { | |||
| @@ -387,16 +399,16 @@ static void yuv2yuvX_c(SwsContext *c, const int16_t *lumFilter, | |||
| const int16_t *chrFilter, const int16_t **chrUSrc, | |||
| const int16_t **chrVSrc, | |||
| int chrFilterSize, const int16_t **alpSrc, | |||
| uint8_t *dest[4], int dstW, int chrDstW, | |||
| const uint8_t *lumDither, const uint8_t *chrDither) | |||
| uint8_t *dest[4], int dstW, int chrDstW) | |||
| { | |||
| uint8_t *yDest = dest[0], *uDest = dest[1], *vDest = dest[2], | |||
| *aDest = CONFIG_SWSCALE_ALPHA ? dest[3] : NULL; | |||
| int i; | |||
| const uint8_t *lumDither = c->lumDither8, *chrDither = c->chrDither8; | |||
| //FIXME Optimize (just quickly written not optimized..) | |||
| for (i=0; i<dstW; i++) { | |||
| int val = lumDither[i&7] << 12; | |||
| int val = lumDither[i & 7] << 12; | |||
| int j; | |||
| for (j=0; j<lumFilterSize; j++) | |||
| val += lumSrc[j][i] * lumFilter[j]; | |||
| @@ -406,8 +418,8 @@ static void yuv2yuvX_c(SwsContext *c, const int16_t *lumFilter, | |||
| if (uDest) | |||
| for (i=0; i<chrDstW; i++) { | |||
| int u = chrDither[i&7] << 12; | |||
| int v = chrDither[(i+3)&7] << 12; | |||
| int u = chrDither[i & 7] << 12; | |||
| int v = chrDither[(i + 3) & 7] << 12; | |||
| int j; | |||
| for (j=0; j<chrFilterSize; j++) { | |||
| u += chrUSrc[j][i] * chrFilter[j]; | |||
| @@ -420,7 +432,7 @@ static void yuv2yuvX_c(SwsContext *c, const int16_t *lumFilter, | |||
| if (CONFIG_SWSCALE_ALPHA && aDest) | |||
| for (i=0; i<dstW; i++) { | |||
| int val = lumDither[i&7] << 12; | |||
| int val = lumDither[i & 7] << 12; | |||
| int j; | |||
| for (j=0; j<lumFilterSize; j++) | |||
| val += alpSrc[j][i] * lumFilter[j]; | |||
| @@ -432,29 +444,29 @@ static void yuv2yuvX_c(SwsContext *c, const int16_t *lumFilter, | |||
| static void yuv2yuv1_c(SwsContext *c, const int16_t *lumSrc, | |||
| const int16_t *chrUSrc, const int16_t *chrVSrc, | |||
| const int16_t *alpSrc, | |||
| uint8_t *dest[4], int dstW, int chrDstW, | |||
| const uint8_t *lumDither, const uint8_t *chrDither) | |||
| uint8_t *dest[4], int dstW, int chrDstW) | |||
| { | |||
| uint8_t *yDest = dest[0], *uDest = dest[1], *vDest = dest[2], | |||
| *aDest = CONFIG_SWSCALE_ALPHA ? dest[3] : NULL; | |||
| int i; | |||
| const uint8_t *lumDither = c->lumDither8, *chrDither = c->chrDither8; | |||
| for (i=0; i<dstW; i++) { | |||
| int val= (lumSrc[i]+lumDither[i&7])>>7; | |||
| int val = (lumSrc[i]+ lumDither[i & 7]) >> 7; | |||
| yDest[i]= av_clip_uint8(val); | |||
| } | |||
| if (uDest) | |||
| for (i=0; i<chrDstW; i++) { | |||
| int u=(chrUSrc[i]+chrDither[i&7])>>7; | |||
| int v=(chrVSrc[i]+chrDither[(i+3)&7])>>7; | |||
| int u = (chrUSrc[i] + chrDither[i & 7]) >> 7; | |||
| int v = (chrVSrc[i] + chrDither[(i + 3) & 7]) >> 7; | |||
| uDest[i]= av_clip_uint8(u); | |||
| vDest[i]= av_clip_uint8(v); | |||
| } | |||
| if (CONFIG_SWSCALE_ALPHA && aDest) | |||
| for (i=0; i<dstW; i++) { | |||
| int val= (alpSrc[i]+lumDither[i&7])>>7; | |||
| int val = (alpSrc[i] + lumDither[i & 7]) >> 7; | |||
| aDest[i]= av_clip_uint8(val); | |||
| } | |||
| } | |||
| @@ -464,16 +476,16 @@ static void yuv2nv12X_c(SwsContext *c, const int16_t *lumFilter, | |||
| const int16_t *chrFilter, const int16_t **chrUSrc, | |||
| const int16_t **chrVSrc, int chrFilterSize, | |||
| const int16_t **alpSrc, uint8_t *dest[4], | |||
| int dstW, int chrDstW, | |||
| const uint8_t *lumDither, const uint8_t *chrDither) | |||
| int dstW, int chrDstW) | |||
| { | |||
| uint8_t *yDest = dest[0], *uDest = dest[1]; | |||
| enum PixelFormat dstFormat = c->dstFormat; | |||
| const uint8_t *lumDither = c->lumDither8, *chrDither = c->chrDither8; | |||
| //FIXME Optimize (just quickly written not optimized..) | |||
| int i; | |||
| for (i=0; i<dstW; i++) { | |||
| int val = lumDither[i&7]<<12; | |||
| int val = lumDither[i & 7] << 12; | |||
| int j; | |||
| for (j=0; j<lumFilterSize; j++) | |||
| val += lumSrc[j][i] * lumFilter[j]; | |||
| @@ -486,8 +498,8 @@ static void yuv2nv12X_c(SwsContext *c, const int16_t *lumFilter, | |||
| if (dstFormat == PIX_FMT_NV12) | |||
| for (i=0; i<chrDstW; i++) { | |||
| int u = chrDither[i&7]<<12; | |||
| int v = chrDither[(i+3)&7]<<12; | |||
| int u = chrDither[i & 7] << 12; | |||
| int v = chrDither[(i + 3) & 7] << 12; | |||
| int j; | |||
| for (j=0; j<chrFilterSize; j++) { | |||
| u += chrUSrc[j][i] * chrFilter[j]; | |||
| @@ -499,8 +511,8 @@ static void yuv2nv12X_c(SwsContext *c, const int16_t *lumFilter, | |||
| } | |||
| else | |||
| for (i=0; i<chrDstW; i++) { | |||
| int u = chrDither[i&7]<<12; | |||
| int v = chrDither[(i+3)&7]<<12; | |||
| int u = chrDither[i & 7] << 12; | |||
| int v = chrDither[(i + 3) & 7] << 12; | |||
| int j; | |||
| for (j=0; j<chrFilterSize; j++) { | |||
| u += chrUSrc[j][i] * chrFilter[j]; | |||
| @@ -2523,6 +2535,7 @@ static int swScale(SwsContext *c, const uint8_t* src[], | |||
| const int chrSrcSliceH= -((-srcSliceH) >> c->chrSrcVSubSample); | |||
| int lastDstY; | |||
| uint32_t *pal=c->pal_yuv; | |||
| int should_dither= isNBPS(c->srcFormat) || is16BPS(c->srcFormat); | |||
| yuv2planar1_fn yuv2yuv1 = c->yuv2yuv1; | |||
| yuv2planarX_fn yuv2yuvX = c->yuv2yuvX; | |||
| @@ -2578,6 +2591,9 @@ static int swScale(SwsContext *c, const uint8_t* src[], | |||
| lastInChrBuf= -1; | |||
| } | |||
| if (!should_dither) { | |||
| c->chrDither8 = c->lumDither8 = ff_sws_pb_64; | |||
| } | |||
| lastDstY= dstY; | |||
| for (;dstY < dstH; dstY++) { | |||
| @@ -2588,8 +2604,6 @@ static int swScale(SwsContext *c, const uint8_t* src[], | |||
| dst[2] + dstStride[2] * chrDstY, | |||
| (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? dst[3] + dstStride[3] * dstY : NULL, | |||
| }; | |||
| const uint8_t *lumDither= should_dither ? dithers[7][dstY &7] : flat64; | |||
| const uint8_t *chrDither= should_dither ? dithers[7][chrDstY&7] : flat64; | |||
| const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input | |||
| const int firstLumSrcY2= vLumFilterPos[FFMIN(dstY | ((1<<c->chrDstVSubSample) - 1), dstH-1)]; | |||
| @@ -2669,6 +2683,10 @@ static int swScale(SwsContext *c, const uint8_t* src[], | |||
| #if HAVE_MMX | |||
| updateMMXDitherTables(c, dstY, lumBufIndex, chrBufIndex, lastInLumBuf, lastInChrBuf); | |||
| #endif | |||
| if (should_dither) { | |||
| c->chrDither8 = dither_8x8_128[chrDstY & 7]; | |||
| c->lumDither8 = dither_8x8_128[dstY & 7]; | |||
| } | |||
| if (dstY >= dstH-2) { | |||
| // hmm looks like we can't use MMX here without overwriting this array's tail | |||
| find_c_packed_planar_out_funcs(c, &yuv2yuv1, &yuv2yuvX, | |||
| @@ -2689,13 +2707,13 @@ static int swScale(SwsContext *c, const uint8_t* src[], | |||
| if (c->yuv2yuv1 && vLumFilterSize == 1 && vChrFilterSize == 1) { // unscaled YV12 | |||
| const int16_t *alpBuf= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? alpSrcPtr[0] : NULL; | |||
| yuv2yuv1(c, lumSrcPtr[0], chrUSrcPtr[0], chrVSrcPtr[0], alpBuf, | |||
| dest, dstW, chrDstW, lumDither, chrDither); | |||
| dest, dstW, chrDstW); | |||
| } else { //General YV12 | |||
| yuv2yuvX(c, vLumFilter + dstY * vLumFilterSize, | |||
| lumSrcPtr, vLumFilterSize, | |||
| vChrFilter + chrDstY * vChrFilterSize, | |||
| chrUSrcPtr, chrVSrcPtr, vChrFilterSize, | |||
| alpSrcPtr, dest, dstW, chrDstW, lumDither, chrDither); | |||
| alpSrcPtr, dest, dstW, chrDstW); | |||
| } | |||
| } else { | |||
| assert(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2); | |||
| @@ -75,8 +75,7 @@ typedef int (*SwsFunc)(struct SwsContext *context, const uint8_t* src[], | |||
| typedef void (*yuv2planar1_fn) (struct SwsContext *c, | |||
| const int16_t *lumSrc, const int16_t *chrUSrc, | |||
| const int16_t *chrVSrc, const int16_t *alpSrc, | |||
| uint8_t *dest[4], int dstW, int chrDstW, | |||
| const uint8_t *lumDither, const uint8_t *chrDither); | |||
| uint8_t *dest[4], int dstW, int chrDstW); | |||
| /** | |||
| * Write one line of horizontally scaled Y/U/V/A to planar output | |||
| * with multi-point vertical scaling between input pixels. | |||
| @@ -99,7 +98,7 @@ typedef void (*yuv2planarX_fn) (struct SwsContext *c, const int16_t *lumFilter, | |||
| const int16_t *chrFilter, const int16_t **chrUSrc, | |||
| const int16_t **chrVSrc, int chrFilterSize, | |||
| const int16_t **alpSrc, uint8_t *dest[4], | |||
| int dstW, int chrDstW, const uint8_t *lumDither, const uint8_t *chrDither); | |||
| int dstW, int chrDstW); | |||
| /** | |||
| * Write one line of horizontally scaled Y/U/V/A to packed-pixel YUV/RGB | |||
| * output without any additional vertical scaling (or point-scaling). Note | |||
| @@ -323,7 +322,7 @@ typedef struct SwsContext { | |||
| #define UV_OFF "11*8+4*4*256*3+48" | |||
| #define UV_OFFx2 "11*8+4*4*256*3+56" | |||
| #define DITHER16 "11*8+4*4*256*3+64" | |||
| #define DITHER32 "11*8+4*4*256*3+64+16" | |||
| #define DITHER32 "11*8+4*4*256*3+80" | |||
| DECLARE_ALIGNED(8, uint64_t, redDither); | |||
| DECLARE_ALIGNED(8, uint64_t, greenDither); | |||
| @@ -351,6 +350,8 @@ typedef struct SwsContext { | |||
| uint16_t dither16[8]; | |||
| uint32_t dither32[8]; | |||
| const uint8_t *chrDither8, *lumDither8; | |||
| #if HAVE_ALTIVEC | |||
| vector signed short CY; | |||
| vector signed short CRV; | |||
| @@ -70,26 +70,62 @@ | |||
| : "%"REG_d, "%"REG_S\ | |||
| ); | |||
| #if !COMPILE_TEMPLATE_MMX2 | |||
| static av_always_inline void | |||
| dither_8to16(SwsContext *c, const uint8_t *srcDither, int rot) | |||
| { | |||
| if (rot) { | |||
| __asm__ volatile("pxor %%mm0, %%mm0\n\t" | |||
| "movq (%0), %%mm3\n\t" | |||
| "movq %%mm3, %%mm4\n\t" | |||
| "psrlq $24, %%mm3\n\t" | |||
| "psllq $40, %%mm4\n\t" | |||
| "por %%mm4, %%mm3\n\t" | |||
| "movq %%mm3, %%mm4\n\t" | |||
| "punpcklbw %%mm0, %%mm3\n\t" | |||
| "punpckhbw %%mm0, %%mm4\n\t" | |||
| "psraw $4, %%mm3\n\t" | |||
| "psraw $4, %%mm4\n\t" | |||
| "movq %%mm3, "DITHER16"+0(%1)\n\t" | |||
| "movq %%mm4, "DITHER16"+8(%1)\n\t" | |||
| :: "r"(srcDither), "r"(&c->redDither) | |||
| ); | |||
| } else { | |||
| __asm__ volatile("pxor %%mm0, %%mm0\n\t" | |||
| "movq (%0), %%mm3\n\t" | |||
| "movq %%mm3, %%mm4\n\t" | |||
| "punpcklbw %%mm0, %%mm3\n\t" | |||
| "punpckhbw %%mm0, %%mm4\n\t" | |||
| "psraw $4, %%mm3\n\t" | |||
| "psraw $4, %%mm4\n\t" | |||
| "movq %%mm3, "DITHER16"+0(%1)\n\t" | |||
| "movq %%mm4, "DITHER16"+8(%1)\n\t" | |||
| :: "r"(srcDither), "r"(&c->redDither) | |||
| ); | |||
| } | |||
| } | |||
| #endif | |||
| static void RENAME(yuv2yuvX)(SwsContext *c, const int16_t *lumFilter, | |||
| const int16_t **lumSrc, int lumFilterSize, | |||
| const int16_t *chrFilter, const int16_t **chrUSrc, | |||
| const int16_t **chrVSrc, | |||
| int chrFilterSize, const int16_t **alpSrc, | |||
| uint8_t *dest[4], int dstW, int chrDstW, | |||
| const uint8_t *lumDither, const uint8_t *chrDither) | |||
| uint8_t *dest[4], int dstW, int chrDstW) | |||
| { | |||
| int i; | |||
| uint8_t *yDest = dest[0], *uDest = dest[1], *vDest = dest[2], | |||
| *aDest = CONFIG_SWSCALE_ALPHA ? dest[3] : NULL; | |||
| const uint8_t *lumDither = c->lumDither8, *chrDither = c->chrDither8; | |||
| if (uDest) { | |||
| x86_reg uv_off = c->uv_offx2 >> 1; | |||
| for(i=0; i<8; i++) c->dither16[i] = chrDither[i]>>4; | |||
| dither_8to16(c, chrDither, 0); | |||
| YSCALEYUV2YV12X(CHR_MMX_FILTER_OFFSET, uDest, chrDstW, 0) | |||
| for(i=0; i<8; i++) c->dither16[i] = chrDither[(i+3)&7]>>4; | |||
| dither_8to16(c, chrDither, 1); | |||
| YSCALEYUV2YV12X(CHR_MMX_FILTER_OFFSET, vDest - uv_off, chrDstW + uv_off, uv_off) | |||
| } | |||
| for(i=0; i<8; i++) c->dither16[i] = lumDither[i]>>4; | |||
| dither_8to16(c, lumDither, 0); | |||
| if (CONFIG_SWSCALE_ALPHA && aDest) { | |||
| YSCALEYUV2YV12X(ALP_MMX_FILTER_OFFSET, aDest, dstW, 0) | |||
| } | |||
| @@ -104,10 +140,6 @@ static void RENAME(yuv2yuvX)(SwsContext *c, const int16_t *lumFilter, | |||
| "movq "DITHER32"+8(%0), %%mm5 \n\t"\ | |||
| "movq "DITHER32"+16(%0), %%mm6 \n\t"\ | |||
| "movq "DITHER32"+24(%0), %%mm7 \n\t"\ | |||
| "pxor %%mm4, %%mm4 \n\t"\ | |||
| "pxor %%mm5, %%mm5 \n\t"\ | |||
| "pxor %%mm6, %%mm6 \n\t"\ | |||
| "pxor %%mm7, %%mm7 \n\t"\ | |||
| "mov (%%"REG_d"), %%"REG_S" \n\t"\ | |||
| ".p2align 4 \n\t"\ | |||
| "1: \n\t"\ | |||
| @@ -157,26 +189,87 @@ static void RENAME(yuv2yuvX)(SwsContext *c, const int16_t *lumFilter, | |||
| : "%"REG_a, "%"REG_d, "%"REG_S\ | |||
| ); | |||
| #if !COMPILE_TEMPLATE_MMX2 | |||
| static av_always_inline void | |||
| dither_8to32(SwsContext *c, const uint8_t *srcDither, int rot) | |||
| { | |||
| int i; | |||
| if(rot) for(i=0; i<8; i++) c->dither32[i] = srcDither[(i+3)&7]<<12; | |||
| else for(i=0; i<8; i++) c->dither32[i] = srcDither[i&7]<<12; | |||
| return; | |||
| if (rot) { | |||
| __asm__ volatile("pxor %%mm0, %%mm0\n\t" | |||
| "movq (%0), %%mm4\n\t" | |||
| "movq %%mm4, %%mm5\n\t" | |||
| "psrlq $24, %%mm4\n\t" | |||
| "psllq $40, %%mm5\n\t" | |||
| "por %%mm5, %%mm4\n\t" | |||
| "movq %%mm4, %%mm6\n\t" | |||
| "punpcklbw %%mm0, %%mm4\n\t" | |||
| "punpckhbw %%mm0, %%mm6\n\t" | |||
| "movq %%mm4, %%mm5\n\t" | |||
| "movq %%mm6, %%mm7\n\t" | |||
| "punpcklwd %%mm0, %%mm4\n\t" | |||
| "punpckhwd %%mm0, %%mm5\n\t" | |||
| "punpcklwd %%mm0, %%mm6\n\t" | |||
| "punpckhwd %%mm0, %%mm7\n\t" | |||
| "psllw $12, %%mm4\n\t" | |||
| "psllw $12, %%mm5\n\t" | |||
| "psllw $12, %%mm6\n\t" | |||
| "psllw $12, %%mm7\n\t" | |||
| "movq %%mm4, "DITHER32"+0(%1)\n\t" | |||
| "movq %%mm5, "DITHER32"+8(%1)\n\t" | |||
| "movq %%mm6, "DITHER32"+16(%1)\n\t" | |||
| "movq %%mm7, "DITHER32"+24(%1)\n\t" | |||
| :: "r"(srcDither), "r"(&c->redDither) | |||
| ); | |||
| } else { | |||
| __asm__ volatile("pxor %%mm0, %%mm0\n\t" | |||
| "movq (%0), %%mm4\n\t" | |||
| "movq %%mm4, %%mm6\n\t" | |||
| "punpcklbw %%mm0, %%mm4\n\t" | |||
| "punpckhbw %%mm0, %%mm6\n\t" | |||
| "movq %%mm4, %%mm5\n\t" | |||
| "movq %%mm6, %%mm7\n\t" | |||
| "punpcklwd %%mm0, %%mm4\n\t" | |||
| "punpckhwd %%mm0, %%mm5\n\t" | |||
| "punpcklwd %%mm0, %%mm6\n\t" | |||
| "punpckhwd %%mm0, %%mm7\n\t" | |||
| "psllw $12, %%mm4\n\t" | |||
| "psllw $12, %%mm5\n\t" | |||
| "psllw $12, %%mm6\n\t" | |||
| "psllw $12, %%mm7\n\t" | |||
| "movq %%mm4, "DITHER32"+0(%1)\n\t" | |||
| "movq %%mm5, "DITHER32"+8(%1)\n\t" | |||
| "movq %%mm6, "DITHER32"+16(%1)\n\t" | |||
| "movq %%mm7, "DITHER32"+24(%1)\n\t" | |||
| :: "r"(srcDither), "r"(&c->redDither) | |||
| ); | |||
| } | |||
| } | |||
| #endif | |||
| static void RENAME(yuv2yuvX_ar)(SwsContext *c, const int16_t *lumFilter, | |||
| const int16_t **lumSrc, int lumFilterSize, | |||
| const int16_t *chrFilter, const int16_t **chrUSrc, | |||
| const int16_t **chrVSrc, | |||
| int chrFilterSize, const int16_t **alpSrc, | |||
| uint8_t *dest[4], int dstW, int chrDstW, | |||
| const uint8_t *lumDither, const uint8_t *chrDither) | |||
| uint8_t *dest[4], int dstW, int chrDstW) | |||
| { | |||
| int i; | |||
| uint8_t *yDest = dest[0], *uDest = dest[1], *vDest = dest[2], | |||
| *aDest = CONFIG_SWSCALE_ALPHA ? dest[3] : NULL; | |||
| const uint8_t *lumDither = c->lumDither8, *chrDither = c->chrDither8; | |||
| if (uDest) { | |||
| x86_reg uv_off = c->uv_offx2 >> 1; | |||
| for(i=0; i<8; i++) c->dither32[i] = chrDither[i]<<12; | |||
| dither_8to32(c, chrDither, 0); | |||
| YSCALEYUV2YV12X_ACCURATE(CHR_MMX_FILTER_OFFSET, uDest, chrDstW, 0) | |||
| for(i=0; i<8; i++) c->dither32[i] = chrDither[(i+3)&7]<<12; | |||
| dither_8to32(c, chrDither, 1); | |||
| YSCALEYUV2YV12X_ACCURATE(CHR_MMX_FILTER_OFFSET, vDest - uv_off, chrDstW + uv_off, uv_off) | |||
| } | |||
| for(i=0; i<8; i++) c->dither32[i] = lumDither[i]<<12; | |||
| dither_8to32(c, lumDither, 0); | |||
| if (CONFIG_SWSCALE_ALPHA && aDest) { | |||
| YSCALEYUV2YV12X_ACCURATE(ALP_MMX_FILTER_OFFSET, aDest, dstW, 0) | |||
| } | |||
| @@ -187,8 +280,7 @@ static void RENAME(yuv2yuvX_ar)(SwsContext *c, const int16_t *lumFilter, | |||
| static void RENAME(yuv2yuv1)(SwsContext *c, const int16_t *lumSrc, | |||
| const int16_t *chrUSrc, const int16_t *chrVSrc, | |||
| const int16_t *alpSrc, | |||
| uint8_t *dst[4], int dstW, int chrDstW, | |||
| const uint8_t *lumDither, const uint8_t *chrDither) | |||
| uint8_t *dst[4], int dstW, int chrDstW) | |||
| { | |||
| int p= 4; | |||
| const int16_t *src[4]= { | |||
| @@ -222,8 +314,7 @@ static void RENAME(yuv2yuv1)(SwsContext *c, const int16_t *lumSrc, | |||
| static void RENAME(yuv2yuv1_ar)(SwsContext *c, const int16_t *lumSrc, | |||
| const int16_t *chrUSrc, const int16_t *chrVSrc, | |||
| const int16_t *alpSrc, | |||
| uint8_t *dst[4], int dstW, int chrDstW, | |||
| const uint8_t *lumDither, const uint8_t *chrDither) | |||
| uint8_t *dst[4], int dstW, int chrDstW) | |||
| { | |||
| int p= 4; | |||
| const int16_t *src[4]= { | |||
| @@ -231,15 +322,16 @@ static void RENAME(yuv2yuv1_ar)(SwsContext *c, const int16_t *lumSrc, | |||
| chrVSrc + chrDstW, alpSrc + dstW | |||
| }; | |||
| x86_reg counter[4]= { dstW, chrDstW, chrDstW, dstW }; | |||
| const uint8_t *lumDither = c->lumDither8, *chrDither = c->chrDither8; | |||
| while (p--) { | |||
| if (dst[p]) { | |||
| int i; | |||
| for(i=0; i<8; i++) c->dither16[i] = i<2 ? lumDither[i] : chrDither[i]; | |||
| for(i=0; i<8; i++) c->dither16[i] = (p == 2 || p == 3) ? lumDither[i] : chrDither[i]; | |||
| __asm__ volatile( | |||
| "mov %2, %%"REG_a" \n\t" | |||
| "movq 0(%3), %%mm6 \n\t" | |||
| "movq 8(%3), %%mm7 \n\t" | |||
| "movq "DITHER16"+0(%3), %%mm6 \n\t" | |||
| "movq "DITHER16"+8(%3), %%mm7 \n\t" | |||
| ".p2align 4 \n\t" /* FIXME Unroll? */ | |||
| "1: \n\t" | |||
| "movq (%0, %%"REG_a", 2), %%mm0 \n\t" | |||
| @@ -253,7 +345,7 @@ static void RENAME(yuv2yuv1_ar)(SwsContext *c, const int16_t *lumSrc, | |||
| "add $8, %%"REG_a" \n\t" | |||
| "jnc 1b \n\t" | |||
| :: "r" (src[p]), "r" (dst[p] + counter[p]), | |||
| "g" (-counter[p]), "r"(c->dither16) | |||
| "g" (-counter[p]), "r"(&c->redDither) | |||
| : "%"REG_a | |||
| ); | |||
| } | |||