swscale: refactor horizontal scaling

+ split color conversion from scaling - disabled gamma correction, until it's refactored too Signed-off-by: Michael Niedermayer <michael@niedermayer.cc>
10 years ago · e0a3173a94
--- a/libswscale/Makefile
+++ b/libswscale/Makefile
@@ -15,6 +15,8 @@ OBJS = alphablend.o                                     \
       swscale_unscaled.o                               \
       utils.o                                          \
       yuv2rgb.o                                        \
       slice.o                                          \
       hscale.o                                         \
 OBJS-$(CONFIG_SHARED)        += log2_tab.o
--- a/libswscale/hscale.c
+++ b/libswscale/hscale.c
@@ -0,0 +1,254 @@
 #include "swscale_internal.h"
 static int lum_h_scale(SwsContext *c, SwsFilterDescriptor *desc, int sliceY, int sliceH)
 {
    FilterContext *instance = desc->instance;
    int srcW = desc->src->width;
    int dstW = desc->dst->width;
    int xInc = instance->xInc;
    int i;
    for (i = 0; i < sliceH; ++i) {
        uint8_t ** src = desc->src->plane[0].line;
        uint8_t ** dst = desc->dst->plane[0].line;
        int src_pos = sliceY+i - desc->src->plane[0].sliceY;
        int dst_pos = sliceY+i - desc->dst->plane[0].sliceY;
        if (c->hyscale_fast) {
            c->hyscale_fast(c, (int16_t*)dst[dst_pos], dstW, src[src_pos], srcW, xInc);
        } else {
            c->hyScale(c, (int16_t*)dst[dst_pos], dstW, (const uint8_t *)src[src_pos], instance->filter,
                       instance->filter_pos, instance->filter_size);
        }
        if (c->lumConvertRange)
            c->lumConvertRange((int16_t*)dst[dst_pos], dstW);
        desc->dst->plane[0].sliceH += 1;
        if (desc->alpha) {
            src = desc->src->plane[3].line;
            dst = desc->dst->plane[3].line;
            src_pos = sliceY+i - desc->src->plane[3].sliceY;
            dst_pos = sliceY+i - desc->dst->plane[3].sliceY;
            desc->dst->plane[3].sliceH += 1;
            if (c->hyscale_fast) {
                c->hyscale_fast(c, (int16_t*)dst[dst_pos], dstW, src[src_pos], srcW, xInc);
            } else {
                c->hyScale(c, (int16_t*)dst[dst_pos], dstW, (const uint8_t *)src[src_pos], instance->filter,
                            instance->filter_pos, instance->filter_size);
            }
        }
    }
    return sliceH;
 }
 static int lum_convert(SwsContext *c, SwsFilterDescriptor *desc, int sliceY, int sliceH)
 {
    int srcW = desc->src->width;
    ColorContext * instance = desc->instance;
    uint32_t * pal = instance->pal;
    int i;
    desc->dst->plane[0].sliceY = sliceY;
    desc->dst->plane[0].sliceH = sliceH;
    desc->dst->plane[3].sliceY = sliceY;
    desc->dst->plane[3].sliceH = sliceH;
    for (i = 0; i < sliceH; ++i) {
        int sp0 = sliceY+i - desc->src->plane[0].sliceY;
        int sp1 = ((sliceY+i) >> desc->src->v_chr_sub_sample) - desc->src->plane[1].sliceY;
        const uint8_t * src[4] = { desc->src->plane[0].line[sp0],
                        desc->src->plane[1].line[sp1],
                        desc->src->plane[2].line[sp1],
                        desc->src->plane[3].line[sp0]};
        uint8_t * dst = desc->dst->plane[0].line[i];
        if (c->lumToYV12) {
            c->lumToYV12(dst, src[0], src[1], src[2], srcW, pal);
        } else if (c->readLumPlanar) {
            c->readLumPlanar(dst, src, srcW, c->input_rgb2yuv_table);
        }
        if (desc->alpha) {
            dst = desc->dst->plane[3].line[i];
            if (c->alpToYV12) {
                c->alpToYV12(dst, src[3], src[1], src[2], srcW, pal);
            } else if (c->readAlpPlanar) {
                c->readAlpPlanar(dst, src, srcW, NULL);
            }
        }
    }
    return sliceH;
 }
 int ff_init_desc_fmt_convert(SwsFilterDescriptor *desc, SwsSlice * src, SwsSlice *dst, uint32_t *pal)
 {
    ColorContext * li = av_malloc(sizeof(ColorContext));
    if (!li)
        return AVERROR(ENOMEM);
    li->pal = pal;
    desc->instance = li;
    desc->alpha = isALPHA(src->fmt) && isALPHA(dst->fmt);
    desc->src =src;
    desc->dst = dst;
    desc->process = &lum_convert;
    return 0;
 }
 int ff_init_desc_hscale(SwsFilterDescriptor *desc, SwsSlice *src, SwsSlice *dst, uint16_t *filter, int * filter_pos, int filter_size, int xInc)
 {
    FilterContext *li = av_malloc(sizeof(FilterContext));
    if (!li)
        return AVERROR(ENOMEM);
    li->filter = filter;
    li->filter_pos = filter_pos;
    li->filter_size = filter_size;
    li->xInc = xInc;
    desc->instance = li;
    desc->alpha = isALPHA(src->fmt) && isALPHA(dst->fmt);
    desc->src = src;
    desc->dst = dst;
    desc->process = &lum_h_scale;
    return 0;
 }
 static int chr_h_scale(SwsContext *c, SwsFilterDescriptor *desc, int sliceY, int sliceH)
 {
    FilterContext *instance = desc->instance;
    int srcW = FF_CEIL_RSHIFT(desc->src->width, desc->src->h_chr_sub_sample);
    int dstW = FF_CEIL_RSHIFT(desc->dst->width, desc->dst->h_chr_sub_sample);
    int xInc = instance->xInc;
    uint8_t ** src1 = desc->src->plane[1].line;
    uint8_t ** dst1 = desc->dst->plane[1].line;
    uint8_t ** src2 = desc->src->plane[2].line;
    uint8_t ** dst2 = desc->dst->plane[2].line;
    int src_pos1 = sliceY - desc->src->plane[1].sliceY;
    int dst_pos1 = sliceY - desc->dst->plane[1].sliceY;
    int src_pos2 = sliceY - desc->src->plane[2].sliceY;
    int dst_pos2 = sliceY - desc->dst->plane[2].sliceY;
    int i;
    for (i = 0; i < sliceH; ++i) {
        if (c->hcscale_fast) {
            c->hcscale_fast(c, (uint16_t*)dst1[dst_pos1+i], (uint16_t*)dst2[dst_pos2+i], dstW, src1[src_pos1+i], src2[src_pos2+i], srcW, xInc);
        } else {
            c->hcScale(c, (uint16_t*)dst1[dst_pos1+i], dstW, src1[src_pos1+i], instance->filter, instance->filter_pos, instance->filter_size);
            c->hcScale(c, (uint16_t*)dst2[dst_pos2+i], dstW, src2[src_pos2+i], instance->filter, instance->filter_pos, instance->filter_size);
        }
        if (c->chrConvertRange)
            c->chrConvertRange((uint16_t*)dst1[dst_pos1+i], (uint16_t*)dst2[dst_pos2+i], dstW);
        desc->dst->plane[1].sliceH += 1;
        desc->dst->plane[2].sliceH += 1;
    }
    return sliceH;
 }
 static int chr_convert(SwsContext *c, SwsFilterDescriptor *desc, int sliceY, int sliceH)
 {
    int srcW = FF_CEIL_RSHIFT(desc->src->width, desc->src->h_chr_sub_sample);
    ColorContext * instance = desc->instance;
    uint32_t * pal = instance->pal;
    int sp0 = (sliceY - (desc->src->plane[0].sliceY >> desc->src->v_chr_sub_sample)) << desc->src->v_chr_sub_sample;
    int sp1 = sliceY - desc->src->plane[1].sliceY;
    int i;
    desc->dst->plane[1].sliceY = sliceY;
    desc->dst->plane[1].sliceH = sliceH;
    desc->dst->plane[2].sliceY = sliceY;
    desc->dst->plane[2].sliceH = sliceH;
    for (i = 0; i < sliceH; ++i) {
        const uint8_t * src[4] = { desc->src->plane[0].line[sp0+i],
                        desc->src->plane[1].line[sp1+i],
                        desc->src->plane[2].line[sp1+i],
                        desc->src->plane[3].line[sp0+i]};
        uint8_t * dst1 = desc->dst->plane[1].line[i];
        uint8_t * dst2 = desc->dst->plane[2].line[i];
        if (c->chrToYV12) {
            c->chrToYV12(dst1, dst2, src[0], src[1], src[2], srcW, pal);
        } else if (c->readChrPlanar) {
            c->readChrPlanar(dst1, dst2, src, srcW, c->input_rgb2yuv_table);
        }
    }
    return sliceH;
 }
 int ff_init_desc_cfmt_convert(SwsFilterDescriptor *desc, SwsSlice * src, SwsSlice *dst, uint32_t *pal)
 {
    ColorContext * li = av_malloc(sizeof(ColorContext));
    if (!li)
        return AVERROR(ENOMEM);
    li->pal = pal;
    desc->instance = li;
    desc->src =src;
    desc->dst = dst;
    desc->process = &chr_convert;
    return 0;
 }
 int ff_init_desc_chscale(SwsFilterDescriptor *desc, SwsSlice *src, SwsSlice *dst, uint16_t *filter, int * filter_pos, int filter_size, int xInc)
 {
    FilterContext *li = av_malloc(sizeof(FilterContext));
    if (!li)
        return AVERROR(ENOMEM);
    li->filter = filter;
    li->filter_pos = filter_pos;
    li->filter_size = filter_size;
    li->xInc = xInc;
    desc->instance = li;
    desc->alpha = isALPHA(src->fmt) && isALPHA(dst->fmt);
    desc->src = src;
    desc->dst = dst;
    desc->process = &chr_h_scale;
    return 0;
 }
 static int no_chr_scale(SwsContext *c, SwsFilterDescriptor *desc, int sliceY, int sliceH)
 {
    desc->dst->plane[1].sliceY = sliceY + sliceH - desc->dst->plane[1].available_lines;
    desc->dst->plane[1].sliceH = desc->dst->plane[1].available_lines;
    desc->dst->plane[2].sliceY = sliceY + sliceH - desc->dst->plane[2].available_lines;
    desc->dst->plane[2].sliceH = desc->dst->plane[2].available_lines;
    return 0;
 }
 int ff_init_desc_no_chr(SwsFilterDescriptor *desc, SwsSlice * src, SwsSlice *dst)
 {
    desc->src = src;
    desc->dst = dst;
    desc->alpha = 0;
    desc->instance = NULL;
    desc->process = &no_chr_scale;
    return 0;
 }
--- a/libswscale/slice.c
+++ b/libswscale/slice.c
@@ -0,0 +1,295 @@
 #include "swscale_internal.h"
 static void free_lines(SwsSlice *s)
 {
    int i;
    for (i = 0; i < 2; ++i) {
        int n = s->plane[i].available_lines;
        int j;
        for (j = 0; j < n; ++j) {
            av_freep(&s->plane[i].line[j]);
            if (s->is_ring)
               s->plane[i].line[j+n] = NULL;
        }
    }
    for (i = 0; i < 4; ++i)
        memset(s->plane[i].line, 0, sizeof(uint8_t*) * s->plane[i].available_lines * (s->is_ring ? 3 : 1));
    s->should_free_lines = 0;
 }
 /*
 slice lines contains extra bytes for vetorial code thus @size
 is the allocated memory size and @width is the number of pixels
 */
 static int alloc_lines(SwsSlice *s, int size, int width)
 {
    int i;
    int idx[2] = {3, 2};
    s->should_free_lines = 1;
    s->width = width;
    for (i = 0; i < 2; ++i) {
        int n = s->plane[i].available_lines;
        int j;
        int ii = idx[i];
        av_assert0(n == s->plane[ii].available_lines);
        for (j = 0; j < n; ++j) {
            // chroma plane line U and V are expected to be contiguous in memory
            // by mmx vertical scaler code
            s->plane[i].line[j] = av_malloc(size * 2 + 32);
            if (!s->plane[i].line[j]) {
                free_lines(s);
                return AVERROR(ENOMEM);
            }
            s->plane[ii].line[j] = s->plane[i].line[j] + size + 16;
            if (s->is_ring) {
               s->plane[i].line[j+n] = s->plane[i].line[j];
               s->plane[ii].line[j+n] = s->plane[ii].line[j];
            }
        }
    }
    return 0;
 }
 static int alloc_slice(SwsSlice *s, enum AVPixelFormat fmt, int lumLines, int chrLines, int h_sub_sample, int v_sub_sample, int ring)
 {
    int i;
    int size[4] = { lumLines,
                    chrLines,
                    chrLines,
                    lumLines };
    s->h_chr_sub_sample = h_sub_sample;
    s->v_chr_sub_sample = v_sub_sample;
    s->fmt = fmt;
    s->is_ring = ring;
    s->should_free_lines = 0;
    for (i = 0; i < 4; ++i) {
        int n = size[i] * ( ring == 0 ? 1 : 3);
        s->plane[i].line = av_mallocz_array(sizeof(uint8_t*), n);
        if (!s->plane[i].line)
            return AVERROR(ENOMEM);
        s->plane[i].tmp = ring ? s->plane[i].line + size[i] * 2 : NULL;
        s->plane[i].available_lines = size[i];
        s->plane[i].sliceY = 0;
        s->plane[i].sliceH = 0;
    }
    return 0;
 }
 static void free_slice(SwsSlice *s)
 {
    int i;
    if (s) {
        if (s->should_free_lines)
            free_lines(s);
        for (i = 0; i < 4; ++i) {
            av_freep(&s->plane[i].line);
            s->plane[i].tmp = NULL;
        }
    }
 }
 int ff_rotate_slice(SwsSlice *s, int lum, int chr)
 {
    int i;
    if (lum) {
        for (i = 0; i < 4; i+=3) {
            int n = s->plane[i].available_lines;
            int l = lum - s->plane[i].sliceY;
            if (l >= n * 2) {
                s->plane[i].sliceY += n;
                s->plane[i].sliceH -= n;
            }
        }
    }
    if (chr) {
        for (i = 1; i < 3; ++i) {
            int n = s->plane[i].available_lines;
            int l = chr - s->plane[i].sliceY;
            if (l >= n * 2) {
                s->plane[i].sliceY += n;
                s->plane[i].sliceH -= n;
            }
        }
    }
    return 0;
 }
 int ff_init_slice_from_src(SwsSlice * s, uint8_t *src[4], int stride[4], int srcW, int lumY, int lumH, int chrY, int chrH)
 {
    int i = 0;
    const int start[4] = {lumY,
                    chrY,
                    chrY,
                    lumY};
    const int end[4] = {lumY +lumH,
                        chrY + chrH,
                        chrY + chrH,
                        lumY + lumH};
    s->width = srcW;
    for (i = 0; i < 4; ++i) {
        int j;
        int lines = end[i];
        lines = s->plane[i].available_lines < lines ? s->plane[i].available_lines : lines;
        if (end[i] > s->plane[i].sliceY+s->plane[i].sliceH) {
            if (start[i] <= s->plane[i].sliceY+1)
                s->plane[i].sliceY = FFMIN(start[i], s->plane[i].sliceY);
            else
                s->plane[i].sliceY = start[i];
            s->plane[i].sliceH = end[i] - s->plane[i].sliceY;
        } else {
            if (end[i] >= s->plane[i].sliceY)
                s->plane[i].sliceH = s->plane[i].sliceY + s->plane[i].sliceH - start[i];
            else
                s->plane[i].sliceH = end[i] - start[i];
            s->plane[i].sliceY = start[i];
        }
        for (j = start[i]; j < lines; j+= 1)
            s->plane[i].line[j] = src[i] + (start[i] + j) * stride[i];
    }
    return 0;
 }
 static void fill_ones(SwsSlice *s, int n, int is16bit)
 {
    int i;
    for (i = 0; i < 4; ++i) {
        int j;
        int size = s->plane[i].available_lines;
        for (j = 0; j < size; ++j) {
            int k;
            int end = is16bit ? n>>1: n;
            // fill also one extra element
            end += 1;
            if (is16bit)
                for (k = 0; k < end; ++k)
                    ((int32_t*)(s->plane[i].line[j]))[k] = 1<<18;
            else
                for (k = 0; k < end; ++k)
                    ((int16_t*)(s->plane[i].line[j]))[k] = 1<<14;
        }
    }
 }
 int ff_init_filters(SwsContext * c)
 {
    int i;
    int index;
    int num_ydesc;
    int num_cdesc;
    int need_lum_conv = c->lumToYV12 || c->readLumPlanar || c->alpToYV12 || c->readAlpPlanar;
    int need_chr_conv = c->chrToYV12 || c->readChrPlanar;
    int srcIdx, dstIdx;
    int dst_stride = FFALIGN(c->dstW * sizeof(int16_t) + 66, 16);
    uint32_t * pal = usePal(c->srcFormat) ? c->pal_yuv : (uint32_t*)c->input_rgb2yuv_table;
    int res = 0;
    if (c->dstBpc == 16)
        dst_stride <<= 1;
    num_ydesc = need_lum_conv ? 2 : 1;
    num_cdesc = need_chr_conv ? 2 : 1;
    c->numSlice = FFMAX(num_ydesc, num_cdesc) + 1;
    c->numDesc = num_ydesc + num_cdesc;
    c->descIndex[0] = num_ydesc;
    c->descIndex[1] = num_ydesc + num_cdesc;
    c->desc = av_mallocz_array(sizeof(SwsFilterDescriptor), c->numDesc);
    if (!c->desc)
        return AVERROR(ENOMEM);
    c->slice = av_mallocz_array(sizeof(SwsSlice), c->numSlice);
    res = alloc_slice(&c->slice[0], c->srcFormat, c->srcH, c->chrSrcH, c->chrSrcHSubSample, c->chrSrcVSubSample, 0);
    if (res < 0) goto cleanup;
    for (i = 1; i < c->numSlice-1; ++i) {
        res = alloc_slice(&c->slice[i], c->srcFormat, c->vLumFilterSize, c->vChrFilterSize, c->chrSrcHSubSample, c->chrSrcVSubSample, 0);
        if (res < 0) goto cleanup;
        res = alloc_lines(&c->slice[i], FFALIGN(c->srcW*2+78, 16), c->srcW);
        if (res < 0) goto cleanup;
    }
    res = alloc_slice(&c->slice[i], c->srcFormat, c->vLumFilterSize, c->vChrFilterSize, c->chrDstHSubSample, c->chrDstVSubSample, 1);
    if (res < 0) goto cleanup;
    res = alloc_lines(&c->slice[i], dst_stride, c->dstW);
    if (res < 0) goto cleanup;
    fill_ones(&c->slice[i], dst_stride>>1, c->dstBpc == 16);
    index = 0;
    srcIdx = 0;
    dstIdx = 1;
    if (need_lum_conv) {
        ff_init_desc_fmt_convert(&c->desc[index], &c->slice[srcIdx], &c->slice[dstIdx], pal);
        c->desc[index].alpha = c->alpPixBuf != 0;
        ++index;
        srcIdx = dstIdx;
    }
    dstIdx = FFMAX(num_ydesc, num_cdesc);
    ff_init_desc_hscale(&c->desc[index], &c->slice[index], &c->slice[dstIdx], c->hLumFilter, c->hLumFilterPos, c->hLumFilterSize, c->lumXInc);
    c->desc[index].alpha = c->alpPixBuf != 0;
    ++index;
    {
        srcIdx = 0;
        dstIdx = 1;
        if (need_chr_conv) {
            ff_init_desc_cfmt_convert(&c->desc[index], &c->slice[srcIdx], &c->slice[dstIdx], pal);
            ++index;
            srcIdx = dstIdx;
        }
        dstIdx = FFMAX(num_ydesc, num_cdesc);
        if (c->needs_hcscale)
            ff_init_desc_chscale(&c->desc[index], &c->slice[srcIdx], &c->slice[dstIdx], c->hChrFilter, c->hChrFilterPos, c->hChrFilterSize, c->chrXInc);
        else
            ff_init_desc_no_chr(&c->desc[index], &c->slice[srcIdx], &c->slice[dstIdx]);
    }
    return 0;
 cleanup:
    ff_free_filters(c);
    return res;
 }
 int ff_free_filters(SwsContext *c)
 {
    int i;
    if (c->desc) {
        for (i = 0; i < c->numDesc; ++i)
            av_freep(&c->desc[i].instance);
        av_freep(&c->desc);
    }
    if (c->slice) {
        for (i = 0; i < c->numSlice; ++i)
            free_slice(&c->slice[i]);
        av_freep(&c->slice);
    }
    return 0;
 }
--- a/libswscale/swscale.c
+++ b/libswscale/swscale.c
@@ -371,6 +371,15 @@ static int swscale(SwsContext *c, const uint8_t *src[],
    int lastInChrBuf = c->lastInChrBuf;
    int perform_gamma = c->is_internal_gamma;
    int numDesc = c->numDesc;
    int lumStart = 0;
    int lumEnd = c->descIndex[0];
    int chrStart = lumEnd;
    int chrEnd = c->descIndex[1];
    SwsSlice *src_slice = &c->slice[lumStart];
    SwsSlice *dst_slice = &c->slice[c->numSlice-1];
    SwsFilterDescriptor *desc = c->desc;
    if (!usePal(c->srcFormat)) {
        pal = c->input_rgb2yuv_table;
@@ -439,6 +448,23 @@ static int swscale(SwsContext *c, const uint8_t *src[],
    }
    lastDstY = dstY;
 #define NEW_FILTER 1
    ff_init_slice_from_src(src_slice, (uint8_t**)src, srcStride, c->srcW,
            srcSliceY, srcSliceH,
            chrSrcSliceY, chrSrcSliceH);
    dst_slice->plane[0].sliceY = lastInLumBuf + 1;
    dst_slice->plane[1].sliceY = lastInChrBuf + 1;
    dst_slice->plane[2].sliceY = lastInChrBuf + 1;
    dst_slice->plane[3].sliceY = lastInLumBuf + 1;
    dst_slice->plane[0].sliceH =
    dst_slice->plane[1].sliceH =
    dst_slice->plane[2].sliceH =
    dst_slice->plane[3].sliceH = 0;
    dst_slice->width = dstW;
    for (; dstY < dstH; dstY++) {
        const int chrDstY = dstY >> c->chrDstVSubSample;
        uint8_t *dest[4]  = {
@@ -460,12 +486,23 @@ static int swscale(SwsContext *c, const uint8_t *src[],
        int lastLumSrcY2 = FFMIN(c->srcH,    firstLumSrcY2 + vLumFilterSize) - 1;
        int lastChrSrcY  = FFMIN(c->chrSrcH, firstChrSrcY  + vChrFilterSize) - 1;
        int enough_lines;
        int i;
        // handle holes (FAST_BILINEAR & weird filters)
        if (firstLumSrcY > lastInLumBuf)
        if (firstLumSrcY > lastInLumBuf) {
            lastInLumBuf = firstLumSrcY - 1;
        if (firstChrSrcY > lastInChrBuf)
            dst_slice->plane[0].sliceY = lastInLumBuf + 1;
            dst_slice->plane[3].sliceY = lastInLumBuf + 1;
            dst_slice->plane[0].sliceH =
            dst_slice->plane[3].sliceH = 0;
        }
        if (firstChrSrcY > lastInChrBuf) {
            lastInChrBuf = firstChrSrcY - 1;
            dst_slice->plane[1].sliceY = lastInChrBuf + 1;
            dst_slice->plane[2].sliceY = lastInChrBuf + 1;
            dst_slice->plane[1].sliceH =
            dst_slice->plane[2].sliceH = 0;
        }
        av_assert0(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1);
        av_assert0(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1);
@@ -486,6 +523,22 @@ static int swscale(SwsContext *c, const uint8_t *src[],
                          lastLumSrcY, lastChrSrcY);
        }
 #if NEW_FILTER
        ff_rotate_slice(dst_slice, lastLumSrcY, lastChrSrcY);
        if (lastInLumBuf < lastLumSrcY)
            for (i = lumStart; i < lumEnd; ++i)
                desc[i].process(c, &desc[i], lastInLumBuf + 1, lastLumSrcY - lastInLumBuf);
        lumBufIndex += lastLumSrcY - lastInLumBuf;
        lastInLumBuf = lastLumSrcY;
        if (lastInChrBuf < lastChrSrcY)
            for (i = chrStart; i < chrEnd; ++i)
                desc[i].process(c, &desc[i], lastInChrBuf + 1, lastChrSrcY - lastInChrBuf);
        chrBufIndex += lastChrSrcY - lastInChrBuf;
        lastInChrBuf = lastChrSrcY;
 #else
        // Do horizontal scaling
        while (lastInLumBuf < lastLumSrcY) {
            const uint8_t *src1[4] = {
@@ -499,8 +552,8 @@ static int swscale(SwsContext *c, const uint8_t *src[],
            av_assert0(lastInLumBuf + 1 - srcSliceY < srcSliceH);
            av_assert0(lastInLumBuf + 1 - srcSliceY >= 0);
            if (perform_gamma)
                gamma_convert((uint8_t **)src1, srcW, c->inv_gamma);
            //if (perform_gamma)
            //    gamma_convert((uint8_t **)src1, srcW, c->inv_gamma);
            hyscale(c, lumPixBuf[lumBufIndex], dstW, src1, srcW, lumXInc,
                    hLumFilter, hLumFilterPos, hLumFilterSize,
@@ -535,6 +588,7 @@ static int swscale(SwsContext *c, const uint8_t *src[],
            DEBUG_BUFFERS("\t\tchrBufIndex %d: lastInChrBuf: %d\n",
                          chrBufIndex, lastInChrBuf);
        }
 #endif
        // wrap buf index around to stay inside the ring buffer
        if (lumBufIndex >= vLumBufSize)
            lumBufIndex -= vLumBufSize;
@@ -560,11 +614,19 @@ static int swscale(SwsContext *c, const uint8_t *src[],
        }
        {
 #if NEW_FILTER
            const int16_t **lumSrcPtr  = (const int16_t **)(void*) dst_slice->plane[0].line + firstLumSrcY - dst_slice->plane[0].sliceY;
            const int16_t **chrUSrcPtr = (const int16_t **)(void*) dst_slice->plane[1].line + firstChrSrcY - dst_slice->plane[1].sliceY;
            const int16_t **chrVSrcPtr = (const int16_t **)(void*) dst_slice->plane[2].line + firstChrSrcY - dst_slice->plane[2].sliceY;
            const int16_t **alpSrcPtr  = (CONFIG_SWSCALE_ALPHA && alpPixBuf) ?
                                         (const int16_t **)(void*) dst_slice->plane[3].line + firstLumSrcY - dst_slice->plane[3].sliceY : NULL;
 #else
            const int16_t **lumSrcPtr  = (const int16_t **)(void*) lumPixBuf  + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
            const int16_t **chrUSrcPtr = (const int16_t **)(void*) chrUPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
            const int16_t **chrVSrcPtr = (const int16_t **)(void*) chrVPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
            const int16_t **alpSrcPtr  = (CONFIG_SWSCALE_ALPHA && alpPixBuf) ?
                                         (const int16_t **)(void*) alpPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize : NULL;
 #endif
            int16_t *vLumFilter = c->vLumFilter;
            int16_t *vChrFilter = c->vChrFilter;
@@ -629,8 +691,10 @@ static int swscale(SwsContext *c, const uint8_t *src[],
                    }
                }
            } else if (yuv2packedX) {
 #if !NEW_FILTER
                av_assert1(lumSrcPtr  + vLumFilterSize - 1 < (const int16_t **)lumPixBuf  + vLumBufSize * 2);
                av_assert1(chrUSrcPtr + vChrFilterSize - 1 < (const int16_t **)chrUPixBuf + vChrBufSize * 2);
 #endif
                if (c->yuv2packed1 && vLumFilterSize == 1 &&
                    vChrFilterSize <= 2) { // unscaled RGB
                    int chrAlpha = vChrFilterSize == 1 ? 0 : vChrFilter[2 * dstY + 1];
@@ -663,8 +727,8 @@ static int swscale(SwsContext *c, const uint8_t *src[],
                         chrUSrcPtr, chrVSrcPtr, vChrFilterSize,
                         alpSrcPtr, dest, dstW, dstY);
            }
            if (perform_gamma)
                gamma_convert(dest, dstW, c->gamma);
            //if (perform_gamma)
            //    gamma_convert(dest, dstW, c->gamma);
        }
    }
    if (isPlanar(dstFormat) && isALPHA(dstFormat) && !alpPixBuf) {
@@ -1151,4 +1215,3 @@ int attribute_align_arg sws_scale(struct SwsContext *c,
    av_free(rgb0_tmp);
    return ret;
 }
--- a/libswscale/swscale_internal.h
+++ b/libswscale/swscale_internal.h
@@ -276,6 +276,9 @@ typedef void (*yuv2anyX_fn)(struct SwsContext *c, const int16_t *lumFilter,
                            const int16_t **alpSrc, uint8_t **dest,
                            int dstW, int y);
 struct SwsSlice;
 struct SwsFilterDescriptor;
 /* This struct should be aligned on at least a 32-byte boundary. */
 typedef struct SwsContext {
    /**
@@ -326,6 +329,12 @@ typedef struct SwsContext {
    uint16_t *gamma;
    uint16_t *inv_gamma;
    int numDesc;
    int descIndex[2];
    int numSlice;
    struct SwsSlice *slice;
    struct SwsFilterDescriptor *desc;
    uint32_t pal_yuv[256];
    uint32_t pal_rgb[256];
@@ -934,4 +943,95 @@ static inline void fillPlane16(uint8_t *plane, int stride, int width, int height
    }
 }
 #define MAX_SLICE_PLANES 4
 /// Slice plane
 typedef struct SwsPlane
 {
    int available_lines;    ///< max number of lines that can be hold by this plane
    int sliceY;             ///< index of first line
    int sliceH;             ///< number of lines
    uint8_t **line;         ///< line buffer
    uint8_t **tmp;          ///< Tmp line buffer used by mmx code
 } SwsPlane;
 /**
 * Struct which defines a slice of an image to be scaled or a output for
 * a scaled slice.
 * A slice can also be used as intermediate ring buffer for scaling steps.
 */
 typedef struct SwsSlice
 {
    int width;              ///< Slice line width
    int h_chr_sub_sample;   ///< horizontal chroma subsampling factor
    int v_chr_sub_sample;   ///< vertical chroma subsampling factor
    int is_ring;            ///< flag to identify if this slice is a ring buffer
    int should_free_lines;  ///< flag to identify if there are dynamic allocated lines
    enum AVPixelFormat fmt; ///< planes pixel format
    SwsPlane plane[MAX_SLICE_PLANES];   ///< color planes
 } SwsSlice;
 /**
 * Struct which holds all necessary data for processing a slice.
 * A processing step can be a color conversion or horizontal/vertical scaling.
 */
 typedef struct SwsFilterDescriptor
 {
    SwsSlice *src;  ///< Source slice
    SwsSlice *dst;  ///< Output slice
    int alpha;      ///< Flag for processing alpha channel
    void *instance; ///< Filter instance data
    /// Function for processing input slice sliceH lines starting from line sliceY
    int (*process)(SwsContext *c, struct SwsFilterDescriptor *desc, int sliceY, int sliceH);
 } SwsFilterDescriptor;
 /// Color conversion instance data
 typedef struct ColorContext
 {
    uint32_t *pal;
 } ColorContext;
 /// Scaler instance data
 typedef struct FilterContext
 {
    uint16_t *filter;
    int *filter_pos;
    int filter_size;
    int xInc;
 } FilterContext;
 // warp input lines in the form (src + width*i + j) to slice format (line[i][j])
 int ff_init_slice_from_src(SwsSlice * s, uint8_t *src[4], int stride[4], int srcW, int lumY, int lumH, int chrY, int chrH);
 // Initialize scaler filter descriptor chain
 int ff_init_filters(SwsContext *c);
 // Free all filter data
 int ff_free_filters(SwsContext *c);
 /*
 function for applying ring buffer logic into slice s
 It checks if the slice can hold more @lum lines, if yes
 do nothing otherwise remove @lum least used lines.
 It applyes the same procedure for @chr lines.
 */
 int ff_rotate_slice(SwsSlice *s, int lum, int chr);
 /// initializes lum pixel format conversion descriptor
 int ff_init_desc_fmt_convert(SwsFilterDescriptor *desc, SwsSlice * src, SwsSlice *dst, uint32_t *pal);
 /// initializes lum horizontal scaling descriptor
 int ff_init_desc_hscale(SwsFilterDescriptor *desc, SwsSlice *src, SwsSlice *dst, uint16_t *filter, int * filter_pos, int filter_size, int xInc);
 /// initializes chr prixel format conversion descriptor
 int ff_init_desc_cfmt_convert(SwsFilterDescriptor *desc, SwsSlice * src, SwsSlice *dst, uint32_t *pal);
 /// initializes chr horizontal scaling descriptor
 int ff_init_desc_chscale(SwsFilterDescriptor *desc, SwsSlice *src, SwsSlice *dst, uint16_t *filter, int * filter_pos, int filter_size, int xInc);
 int ff_init_desc_no_chr(SwsFilterDescriptor *desc, SwsSlice * src, SwsSlice *dst);
 #endif /* SWSCALE_SWSCALE_INTERNAL_H */
--- a/libswscale/utils.c
+++ b/libswscale/utils.c
@@ -1702,7 +1702,7 @@ av_cold int sws_init_context(SwsContext *c, SwsFilter *srcFilter,
    }
    c->swscale = ff_getSwsFunc(c);
    return 0;
    return ff_init_filters(c);
 fail: // FIXME replace things by appropriate error codes
    if (ret == RETCODE_USE_CASCADE)  {
        int tmpW = sqrt(srcW * (int64_t)dstW);
@@ -2219,6 +2219,7 @@ void sws_freeContext(SwsContext *c)
    av_freep(&c->gamma);
    av_freep(&c->inv_gamma);
    ff_free_filters(c);
    av_free(c);
 }
--- a/libswscale/x86/swscale.c
+++ b/libswscale/x86/swscale.c
@@ -85,9 +85,17 @@ void ff_updateMMXDitherTables(SwsContext *c, int dstY, int lumBufIndex, int chrB
 {
    const int dstH= c->dstH;
    const int flags= c->flags;
 #define NEW_FILTER 1
 #if NEW_FILTER
    SwsPlane *lumPlane = &c->slice[c->numSlice-1].plane[0];
    SwsPlane *chrUPlane = &c->slice[c->numSlice-1].plane[1];
    SwsPlane *alpPlane = &c->slice[c->numSlice-1].plane[3];
 #else
    int16_t **lumPixBuf= c->lumPixBuf;
    int16_t **chrUPixBuf= c->chrUPixBuf;
    int16_t **alpPixBuf= c->alpPixBuf;
 #endif
    int hasAlpha = c->alpPixBuf != NULL;
    const int vLumBufSize= c->vLumBufSize;
    const int vChrBufSize= c->vChrBufSize;
    int32_t *vLumFilterPos= c->vLumFilterPos;
@@ -110,13 +118,22 @@ void ff_updateMMXDitherTables(SwsContext *c, int dstY, int lumBufIndex, int chrB
        c->greenDither= ff_dither4[dstY&1];
    c->redDither= ff_dither8[(dstY+1)&1];
    if (dstY < dstH - 2) {
 #if NEW_FILTER
        const int16_t **lumSrcPtr  = (const int16_t **)(void*) lumPlane->line + firstLumSrcY - lumPlane->sliceY;
        const int16_t **chrUSrcPtr = (const int16_t **)(void*) chrUPlane->line + firstChrSrcY - chrUPlane->sliceY;
        const int16_t **alpSrcPtr  = (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) ? (const int16_t **)(void*) alpPlane->line + firstLumSrcY - alpPlane->sliceY : NULL;
 #else
        const int16_t **lumSrcPtr= (const int16_t **)(void*) lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
        const int16_t **chrUSrcPtr= (const int16_t **)(void*) chrUPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
        const int16_t **alpSrcPtr= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? (const int16_t **)(void*) alpPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize : NULL;
 #endif
        int i;
        if (firstLumSrcY < 0 || firstLumSrcY + vLumFilterSize > c->srcH) {
 #if NEW_FILTER
            const int16_t **tmpY = (const int16_t **) lumPlane->tmp;
 #else
            const int16_t **tmpY = (const int16_t **) lumPixBuf + 2 * vLumBufSize;
 #endif
            int neg = -firstLumSrcY, i, end = FFMIN(c->srcH - firstLumSrcY, vLumFilterSize);
            for (i = 0; i < neg;            i++)
                tmpY[i] = lumSrcPtr[neg];
@@ -127,7 +144,11 @@ void ff_updateMMXDitherTables(SwsContext *c, int dstY, int lumBufIndex, int chrB
            lumSrcPtr = tmpY;
            if (alpSrcPtr) {
 #if NEW_FILTER
                const int16_t **tmpA = (const int16_t **) alpPlane->tmp;
 #else
                const int16_t **tmpA = (const int16_t **) alpPixBuf + 2 * vLumBufSize;
 #endif
                for (i = 0; i < neg;            i++)
                    tmpA[i] = alpSrcPtr[neg];
                for (     ; i < end;            i++)
@@ -138,7 +159,11 @@ void ff_updateMMXDitherTables(SwsContext *c, int dstY, int lumBufIndex, int chrB
            }
        }
        if (firstChrSrcY < 0 || firstChrSrcY + vChrFilterSize > c->chrSrcH) {
 #if NEW_FILTER
            const int16_t **tmpU = (const int16_t **) chrUPlane->tmp;
 #else
            const int16_t **tmpU = (const int16_t **) chrUPixBuf + 2 * vChrBufSize;
 #endif
            int neg = -firstChrSrcY, i, end = FFMIN(c->chrSrcH - firstChrSrcY, vChrFilterSize);
            for (i = 0; i < neg;            i++) {
                tmpU[i] = chrUSrcPtr[neg];
@@ -160,7 +185,7 @@ void ff_updateMMXDitherTables(SwsContext *c, int dstY, int lumBufIndex, int chrB
                lumMmxFilter[s*i+APCK_COEF/4  ]=
                lumMmxFilter[s*i+APCK_COEF/4+1]= vLumFilter[dstY*vLumFilterSize + i    ]
                + (vLumFilterSize>1 ? vLumFilter[dstY*vLumFilterSize + i + 1]<<16 : 0);
                if (CONFIG_SWSCALE_ALPHA && alpPixBuf) {
                if (CONFIG_SWSCALE_ALPHA && hasAlpha) {
                    *(const void**)&alpMmxFilter[s*i              ]= alpSrcPtr[i  ];
                    *(const void**)&alpMmxFilter[s*i+APCK_PTR2/4  ]= alpSrcPtr[i+(vLumFilterSize>1)];
                    alpMmxFilter[s*i+APCK_COEF/4  ]=
@@ -180,7 +205,7 @@ void ff_updateMMXDitherTables(SwsContext *c, int dstY, int lumBufIndex, int chrB
                lumMmxFilter[4*i+2]=
                lumMmxFilter[4*i+3]=
                ((uint16_t)vLumFilter[dstY*vLumFilterSize + i])*0x10001U;
                if (CONFIG_SWSCALE_ALPHA && alpPixBuf) {
                if (CONFIG_SWSCALE_ALPHA && hasAlpha) {
                    *(const void**)&alpMmxFilter[4*i+0]= alpSrcPtr[i];
                    alpMmxFilter[4*i+2]=
                    alpMmxFilter[4*i+3]= lumMmxFilter[4*i+2];