avcodec/utvideodec: add SIMD support for median prediction for planar formats

~10% faster overall. Signed-off-by: Paul B Mahol <onemda@gmail.com>
9 years ago · ea93052db3
--- a/configure
+++ b/configure
@@ -2531,7 +2531,7 @@ truespeech_decoder_select="bswapdsp"
 tscc_decoder_select="zlib"
 twinvq_decoder_select="mdct lsp sinewin"
 txd_decoder_select="texturedsp"
 utvideo_decoder_select="bswapdsp"
 utvideo_decoder_select="bswapdsp huffyuvdsp"
 utvideo_encoder_select="bswapdsp huffman huffyuvencdsp"
 vble_decoder_select="huffyuvdsp"
 vc1_decoder_select="blockdsp h263_decoder h264qpel intrax8 mpegvideo vc1dsp"
--- a/libavcodec/utvideo.h
+++ b/libavcodec/utvideo.h
@@ -30,6 +30,7 @@
 #include "libavutil/common.h"
 #include "avcodec.h"
 #include "bswapdsp.h"
 #include "huffyuvdsp.h"
 #include "huffyuvencdsp.h"

 enum {
@@ -69,6 +70,7 @@ typedef struct UtvideoContext {
    const AVClass *class;
    AVCodecContext *avctx;
    BswapDSPContext bdsp;
    HuffYUVDSPContext hdspdec;
    HuffYUVEncDSPContext hdsp;

    uint32_t frame_info_size, flags, frame_info;
--- a/libavcodec/utvideodec.c
+++ b/libavcodec/utvideodec.c
@@ -372,8 +372,111 @@ static void restore_rgb_planes10(AVFrame *frame, int width, int height)
    }
 }

 static void restore_median(uint8_t *src, int step, int stride,
                           int width, int height, int slices, int rmode)
 #undef A
 #undef B
 #undef C

 static void restore_median_planar(UtvideoContext *c, uint8_t *src, int stride,
                                  int width, int height, int slices, int rmode)
 {
    int i, j, slice;
    int A, B, C;
    uint8_t *bsrc;
    int slice_start, slice_height;
    const int cmask = ~rmode;

    for (slice = 0; slice < slices; slice++) {
        slice_start  = ((slice * height) / slices) & cmask;
        slice_height = ((((slice + 1) * height) / slices) & cmask) -
                       slice_start;

        if (!slice_height)
            continue;
        bsrc = src + slice_start * stride;

        // first line - left neighbour prediction
        bsrc[0] += 0x80;
        c->hdspdec.add_hfyu_left_pred(bsrc, bsrc, width, 0);
        bsrc += stride;
        if (slice_height <= 1)
            continue;
        // second line - first element has top prediction, the rest uses median
        C        = bsrc[-stride];
        bsrc[0] += C;
        A        = bsrc[0];
        for (i = 1; i < width; i++) {
            B        = bsrc[i - stride];
            bsrc[i] += mid_pred(A, B, (uint8_t)(A + B - C));
            C        = B;
            A        = bsrc[i];
        }
        bsrc += stride;
        // the rest of lines use continuous median prediction
        for (j = 2; j < slice_height; j++) {
            c->hdspdec.add_hfyu_median_pred(bsrc, bsrc - stride,
                                            bsrc, width, &B, &C);
            bsrc += stride;
        }
    }
 }

 /* UtVideo interlaced mode treats every two lines as a single one,
 * so restoring function should take care of possible padding between
 * two parts of the same "line".
 */
 static void restore_median_planar_il(UtvideoContext *c, uint8_t *src, int stride,
                                     int width, int height, int slices, int rmode)
 {
    int i, j, slice;
    int A, B, C;
    uint8_t *bsrc;
    int slice_start, slice_height;
    const int cmask   = ~(rmode ? 3 : 1);
    const int stride2 = stride << 1;

    for (slice = 0; slice < slices; slice++) {
        slice_start    = ((slice * height) / slices) & cmask;
        slice_height   = ((((slice + 1) * height) / slices) & cmask) -
                         slice_start;
        slice_height >>= 1;
        if (!slice_height)
            continue;

        bsrc = src + slice_start * stride;

        // first line - left neighbour prediction
        bsrc[0] += 0x80;
        A = c->hdspdec.add_hfyu_left_pred(bsrc, bsrc, width, 0);
        c->hdspdec.add_hfyu_left_pred(bsrc + stride, bsrc + stride, width, A);
        bsrc += stride2;
        if (slice_height <= 1)
            continue;
        // second line - first element has top prediction, the rest uses median
        C        = bsrc[-stride2];
        bsrc[0] += C;
        A        = bsrc[0];
        for (i = 1; i < width; i++) {
            B        = bsrc[i - stride2];
            bsrc[i] += mid_pred(A, B, (uint8_t)(A + B - C));
            C        = B;
            A        = bsrc[i];
        }
        c->hdspdec.add_hfyu_median_pred(bsrc + stride, bsrc - stride,
                                        bsrc + stride, width, &B, &C);
        bsrc += stride2;
        // the rest of lines use continuous median prediction
        for (j = 2; j < slice_height; j++) {
            c->hdspdec.add_hfyu_median_pred(bsrc, bsrc - stride2,
                                            bsrc, width, &B, &C);
            c->hdspdec.add_hfyu_median_pred(bsrc + stride, bsrc - stride,
                                            bsrc + stride, width, &B, &C);
            bsrc += stride2;
        }
    }
 }

 static void restore_median_packed(uint8_t *src, int step, int stride,
                                  int width, int height, int slices, int rmode)
 {
    int i, j, slice;
    int A, B, C;
@@ -428,8 +531,8 @@ static void restore_median(uint8_t *src, int step, int stride,
 * so restoring function should take care of possible padding between
 * two parts of the same "line".
 */
 static void restore_median_il(uint8_t *src, int step, int stride,
                              int width, int height, int slices, int rmode)
 static void restore_median_packed_il(uint8_t *src, int step, int stride,
                                     int width, int height, int slices, int rmode)
 {
    int i, j, slice;
    int A, B, C;
@@ -608,14 +711,14 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
                return ret;
            if (c->frame_pred == PRED_MEDIAN) {
                if (!c->interlaced) {
                    restore_median(frame.f->data[0] + ff_ut_rgb_order[i],
                                   c->planes, frame.f->linesize[0], avctx->width,
                                   avctx->height, c->slices, 0);
                    restore_median_packed(frame.f->data[0] + ff_ut_rgb_order[i],
                                          c->planes, frame.f->linesize[0], avctx->width,
                                          avctx->height, c->slices, 0);
                } else {
                    restore_median_il(frame.f->data[0] + ff_ut_rgb_order[i],
                                      c->planes, frame.f->linesize[0],
                                      avctx->width, avctx->height, c->slices,
                                      0);
                    restore_median_packed_il(frame.f->data[0] + ff_ut_rgb_order[i],
                                             c->planes, frame.f->linesize[0],
                                             avctx->width, avctx->height, c->slices,
                                             0);
                }
            }
        }
@@ -644,14 +747,14 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
                return ret;
            if (c->frame_pred == PRED_MEDIAN) {
                if (!c->interlaced) {
                    restore_median(frame.f->data[i], 1, frame.f->linesize[i],
                                   avctx->width >> !!i, avctx->height >> !!i,
                                   c->slices, !i);
                    restore_median_planar(c, frame.f->data[i], frame.f->linesize[i],
                                          avctx->width >> !!i, avctx->height >> !!i,
                                          c->slices, !i);
                } else {
                    restore_median_il(frame.f->data[i], 1, frame.f->linesize[i],
                                      avctx->width  >> !!i,
                                      avctx->height >> !!i,
                                      c->slices, !i);
                    restore_median_planar_il(c, frame.f->data[i], frame.f->linesize[i],
                                             avctx->width  >> !!i,
                                             avctx->height >> !!i,
                                             c->slices, !i);
                }
            }
        }
@@ -665,13 +768,13 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
                return ret;
            if (c->frame_pred == PRED_MEDIAN) {
                if (!c->interlaced) {
                    restore_median(frame.f->data[i], 1, frame.f->linesize[i],
                                   avctx->width >> !!i, avctx->height,
                                   c->slices, 0);
                    restore_median_planar(c, frame.f->data[i], frame.f->linesize[i],
                                          avctx->width >> !!i, avctx->height,
                                          c->slices, 0);
                } else {
                    restore_median_il(frame.f->data[i], 1, frame.f->linesize[i],
                                      avctx->width >> !!i, avctx->height,
                                      c->slices, 0);
                    restore_median_planar_il(c, frame.f->data[i], frame.f->linesize[i],
                                             avctx->width >> !!i, avctx->height,
                                             c->slices, 0);
                }
            }
        }
@@ -685,13 +788,13 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
                return ret;
            if (c->frame_pred == PRED_MEDIAN) {
                if (!c->interlaced) {
                    restore_median(frame.f->data[i], 1, frame.f->linesize[i],
                                   avctx->width, avctx->height,
                                   c->slices, 0);
                    restore_median_planar(c, frame.f->data[i], frame.f->linesize[i],
                                          avctx->width, avctx->height,
                                          c->slices, 0);
                } else {
                    restore_median_il(frame.f->data[i], 1, frame.f->linesize[i],
                                      avctx->width, avctx->height,
                                      c->slices, 0);
                    restore_median_planar_il(c, frame.f->data[i], frame.f->linesize[i],
                                             avctx->width, avctx->height,
                                             c->slices, 0);
                }
            }
        }
@@ -724,6 +827,7 @@ static av_cold int decode_init(AVCodecContext *avctx)
    c->avctx = avctx;

    ff_bswapdsp_init(&c->bdsp);
    ff_huffyuvdsp_init(&c->hdspdec);

    if (avctx->extradata_size >= 16) {
        av_log(avctx, AV_LOG_DEBUG, "Encoder version %d.%d.%d.%d\n",
--- a/libavcodec/utvideoenc.c
+++ b/libavcodec/utvideoenc.c
@@ -293,6 +293,9 @@ static void left_predict(uint8_t *src, uint8_t *dst, int stride,
    }
 }

 #undef A
 #undef B

 /* Write data to a plane with median prediction */
 static void median_predict(UtvideoContext *c, uint8_t *src, uint8_t *dst, int stride,
                           int width, int height)