avcodec/hevc: new idct + asm

Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
11 years ago · 92cccb7bcd
--- a/libavcodec/hevc_cabac.c
+++ b/libavcodec/hevc_cabac.c
@@ -1388,8 +1388,21 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
            s->hevcdsp.transform_skip(dst, coeffs, stride);
        else if (lc->cu.pred_mode == MODE_INTRA && c_idx == 0 && log2_trafo_size == 2)
            s->hevcdsp.transform_4x4_luma_add(dst, coeffs, stride);
        else
            s->hevcdsp.transform_add[log2_trafo_size-2](dst, coeffs, stride);
        else {
            int max_xy = FFMAX(last_significant_coeff_x, last_significant_coeff_y);
            if (max_xy == 0)
                s->hevcdsp.transform_dc_add[log2_trafo_size-2](dst, coeffs, stride);
            else {
                int col_limit = last_significant_coeff_x + last_significant_coeff_y + 4;
                if (max_xy < 4)
                    col_limit = FFMIN(4, col_limit);
                else if (max_xy < 8)
                    col_limit = FFMIN(8, col_limit);
                else if (max_xy < 12)
                    col_limit = FFMIN(24, col_limit);
                s->hevcdsp.transform_add[log2_trafo_size-2](dst, coeffs, stride, col_limit);
            }
        }
    }
 }

--- a/libavcodec/hevcdsp.c
+++ b/libavcodec/hevcdsp.c
@@ -202,6 +202,11 @@ void ff_hevc_dsp_init(HEVCDSPContext *hevcdsp, int bit_depth)
    hevcdsp->transform_add[2]       = FUNC(transform_16x16_add, depth);     \
    hevcdsp->transform_add[3]       = FUNC(transform_32x32_add, depth);     \
                                                                            \
    hevcdsp->transform_dc_add[0]    = FUNC(transform_4x4_dc_add, depth);    \
    hevcdsp->transform_dc_add[1]    = FUNC(transform_8x8_dc_add, depth);    \
    hevcdsp->transform_dc_add[2]    = FUNC(transform_16x16_dc_add, depth);  \
    hevcdsp->transform_dc_add[3]    = FUNC(transform_32x32_dc_add, depth);  \
                                                                            \
    hevcdsp->sao_band_filter[0] = FUNC(sao_band_filter_0, depth);           \
    hevcdsp->sao_band_filter[1] = FUNC(sao_band_filter_1, depth);           \
    hevcdsp->sao_band_filter[2] = FUNC(sao_band_filter_2, depth);           \
--- a/libavcodec/hevcdsp.h
+++ b/libavcodec/hevcdsp.h
@@ -50,7 +50,9 @@ typedef struct HEVCDSPContext {
    void (*transform_skip)(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
    void (*transform_4x4_luma_add)(uint8_t *dst, int16_t *coeffs,
                                   ptrdiff_t stride);
    void (*transform_add[4])(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
    void (*transform_add[4])(uint8_t *dst, int16_t *coeffs, ptrdiff_t _stride, int col_limit);

    void (*transform_dc_add[4])(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);

    void (*sao_band_filter[4])(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
                               struct SAOParams *sao, int *borders,
--- a/libavcodec/hevcdsp_template.c
+++ b/libavcodec/hevcdsp_template.c
@@ -178,172 +178,122 @@ static void FUNC(transform_4x4_luma_add)(uint8_t *_dst, int16_t *coeffs,

 #undef TR_4x4_LUMA

 #define TR_4(dst, src, dstep, sstep, assign)                            \
    do {                                                                \
        const int e0 = transform[8 * 0][0] * src[0 * sstep] +           \
                       transform[8 * 2][0] * src[2 * sstep];            \
        const int e1 = transform[8 * 0][1] * src[0 * sstep] +           \
                       transform[8 * 2][1] * src[2 * sstep];            \
        const int o0 = transform[8 * 1][0] * src[1 * sstep] +           \
                       transform[8 * 3][0] * src[3 * sstep];            \
        const int o1 = transform[8 * 1][1] * src[1 * sstep] +           \
                       transform[8 * 3][1] * src[3 * sstep];            \
                                                                        \
        assign(dst[0 * dstep], e0 + o0);                                \
        assign(dst[1 * dstep], e1 + o1);                                \
        assign(dst[2 * dstep], e1 - o1);                                \
        assign(dst[3 * dstep], e0 - o0);                                \
 #define TR_4(dst, src, dstep, sstep, assign, end)                              \
    do {                                                                       \
        const int e0 = 64 * src[0 * sstep] + 64 * src[2 * sstep];              \
        const int e1 = 64 * src[0 * sstep] - 64 * src[2 * sstep];              \
        const int o0 = 83 * src[1 * sstep] + 36 * src[3 * sstep];              \
        const int o1 = 36 * src[1 * sstep] - 83 * src[3 * sstep];              \
                                                                               \
        assign(dst[0 * dstep], e0 + o0);                                       \
        assign(dst[1 * dstep], e1 + o1);                                       \
        assign(dst[2 * dstep], e1 - o1);                                       \
        assign(dst[3 * dstep], e0 - o0);                                       \
    } while (0)

 static void FUNC(transform_4x4_add)(uint8_t *_dst, int16_t *coeffs,
                                    ptrdiff_t stride)
 {
    int i;
    pixel *dst   = (pixel *)_dst;
    int shift    = 7;
    int add      = 1 << (shift - 1);
    int16_t *src = coeffs;

    stride /= sizeof(pixel);

    for (i = 0; i < 4; i++) {
        TR_4(src, src, 4, 4, SCALE);
        src++;
    }

    shift = 20 - BIT_DEPTH;
    add   = 1 << (shift - 1);
    for (i = 0; i < 4; i++) {
        TR_4(dst, coeffs, 1, 1, ADD_AND_SCALE);
        coeffs += 4;
        dst    += stride;
    }
 }

 #define TR_8(dst, src, dstep, sstep, assign)                      \
    do {                                                          \
        int i, j;                                                 \
        int e_8[4];                                               \
        int o_8[4] = { 0 };                                       \
        for (i = 0; i < 4; i++)                                   \
            for (j = 1; j < 8; j += 2)                            \
                o_8[i] += transform[4 * j][i] * src[j * sstep];   \
        TR_4(e_8, src, 1, 2 * sstep, SET);                        \
                                                                  \
        for (i = 0; i < 4; i++) {                                 \
            assign(dst[i * dstep], e_8[i] + o_8[i]);              \
            assign(dst[(7 - i) * dstep], e_8[i] - o_8[i]);        \
        }                                                         \
 #define TR_8(dst, src, dstep, sstep, assign, end)                              \
    do {                                                                       \
        int i, j;                                                              \
        int e_8[4];                                                            \
        int o_8[4] = { 0 };                                                    \
        for (i = 0; i < 4; i++)                                                \
            for (j = 1; j < end; j += 2)                                       \
                o_8[i] += transform[4 * j][i] * src[j * sstep];                \
        TR_4(e_8, src, 1, 2 * sstep, SET, 4);                                  \
                                                                               \
        for (i = 0; i < 4; i++) {                                              \
            assign(dst[i * dstep], e_8[i] + o_8[i]);                           \
            assign(dst[(7 - i) * dstep], e_8[i] - o_8[i]);                     \
        }                                                                      \
    } while (0)

 #define TR_16(dst, src, dstep, sstep, assign)                     \
    do {                                                          \
        int i, j;                                                 \
        int e_16[8];                                              \
        int o_16[8] = { 0 };                                      \
        for (i = 0; i < 8; i++)                                   \
            for (j = 1; j < 16; j += 2)                           \
                o_16[i] += transform[2 * j][i] * src[j * sstep];  \
        TR_8(e_16, src, 1, 2 * sstep, SET);                       \
                                                                  \
        for (i = 0; i < 8; i++) {                                 \
            assign(dst[i * dstep], e_16[i] + o_16[i]);            \
            assign(dst[(15 - i) * dstep], e_16[i] - o_16[i]);     \
        }                                                         \
 #define TR_16(dst, src, dstep, sstep, assign, end)                             \
    do {                                                                       \
        int i, j;                                                              \
        int e_16[8];                                                           \
        int o_16[8] = { 0 };                                                   \
        for (i = 0; i < 8; i++)                                                \
            for (j = 1; j < end; j += 2)                                       \
                o_16[i] += transform[2 * j][i] * src[j * sstep];               \
        TR_8(e_16, src, 1, 2 * sstep, SET, 8);                                 \
                                                                               \
        for (i = 0; i < 8; i++) {                                              \
            assign(dst[i * dstep], e_16[i] + o_16[i]);                         \
            assign(dst[(15 - i) * dstep], e_16[i] - o_16[i]);                  \
        }                                                                      \
    } while (0)

 #define TR_32(dst, src, dstep, sstep, assign)                     \
    do {                                                          \
        int i, j;                                                 \
        int e_32[16];                                             \
        int o_32[16] = { 0 };                                     \
        for (i = 0; i < 16; i++)                                  \
            for (j = 1; j < 32; j += 2)                           \
                o_32[i] += transform[j][i] * src[j * sstep];      \
        TR_16(e_32, src, 1, 2 * sstep, SET);                      \
                                                                  \
        for (i = 0; i < 16; i++) {                                \
            assign(dst[i * dstep], e_32[i] + o_32[i]);            \
            assign(dst[(31 - i) * dstep], e_32[i] - o_32[i]);     \
        }                                                         \
 #define TR_32(dst, src, dstep, sstep, assign, end)                             \
    do {                                                                       \
        int i, j;                                                              \
        int e_32[16];                                                          \
        int o_32[16] = { 0 };                                                  \
        for (i = 0; i < 16; i++)                                               \
            for (j = 1; j < end; j += 2)                                       \
                o_32[i] += transform[j][i] * src[j * sstep];                   \
        TR_16(e_32, src, 1, 2 * sstep, SET, end/2);                            \
                                                                               \
        for (i = 0; i < 16; i++) {                                             \
            assign(dst[i * dstep], e_32[i] + o_32[i]);                         \
            assign(dst[(31 - i) * dstep], e_32[i] - o_32[i]);                  \
        }                                                                      \
    } while (0)



 static void FUNC(transform_8x8_add)(uint8_t *_dst, int16_t *coeffs,
                                    ptrdiff_t stride)
 {
    int i;
    pixel *dst   = (pixel *)_dst;
    int shift    = 7;
    int add      = 1 << (shift - 1);
    int16_t *src = coeffs;

    stride /= sizeof(pixel);

    for (i = 0; i < 8; i++) {
        TR_8(src, src, 8, 8, SCALE);
        src++;
    }

    shift = 20 - BIT_DEPTH;
    add   = 1 << (shift - 1);
    for (i = 0; i < 8; i++) {
        TR_8(dst, coeffs, 1, 1, ADD_AND_SCALE);
        coeffs += 8;
        dst    += stride;
    }
 #define TRANSFORM_ADD(H)                                                       \
 static void FUNC(transform_##H ##x ##H ##_add)(                                \
    uint8_t *_dst, int16_t *coeffs, ptrdiff_t _stride, int col_limit) {        \
    int i;                                                                     \
    pixel    *dst    = (pixel *)_dst;                                          \
    int      stride  = _stride/sizeof(pixel);                                  \
    int      shift   = 7;                                                      \
    int      add     = 1 << (shift - 1);                                       \
    int16_t *src     = coeffs;                                                 \
    int      limit   = FFMIN(col_limit + 4, H);                                \
                                                                               \
    for (i = 0; i < H; i++) {                                                  \
        TR_ ## H(src, src, H, H, SCALE, limit);                                \
        if (limit < H && i%4 == 0 && !!i)                                      \
            limit -= 4;                                                        \
        src++;                                                                 \
    }                                                                          \
    limit   = FFMIN(col_limit, H);                                             \
                                                                               \
    shift   = 20 - BIT_DEPTH;                                                  \
    add     = 1 << (shift - 1);                                                \
    for (i = 0; i < H; i++) {                                                  \
        TR_ ## H(dst, coeffs, 1, 1, ADD_AND_SCALE, limit);                     \
        coeffs += H;                                                           \
        dst    += stride;                                                      \
    }                                                                          \
 }

 static void FUNC(transform_16x16_add)(uint8_t *_dst, int16_t *coeffs,
                                      ptrdiff_t stride)
 {
    int i;
    pixel *dst   = (pixel *)_dst;
    int shift    = 7;
    int add      = 1 << (shift - 1);
    int16_t *src = coeffs;

    stride /= sizeof(pixel);

    for (i = 0; i < 16; i++) {
        TR_16(src, src, 16, 16, SCALE);
        src++;
    }

    shift = 20 - BIT_DEPTH;
    add   = 1 << (shift - 1);
    for (i = 0; i < 16; i++) {
        TR_16(dst, coeffs, 1, 1, ADD_AND_SCALE);
        coeffs += 16;
        dst    += stride;
    }
 #define TRANSFORM_DC_ADD(H)                                                    \
 static void FUNC(transform_##H ##x ##H ##_dc_add)(                             \
    uint8_t *_dst, int16_t *coeffs, ptrdiff_t _stride) {                       \
    int i, j;                                                                  \
    pixel    *dst    = (pixel *)_dst;                                          \
    int      stride  = _stride/sizeof(pixel);                                  \
    int      shift   = 14 - BIT_DEPTH;                                         \
    int      add     = 1 << (shift - 1);                                       \
    int      coeff   = (((coeffs[0] + 1) >> 1) + add) >> shift;                \
                                                                               \
    for (j = 0; j < H; j++) {                                                  \
        for (i = 0; i < H; i++) {                                              \
            dst[i+j*stride] = av_clip_pixel(dst[i+j*stride] + coeff);          \
        }                                                                      \
    }                                                                          \
 }

 static void FUNC(transform_32x32_add)(uint8_t *_dst, int16_t *coeffs,
                                      ptrdiff_t stride)
 {
    int i;
    pixel *dst   = (pixel *)_dst;
    int shift    = 7;
    int add      = 1 << (shift - 1);
    int16_t *src = coeffs;
 TRANSFORM_ADD( 4)
 TRANSFORM_ADD( 8)
 TRANSFORM_ADD(16)
 TRANSFORM_ADD(32)

    stride /= sizeof(pixel);
 TRANSFORM_DC_ADD( 4)
 TRANSFORM_DC_ADD( 8)
 TRANSFORM_DC_ADD(16)
 TRANSFORM_DC_ADD(32)

    for (i = 0; i < 32; i++) {
        TR_32(src, src, 32, 32, SCALE);
        src++;
    }
    src   = coeffs;
    shift = 20 - BIT_DEPTH;
    add   = 1 << (shift - 1);
    for (i = 0; i < 32; i++) {
        TR_32(dst, coeffs, 1, 1, ADD_AND_SCALE);
        coeffs += 32;
        dst    += stride;
    }
 }

 static void FUNC(sao_band_filter)(uint8_t *_dst, uint8_t *_src,
                                  ptrdiff_t stride, SAOParams *sao,
--- a/libavcodec/x86/Makefile
+++ b/libavcodec/x86/Makefile
@@ -92,7 +92,8 @@ YASM-OBJS-$(CONFIG_H264QPEL)           += x86/h264_qpel_8bit.o          \
                                          x86/fpel.o                    \
                                          x86/qpel.o
 YASM-OBJS-$(CONFIG_HEVC_DECODER)       += x86/hevc_mc.o                 \
                                          x86/hevc_deblock.o
                                          x86/hevc_deblock.o            \
                                          x86/hevc_idct.o
 YASM-OBJS-$(CONFIG_HPELDSP)            += x86/fpel.o                    \
                                          x86/hpeldsp.o
 YASM-OBJS-$(CONFIG_HUFFYUVDSP)         += x86/huffyuvdsp.o
--- a/libavcodec/x86/hevc_idct.asm
+++ b/libavcodec/x86/hevc_idct.asm
@@ -0,0 +1,180 @@
 ; /*
 ; * Provide SSE & MMX idct functions for HEVC decoding
 ; * Copyright (c) 2014 Pierre-Edouard LEPERE
 ; *
 ; * This file is part of FFmpeg.
 ; *
 ; * FFmpeg is free software; you can redistribute it and/or
 ; * modify it under the terms of the GNU Lesser General Public
 ; * License as published by the Free Software Foundation; either
 ; * version 2.1 of the License, or (at your option) any later version.
 ; *
 ; * FFmpeg is distributed in the hope that it will be useful,
 ; * but WITHOUT ANY WARRANTY; without even the implied warranty of
 ; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ; * Lesser General Public License for more details.
 ; *
 ; * You should have received a copy of the GNU Lesser General Public
 ; * License along with FFmpeg; if not, write to the Free Software
 ; * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ; */
 %include "libavutil/x86/x86util.asm"

 SECTION_RODATA
 max_pixels_10:          times 8  dw ((1 << 10)-1)
 dc_add_10:              times 4 dd ((1 << 14-10) + 1)


 SECTION .text

 ;the idct_dc_add macros and functions were largely inspired by x264 project's code in the h264_idct.asm file

 %macro DC_ADD_INIT 2
    add              %1w, ((1 << 14-8) + 1)
    sar              %1w, (15-8)
    movd              m0, %1
    lea               %1, [%2*3]
    SPLATW            m0, m0, 0
    pxor              m1, m1
    psubw             m1, m0
    packuswb          m0, m0
    packuswb          m1, m1
 %endmacro

 %macro DC_ADD_OP 4
    %1                m2, [%2     ]
    %1                m3, [%2+%3  ]
    %1                m4, [%2+%3*2]
    %1                m5, [%2+%4  ]
    paddusb           m2, m0
    paddusb           m3, m0
    paddusb           m4, m0
    paddusb           m5, m0
    psubusb           m2, m1
    psubusb           m3, m1
    psubusb           m4, m1
    psubusb           m5, m1
    %1         [%2     ], m2
    %1         [%2+%3  ], m3
    %1         [%2+%3*2], m4
    %1         [%2+%4  ], m5
 %endmacro

 INIT_MMX mmxext
 ; void ff_hevc_idct_dc_add_8_mmxext(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
 %if ARCH_X86_64
 cglobal hevc_idct4_dc_add_8, 3, 4, 0
    movsx             r3, word [r1]
    DC_ADD_INIT       r3, r2
    DC_ADD_OP       movh, r0, r2, r3
    RET

 ; void ff_hevc_idct8_dc_add_8_mmxext(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
 cglobal hevc_idct8_dc_add_8, 3, 4, 0
    movsx             r3, word [r1]
    DC_ADD_INIT       r3, r2
    DC_ADD_OP       mova, r0, r2, r3
    lea               r0, [r0+r2*4]
    DC_ADD_OP       mova, r0, r2, r3
    RET
 %else
 ; void ff_hevc_idct_dc_add_8_mmxext(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
 cglobal hevc_idct4_dc_add_8, 2, 3, 0
    movsx             r2, word [r1]
    mov               r1, r2m
    DC_ADD_INIT       r2, r1
    DC_ADD_OP       movh, r0, r1, r2
    RET

 ; void ff_hevc_idct8_dc_add_8_mmxext(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
 cglobal hevc_idct8_dc_add_8, 2, 3, 0
    movsx             r2, word [r1]
    mov               r1, r2m
    DC_ADD_INIT       r2, r1
    DC_ADD_OP       mova, r0, r1, r2
    lea               r0, [r0+r1*4]
    DC_ADD_OP       mova, r0, r1, r2
    RET
 %endif


 INIT_XMM sse2
 ; void ff_hevc_idct16_dc_add_8_mmxext(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
 cglobal hevc_idct16_dc_add_8, 3, 4, 0
    movsx             r3, word [r1]
    DC_ADD_INIT       r3, r2
    DC_ADD_OP       mova, r0, r2, r3
    lea               r0, [r0+r2*4]
    DC_ADD_OP       mova, r0, r2, r3
    lea               r0, [r0+r2*4]
    DC_ADD_OP       mova, r0, r2, r3
    lea               r0, [r0+r2*4]
    DC_ADD_OP       mova, r0, r2, r3
    RET

 ;-----------------------------------------------------------------------------
 ; void ff_hevc_idct_dc_add_10(pixel *dst, int16_t *block, int stride)
 ;-----------------------------------------------------------------------------
 %macro IDCT_DC_ADD_OP_10 3
    pxor              m5, m5
 %if avx_enabled
    paddw             m1, m0, [%1+0   ]
    paddw             m2, m0, [%1+%2  ]
    paddw             m3, m0, [%1+%2*2]
    paddw             m4, m0, [%1+%3  ]
 %else
    mova              m1, [%1+0   ]
    mova              m2, [%1+%2  ]
    mova              m3, [%1+%2*2]
    mova              m4, [%1+%3  ]
    paddw             m1, m0
    paddw             m2, m0
    paddw             m3, m0
    paddw             m4, m0
 %endif
    CLIPW             m1, m5, m6
    CLIPW             m2, m5, m6
    CLIPW             m3, m5, m6
    CLIPW             m4, m5, m6
    mova       [%1+0   ], m1
    mova       [%1+%2  ], m2
    mova       [%1+%2*2], m3
    mova       [%1+%3  ], m4
 %endmacro

 INIT_MMX mmxext
 cglobal hevc_idct4_dc_add_10,3,3
    mov              r1w, [r1]
    add              r1w, ((1 << 4) + 1)
    sar              r1w, 5
    movd              m0, r1d
    lea               r1, [r2*3]
    SPLATW            m0, m0, 0
    mova              m6, [max_pixels_10]
    IDCT_DC_ADD_OP_10 r0, r2, r1
    RET

 ;-----------------------------------------------------------------------------
 ; void ff_hevc_idct8_dc_add_10(pixel *dst, int16_t *block, int stride)
 ;-----------------------------------------------------------------------------
 %macro IDCT8_DC_ADD 0
 cglobal hevc_idct8_dc_add_10,3,4,7
    mov              r1w, [r1]
    add              r1w, ((1 << 4) + 1)
    sar              r1w, 5
    movd              m0, r1d
    lea               r1, [r2*3]
    SPLATW            m0, m0, 0
    mova              m6, [max_pixels_10]
    IDCT_DC_ADD_OP_10 r0, r2, r1
    lea               r0, [r0+r2*4]
    IDCT_DC_ADD_OP_10 r0, r2, r1
    RET
 %endmacro

 INIT_XMM sse2
 IDCT8_DC_ADD
 %if HAVE_AVX_EXTERNAL
 INIT_XMM avx
 IDCT8_DC_ADD
 %endif
--- a/libavcodec/x86/hevcdsp.h
+++ b/libavcodec/x86/hevcdsp.h
@@ -28,6 +28,10 @@
 #include <stddef.h>
 #include <stdint.h>


 #define idct_dc_proto(size, bitd, opt) \
                void ff_hevc_idct##size##_dc_add_##bitd##_##opt(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)

 #define PEL_LINK(dst, idx1, idx2, idx3, name, D, opt) \
 dst[idx1][idx2][idx3] = ff_hevc_put_hevc_ ## name ## _ ## D ## _##opt; \
 dst ## _bi[idx1][idx2][idx3] = ff_hevc_put_hevc_bi_ ## name ## _ ## D ## _##opt; \
@@ -119,5 +123,26 @@ QPEL_PROTOTYPES(qpel_hv, 10, sse4);
 WEIGHTING_PROTOTYPES(8, sse4);
 WEIGHTING_PROTOTYPES(10, sse4);

 ///////////////////////////////////////////////////////////////////////////////
 // IDCT
 ///////////////////////////////////////////////////////////////////////////////


 idct_dc_proto(4, 8,mmxext);
 idct_dc_proto(8, 8,mmxext);
 idct_dc_proto(16,8,  sse2);
 idct_dc_proto(32,8,  sse2);


 idct_dc_proto(4, 10,mmxext);
 idct_dc_proto(8, 10,  sse2);
 idct_dc_proto(16,10,  sse2);
 idct_dc_proto(32,10,  sse2);
 idct_dc_proto(8, 10,   avx);
 idct_dc_proto(16,10,   avx);
 idct_dc_proto(32,10,   avx);




 #endif // AVCODEC_X86_HEVCDSP_H
--- a/libavcodec/x86/hevcdsp_init.c
+++ b/libavcodec/x86/hevcdsp_init.c
@@ -49,6 +49,48 @@ LFC_FUNCS(uint8_t,  10)
 LFL_FUNCS(uint8_t,   8)
 LFL_FUNCS(uint8_t,  10)

 #if HAVE_SSE2_EXTERNAL
 void ff_hevc_idct32_dc_add_8_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
 {
    ff_hevc_idct16_dc_add_8_sse2(dst, coeffs, stride);
    ff_hevc_idct16_dc_add_8_sse2(dst+16, coeffs, stride);
    ff_hevc_idct16_dc_add_8_sse2(dst+16*stride, coeffs, stride);
    ff_hevc_idct16_dc_add_8_sse2(dst+16*stride+16, coeffs, stride);
 }

 void ff_hevc_idct16_dc_add_10_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
 {
    ff_hevc_idct8_dc_add_10_sse2(dst, coeffs, stride);
    ff_hevc_idct8_dc_add_10_sse2(dst+16, coeffs, stride);
    ff_hevc_idct8_dc_add_10_sse2(dst+8*stride, coeffs, stride);
    ff_hevc_idct8_dc_add_10_sse2(dst+8*stride+16, coeffs, stride);
 }

 void ff_hevc_idct32_dc_add_10_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
 {
    ff_hevc_idct16_dc_add_10_sse2(dst, coeffs, stride);
    ff_hevc_idct16_dc_add_10_sse2(dst+32, coeffs, stride);
    ff_hevc_idct16_dc_add_10_sse2(dst+16*stride, coeffs, stride);
    ff_hevc_idct16_dc_add_10_sse2(dst+16*stride+32, coeffs, stride);
 }
 #endif //HAVE_SSE2_EXTERNAL
 #if HAVE_AVX_EXTERNAL
 void ff_hevc_idct16_dc_add_10_avx(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
 {
    ff_hevc_idct8_dc_add_10_avx(dst, coeffs, stride);
    ff_hevc_idct8_dc_add_10_avx(dst+16, coeffs, stride);
    ff_hevc_idct8_dc_add_10_avx(dst+8*stride, coeffs, stride);
    ff_hevc_idct8_dc_add_10_avx(dst+8*stride+16, coeffs, stride);
 }

 void ff_hevc_idct32_dc_add_10_avx(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
 {
    ff_hevc_idct16_dc_add_10_avx(dst, coeffs, stride);
    ff_hevc_idct16_dc_add_10_avx(dst+32, coeffs, stride);
    ff_hevc_idct16_dc_add_10_avx(dst+16*stride, coeffs, stride);
    ff_hevc_idct16_dc_add_10_avx(dst+16*stride+32, coeffs, stride);
 }
 #endif //HAVE_AVX_EXTERNAL

 #define mc_rep_func(name, bitd, step, W, opt) \
 void ff_hevc_put_hevc_##name##W##_##bitd##_##opt(int16_t *_dst, ptrdiff_t dststride,                            \
@@ -368,9 +410,17 @@ void ff_hevcdsp_init_x86(HEVCDSPContext *c, const int bit_depth)
    int mm_flags = av_get_cpu_flags();

    if (bit_depth == 8) {
        if (EXTERNAL_MMXEXT(mm_flags)) {
                c->transform_dc_add[0]    =  ff_hevc_idct4_dc_add_8_mmxext;
                c->transform_dc_add[1]    =  ff_hevc_idct8_dc_add_8_mmxext;

        }
        if (EXTERNAL_SSE2(mm_flags)) {
                    c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_8_sse2;
                    c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_8_sse2;

                    c->transform_dc_add[2]    =  ff_hevc_idct16_dc_add_8_sse2;
                    c->transform_dc_add[3]    =  ff_hevc_idct32_dc_add_8_sse2;
        }
        if (EXTERNAL_SSSE3(mm_flags) && ARCH_X86_64) {
                    c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_8_ssse3;
@@ -387,13 +437,21 @@ void ff_hevcdsp_init_x86(HEVCDSPContext *c, const int bit_depth)
            QPEL_LINKS(c->put_hevc_qpel, 0, 1, qpel_h,     8, sse4);
            QPEL_LINKS(c->put_hevc_qpel, 1, 0, qpel_v,     8, sse4);
            QPEL_LINKS(c->put_hevc_qpel, 1, 1, qpel_hv,    8, sse4);

        }
    } else if (bit_depth == 10) {
        if (EXTERNAL_MMXEXT(mm_flags)) {
                c->transform_dc_add[0]    =  ff_hevc_idct4_dc_add_10_mmxext;

        }
        if (EXTERNAL_SSE2(mm_flags)) {
                    c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_10_sse2;
                    c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_10_sse2;
        }


                    c->transform_dc_add[1]    =  ff_hevc_idct8_dc_add_10_sse2;
                    c->transform_dc_add[2]    =  ff_hevc_idct16_dc_add_10_sse2;
                    c->transform_dc_add[3]    =  ff_hevc_idct32_dc_add_10_sse2;
                }
        if (EXTERNAL_SSSE3(mm_flags) && ARCH_X86_64) {
                    c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_10_ssse3;
                    c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_10_ssse3;
@@ -410,5 +468,11 @@ void ff_hevcdsp_init_x86(HEVCDSPContext *c, const int bit_depth)
            QPEL_LINKS(c->put_hevc_qpel, 1, 0, qpel_v,     10, sse4);
            QPEL_LINKS(c->put_hevc_qpel, 1, 1, qpel_hv,    10, sse4);
        }
        if (EXTERNAL_AVX(mm_flags)) {
            c->transform_dc_add[1]    =  ff_hevc_idct8_dc_add_10_avx;
            c->transform_dc_add[2]    =  ff_hevc_idct16_dc_add_10_avx;
            c->transform_dc_add[3]    =  ff_hevc_idct32_dc_add_10_avx;
        }

    }
 }