| @@ -38,9 +38,9 @@ | |||||
| #include "golomb.h" | #include "golomb.h" | ||||
| #include "hevc.h" | #include "hevc.h" | ||||
| const uint8_t ff_hevc_qpel_extra_before[4] = { 0, 3, 3, 2 }; | |||||
| const uint8_t ff_hevc_qpel_extra_after[4] = { 0, 3, 4, 4 }; | |||||
| const uint8_t ff_hevc_qpel_extra[4] = { 0, 6, 7, 6 }; | |||||
| const uint8_t ff_hevc_qpel_extra_before[4] = { 0, 3, 3, 3 }; | |||||
| const uint8_t ff_hevc_qpel_extra_after[4] = { 0, 4, 4, 4 }; | |||||
| const uint8_t ff_hevc_qpel_extra[4] = { 0, 7, 7, 7 }; | |||||
| static const uint8_t scan_1x1[1] = { 0 }; | static const uint8_t scan_1x1[1] = { 0 }; | ||||
| @@ -740,7 +740,7 @@ typedef struct HEVCPredContext { | |||||
| } HEVCPredContext; | } HEVCPredContext; | ||||
| typedef struct HEVCLocalContext { | typedef struct HEVCLocalContext { | ||||
| DECLARE_ALIGNED(16, int16_t, mc_buffer[(MAX_PB_SIZE + 7) * MAX_PB_SIZE]); | |||||
| DECLARE_ALIGNED(16, int16_t, mc_buffer[(MAX_PB_SIZE + 24) * MAX_PB_SIZE]); | |||||
| uint8_t cabac_state[HEVC_CONTEXTS]; | uint8_t cabac_state[HEVC_CONTEXTS]; | ||||
| uint8_t first_qp_group; | uint8_t first_qp_group; | ||||
| @@ -89,7 +89,7 @@ static const int8_t transform[32][32] = { | |||||
| 90, -90, 88, -85, 82, -78, 73, -67, 61, -54, 46, -38, 31, -22, 13, -4 }, | 90, -90, 88, -85, 82, -78, 73, -67, 61, -54, 46, -38, 31, -22, 13, -4 }, | ||||
| }; | }; | ||||
| DECLARE_ALIGNED(16, const int8_t, ff_hevc_epel_filters[7][16]) = { | |||||
| DECLARE_ALIGNED(16, const int16_t, ff_hevc_epel_coeffs[7][16]) = { | |||||
| { -2, 58, 10, -2, -2, 58, 10, -2, -2, 58, 10, -2, -2, 58, 10, -2 }, | { -2, 58, 10, -2, -2, 58, 10, -2, -2, 58, 10, -2, -2, 58, 10, -2 }, | ||||
| { -4, 54, 16, -2, -4, 54, 16, -2, -4, 54, 16, -2, -4, 54, 16, -2 }, | { -4, 54, 16, -2, -4, 54, 16, -2, -4, 54, 16, -2, -4, 54, 16, -2 }, | ||||
| { -6, 46, 28, -4, -6, 46, 28, -4, -6, 46, 28, -4, -6, 46, 28, -4 }, | { -6, 46, 28, -4, -6, 46, 28, -4, -6, 46, 28, -4, -6, 46, 28, -4 }, | ||||
| @@ -99,6 +99,28 @@ DECLARE_ALIGNED(16, const int8_t, ff_hevc_epel_filters[7][16]) = { | |||||
| { -2, 10, 58, -2, -2, 10, 58, -2, -2, 10, 58, -2, -2, 10, 58, -2 }, | { -2, 10, 58, -2, -2, 10, 58, -2, -2, 10, 58, -2, -2, 10, 58, -2 }, | ||||
| }; | }; | ||||
| DECLARE_ALIGNED(16, const int8_t, ff_hevc_epel_coeffs8[7][16]) = { | |||||
| { -2, 58, 10, -2, -2, 58, 10, -2, -2, 58, 10, -2, -2, 58, 10, -2 }, | |||||
| { -4, 54, 16, -2, -4, 54, 16, -2, -4, 54, 16, -2, -4, 54, 16, -2 }, | |||||
| { -6, 46, 28, -4, -6, 46, 28, -4, -6, 46, 28, -4, -6, 46, 28, -4 }, | |||||
| { -4, 36, 36, -4, -4, 36, 36, -4, -4, 36, 36, -4, -4, 36, 36, -4 }, | |||||
| { -4, 28, 46, -6, -4, 28, 46, -6, -4, 28, 46, -6, -4, 28, 46, -6 }, | |||||
| { -2, 16, 54, -4, -2, 16, 54, -4, -2, 16, 54, -4, -2, 16, 54, -4 }, | |||||
| { -2, 10, 58, -2, -2, 10, 58, -2, -2, 10, 58, -2, -2, 10, 58, -2 }, | |||||
| }; | |||||
| DECLARE_ALIGNED(16, const int16_t, ff_hevc_qpel_coeffs[3][8]) = { | |||||
| { -1, 4, -10, 58, 17, -5, 1, 0 }, | |||||
| { -1, 4, -11, 40, 40, -11, 4, -1 }, | |||||
| { 0, 1, -5, 17, 58, -10, 4, -1 }, | |||||
| }; | |||||
| DECLARE_ALIGNED(16, const int8_t, ff_hevc_qpel_coeffs8[3][16]) = { | |||||
| { -1, 4, -10, 58, 17, -5, 1, 0, -1, 4, -10, 58, 17, -5, 1, 0 }, | |||||
| { -1, 4, -11, 40, 40, -11, 4, -1, -1, 4, -11, 40, 40, -11, 4, -1 }, | |||||
| { 0, 1, -5, 17, 58, -10, 4, -1, 0, 1, -5, 17, 58, -10, 4, -1 }, | |||||
| }; | |||||
| #define BIT_DEPTH 8 | #define BIT_DEPTH 8 | ||||
| #include "hevcdsp_template.c" | #include "hevcdsp_template.c" | ||||
| #undef BIT_DEPTH | #undef BIT_DEPTH | ||||
| @@ -118,6 +118,9 @@ void ff_hevc_dsp_init(HEVCDSPContext *hpc, int bit_depth); | |||||
| void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth); | void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth); | ||||
| extern const int8_t ff_hevc_epel_filters[7][16]; | |||||
| extern const int16_t ff_hevc_epel_coeffs[7][16]; | |||||
| extern const int8_t ff_hevc_epel_coeffs8[7][16]; | |||||
| extern const int16_t ff_hevc_qpel_coeffs[3][8]; | |||||
| extern const int8_t ff_hevc_qpel_coeffs8[3][16]; | |||||
| #endif /* AVCODEC_HEVCDSP_H */ | #endif /* AVCODEC_HEVCDSP_H */ | ||||
| @@ -1018,7 +1018,7 @@ static inline void FUNC(put_hevc_epel_h)(int16_t *dst, ptrdiff_t dststride, | |||||
| int x, y; | int x, y; | ||||
| pixel *src = (pixel *)_src; | pixel *src = (pixel *)_src; | ||||
| ptrdiff_t srcstride = _srcstride / sizeof(pixel); | ptrdiff_t srcstride = _srcstride / sizeof(pixel); | ||||
| const int8_t *filter = ff_hevc_epel_filters[mx - 1]; | |||||
| const int16_t *filter = ff_hevc_epel_coeffs[mx - 1]; | |||||
| int8_t filter_0 = filter[0]; | int8_t filter_0 = filter[0]; | ||||
| int8_t filter_1 = filter[1]; | int8_t filter_1 = filter[1]; | ||||
| int8_t filter_2 = filter[2]; | int8_t filter_2 = filter[2]; | ||||
| @@ -1040,7 +1040,7 @@ static inline void FUNC(put_hevc_epel_v)(int16_t *dst, ptrdiff_t dststride, | |||||
| int x, y; | int x, y; | ||||
| pixel *src = (pixel *)_src; | pixel *src = (pixel *)_src; | ||||
| ptrdiff_t srcstride = _srcstride / sizeof(pixel); | ptrdiff_t srcstride = _srcstride / sizeof(pixel); | ||||
| const int8_t *filter = ff_hevc_epel_filters[my - 1]; | |||||
| const int16_t *filter = ff_hevc_epel_coeffs[my - 1]; | |||||
| int8_t filter_0 = filter[0]; | int8_t filter_0 = filter[0]; | ||||
| int8_t filter_1 = filter[1]; | int8_t filter_1 = filter[1]; | ||||
| int8_t filter_2 = filter[2]; | int8_t filter_2 = filter[2]; | ||||
| @@ -1063,8 +1063,8 @@ static inline void FUNC(put_hevc_epel_hv)(int16_t *dst, ptrdiff_t dststride, | |||||
| int x, y; | int x, y; | ||||
| pixel *src = (pixel *)_src; | pixel *src = (pixel *)_src; | ||||
| ptrdiff_t srcstride = _srcstride / sizeof(pixel); | ptrdiff_t srcstride = _srcstride / sizeof(pixel); | ||||
| const int8_t *filter_h = ff_hevc_epel_filters[mx - 1]; | |||||
| const int8_t *filter_v = ff_hevc_epel_filters[my - 1]; | |||||
| const int16_t *filter_h = ff_hevc_epel_coeffs[mx - 1]; | |||||
| const int16_t *filter_v = ff_hevc_epel_coeffs[my - 1]; | |||||
| int8_t filter_0 = filter_h[0]; | int8_t filter_0 = filter_h[0]; | ||||
| int8_t filter_1 = filter_h[1]; | int8_t filter_1 = filter_h[1]; | ||||
| int8_t filter_2 = filter_h[2]; | int8_t filter_2 = filter_h[2]; | ||||
| @@ -113,7 +113,8 @@ YASM-OBJS-$(CONFIG_VP8DSP) += x86/vp8dsp.o \ | |||||
| YASM-OBJS-$(CONFIG_AAC_DECODER) += x86/sbrdsp.o | YASM-OBJS-$(CONFIG_AAC_DECODER) += x86/sbrdsp.o | ||||
| YASM-OBJS-$(CONFIG_APE_DECODER) += x86/apedsp.o | YASM-OBJS-$(CONFIG_APE_DECODER) += x86/apedsp.o | ||||
| YASM-OBJS-$(CONFIG_DCA_DECODER) += x86/dcadsp.o | YASM-OBJS-$(CONFIG_DCA_DECODER) += x86/dcadsp.o | ||||
| YASM-OBJS-$(CONFIG_HEVC_DECODER) += x86/hevc_deblock.o | |||||
| YASM-OBJS-$(CONFIG_HEVC_DECODER) += x86/hevc_deblock.o \ | |||||
| x86/hevc_mc.o | |||||
| YASM-OBJS-$(CONFIG_PNG_DECODER) += x86/pngdsp.o | YASM-OBJS-$(CONFIG_PNG_DECODER) += x86/pngdsp.o | ||||
| YASM-OBJS-$(CONFIG_PRORES_DECODER) += x86/proresdsp.o | YASM-OBJS-$(CONFIG_PRORES_DECODER) += x86/proresdsp.o | ||||
| YASM-OBJS-$(CONFIG_RV40_DECODER) += x86/rv40dsp.o | YASM-OBJS-$(CONFIG_RV40_DECODER) += x86/rv40dsp.o | ||||
| @@ -0,0 +1,851 @@ | |||||
| ;***************************************************************************** | |||||
| ;* x86-optimized HEVC MC | |||||
| ;* Copyright 2015 Anton Khirnov | |||||
| ;* | |||||
| ;* This file is part of Libav. | |||||
| ;* | |||||
| ;* Libav is free software; you can redistribute it and/or | |||||
| ;* modify it under the terms of the GNU Lesser General Public | |||||
| ;* License as published by the Free Software Foundation; either | |||||
| ;* version 2.1 of the License, or (at your option) any later version. | |||||
| ;* | |||||
| ;* Libav is distributed in the hope that it will be useful, | |||||
| ;* but WITHOUT ANY WARRANTY; without even the implied warranty of | |||||
| ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |||||
| ;* Lesser General Public License for more details. | |||||
| ;* | |||||
| ;* You should have received a copy of the GNU Lesser General Public | |||||
| ;* License along with Libav; if not, write to the Free Software | |||||
| ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |||||
| ;****************************************************************************** | |||||
| %include "libavutil/x86/x86util.asm" | |||||
| SECTION .rodata | |||||
| pw_1023: times 8 dw 1023 | |||||
| cextern hevc_qpel_coeffs | |||||
| cextern hevc_qpel_coeffs8 | |||||
| cextern hevc_epel_coeffs | |||||
| cextern hevc_epel_coeffs8 | |||||
| cextern pw_8 | |||||
| cextern pw_16 | |||||
| cextern pw_32 | |||||
| cextern pw_64 | |||||
| SECTION .text | |||||
| ; %1: width | |||||
| ; %2: bit depth | |||||
| %macro COMMON_DEFS 2 | |||||
| %assign blocksize 8 | |||||
| %assign nb_blocks ((%1 + blocksize - 1) / blocksize) | |||||
| %define last_block_truncated (blocksize * nb_blocks > %1) | |||||
| %if %2 > 8 | |||||
| %define LOAD_BLOCK movu | |||||
| %define LOAD_HALFBLOCK movq | |||||
| %assign pixelsize 2 | |||||
| %else | |||||
| %define LOAD_BLOCK movq | |||||
| %define LOAD_HALFBLOCK movd | |||||
| %assign pixelsize 1 | |||||
| %endif | |||||
| %define STORE_BLOCK mova | |||||
| %define STORE_HALFBLOCK movq | |||||
| %endmacro | |||||
| ; %1: block index | |||||
| %macro BLOCK_DEFS 1 | |||||
| %if last_block_truncated && %1 == nb_blocks - 1 | |||||
| %define block_truncated 1 | |||||
| %define LOAD LOAD_HALFBLOCK | |||||
| %define STORE STORE_HALFBLOCK | |||||
| %else | |||||
| %define block_truncated 0 | |||||
| %define LOAD LOAD_BLOCK | |||||
| %define STORE STORE_BLOCK | |||||
| %endif | |||||
| %endmacro | |||||
| ; hevc_get_pixels_<w>_<d>(int16_t *dst, ptrdiff_t dststride, | |||||
| ; pixel *src, ptrdiff_t srcstride, | |||||
| ; int height, int mx, int my, int *mcbuffer) | |||||
| ; %1: block width | |||||
| ; %2: bit depth | |||||
| ; %3: log2 of height unroll | |||||
| %macro GET_PIXELS 3 | |||||
| cglobal hevc_get_pixels_ %+ %1 %+ _ %+ %2, 5, 5, 2, dst, dststride, src, srcstride, height ; rest of the args unused | |||||
| %assign shift 14 - %2 | |||||
| COMMON_DEFS %1, %2 | |||||
| %if pixelsize == 1 | |||||
| pxor m0, m0 | |||||
| %endif | |||||
| shr heightd, %3 | |||||
| .loop: | |||||
| %assign i 0 | |||||
| %rep (1 << %3) | |||||
| %assign j 0 | |||||
| %rep nb_blocks | |||||
| BLOCK_DEFS j | |||||
| LOAD m1, [srcq + j * pixelsize * blocksize] | |||||
| %if pixelsize == 1 | |||||
| punpcklbw m1, m0 | |||||
| %endif | |||||
| psllw m1, shift | |||||
| STORE [dstq + j * 2 * blocksize], m1 | |||||
| %assign j (j + 1) | |||||
| %endrep | |||||
| add dstq, dststrideq | |||||
| add srcq, srcstrideq | |||||
| %assign i (i + 1) | |||||
| %endrep | |||||
| dec heightd | |||||
| jg .loop | |||||
| RET | |||||
| %endmacro | |||||
| INIT_XMM sse2 | |||||
| GET_PIXELS 4, 8, 1 | |||||
| GET_PIXELS 8, 8, 1 | |||||
| GET_PIXELS 12, 8, 3 | |||||
| GET_PIXELS 16, 8, 2 | |||||
| GET_PIXELS 24, 8, 3 | |||||
| GET_PIXELS 32, 8, 3 | |||||
| GET_PIXELS 48, 8, 3 | |||||
| GET_PIXELS 64, 8, 3 | |||||
| GET_PIXELS 4, 10, 1 | |||||
| GET_PIXELS 8, 10, 1 | |||||
| GET_PIXELS 12, 10, 3 | |||||
| GET_PIXELS 16, 10, 2 | |||||
| GET_PIXELS 24, 10, 3 | |||||
| GET_PIXELS 32, 10, 3 | |||||
| GET_PIXELS 48, 10, 3 | |||||
| GET_PIXELS 64, 10, 3 | |||||
| ; hevc_qpel_h/v_<w>_8(int16_t *dst, ptrdiff_t dststride, | |||||
| ; uint8_t *src, ptrdiff_t srcstride, | |||||
| ; int height, int mx, int my, int *mcbuffer) | |||||
| ; 8-bit qpel interpolation | |||||
| ; %1: block width | |||||
| ; %2: 0 - horizontal; 1 - vertical | |||||
| %macro QPEL_8 2 | |||||
| %if %2 | |||||
| %define postfix v | |||||
| %define mvfrac myq | |||||
| %define coeffsaddr r5q | |||||
| %define pixstride srcstrideq | |||||
| %define pixstride3 r5q | |||||
| %define src_m3 r6q | |||||
| %else | |||||
| %define postfix h | |||||
| %define mvfrac mxq | |||||
| %define coeffsaddr r6q | |||||
| %define pixstride 1 | |||||
| %define pixstride3 3 | |||||
| %define src_m3 (srcq - 3) | |||||
| %endif | |||||
| COMMON_DEFS %1, 8 | |||||
| cglobal hevc_qpel_ %+ postfix %+ _ %+ %1 %+ _8, 7, 7, 7, dst, dststride, src, srcstride, height, mx, my | |||||
| and mvfrac, 0x3 | |||||
| dec mvfrac | |||||
| shl mvfrac, 4 | |||||
| lea coeffsaddr, [hevc_qpel_coeffs8] | |||||
| mova m0, [coeffsaddr + mvfrac] | |||||
| SPLATW m1, m0, 1 | |||||
| SPLATW m2, m0, 2 | |||||
| SPLATW m3, m0, 3 | |||||
| SPLATW m0, m0, 0 | |||||
| %if %2 | |||||
| lea pixstride3, [srcstrideq + 2 * srcstrideq] | |||||
| mov src_m3, srcq | |||||
| sub src_m3, pixstride3 | |||||
| %endif | |||||
| .loop | |||||
| %assign i 0 | |||||
| %rep nb_blocks | |||||
| BLOCK_DEFS i | |||||
| LOAD m4, [src_m3 + i * blocksize] | |||||
| LOAD m5, [src_m3 + i * blocksize + 1 * pixstride] | |||||
| punpcklbw m4, m5 | |||||
| pmaddubsw m4, m0 | |||||
| LOAD m5, [src_m3 + i * blocksize + 2 * pixstride] | |||||
| LOAD m6, [srcq + i * blocksize] | |||||
| punpcklbw m5, m6 | |||||
| pmaddubsw m5, m1 | |||||
| paddsw m4, m5 | |||||
| LOAD m5, [srcq + i * blocksize + 1 * pixstride] | |||||
| LOAD m6, [srcq + i * blocksize + 2 * pixstride] | |||||
| punpcklbw m5, m6 | |||||
| pmaddubsw m5, m2 | |||||
| paddsw m4, m5 | |||||
| LOAD m5, [srcq + i * blocksize + pixstride3] | |||||
| LOAD m6, [srcq + i * blocksize + 4 * pixstride] | |||||
| punpcklbw m5, m6 | |||||
| pmaddubsw m5, m3 | |||||
| paddsw m4, m5 | |||||
| STORE [dstq + i * 2 * blocksize], m4 | |||||
| %assign i (i + 1) | |||||
| %endrep | |||||
| add dstq, dststrideq | |||||
| add srcq, srcstrideq | |||||
| %if %2 | |||||
| add src_m3, srcstrideq | |||||
| %endif | |||||
| dec heightd | |||||
| jg .loop | |||||
| RET | |||||
| %endmacro | |||||
| INIT_XMM ssse3 | |||||
| QPEL_8 4, 0 | |||||
| QPEL_8 8, 0 | |||||
| QPEL_8 12, 0 | |||||
| QPEL_8 16, 0 | |||||
| QPEL_8 24, 0 | |||||
| QPEL_8 32, 0 | |||||
| QPEL_8 48, 0 | |||||
| QPEL_8 64, 0 | |||||
| QPEL_8 4, 1 | |||||
| QPEL_8 8, 1 | |||||
| QPEL_8 12, 1 | |||||
| QPEL_8 16, 1 | |||||
| QPEL_8 24, 1 | |||||
| QPEL_8 32, 1 | |||||
| QPEL_8 48, 1 | |||||
| QPEL_8 64, 1 | |||||
| ; 16-bit qpel interpolation | |||||
| ; %1: block width | |||||
| ; %2: shift applied to the result | |||||
| ; %3: 0 - horizontal; 1 - vertical | |||||
| %macro QPEL_16 3 | |||||
| %if %3 | |||||
| %define mvfrac myq | |||||
| %define pixstride srcstrideq | |||||
| %define pixstride3 sstride3q | |||||
| %define src_m3 srcm3q | |||||
| %else | |||||
| %define mvfrac mxq | |||||
| %define pixstride 2 | |||||
| %define pixstride3 6 | |||||
| %define src_m3 (srcq - 6) | |||||
| %endif | |||||
| COMMON_DEFS %1, 16 | |||||
| and mvfrac, 0x3 | |||||
| dec mvfrac | |||||
| shl mvfrac, 4 | |||||
| lea coeffsregq, [hevc_qpel_coeffs] | |||||
| mova m0, [coeffsregq + mvfrac] | |||||
| pshufd m1, m0, 0x55 | |||||
| pshufd m2, m0, 0xaa | |||||
| pshufd m3, m0, 0xff | |||||
| pshufd m0, m0, 0x00 | |||||
| %if %3 | |||||
| lea sstride3q, [srcstrideq + 2 * srcstrideq] | |||||
| mov srcm3q, srcq | |||||
| sub srcm3q, sstride3q | |||||
| %endif | |||||
| .loop | |||||
| %assign i 0 | |||||
| %rep nb_blocks | |||||
| BLOCK_DEFS i | |||||
| LOAD m4, [src_m3 + i * 2 * blocksize] | |||||
| LOAD m5, [src_m3 + i * 2 * blocksize + 1 * pixstride] | |||||
| LOAD m6, [src_m3 + i * 2 * blocksize + 2 * pixstride] | |||||
| LOAD m7, [srcq + i * 2 * blocksize + 0 * pixstride] | |||||
| LOAD m8, [srcq + i * 2 * blocksize + 1 * pixstride] | |||||
| LOAD m9, [srcq + i * 2 * blocksize + 2 * pixstride] | |||||
| LOAD m10, [srcq + i * 2 * blocksize + pixstride3] | |||||
| LOAD m11, [srcq + i * 2 * blocksize + 4 * pixstride] | |||||
| punpcklwd m12, m4, m5 | |||||
| pmaddwd m12, m0 | |||||
| punpcklwd m13, m6, m7 | |||||
| pmaddwd m13, m1 | |||||
| paddd m12, m13 | |||||
| punpcklwd m13, m8, m9 | |||||
| pmaddwd m13, m2 | |||||
| paddd m12, m13 | |||||
| punpcklwd m13, m10, m11 | |||||
| pmaddwd m13, m3 | |||||
| paddd m12, m13 | |||||
| psrad m12, %2 | |||||
| %if block_truncated == 0 | |||||
| punpckhwd m4, m5 | |||||
| pmaddwd m4, m0 | |||||
| punpckhwd m6, m7 | |||||
| pmaddwd m6, m1 | |||||
| paddd m4, m6 | |||||
| punpckhwd m8, m9 | |||||
| pmaddwd m8, m2 | |||||
| paddd m4, m8 | |||||
| punpckhwd m10, m11 | |||||
| pmaddwd m10, m3 | |||||
| paddd m4, m10 | |||||
| psrad m4, %2 | |||||
| %endif | |||||
| packssdw m12, m4 | |||||
| STORE [dstq + i * 2 * blocksize], m12 | |||||
| %assign i (i + 1) | |||||
| %endrep | |||||
| add dstq, dststrideq | |||||
| add srcq, srcstrideq | |||||
| %if %3 | |||||
| add srcm3q, srcstrideq | |||||
| %endif | |||||
| dec heightd | |||||
| jg .loop | |||||
| RET | |||||
| %endmacro | |||||
| %if ARCH_X86_64 | |||||
| %macro QPEL_H_10 1 | |||||
| cglobal hevc_qpel_h_ %+ %1 %+ _10, 7, 9, 14, dst, dststride, src, srcstride, height, mx, my, mcbuffer, coeffsreg | |||||
| QPEL_16 %1, 2, 0 | |||||
| %endmacro | |||||
| INIT_XMM avx | |||||
| QPEL_H_10 4 | |||||
| QPEL_H_10 8 | |||||
| QPEL_H_10 12 | |||||
| QPEL_H_10 16 | |||||
| QPEL_H_10 24 | |||||
| QPEL_H_10 32 | |||||
| QPEL_H_10 48 | |||||
| QPEL_H_10 64 | |||||
| %macro QPEL_V_10 1 | |||||
| cglobal hevc_qpel_v_ %+ %1 %+ _10, 7, 10, 14, dst, dststride, src, srcstride, height, mx, my, sstride3, srcm3, coeffsreg | |||||
| QPEL_16 %1, 2, 1 | |||||
| %endmacro | |||||
| INIT_XMM avx | |||||
| QPEL_V_10 4 | |||||
| QPEL_V_10 8 | |||||
| QPEL_V_10 12 | |||||
| QPEL_V_10 16 | |||||
| QPEL_V_10 24 | |||||
| QPEL_V_10 32 | |||||
| QPEL_V_10 48 | |||||
| QPEL_V_10 64 | |||||
| ; hevc_qpel_hv_<w>(int16_t *dst, ptrdiff_t dststride, | |||||
| ; uint8_t *src, ptrdiff_t srcstride, | |||||
| ; int height, int mx, int my, int *mcbuffer) | |||||
| %macro QPEL_HV 1 | |||||
| cglobal hevc_qpel_hv_ %+ %1, 7, 10, 14, dst, dststride, src, srcstride, height, mx, my, sstride3, srcm3, coeffsreg | |||||
| QPEL_16 %1, 6, 1 | |||||
| %endmacro | |||||
| INIT_XMM avx | |||||
| QPEL_HV 4 | |||||
| QPEL_HV 8 | |||||
| QPEL_HV 12 | |||||
| QPEL_HV 16 | |||||
| QPEL_HV 24 | |||||
| QPEL_HV 32 | |||||
| QPEL_HV 48 | |||||
| QPEL_HV 64 | |||||
| %endif ; ARCH_X86_64 | |||||
| ; hevc_epel_h/v_<w>_8(int16_t *dst, ptrdiff_t dststride, | |||||
| ; uint8_t *src, ptrdiff_t srcstride, | |||||
| ; int height, int mx, int my, int *mcbuffer) | |||||
| ; 8-bit epel interpolation | |||||
| ; %1: block width | |||||
| ; %2: 0 - horizontal; 1 - vertical | |||||
| %macro EPEL_8 2 | |||||
| %if %2 | |||||
| %define postfix v | |||||
| %define mvfrac myq | |||||
| %define coeffsaddr r5q | |||||
| %define pixstride srcstrideq | |||||
| %define pixstride3 r5q | |||||
| %else | |||||
| %define postfix h | |||||
| %define mvfrac mxq | |||||
| %define coeffsaddr r6q | |||||
| %define pixstride 1 | |||||
| %define pixstride3 3 | |||||
| %endif | |||||
| COMMON_DEFS %1, 8 | |||||
| cglobal hevc_epel_ %+ postfix %+ _ %+ %1 %+ _8, 7, 7, 6, dst, dststride, src, srcstride, height, mx, my | |||||
| and mvfrac, 0x7 | |||||
| dec mvfrac | |||||
| shl mvfrac, 4 | |||||
| lea coeffsaddr, [hevc_epel_coeffs8] | |||||
| movq m0, [coeffsaddr + mvfrac] | |||||
| SPLATW m1, m0, 1 | |||||
| SPLATW m0, m0, 0 | |||||
| %if %2 | |||||
| lea pixstride3, [srcstrideq + 2 * srcstrideq] | |||||
| %endif | |||||
| sub srcq, pixstride | |||||
| .loop | |||||
| %assign i 0 | |||||
| %rep nb_blocks | |||||
| BLOCK_DEFS i | |||||
| LOAD m2, [srcq + i * blocksize + 0 * pixstride] | |||||
| LOAD m3, [srcq + i * blocksize + 1 * pixstride] | |||||
| LOAD m4, [srcq + i * blocksize + 2 * pixstride] | |||||
| LOAD m5, [srcq + i * blocksize + pixstride3] | |||||
| punpcklbw m2, m3 | |||||
| punpcklbw m4, m5 | |||||
| pmaddubsw m2, m0 | |||||
| pmaddubsw m4, m1 | |||||
| paddsw m2, m4 | |||||
| STORE [dstq + i * 2 * blocksize], m2 | |||||
| %assign i (i + 1) | |||||
| %endrep | |||||
| add dstq, dststrideq | |||||
| add srcq, srcstrideq | |||||
| dec heightd | |||||
| jg .loop | |||||
| RET | |||||
| %endmacro | |||||
| INIT_XMM ssse3 | |||||
| EPEL_8 4, 0 | |||||
| EPEL_8 8, 0 | |||||
| EPEL_8 12, 0 | |||||
| EPEL_8 16, 0 | |||||
| EPEL_8 24, 0 | |||||
| EPEL_8 32, 0 | |||||
| EPEL_8 4, 1 | |||||
| EPEL_8 8, 1 | |||||
| EPEL_8 12, 1 | |||||
| EPEL_8 16, 1 | |||||
| EPEL_8 24, 1 | |||||
| EPEL_8 32, 1 | |||||
| %macro EPEL_16 3 | |||||
| %if %3 | |||||
| %define mvfrac myq | |||||
| %define pixstride srcstrideq | |||||
| %define pixstride3 sstride3q | |||||
| %else | |||||
| %define mvfrac mxq | |||||
| %define pixstride 2 | |||||
| %define pixstride3 6 | |||||
| %endif | |||||
| COMMON_DEFS %1, 16 | |||||
| and mvfrac, 0x7 | |||||
| dec mvfrac | |||||
| shl mvfrac, 5 | |||||
| lea coeffsregq, [hevc_epel_coeffs] | |||||
| mova m0, [coeffsregq + mvfrac] | |||||
| pshufd m1, m0, 0x55 | |||||
| pshufd m0, m0, 0x00 | |||||
| %if %3 | |||||
| lea sstride3q, [srcstrideq + 2 * srcstrideq] | |||||
| %endif | |||||
| sub srcq, pixstride | |||||
| .loop | |||||
| %assign i 0 | |||||
| %rep nb_blocks | |||||
| BLOCK_DEFS i | |||||
| LOAD m2, [srcq + i * 2 * blocksize + 0 * pixstride] | |||||
| LOAD m3, [srcq + i * 2 * blocksize + 1 * pixstride] | |||||
| LOAD m4, [srcq + i * 2 * blocksize + 2 * pixstride] | |||||
| LOAD m5, [srcq + i * 2 * blocksize + pixstride3] | |||||
| punpcklwd m6, m2, m3 | |||||
| punpcklwd m7, m4, m5 | |||||
| pmaddwd m6, m0 | |||||
| pmaddwd m7, m1 | |||||
| paddd m6, m7 | |||||
| psrad m6, %2 | |||||
| %if block_truncated == 0 | |||||
| punpckhwd m2, m3 | |||||
| punpckhwd m4, m5 | |||||
| pmaddwd m2, m0 | |||||
| pmaddwd m4, m1 | |||||
| paddd m2, m4 | |||||
| psrad m2, %2 | |||||
| %endif | |||||
| packssdw m6, m2 | |||||
| STORE [dstq + i * 2 * blocksize], m6 | |||||
| %assign i (i + 1) | |||||
| %endrep | |||||
| add dstq, dststrideq | |||||
| add srcq, srcstrideq | |||||
| dec heightd | |||||
| jg .loop | |||||
| RET | |||||
| %endmacro | |||||
| %if ARCH_X86_64 | |||||
| %macro EPEL_H_10 1 | |||||
| cglobal hevc_epel_h_ %+ %1 %+ _10, 8, 9, 8, dst, dststride, src, srcstride, height, mx, my, sstride3, coeffsreg | |||||
| EPEL_16 %1, 2, 0 | |||||
| %endmacro | |||||
| INIT_XMM avx | |||||
| EPEL_H_10 4 | |||||
| EPEL_H_10 8 | |||||
| EPEL_H_10 12 | |||||
| EPEL_H_10 16 | |||||
| EPEL_H_10 24 | |||||
| EPEL_H_10 32 | |||||
| %macro EPEL_V_10 1 | |||||
| cglobal hevc_epel_v_ %+ %1 %+ _10, 8, 9, 8, dst, dststride, src, srcstride, height, mx, my, sstride3, coeffsreg | |||||
| EPEL_16 %1, 2, 1 | |||||
| %endmacro | |||||
| INIT_XMM avx | |||||
| EPEL_V_10 4 | |||||
| EPEL_V_10 8 | |||||
| EPEL_V_10 12 | |||||
| EPEL_V_10 16 | |||||
| EPEL_V_10 24 | |||||
| EPEL_V_10 32 | |||||
| ; hevc_epel_hv_<w>_8(int16_t *dst, ptrdiff_t dststride, | |||||
| ; int16_t *src, ptrdiff_t srcstride, | |||||
| ; int height, int mx, int my, int *mcbuffer) | |||||
| %macro EPEL_HV 1 | |||||
| cglobal hevc_epel_hv_ %+ %1, 8, 9, 8, dst, dststride, src, srcstride, height, mx, my, sstride3, coeffsreg | |||||
| EPEL_16 %1, 6, 1 | |||||
| %endmacro | |||||
| INIT_XMM avx | |||||
| EPEL_HV 4 | |||||
| EPEL_HV 8 | |||||
| EPEL_HV 12 | |||||
| EPEL_HV 16 | |||||
| EPEL_HV 24 | |||||
| EPEL_HV 32 | |||||
| %endif ; ARCH_X86_64 | |||||
| ; hevc_put_unweighted_pred_<w>_<d>(pixel *dst, ptrdiff_t dststride, | |||||
| ; int16_t *src, ptrdiff_t srcstride, | |||||
| ; int height) | |||||
| %macro AVG 5 | |||||
| %if %3 | |||||
| %if %4 == 4 | |||||
| movq %5, %2 | |||||
| paddsw %1, %5 | |||||
| %else | |||||
| paddsw %1, %2 | |||||
| %endif | |||||
| %endif | |||||
| %endmacro | |||||
| ; %1: 0 - one source; 1 - two sources | |||||
| ; %2: width | |||||
| ; %3: bit depth | |||||
| %macro PUT_PRED 3 | |||||
| %if %1 | |||||
| cglobal hevc_put_unweighted_pred_avg_ %+ %2 %+ _ %+ %3, 6, 6, 4, dst, dststride, src, src2, srcstride, height | |||||
| %else | |||||
| cglobal hevc_put_unweighted_pred_ %+ %2 %+ _ %+ %3, 5, 5, 4, dst, dststride, src, srcstride, height | |||||
| %endif | |||||
| %assign shift 14 + %1 - %3 | |||||
| %assign offset (1 << (shift - 1)) | |||||
| %define offset_data pw_ %+ offset | |||||
| mova m0, [offset_data] | |||||
| %if %3 > 8 | |||||
| %define STORE_BLOCK movu | |||||
| %define STORE_HALF movq | |||||
| %assign pixel_max ((1 << %3) - 1) | |||||
| %define pw_pixel_max pw_ %+ pixel_max | |||||
| pxor m1, m1 | |||||
| mova m2, [pw_pixel_max] | |||||
| %else | |||||
| %define STORE_BLOCK movq | |||||
| %define STORE_HALF movd | |||||
| %endif | |||||
| .loop | |||||
| %assign i 0 | |||||
| %rep (%2 + 7) / 8 | |||||
| %if (i + 1) * 8 > %2 | |||||
| %define LOAD movq | |||||
| %define STORE STORE_HALF | |||||
| %else | |||||
| %define LOAD mova | |||||
| %define STORE STORE_BLOCK | |||||
| %endif | |||||
| LOAD m3, [srcq + 16 * i] | |||||
| AVG m3, [src2q + 16 * i], %1, %3 - i * 8, m4 | |||||
| paddsw m3, m0 | |||||
| psraw m3, shift | |||||
| %if %3 == 8 | |||||
| packuswb m3, m3 | |||||
| STORE [dstq + 8 * i], m3 | |||||
| %else | |||||
| CLIPW m3, m1, m2 | |||||
| STORE [dstq + 16 * i], m3 | |||||
| %endif | |||||
| %assign i (i + 1) | |||||
| %endrep | |||||
| add dstq, dststrideq | |||||
| add srcq, srcstrideq | |||||
| %if %1 | |||||
| add src2q, srcstrideq | |||||
| %endif | |||||
| dec heightd | |||||
| jg .loop | |||||
| RET | |||||
| %endmacro | |||||
| INIT_XMM sse2 | |||||
| PUT_PRED 0, 4, 8 | |||||
| PUT_PRED 1, 4, 8 | |||||
| PUT_PRED 0, 8, 8 | |||||
| PUT_PRED 1, 8, 8 | |||||
| PUT_PRED 0, 12, 8 | |||||
| PUT_PRED 1, 12, 8 | |||||
| PUT_PRED 0, 16, 8 | |||||
| PUT_PRED 1, 16, 8 | |||||
| PUT_PRED 0, 24, 8 | |||||
| PUT_PRED 1, 24, 8 | |||||
| PUT_PRED 0, 32, 8 | |||||
| PUT_PRED 1, 32, 8 | |||||
| PUT_PRED 0, 48, 8 | |||||
| PUT_PRED 1, 48, 8 | |||||
| PUT_PRED 0, 64, 8 | |||||
| PUT_PRED 1, 64, 8 | |||||
| PUT_PRED 0, 4, 10 | |||||
| PUT_PRED 1, 4, 10 | |||||
| PUT_PRED 0, 8, 10 | |||||
| PUT_PRED 1, 8, 10 | |||||
| PUT_PRED 0, 12, 10 | |||||
| PUT_PRED 1, 12, 10 | |||||
| PUT_PRED 0, 16, 10 | |||||
| PUT_PRED 1, 16, 10 | |||||
| PUT_PRED 0, 24, 10 | |||||
| PUT_PRED 1, 24, 10 | |||||
| PUT_PRED 0, 32, 10 | |||||
| PUT_PRED 1, 32, 10 | |||||
| PUT_PRED 0, 48, 10 | |||||
| PUT_PRED 1, 48, 10 | |||||
| PUT_PRED 0, 64, 10 | |||||
| PUT_PRED 1, 64, 10 | |||||
| %macro PUT_WEIGHTED_PRED 3 | |||||
| %if %1 | |||||
| cglobal hevc_put_weighted_pred_avg_ %+ %2 %+ _ %+ %3, 11, 11, 8, denom, weight0, weight1, offset0, offset1, dst, dststride, src0, src1, srcstride, height | |||||
| %else | |||||
| cglobal hevc_put_weighted_pred_ %+ %2 %+ _ %+ %3, 8, 8, 8, denom, weight0, offset0, dst, dststride, src0, srcstride, height | |||||
| %endif | |||||
| and denomd, 0xff | |||||
| movsx weight0d, weight0w | |||||
| movsx offset0d, offset0w | |||||
| %if %1 | |||||
| movsx weight1d, weight1w | |||||
| movsx offset1d, offset1w | |||||
| %endif | |||||
| add denomd, 14 + %1 - %3 | |||||
| movd m0, denomd | |||||
| %if %3 > 8 | |||||
| %assign pixel_max ((1 << %3) - 1) | |||||
| %define pw_pixel_max pw_ %+ pixel_max | |||||
| pxor m4, m4 | |||||
| mova m5, [pw_pixel_max] | |||||
| shl offset0d, %3 - 8 | |||||
| %if %1 | |||||
| shl offset1d, %3 - 8 | |||||
| %endif | |||||
| %endif | |||||
| %if %1 | |||||
| lea offset0d, [offset0d + offset1d + 1] | |||||
| %else | |||||
| lea offset0d, [2 * offset0d + 1] | |||||
| %endif | |||||
| movd m1, offset0d | |||||
| SPLATD m1 | |||||
| pslld m1, m0 | |||||
| psrad m1, 1 | |||||
| movd m2, weight0d | |||||
| SPLATD m2 | |||||
| %if %1 | |||||
| movd m3, weight1d | |||||
| SPLATD m3 | |||||
| %endif | |||||
| .loop | |||||
| %assign i 0 | |||||
| %rep (%2 + 3) / 4 | |||||
| pmovsxwd m6, [src0q + 8 * i] | |||||
| pmulld m6, m2 | |||||
| %if %1 | |||||
| pmovsxwd m7, [src1q + 8 * i] | |||||
| pmulld m7, m3 | |||||
| paddd m6, m7 | |||||
| %endif | |||||
| paddd m6, m1 | |||||
| psrad m6, m0 | |||||
| packssdw m6, m6 | |||||
| %if %3 > 8 | |||||
| CLIPW m6, m4, m5 | |||||
| movq [dstq + 8 * i], m6 | |||||
| %else | |||||
| packuswb m6, m6 | |||||
| movd [dstq + 4 * i], m6 | |||||
| %endif | |||||
| %assign i (i + 1) | |||||
| %endrep | |||||
| add dstq, dststrideq | |||||
| add src0q, srcstrideq | |||||
| %if %1 | |||||
| add src1q, srcstrideq | |||||
| %endif | |||||
| dec heightd | |||||
| jg .loop | |||||
| RET | |||||
| %endmacro | |||||
| %if ARCH_X86_64 | |||||
| INIT_XMM sse4 | |||||
| PUT_WEIGHTED_PRED 0, 4, 8 | |||||
| PUT_WEIGHTED_PRED 1, 4, 8 | |||||
| PUT_WEIGHTED_PRED 0, 8, 8 | |||||
| PUT_WEIGHTED_PRED 1, 8, 8 | |||||
| PUT_WEIGHTED_PRED 0, 12, 8 | |||||
| PUT_WEIGHTED_PRED 1, 12, 8 | |||||
| PUT_WEIGHTED_PRED 0, 16, 8 | |||||
| PUT_WEIGHTED_PRED 1, 16, 8 | |||||
| PUT_WEIGHTED_PRED 0, 24, 8 | |||||
| PUT_WEIGHTED_PRED 1, 24, 8 | |||||
| PUT_WEIGHTED_PRED 0, 32, 8 | |||||
| PUT_WEIGHTED_PRED 1, 32, 8 | |||||
| PUT_WEIGHTED_PRED 0, 48, 8 | |||||
| PUT_WEIGHTED_PRED 1, 48, 8 | |||||
| PUT_WEIGHTED_PRED 0, 64, 8 | |||||
| PUT_WEIGHTED_PRED 1, 64, 8 | |||||
| PUT_WEIGHTED_PRED 0, 4, 10 | |||||
| PUT_WEIGHTED_PRED 1, 4, 10 | |||||
| PUT_WEIGHTED_PRED 0, 8, 10 | |||||
| PUT_WEIGHTED_PRED 1, 8, 10 | |||||
| PUT_WEIGHTED_PRED 0, 12, 10 | |||||
| PUT_WEIGHTED_PRED 1, 12, 10 | |||||
| PUT_WEIGHTED_PRED 0, 16, 10 | |||||
| PUT_WEIGHTED_PRED 1, 16, 10 | |||||
| PUT_WEIGHTED_PRED 0, 24, 10 | |||||
| PUT_WEIGHTED_PRED 1, 24, 10 | |||||
| PUT_WEIGHTED_PRED 0, 32, 10 | |||||
| PUT_WEIGHTED_PRED 1, 32, 10 | |||||
| PUT_WEIGHTED_PRED 0, 48, 10 | |||||
| PUT_WEIGHTED_PRED 1, 48, 10 | |||||
| PUT_WEIGHTED_PRED 0, 64, 10 | |||||
| PUT_WEIGHTED_PRED 1, 64, 10 | |||||
| %endif ; ARCH_X86_64 | |||||
| @@ -45,27 +45,260 @@ LFC_FUNCS(uint8_t, 10) | |||||
| LFL_FUNCS(uint8_t, 8) | LFL_FUNCS(uint8_t, 8) | ||||
| LFL_FUNCS(uint8_t, 10) | LFL_FUNCS(uint8_t, 10) | ||||
| #define GET_PIXELS(width, depth, cf) \ | |||||
| void ff_hevc_get_pixels_ ## width ## _ ## depth ## _ ## cf(int16_t *dst, ptrdiff_t dststride, \ | |||||
| uint8_t *src, ptrdiff_t srcstride, \ | |||||
| int height, int mx, int my, int16_t *mcbuffer); | |||||
| GET_PIXELS(4, 8, sse2) | |||||
| GET_PIXELS(8, 8, sse2) | |||||
| GET_PIXELS(12, 8, sse2) | |||||
| GET_PIXELS(16, 8, sse2) | |||||
| GET_PIXELS(24, 8, sse2) | |||||
| GET_PIXELS(32, 8, sse2) | |||||
| GET_PIXELS(48, 8, sse2) | |||||
| GET_PIXELS(64, 8, sse2) | |||||
| GET_PIXELS(4, 10, sse2) | |||||
| GET_PIXELS(8, 10, sse2) | |||||
| GET_PIXELS(12, 10, sse2) | |||||
| GET_PIXELS(16, 10, sse2) | |||||
| GET_PIXELS(24, 10, sse2) | |||||
| GET_PIXELS(32, 10, sse2) | |||||
| GET_PIXELS(48, 10, sse2) | |||||
| GET_PIXELS(64, 10, sse2) | |||||
| /* those are independent of the bit depth, so declared separately */ | |||||
| #define INTERP_HV_FUNC(width, cf) \ | |||||
| void ff_hevc_qpel_hv_ ## width ## _ ## cf(int16_t *dst, ptrdiff_t dststride, \ | |||||
| int16_t *src, ptrdiff_t srcstride, \ | |||||
| int height, int mx, int my, int16_t *mcbuffer); \ | |||||
| void ff_hevc_epel_hv_ ## width ## _ ## cf(int16_t *dst, ptrdiff_t dststride, \ | |||||
| int16_t *src, ptrdiff_t srcstride, \ | |||||
| int height, int mx, int my, int16_t *mcbuffer); | |||||
| INTERP_HV_FUNC(4, avx) | |||||
| INTERP_HV_FUNC(8, avx) | |||||
| INTERP_HV_FUNC(12, avx) | |||||
| INTERP_HV_FUNC(16, avx) | |||||
| INTERP_HV_FUNC(24, avx) | |||||
| INTERP_HV_FUNC(32, avx) | |||||
| INTERP_HV_FUNC(48, avx) | |||||
| INTERP_HV_FUNC(64, avx) | |||||
| #if ARCH_X86_64 | |||||
| #define QPEL_FUNC_HV(width, depth, cf_h, cf_v, cf_hv) \ | |||||
| static void hevc_qpel_hv_ ## width ## _ ## depth ## _ ## cf_hv(int16_t *dst, ptrdiff_t dststride, \ | |||||
| uint8_t *src, ptrdiff_t srcstride, \ | |||||
| int height, int mx, int my, int16_t *mcbuffer) \ | |||||
| { \ | |||||
| const ptrdiff_t stride = FFALIGN(width + 7, 8); \ | |||||
| ff_hevc_qpel_h_ ## width ## _ ## depth ## _ ## cf_h(mcbuffer, 2 * stride, src - 3 * srcstride, srcstride, \ | |||||
| height + 7, mx, my, mcbuffer); \ | |||||
| ff_hevc_qpel_hv_ ## width ## _ ## cf_hv(dst, dststride, mcbuffer + 3 * stride, 2 * stride, \ | |||||
| height, mx, my, mcbuffer); \ | |||||
| } | |||||
| #else | |||||
| #define QPEL_FUNC_HV(width, depth, cf_h, cf_v, cf_hv) | |||||
| #endif | |||||
| #define QPEL_FUNCS(width, depth, cf_h, cf_v, cf_hv) \ | |||||
| void ff_hevc_qpel_h_ ## width ## _ ## depth ## _ ## cf_h(int16_t *dst, ptrdiff_t dststride, \ | |||||
| uint8_t *src, ptrdiff_t srcstride, \ | |||||
| int height, int mx, int my, int16_t *mcbuffer); \ | |||||
| void ff_hevc_qpel_v_ ## width ## _ ## depth ## _ ## cf_v(int16_t *dst, ptrdiff_t dststride, \ | |||||
| uint8_t *src, ptrdiff_t srcstride, \ | |||||
| int height, int mx, int my, int16_t *mcbuffer); \ | |||||
| QPEL_FUNC_HV(width, depth, cf_h, cf_v, cf_hv) | |||||
| QPEL_FUNCS(4, 8, ssse3, ssse3, avx) | |||||
| QPEL_FUNCS(8, 8, ssse3, ssse3, avx) | |||||
| QPEL_FUNCS(12, 8, ssse3, ssse3, avx) | |||||
| QPEL_FUNCS(16, 8, ssse3, ssse3, avx) | |||||
| QPEL_FUNCS(24, 8, ssse3, ssse3, avx) | |||||
| QPEL_FUNCS(32, 8, ssse3, ssse3, avx) | |||||
| QPEL_FUNCS(48, 8, ssse3, ssse3, avx) | |||||
| QPEL_FUNCS(64, 8, ssse3, ssse3, avx) | |||||
| QPEL_FUNCS(4, 10, avx, avx, avx) | |||||
| QPEL_FUNCS(8, 10, avx, avx, avx) | |||||
| QPEL_FUNCS(12, 10, avx, avx, avx) | |||||
| QPEL_FUNCS(16, 10, avx, avx, avx) | |||||
| QPEL_FUNCS(24, 10, avx, avx, avx) | |||||
| QPEL_FUNCS(32, 10, avx, avx, avx) | |||||
| QPEL_FUNCS(48, 10, avx, avx, avx) | |||||
| QPEL_FUNCS(64, 10, avx, avx, avx) | |||||
| #if ARCH_X86_64 | |||||
| #define EPEL_FUNC_HV(width, depth, cf_h, cf_v, cf_hv) \ | |||||
| static void hevc_epel_hv_ ## width ## _ ## depth ## _ ## cf_hv(int16_t *dst, ptrdiff_t dststride, \ | |||||
| uint8_t *src, ptrdiff_t srcstride, \ | |||||
| int height, int mx, int my, int16_t *mcbuffer) \ | |||||
| { \ | |||||
| const ptrdiff_t stride = FFALIGN(width + 3, 8); \ | |||||
| ff_hevc_epel_h_ ## width ## _ ## depth ## _ ## cf_h(mcbuffer, 2 * stride, src - srcstride, srcstride, \ | |||||
| height + 3, mx, my, mcbuffer); \ | |||||
| ff_hevc_epel_hv_ ## width ## _ ## cf_hv(dst, dststride, mcbuffer + stride, 2 * stride, \ | |||||
| height, mx, my, mcbuffer); \ | |||||
| } | |||||
| #else | |||||
| #define EPEL_FUNC_HV(width, depth, cf_h, cf_v, cf_hv) | |||||
| #endif | |||||
| #define EPEL_FUNCS(width, depth, cf_h, cf_v, cf_hv) \ | |||||
| void ff_hevc_epel_h_ ## width ## _ ## depth ## _ ## cf_h(int16_t *dst, ptrdiff_t dststride, \ | |||||
| uint8_t *src, ptrdiff_t srcstride, \ | |||||
| int height, int mx, int my, int16_t *mcbuffer); \ | |||||
| void ff_hevc_epel_v_ ## width ## _ ## depth ## _ ## cf_v(int16_t *dst, ptrdiff_t dststride, \ | |||||
| uint8_t *src, ptrdiff_t srcstride, \ | |||||
| int height, int mx, int my, int16_t *mcbuffer); \ | |||||
| EPEL_FUNC_HV(width, depth, cf_h, cf_v, cf_hv) | |||||
| EPEL_FUNCS(4, 8, ssse3, ssse3, avx) | |||||
| EPEL_FUNCS(8, 8, ssse3, ssse3, avx) | |||||
| EPEL_FUNCS(12, 8, ssse3, ssse3, avx) | |||||
| EPEL_FUNCS(16, 8, ssse3, ssse3, avx) | |||||
| EPEL_FUNCS(24, 8, ssse3, ssse3, avx) | |||||
| EPEL_FUNCS(32, 8, ssse3, ssse3, avx) | |||||
| EPEL_FUNCS(4, 10, avx, avx, avx) | |||||
| EPEL_FUNCS(8, 10, avx, avx, avx) | |||||
| EPEL_FUNCS(12, 10, avx, avx, avx) | |||||
| EPEL_FUNCS(16, 10, avx, avx, avx) | |||||
| EPEL_FUNCS(24, 10, avx, avx, avx) | |||||
| EPEL_FUNCS(32, 10, avx, avx, avx) | |||||
| #define PUT_PRED(width, depth, cf_uw, cf_w) \ | |||||
| void ff_hevc_put_unweighted_pred_ ## width ## _ ## depth ## _ ## cf_uw(uint8_t *dst, ptrdiff_t dststride, \ | |||||
| int16_t *src, ptrdiff_t srcstride, \ | |||||
| int height); \ | |||||
| void ff_hevc_put_unweighted_pred_avg_ ## width ## _ ## depth ## _ ## cf_uw(uint8_t *dst, ptrdiff_t dststride, \ | |||||
| int16_t *src1, int16_t *src2, \ | |||||
| ptrdiff_t srcstride, int height); \ | |||||
| void ff_hevc_put_weighted_pred_ ## width ## _ ## depth ## _ ## cf_w(uint8_t denom, int16_t weight, int16_t offset, \ | |||||
| uint8_t *dst, ptrdiff_t dststride, \ | |||||
| int16_t *src, ptrdiff_t srcstride, \ | |||||
| int height); \ | |||||
| void ff_hevc_put_weighted_pred_avg_ ## width ## _ ## depth ## _ ## cf_w(uint8_t denom, int16_t weight0, int16_t weight1, \ | |||||
| int16_t offset0, int16_t offset1, \ | |||||
| uint8_t *dst, ptrdiff_t dststride, \ | |||||
| int16_t *src0, int16_t *src1, ptrdiff_t srcstride, \ | |||||
| int height); | |||||
| PUT_PRED(4, 8, sse2, sse4) | |||||
| PUT_PRED(8, 8, sse2, sse4) | |||||
| PUT_PRED(12, 8, sse2, sse4) | |||||
| PUT_PRED(16, 8, sse2, sse4) | |||||
| PUT_PRED(24, 8, sse2, sse4) | |||||
| PUT_PRED(32, 8, sse2, sse4) | |||||
| PUT_PRED(48, 8, sse2, sse4) | |||||
| PUT_PRED(64, 8, sse2, sse4) | |||||
| PUT_PRED(4, 10, sse2, sse4) | |||||
| PUT_PRED(8, 10, sse2, sse4) | |||||
| PUT_PRED(12, 10, sse2, sse4) | |||||
| PUT_PRED(16, 10, sse2, sse4) | |||||
| PUT_PRED(24, 10, sse2, sse4) | |||||
| PUT_PRED(32, 10, sse2, sse4) | |||||
| PUT_PRED(48, 10, sse2, sse4) | |||||
| PUT_PRED(64, 10, sse2, sse4) | |||||
| void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth) | void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth) | ||||
| { | { | ||||
| int cpu_flags = av_get_cpu_flags(); | int cpu_flags = av_get_cpu_flags(); | ||||
| #define SET_LUMA_FUNCS(tabname, funcname, depth, cf) \ | |||||
| c->tabname[0] = funcname ## _4_ ## depth ## _ ## cf; \ | |||||
| c->tabname[1] = funcname ## _8_ ## depth ## _ ## cf; \ | |||||
| c->tabname[2] = funcname ## _12_ ## depth ## _ ## cf; \ | |||||
| c->tabname[3] = funcname ## _16_ ## depth ## _ ## cf; \ | |||||
| c->tabname[4] = funcname ## _24_ ## depth ## _ ## cf; \ | |||||
| c->tabname[5] = funcname ## _32_ ## depth ## _ ## cf; \ | |||||
| c->tabname[6] = funcname ## _48_ ## depth ## _ ## cf; \ | |||||
| c->tabname[7] = funcname ## _64_ ## depth ## _ ## cf; | |||||
| #define SET_CHROMA_FUNCS(tabname, funcname, depth, cf) \ | |||||
| c->tabname[1] = funcname ## _4_ ## depth ## _ ## cf; \ | |||||
| c->tabname[3] = funcname ## _8_ ## depth ## _ ## cf; \ | |||||
| c->tabname[4] = funcname ## _12_ ## depth ## _ ## cf; \ | |||||
| c->tabname[5] = funcname ## _16_ ## depth ## _ ## cf; \ | |||||
| c->tabname[6] = funcname ## _24_ ## depth ## _ ## cf; \ | |||||
| c->tabname[7] = funcname ## _32_ ## depth ## _ ## cf; | |||||
| #define SET_QPEL_FUNCS(v, h, depth, cf, name) SET_LUMA_FUNCS (put_hevc_qpel[v][h], name, depth, cf) | |||||
| #define SET_EPEL_FUNCS(v, h, depth, cf, name) SET_CHROMA_FUNCS(put_hevc_epel[v][h], name, depth, cf) | |||||
| if (bit_depth == 8) { | if (bit_depth == 8) { | ||||
| if (EXTERNAL_SSE2(cpu_flags)) { | if (EXTERNAL_SSE2(cpu_flags)) { | ||||
| c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_8_sse2; | c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_8_sse2; | ||||
| c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_8_sse2; | c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_8_sse2; | ||||
| SET_QPEL_FUNCS(0, 0, 8, sse2, ff_hevc_get_pixels); | |||||
| SET_EPEL_FUNCS(0, 0, 8, sse2, ff_hevc_get_pixels); | |||||
| SET_LUMA_FUNCS(put_unweighted_pred, ff_hevc_put_unweighted_pred, 8, sse2); | |||||
| SET_LUMA_FUNCS(put_unweighted_pred_avg, ff_hevc_put_unweighted_pred_avg, 8, sse2); | |||||
| SET_CHROMA_FUNCS(put_unweighted_pred_chroma, ff_hevc_put_unweighted_pred, 8, sse2); | |||||
| SET_CHROMA_FUNCS(put_unweighted_pred_avg_chroma, ff_hevc_put_unweighted_pred_avg, 8, sse2); | |||||
| } | } | ||||
| if (EXTERNAL_SSSE3(cpu_flags) && ARCH_X86_64) { | |||||
| c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_8_ssse3; | |||||
| c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_8_ssse3; | |||||
| if (EXTERNAL_SSSE3(cpu_flags)) { | |||||
| SET_QPEL_FUNCS(0, 1, 8, ssse3, ff_hevc_qpel_h); | |||||
| SET_QPEL_FUNCS(1, 0, 8, ssse3, ff_hevc_qpel_v); | |||||
| SET_EPEL_FUNCS(0, 1, 8, ssse3, ff_hevc_epel_h); | |||||
| SET_EPEL_FUNCS(1, 0, 8, ssse3, ff_hevc_epel_v); | |||||
| } | } | ||||
| } else if (bit_depth == 10) { | } else if (bit_depth == 10) { | ||||
| if (EXTERNAL_SSE2(cpu_flags)) { | if (EXTERNAL_SSE2(cpu_flags)) { | ||||
| c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_10_sse2; | c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_10_sse2; | ||||
| c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_10_sse2; | c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_10_sse2; | ||||
| SET_QPEL_FUNCS(0, 0, 10, sse2, ff_hevc_get_pixels); | |||||
| SET_EPEL_FUNCS(0, 0, 10, sse2, ff_hevc_get_pixels); | |||||
| SET_LUMA_FUNCS(put_unweighted_pred, ff_hevc_put_unweighted_pred, 10, sse2); | |||||
| SET_LUMA_FUNCS(put_unweighted_pred_avg, ff_hevc_put_unweighted_pred_avg, 10, sse2); | |||||
| SET_CHROMA_FUNCS(put_unweighted_pred_chroma, ff_hevc_put_unweighted_pred, 10, sse2); | |||||
| SET_CHROMA_FUNCS(put_unweighted_pred_avg_chroma, ff_hevc_put_unweighted_pred_avg, 10, sse2); | |||||
| } | |||||
| } | |||||
| #if ARCH_X86_64 | |||||
| if (bit_depth == 8) { | |||||
| if (EXTERNAL_SSSE3(cpu_flags)) { | |||||
| c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_8_ssse3; | |||||
| c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_8_ssse3; | |||||
| } | |||||
| if (EXTERNAL_SSE4(cpu_flags)) { | |||||
| SET_LUMA_FUNCS(weighted_pred, ff_hevc_put_weighted_pred, 8, sse4); | |||||
| SET_CHROMA_FUNCS(weighted_pred_chroma, ff_hevc_put_weighted_pred, 8, sse4); | |||||
| SET_LUMA_FUNCS(weighted_pred_avg, ff_hevc_put_weighted_pred_avg, 8, sse4); | |||||
| SET_CHROMA_FUNCS(weighted_pred_avg_chroma, ff_hevc_put_weighted_pred_avg, 8, sse4); | |||||
| } | } | ||||
| if (EXTERNAL_SSSE3(cpu_flags) && ARCH_X86_64) { | |||||
| if (EXTERNAL_AVX(cpu_flags)) { | |||||
| SET_QPEL_FUNCS(1, 1, 8, avx, hevc_qpel_hv); | |||||
| SET_EPEL_FUNCS(1, 1, 8, avx, hevc_epel_hv); | |||||
| } | |||||
| } else if (bit_depth == 10) { | |||||
| if (EXTERNAL_SSSE3(cpu_flags)) { | |||||
| c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_10_ssse3; | c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_10_ssse3; | ||||
| c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_10_ssse3; | c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_10_ssse3; | ||||
| } | } | ||||
| if (EXTERNAL_SSE4(cpu_flags)) { | |||||
| SET_LUMA_FUNCS(weighted_pred, ff_hevc_put_weighted_pred, 10, sse4); | |||||
| SET_CHROMA_FUNCS(weighted_pred_chroma, ff_hevc_put_weighted_pred, 10, sse4); | |||||
| SET_LUMA_FUNCS(weighted_pred_avg, ff_hevc_put_weighted_pred_avg, 10, sse4); | |||||
| SET_CHROMA_FUNCS(weighted_pred_avg_chroma, ff_hevc_put_weighted_pred_avg, 10, sse4); | |||||
| } | |||||
| if (EXTERNAL_AVX(cpu_flags)) { | |||||
| SET_QPEL_FUNCS(0, 1, 10, avx, ff_hevc_qpel_h); | |||||
| SET_QPEL_FUNCS(1, 0, 10, avx, ff_hevc_qpel_v); | |||||
| SET_QPEL_FUNCS(1, 1, 10, avx, hevc_qpel_hv); | |||||
| SET_EPEL_FUNCS(0, 1, 10, avx, ff_hevc_epel_h); | |||||
| SET_EPEL_FUNCS(1, 0, 10, avx, ff_hevc_epel_v); | |||||
| SET_EPEL_FUNCS(1, 1, 10, avx, hevc_epel_hv); | |||||
| } | |||||
| } | } | ||||
| #endif /* ARCH_X86_64 */ | |||||
| } | } | ||||