avcodec/mips: MSA (MIPS-SIMD-Arch) optimizations for HEVC uniw mc functions

This patch adds MSA (MIPS-SIMD-Arch) optimizations for HEVC uniw mc functions (qpel as well as epel) in new file hevc_mc_uniw_msa.c Adds new generic macros (needed for this patch) in libavutil/mips/generic_macros_msa.h Signed-off-by: Shivraj Patil <shivraj.patil@imgtec.com> Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
10 years ago · ce1761db19
--- a/libavcodec/mips/Makefile
+++ b/libavcodec/mips/Makefile
@@ -23,6 +23,7 @@ OBJS-$(CONFIG_H264DSP)                    += mips/h264dsp_init_mips.o
 OBJS-$(CONFIG_H264CHROMA)                 += mips/h264chroma_init_mips.o
 MSA-OBJS-$(CONFIG_HEVC_DECODER)           += mips/hevcdsp_msa.o            \
                                             mips/hevc_mc_uni_msa.o        \
                                             mips/hevc_mc_uniw_msa.o       \
                                             mips/hevc_mc_bi_msa.o
 MSA-OBJS-$(CONFIG_H264DSP)                += mips/h264dsp_msa.o
 LOONGSON3-OBJS-$(CONFIG_H264DSP)          += mips/h264dsp_mmi.o
--- a/libavcodec/mips/hevc_mc_uniw_msa.c
+++ b/libavcodec/mips/hevc_mc_uniw_msa.c
--- a/libavcodec/mips/hevcdsp_init_mips.c
+++ b/libavcodec/mips/hevcdsp_init_mips.c
@@ -97,6 +97,99 @@ static av_cold void hevc_dsp_init_msa(HEVCDSPContext *c,
        c->put_hevc_qpel_uni[8][1][1] = ff_hevc_put_hevc_uni_qpel_hv48_8_msa;
        c->put_hevc_qpel_uni[9][1][1] = ff_hevc_put_hevc_uni_qpel_hv64_8_msa;

        c->put_hevc_qpel_uni_w[1][0][0] =
            ff_hevc_put_hevc_uni_w_pel_pixels4_8_msa;
        c->put_hevc_qpel_uni_w[3][0][0] =
            ff_hevc_put_hevc_uni_w_pel_pixels8_8_msa;
        c->put_hevc_qpel_uni_w[4][0][0] =
            ff_hevc_put_hevc_uni_w_pel_pixels12_8_msa;
        c->put_hevc_qpel_uni_w[5][0][0] =
            ff_hevc_put_hevc_uni_w_pel_pixels16_8_msa;
        c->put_hevc_qpel_uni_w[6][0][0] =
            ff_hevc_put_hevc_uni_w_pel_pixels24_8_msa;
        c->put_hevc_qpel_uni_w[7][0][0] =
            ff_hevc_put_hevc_uni_w_pel_pixels32_8_msa;
        c->put_hevc_qpel_uni_w[8][0][0] =
            ff_hevc_put_hevc_uni_w_pel_pixels48_8_msa;
        c->put_hevc_qpel_uni_w[9][0][0] =
            ff_hevc_put_hevc_uni_w_pel_pixels64_8_msa;

        c->put_hevc_qpel_uni_w[1][0][1] = ff_hevc_put_hevc_uni_w_qpel_h4_8_msa;
        c->put_hevc_qpel_uni_w[3][0][1] = ff_hevc_put_hevc_uni_w_qpel_h8_8_msa;
        c->put_hevc_qpel_uni_w[4][0][1] = ff_hevc_put_hevc_uni_w_qpel_h12_8_msa;
        c->put_hevc_qpel_uni_w[5][0][1] = ff_hevc_put_hevc_uni_w_qpel_h16_8_msa;
        c->put_hevc_qpel_uni_w[6][0][1] = ff_hevc_put_hevc_uni_w_qpel_h24_8_msa;
        c->put_hevc_qpel_uni_w[7][0][1] = ff_hevc_put_hevc_uni_w_qpel_h32_8_msa;
        c->put_hevc_qpel_uni_w[8][0][1] = ff_hevc_put_hevc_uni_w_qpel_h48_8_msa;
        c->put_hevc_qpel_uni_w[9][0][1] = ff_hevc_put_hevc_uni_w_qpel_h64_8_msa;

        c->put_hevc_qpel_uni_w[1][1][0] = ff_hevc_put_hevc_uni_w_qpel_v4_8_msa;
        c->put_hevc_qpel_uni_w[3][1][0] = ff_hevc_put_hevc_uni_w_qpel_v8_8_msa;
        c->put_hevc_qpel_uni_w[4][1][0] = ff_hevc_put_hevc_uni_w_qpel_v12_8_msa;
        c->put_hevc_qpel_uni_w[5][1][0] = ff_hevc_put_hevc_uni_w_qpel_v16_8_msa;
        c->put_hevc_qpel_uni_w[6][1][0] = ff_hevc_put_hevc_uni_w_qpel_v24_8_msa;
        c->put_hevc_qpel_uni_w[7][1][0] = ff_hevc_put_hevc_uni_w_qpel_v32_8_msa;
        c->put_hevc_qpel_uni_w[8][1][0] = ff_hevc_put_hevc_uni_w_qpel_v48_8_msa;
        c->put_hevc_qpel_uni_w[9][1][0] = ff_hevc_put_hevc_uni_w_qpel_v64_8_msa;

        c->put_hevc_qpel_uni_w[1][1][1] = ff_hevc_put_hevc_uni_w_qpel_hv4_8_msa;
        c->put_hevc_qpel_uni_w[3][1][1] = ff_hevc_put_hevc_uni_w_qpel_hv8_8_msa;
        c->put_hevc_qpel_uni_w[4][1][1] =
            ff_hevc_put_hevc_uni_w_qpel_hv12_8_msa;
        c->put_hevc_qpel_uni_w[5][1][1] =
            ff_hevc_put_hevc_uni_w_qpel_hv16_8_msa;
        c->put_hevc_qpel_uni_w[6][1][1] =
            ff_hevc_put_hevc_uni_w_qpel_hv24_8_msa;
        c->put_hevc_qpel_uni_w[7][1][1] =
            ff_hevc_put_hevc_uni_w_qpel_hv32_8_msa;
        c->put_hevc_qpel_uni_w[8][1][1] =
            ff_hevc_put_hevc_uni_w_qpel_hv48_8_msa;
        c->put_hevc_qpel_uni_w[9][1][1] =
            ff_hevc_put_hevc_uni_w_qpel_hv64_8_msa;

        c->put_hevc_epel_uni_w[1][0][0] =
            ff_hevc_put_hevc_uni_w_pel_pixels4_8_msa;
        c->put_hevc_epel_uni_w[2][0][0] =
            ff_hevc_put_hevc_uni_w_pel_pixels6_8_msa;
        c->put_hevc_epel_uni_w[3][0][0] =
            ff_hevc_put_hevc_uni_w_pel_pixels8_8_msa;
        c->put_hevc_epel_uni_w[4][0][0] =
            ff_hevc_put_hevc_uni_w_pel_pixels12_8_msa;
        c->put_hevc_epel_uni_w[5][0][0] =
            ff_hevc_put_hevc_uni_w_pel_pixels16_8_msa;
        c->put_hevc_epel_uni_w[6][0][0] =
            ff_hevc_put_hevc_uni_w_pel_pixels24_8_msa;
        c->put_hevc_epel_uni_w[7][0][0] =
            ff_hevc_put_hevc_uni_w_pel_pixels32_8_msa;

        c->put_hevc_epel_uni_w[1][0][1] = ff_hevc_put_hevc_uni_w_epel_h4_8_msa;
        c->put_hevc_epel_uni_w[2][0][1] = ff_hevc_put_hevc_uni_w_epel_h6_8_msa;
        c->put_hevc_epel_uni_w[3][0][1] = ff_hevc_put_hevc_uni_w_epel_h8_8_msa;
        c->put_hevc_epel_uni_w[4][0][1] = ff_hevc_put_hevc_uni_w_epel_h12_8_msa;
        c->put_hevc_epel_uni_w[5][0][1] = ff_hevc_put_hevc_uni_w_epel_h16_8_msa;
        c->put_hevc_epel_uni_w[6][0][1] = ff_hevc_put_hevc_uni_w_epel_h24_8_msa;
        c->put_hevc_epel_uni_w[7][0][1] = ff_hevc_put_hevc_uni_w_epel_h32_8_msa;

        c->put_hevc_epel_uni_w[1][1][0] = ff_hevc_put_hevc_uni_w_epel_v4_8_msa;
        c->put_hevc_epel_uni_w[2][1][0] = ff_hevc_put_hevc_uni_w_epel_v6_8_msa;
        c->put_hevc_epel_uni_w[3][1][0] = ff_hevc_put_hevc_uni_w_epel_v8_8_msa;
        c->put_hevc_epel_uni_w[4][1][0] = ff_hevc_put_hevc_uni_w_epel_v12_8_msa;
        c->put_hevc_epel_uni_w[5][1][0] = ff_hevc_put_hevc_uni_w_epel_v16_8_msa;
        c->put_hevc_epel_uni_w[6][1][0] = ff_hevc_put_hevc_uni_w_epel_v24_8_msa;
        c->put_hevc_epel_uni_w[7][1][0] = ff_hevc_put_hevc_uni_w_epel_v32_8_msa;

        c->put_hevc_epel_uni_w[1][1][1] = ff_hevc_put_hevc_uni_w_epel_hv4_8_msa;
        c->put_hevc_epel_uni_w[2][1][1] = ff_hevc_put_hevc_uni_w_epel_hv6_8_msa;
        c->put_hevc_epel_uni_w[3][1][1] = ff_hevc_put_hevc_uni_w_epel_hv8_8_msa;
        c->put_hevc_epel_uni_w[4][1][1] =
            ff_hevc_put_hevc_uni_w_epel_hv12_8_msa;
        c->put_hevc_epel_uni_w[5][1][1] =
            ff_hevc_put_hevc_uni_w_epel_hv16_8_msa;
        c->put_hevc_epel_uni_w[6][1][1] =
            ff_hevc_put_hevc_uni_w_epel_hv24_8_msa;
        c->put_hevc_epel_uni_w[7][1][1] =
            ff_hevc_put_hevc_uni_w_epel_hv32_8_msa;

        c->put_hevc_qpel_bi[1][0][0] = ff_hevc_put_hevc_bi_pel_pixels4_8_msa;
        c->put_hevc_qpel_bi[3][0][0] = ff_hevc_put_hevc_bi_pel_pixels8_8_msa;
        c->put_hevc_qpel_bi[4][0][0] = ff_hevc_put_hevc_bi_pel_pixels12_8_msa;
--- a/libavcodec/mips/hevcdsp_mips.h
+++ b/libavcodec/mips/hevcdsp_mips.h
@@ -117,6 +117,90 @@ UNI_MC(qpel, hv, 64);

 #undef UNI_MC

 #define UNI_W_MC(PEL, DIR, WIDTH)                                         \
 void ff_hevc_put_hevc_uni_w_##PEL##_##DIR####WIDTH##_8_msa(uint8_t *dst,  \
                                                           ptrdiff_t      \
                                                           dst_stride,    \
                                                           uint8_t *src,  \
                                                           ptrdiff_t      \
                                                           src_stride,    \
                                                           int height,    \
                                                           int denom,     \
                                                           int weight,    \
                                                           int offset,    \
                                                           intptr_t mx,   \
                                                           intptr_t my,   \
                                                           int width)

 UNI_W_MC(pel, pixels, 4);
 UNI_W_MC(pel, pixels, 6);
 UNI_W_MC(pel, pixels, 8);
 UNI_W_MC(pel, pixels, 12);
 UNI_W_MC(pel, pixels, 16);
 UNI_W_MC(pel, pixels, 24);
 UNI_W_MC(pel, pixels, 32);
 UNI_W_MC(pel, pixels, 48);
 UNI_W_MC(pel, pixels, 64);

 UNI_W_MC(qpel, h, 4);
 UNI_W_MC(qpel, h, 8);
 UNI_W_MC(qpel, h, 12);
 UNI_W_MC(qpel, h, 16);
 UNI_W_MC(qpel, h, 24);
 UNI_W_MC(qpel, h, 32);
 UNI_W_MC(qpel, h, 48);
 UNI_W_MC(qpel, h, 64);

 UNI_W_MC(qpel, v, 4);
 UNI_W_MC(qpel, v, 8);
 UNI_W_MC(qpel, v, 12);
 UNI_W_MC(qpel, v, 16);
 UNI_W_MC(qpel, v, 24);
 UNI_W_MC(qpel, v, 32);
 UNI_W_MC(qpel, v, 48);
 UNI_W_MC(qpel, v, 64);

 UNI_W_MC(qpel, hv, 4);
 UNI_W_MC(qpel, hv, 8);
 UNI_W_MC(qpel, hv, 12);
 UNI_W_MC(qpel, hv, 16);
 UNI_W_MC(qpel, hv, 24);
 UNI_W_MC(qpel, hv, 32);
 UNI_W_MC(qpel, hv, 48);
 UNI_W_MC(qpel, hv, 64);

 UNI_W_MC(epel, h, 4);
 UNI_W_MC(epel, h, 6);
 UNI_W_MC(epel, h, 8);
 UNI_W_MC(epel, h, 12);
 UNI_W_MC(epel, h, 16);
 UNI_W_MC(epel, h, 24);
 UNI_W_MC(epel, h, 32);
 UNI_W_MC(epel, h, 48);
 UNI_W_MC(epel, h, 64);

 UNI_W_MC(epel, v, 4);
 UNI_W_MC(epel, v, 6);
 UNI_W_MC(epel, v, 8);
 UNI_W_MC(epel, v, 12);
 UNI_W_MC(epel, v, 16);
 UNI_W_MC(epel, v, 24);
 UNI_W_MC(epel, v, 32);
 UNI_W_MC(epel, v, 48);
 UNI_W_MC(epel, v, 64);

 UNI_W_MC(epel, hv, 4);
 UNI_W_MC(epel, hv, 6);
 UNI_W_MC(epel, hv, 8);
 UNI_W_MC(epel, hv, 12);
 UNI_W_MC(epel, hv, 16);
 UNI_W_MC(epel, hv, 24);
 UNI_W_MC(epel, hv, 32);
 UNI_W_MC(epel, hv, 48);
 UNI_W_MC(epel, hv, 64);

 #undef UNI_W_MC

 #define BI_MC(PEL, DIR, WIDTH)                                                 \
 void ff_hevc_put_hevc_bi_##PEL##_##DIR####WIDTH##_8_msa(uint8_t *dst,          \
                                                        ptrdiff_t dst_stride,  \
--- a/libavutil/mips/generic_macros_msa.h
+++ b/libavutil/mips/generic_macros_msa.h
@@ -802,6 +802,34 @@
 }
 #define DOTP_SB4_SH(...) DOTP_SB4(v8i16, __VA_ARGS__)

 /* Description : Dot product of halfword vector elements
   Arguments   : Inputs  - mult0, mult1
                           cnst0, cnst1
                 Outputs - out0, out1
                 Return Type - signed word
   Details     : Signed halfword elements from mult0 are multiplied with
                 signed halfword elements from cnst0 producing a result
                 twice the size of input i.e. signed word.
                 Then this multiplication results of adjacent odd-even elements
                 are added together and stored to the out vector
                 (2 signed word results)
 */
 #define DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1)   \
 {                                                                 \
    out0 = (RTYPE) __msa_dotp_s_w((v8i16) mult0, (v8i16) cnst0);  \
    out1 = (RTYPE) __msa_dotp_s_w((v8i16) mult1, (v8i16) cnst1);  \
 }
 #define DOTP_SH2_SW(...) DOTP_SH2(v4i32, __VA_ARGS__)

 #define DOTP_SH4(RTYPE, mult0, mult1, mult2, mult3,           \
                 cnst0, cnst1, cnst2, cnst3,                  \
                 out0, out1, out2, out3)                      \
 {                                                             \
    DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1);  \
    DOTP_SH2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3);  \
 }
 #define DOTP_SH4_SW(...) DOTP_SH4(v4i32, __VA_ARGS__)

 /* Description : Dot product & addition of byte vector elements
   Arguments   : Inputs  - mult0, mult1
                           cnst0, cnst1
@@ -1017,6 +1045,7 @@
    out1 = (RTYPE) __msa_ilvl_h((v8i16) in2, (v8i16) in3);  \
 }
 #define ILVL_H2_SH(...) ILVL_H2(v8i16, __VA_ARGS__)
 #define ILVL_H2_SW(...) ILVL_H2(v4i32, __VA_ARGS__)

 #define ILVL_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
                out0, out1, out2, out3)                         \
@@ -1088,6 +1117,7 @@
    out1 = (RTYPE) __msa_ilvr_h((v8i16) in2, (v8i16) in3);  \
 }
 #define ILVR_H2_SH(...) ILVR_H2(v8i16, __VA_ARGS__)
 #define ILVR_H2_SW(...) ILVR_H2(v4i32, __VA_ARGS__)

 #define ILVR_H3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2)  \
 {                                                                       \
@@ -1555,6 +1585,31 @@
 #define SRAR_H4_UH(...) SRAR_H4(v8u16, __VA_ARGS__)
 #define SRAR_H4_SH(...) SRAR_H4(v8i16, __VA_ARGS__)

 /* Description : Shift right arithmetic rounded words
   Arguments   : Inputs  - in0, in1, shift
                 Outputs - in0, in1, (in place)
                 Return Type - as per RTYPE
   Details     : Each element of vector 'in0' is shifted right arithmetic by
                 number of bits respective element holds in vector 'shift'.
                 The last discarded bit is added to shifted value for rounding
                 and the result is in place written to 'in0'
                 Here, 'shift' is a vector passed in
                 Similar for other pairs
 */
 #define SRAR_W2(RTYPE, in0, in1, shift)                      \
 {                                                            \
    in0 = (RTYPE) __msa_srar_w((v4i32) in0, (v4i32) shift);  \
    in1 = (RTYPE) __msa_srar_w((v4i32) in1, (v4i32) shift);  \
 }
 #define SRAR_W2_SW(...) SRAR_W2(v4i32, __VA_ARGS__)

 #define SRAR_W4(RTYPE, in0, in1, in2, in3, shift)  \
 {                                                  \
    SRAR_W2(RTYPE, in0, in1, shift)                \
    SRAR_W2(RTYPE, in2, in3, shift)                \
 }
 #define SRAR_W4_SW(...) SRAR_W4(v4i32, __VA_ARGS__)

 /* Description : Shift right arithmetic rounded (immediate)
   Arguments   : Inputs  - in0, in1, in2, in3, shift
                 Outputs - in0, in1, in2, in3 (in place)
@@ -1616,6 +1671,23 @@
    MUL2(in4, in5, in6, in7, out2, out3);                                     \
 }

 /* Description : Addition of 2 pairs of vectors
   Arguments   : Inputs  - in0, in1, in2, in3
                 Outputs - out0, out1
   Details     : Each element from 2 pairs vectors is added and 2 results are
                 produced
 */
 #define ADD2(in0, in1, in2, in3, out0, out1)  \
 {                                             \
    out0 = in0 + in1;                         \
    out1 = in2 + in3;                         \
 }
 #define ADD4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3)  \
 {                                                                             \
    ADD2(in0, in1, in2, in3, out0, out1);                                     \
    ADD2(in4, in5, in6, in7, out2, out3);                                     \
 }

 /* Description : Zero extend unsigned byte elements to halfword elements
   Arguments   : Inputs  - in           (1 input unsigned byte vector)
                 Outputs - out0, out1   (unsigned 2 halfword vectors)