avcodec/mips: MSA (MIPS-SIMD-Arch) optimizations for HEVC uni mc epel functions

This patch adds MSA (MIPS-SIMD-Arch) optimizations for HEVC uni mc epel functions. Adds new generic macros (needed for this patch) in libavutil/mips/generic_macros_msa.h Signed-off-by: Shivraj Patil <shivraj.patil@imgtec.com> Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
10 years ago · aef34ab950
--- a/libavcodec/mips/hevc_mc_uni_msa.c
+++ b/libavcodec/mips/hevc_mc_uni_msa.c
--- a/libavcodec/mips/hevcdsp_init_mips.c
+++ b/libavcodec/mips/hevcdsp_init_mips.c
@@ -129,6 +129,36 @@ static av_cold void hevc_dsp_init_msa(HEVCDSPContext *c,
        c->put_hevc_qpel_uni[8][1][1] = ff_hevc_put_hevc_uni_qpel_hv48_8_msa;
        c->put_hevc_qpel_uni[9][1][1] = ff_hevc_put_hevc_uni_qpel_hv64_8_msa;

        c->put_hevc_epel_uni[3][0][0] = ff_hevc_put_hevc_uni_pel_pixels8_8_msa;
        c->put_hevc_epel_uni[4][0][0] = ff_hevc_put_hevc_uni_pel_pixels12_8_msa;
        c->put_hevc_epel_uni[5][0][0] = ff_hevc_put_hevc_uni_pel_pixels16_8_msa;
        c->put_hevc_epel_uni[6][0][0] = ff_hevc_put_hevc_uni_pel_pixels24_8_msa;
        c->put_hevc_epel_uni[7][0][0] = ff_hevc_put_hevc_uni_pel_pixels32_8_msa;

        c->put_hevc_epel_uni[1][0][1] = ff_hevc_put_hevc_uni_epel_h4_8_msa;
        c->put_hevc_epel_uni[2][0][1] = ff_hevc_put_hevc_uni_epel_h6_8_msa;
        c->put_hevc_epel_uni[3][0][1] = ff_hevc_put_hevc_uni_epel_h8_8_msa;
        c->put_hevc_epel_uni[4][0][1] = ff_hevc_put_hevc_uni_epel_h12_8_msa;
        c->put_hevc_epel_uni[5][0][1] = ff_hevc_put_hevc_uni_epel_h16_8_msa;
        c->put_hevc_epel_uni[6][0][1] = ff_hevc_put_hevc_uni_epel_h24_8_msa;
        c->put_hevc_epel_uni[7][0][1] = ff_hevc_put_hevc_uni_epel_h32_8_msa;

        c->put_hevc_epel_uni[1][1][0] = ff_hevc_put_hevc_uni_epel_v4_8_msa;
        c->put_hevc_epel_uni[2][1][0] = ff_hevc_put_hevc_uni_epel_v6_8_msa;
        c->put_hevc_epel_uni[3][1][0] = ff_hevc_put_hevc_uni_epel_v8_8_msa;
        c->put_hevc_epel_uni[4][1][0] = ff_hevc_put_hevc_uni_epel_v12_8_msa;
        c->put_hevc_epel_uni[5][1][0] = ff_hevc_put_hevc_uni_epel_v16_8_msa;
        c->put_hevc_epel_uni[6][1][0] = ff_hevc_put_hevc_uni_epel_v24_8_msa;
        c->put_hevc_epel_uni[7][1][0] = ff_hevc_put_hevc_uni_epel_v32_8_msa;

        c->put_hevc_epel_uni[1][1][1] = ff_hevc_put_hevc_uni_epel_hv4_8_msa;
        c->put_hevc_epel_uni[2][1][1] = ff_hevc_put_hevc_uni_epel_hv6_8_msa;
        c->put_hevc_epel_uni[3][1][1] = ff_hevc_put_hevc_uni_epel_hv8_8_msa;
        c->put_hevc_epel_uni[4][1][1] = ff_hevc_put_hevc_uni_epel_hv12_8_msa;
        c->put_hevc_epel_uni[5][1][1] = ff_hevc_put_hevc_uni_epel_hv16_8_msa;
        c->put_hevc_epel_uni[6][1][1] = ff_hevc_put_hevc_uni_epel_hv24_8_msa;
        c->put_hevc_epel_uni[7][1][1] = ff_hevc_put_hevc_uni_epel_hv32_8_msa;

        c->put_hevc_qpel_uni_w[1][0][0] =
            ff_hevc_put_hevc_uni_w_pel_pixels4_8_msa;
        c->put_hevc_qpel_uni_w[3][0][0] =
--- a/libavcodec/mips/hevcdsp_mips.h
+++ b/libavcodec/mips/hevcdsp_mips.h
@@ -145,6 +145,36 @@ UNI_MC(qpel, hv, 32);
 UNI_MC(qpel, hv, 48);
 UNI_MC(qpel, hv, 64);

 UNI_MC(epel, h, 4);
 UNI_MC(epel, h, 6);
 UNI_MC(epel, h, 8);
 UNI_MC(epel, h, 12);
 UNI_MC(epel, h, 16);
 UNI_MC(epel, h, 24);
 UNI_MC(epel, h, 32);
 UNI_MC(epel, h, 48);
 UNI_MC(epel, h, 64);

 UNI_MC(epel, v, 4);
 UNI_MC(epel, v, 6);
 UNI_MC(epel, v, 8);
 UNI_MC(epel, v, 12);
 UNI_MC(epel, v, 16);
 UNI_MC(epel, v, 24);
 UNI_MC(epel, v, 32);
 UNI_MC(epel, v, 48);
 UNI_MC(epel, v, 64);

 UNI_MC(epel, hv, 4);
 UNI_MC(epel, hv, 6);
 UNI_MC(epel, hv, 8);
 UNI_MC(epel, hv, 12);
 UNI_MC(epel, hv, 16);
 UNI_MC(epel, hv, 24);
 UNI_MC(epel, hv, 32);
 UNI_MC(epel, hv, 48);
 UNI_MC(epel, hv, 64);

 #undef UNI_MC

 #define UNI_W_MC(PEL, DIR, WIDTH)                                         \
--- a/libavutil/mips/generic_macros_msa.h
+++ b/libavutil/mips/generic_macros_msa.h
@@ -291,6 +291,7 @@
    LD_B2(RTYPE, (psrc), stride, out0, out1);         \
    out2 = LD_B(RTYPE, (psrc) + 2 * stride);          \
 }
 #define LD_UB3(...) LD_B3(v16u8, __VA_ARGS__)
 #define LD_SB3(...) LD_B3(v16i8, __VA_ARGS__)

 #define LD_B4(RTYPE, psrc, stride, out0, out1, out2, out3)   \
@@ -573,6 +574,18 @@
    SH(out7_m, (pblk_6x4_m + 4));              \
 }

 /* Description : Store as 8x1 byte block to destination memory from input vector
   Arguments   : Inputs  - in, pdst
   Details     : Index 0 double word element from input vector 'in' is copied
                 and stored to destination memory at (pdst)
 */
 #define ST8x1_UB(in, pdst)                   \
 {                                            \
    uint64_t out0_m;                         \
    out0_m = __msa_copy_u_d((v2i64) in, 0);  \
    SD(out0_m, pdst);                        \
 }

 /* Description : Store as 8x2 byte block to destination memory from input vector
   Arguments   : Inputs  - in, pdst, stride
   Details     : Index 0 double word element from input vector 'in' is copied
@@ -716,6 +729,23 @@
 }
 #define SLDI_B4_0_SB(...) SLDI_B4_0(v16i8, __VA_ARGS__)

 /* Description : Immediate number of columns to slide
   Arguments   : Inputs  - in0_0, in0_1, in1_0, in1_1, slide_val
                 Outputs - out0, out1
                 Return Type - as per RTYPE
   Details     : Byte elements from 'in0_0' vector are slide into 'in1_0' by
                 number of elements specified by 'slide_val'
 */
 #define SLDI_B2(RTYPE, in0_0, in0_1, in1_0, in1_1, out0, out1, slide_val)  \
 {                                                                          \
    out0 = (RTYPE) __msa_sldi_b((v16i8) in0_0, (v16i8) in1_0, slide_val);  \
    out1 = (RTYPE) __msa_sldi_b((v16i8) in0_1, (v16i8) in1_1, slide_val);  \
 }
 #define SLDI_B2_UB(...) SLDI_B2(v16u8, __VA_ARGS__)
 #define SLDI_B2_SB(...) SLDI_B2(v16i8, __VA_ARGS__)
 #define SLDI_B2_SH(...) SLDI_B2(v8i16, __VA_ARGS__)


 /* Description : Shuffle byte vector elements as per mask vector
   Arguments   : Inputs  - in0, in1, in2, in3, mask0, mask1
                 Outputs - out0, out1
@@ -1090,6 +1120,16 @@
 #define ILVR_B2_SB(...) ILVR_B2(v16i8, __VA_ARGS__)
 #define ILVR_B2_UH(...) ILVR_B2(v8u16, __VA_ARGS__)
 #define ILVR_B2_SH(...) ILVR_B2(v8i16, __VA_ARGS__)
 #define ILVR_B2_SW(...) ILVR_B2(v4i32, __VA_ARGS__)

 #define ILVR_B3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2)  \
 {                                                                       \
    ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1);                     \
    out2 = (RTYPE) __msa_ilvr_b((v16i8) in4, (v16i8) in5);              \
 }
 #define ILVR_B3_UB(...) ILVR_B3(v16u8, __VA_ARGS__)
 #define ILVR_B3_UH(...) ILVR_B3(v8u16, __VA_ARGS__)
 #define ILVR_B3_SH(...) ILVR_B3(v8i16, __VA_ARGS__)

 #define ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
                out0, out1, out2, out3)                         \
@@ -1306,6 +1346,7 @@
    out0 = (RTYPE) __msa_splati_h((v8i16) in, idx0);  \
    out1 = (RTYPE) __msa_splati_h((v8i16) in, idx1);  \
 }
 #define SPLATI_H2_SB(...) SPLATI_H2(v16i8, __VA_ARGS__)
 #define SPLATI_H2_SH(...) SPLATI_H2(v8i16, __VA_ARGS__)

 #define SPLATI_H4(RTYPE, in, idx0, idx1, idx2, idx3,  \
@@ -1427,7 +1468,9 @@
    in0 = (RTYPE) __msa_xori_b((v16u8) in0, 128);  \
    in1 = (RTYPE) __msa_xori_b((v16u8) in1, 128);  \
 }
 #define XORI_B2_128_UB(...) XORI_B2_128(v16u8, __VA_ARGS__)
 #define XORI_B2_128_SB(...) XORI_B2_128(v16i8, __VA_ARGS__)
 #define XORI_B2_128_SH(...) XORI_B2_128(v8i16, __VA_ARGS__)

 #define XORI_B3_128(RTYPE, in0, in1, in2)          \
 {                                                  \
@@ -1628,6 +1671,14 @@
 #define SRARI_H2_UH(...) SRARI_H2(v8u16, __VA_ARGS__)
 #define SRARI_H2_SH(...) SRARI_H2(v8i16, __VA_ARGS__)

 #define SRARI_H4(RTYPE, in0, in1, in2, in3, shift)    \
 {                                                     \
    SRARI_H2(RTYPE, in0, in1, shift);                 \
    SRARI_H2(RTYPE, in2, in3, shift);                 \
 }
 #define SRARI_H4_UH(...) SRARI_H4(v8u16, __VA_ARGS__)
 #define SRARI_H4_SH(...) SRARI_H4(v8i16, __VA_ARGS__)

 /* Description : Shift right arithmetic rounded (immediate)
   Arguments   : Inputs  - in0, in1, shift
                 Outputs - in0, in1     (in place)