avcodec/mips: MSA (MIPS-SIMD-Arch) optimizations for HEVC bi mc functions

This patch adds MSA (MIPS-SIMD-Arch) optimizations for HEVC bi mc functions (qpel as well as epel) in new file hevc_mc_bi_msa.c Adds new generic macros (needed for this patch) in libavutil/mips/generic_macros_msa.h Adds HEVC specific macros (needed for this patch) in libavcodec/mips/hevc_macros_msa.h Signed-off-by: Shivraj Patil <shivraj.patil@imgtec.com> Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
10 years ago · aede1a1a60
--- a/libavcodec/mips/Makefile
+++ b/libavcodec/mips/Makefile
@@ -21,6 +21,7 @@ MIPSFPU-OBJS-$(CONFIG_AAC_ENCODER)        += mips/iirfilter_mips.o
 OBJS-$(CONFIG_HEVC_DECODER)               += mips/hevcdsp_init_mips.o
 OBJS-$(CONFIG_H264DSP)                    += mips/h264dsp_init_mips.o
 MSA-OBJS-$(CONFIG_HEVC_DECODER)           += mips/hevcdsp_msa.o            \
                                             mips/hevc_mc_uni_msa.o
                                             mips/hevc_mc_uni_msa.o        \
                                             mips/hevc_mc_bi_msa.o
 MSA-OBJS-$(CONFIG_H264DSP)                += mips/h264dsp_msa.o
 LOONGSON3-OBJS-$(CONFIG_H264DSP)          += mips/h264dsp_mmi.o
--- a/libavcodec/mips/hevc_macros_msa.h
+++ b/libavcodec/mips/hevc_macros_msa.h
@@ -37,6 +37,27 @@
    out = (v4i32) __msa_pckev_b((v16i8) tmp1_m, (v16i8) tmp0_m);  \
 }
 #define HEVC_PCK_SW_SB8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1)  \
 {                                                                            \
    v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                    \
                                                                             \
    PCKEV_H4_SH(in0, in1, in2, in3, in4, in5, in6, in7,                      \
                tmp0_m, tmp1_m, tmp2_m, tmp3_m);                             \
    PCKEV_B2_SW(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out0, out1);                 \
 }
 #define HEVC_PCK_SW_SB12(in0, in1, in2, in3, in4, in5, in6, in7,   \
                         in8, in9, in10, in11, out0, out1, out2)   \
 {                                                                  \
    v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m, tmp4_m, tmp5_m;          \
                                                                   \
    PCKEV_H4_SH(in0, in1, in2, in3, in4, in5, in6, in7,            \
                tmp0_m, tmp1_m, tmp2_m, tmp3_m);                   \
    PCKEV_H2_SH(in8, in9, in10, in11, tmp4_m, tmp5_m);             \
    PCKEV_B2_SW(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out0, out1);       \
    out2 = (v4i32) __msa_pckev_b((v16i8) tmp5_m, (v16i8) tmp4_m);  \
 }
 #define HEVC_FILT_8TAP(in0, in1, in2, in3,                       \
                       filt0, filt1, filt2, filt3)               \
 ( {                                                              \
@@ -48,4 +69,13 @@
    out_m;                                                       \
 } )
 #define HEVC_FILT_4TAP(in0, in1, filt0, filt1)           \
 ( {                                                      \
    v4i32 out_m;                                         \
                                                         \
    out_m = __msa_dotp_s_w(in0, (v8i16) filt0);          \
    out_m = __msa_dpadd_s_w(out_m, in1, (v8i16) filt1);  \
    out_m;                                               \
 } )
 #endif  /* AVCODEC_MIPS_HEVC_MACROS_MSA_H */
--- a/libavcodec/mips/hevc_mc_bi_msa.c
+++ b/libavcodec/mips/hevc_mc_bi_msa.c
--- a/libavcodec/mips/hevcdsp_init_mips.c
+++ b/libavcodec/mips/hevcdsp_init_mips.c
@@ -96,6 +96,74 @@ static av_cold void hevc_dsp_init_msa(HEVCDSPContext *c,
        c->put_hevc_qpel_uni[7][1][1] = ff_hevc_put_hevc_uni_qpel_hv32_8_msa;
        c->put_hevc_qpel_uni[8][1][1] = ff_hevc_put_hevc_uni_qpel_hv48_8_msa;
        c->put_hevc_qpel_uni[9][1][1] = ff_hevc_put_hevc_uni_qpel_hv64_8_msa;
        c->put_hevc_qpel_bi[1][0][0] = ff_hevc_put_hevc_bi_pel_pixels4_8_msa;
        c->put_hevc_qpel_bi[3][0][0] = ff_hevc_put_hevc_bi_pel_pixels8_8_msa;
        c->put_hevc_qpel_bi[4][0][0] = ff_hevc_put_hevc_bi_pel_pixels12_8_msa;
        c->put_hevc_qpel_bi[5][0][0] = ff_hevc_put_hevc_bi_pel_pixels16_8_msa;
        c->put_hevc_qpel_bi[6][0][0] = ff_hevc_put_hevc_bi_pel_pixels24_8_msa;
        c->put_hevc_qpel_bi[7][0][0] = ff_hevc_put_hevc_bi_pel_pixels32_8_msa;
        c->put_hevc_qpel_bi[8][0][0] = ff_hevc_put_hevc_bi_pel_pixels48_8_msa;
        c->put_hevc_qpel_bi[9][0][0] = ff_hevc_put_hevc_bi_pel_pixels64_8_msa;
        c->put_hevc_qpel_bi[1][0][1] = ff_hevc_put_hevc_bi_qpel_h4_8_msa;
        c->put_hevc_qpel_bi[3][0][1] = ff_hevc_put_hevc_bi_qpel_h8_8_msa;
        c->put_hevc_qpel_bi[4][0][1] = ff_hevc_put_hevc_bi_qpel_h12_8_msa;
        c->put_hevc_qpel_bi[5][0][1] = ff_hevc_put_hevc_bi_qpel_h16_8_msa;
        c->put_hevc_qpel_bi[6][0][1] = ff_hevc_put_hevc_bi_qpel_h24_8_msa;
        c->put_hevc_qpel_bi[7][0][1] = ff_hevc_put_hevc_bi_qpel_h32_8_msa;
        c->put_hevc_qpel_bi[8][0][1] = ff_hevc_put_hevc_bi_qpel_h48_8_msa;
        c->put_hevc_qpel_bi[9][0][1] = ff_hevc_put_hevc_bi_qpel_h64_8_msa;
        c->put_hevc_qpel_bi[1][1][0] = ff_hevc_put_hevc_bi_qpel_v4_8_msa;
        c->put_hevc_qpel_bi[3][1][0] = ff_hevc_put_hevc_bi_qpel_v8_8_msa;
        c->put_hevc_qpel_bi[4][1][0] = ff_hevc_put_hevc_bi_qpel_v12_8_msa;
        c->put_hevc_qpel_bi[5][1][0] = ff_hevc_put_hevc_bi_qpel_v16_8_msa;
        c->put_hevc_qpel_bi[6][1][0] = ff_hevc_put_hevc_bi_qpel_v24_8_msa;
        c->put_hevc_qpel_bi[7][1][0] = ff_hevc_put_hevc_bi_qpel_v32_8_msa;
        c->put_hevc_qpel_bi[8][1][0] = ff_hevc_put_hevc_bi_qpel_v48_8_msa;
        c->put_hevc_qpel_bi[9][1][0] = ff_hevc_put_hevc_bi_qpel_v64_8_msa;
        c->put_hevc_qpel_bi[1][1][1] = ff_hevc_put_hevc_bi_qpel_hv4_8_msa;
        c->put_hevc_qpel_bi[3][1][1] = ff_hevc_put_hevc_bi_qpel_hv8_8_msa;
        c->put_hevc_qpel_bi[4][1][1] = ff_hevc_put_hevc_bi_qpel_hv12_8_msa;
        c->put_hevc_qpel_bi[5][1][1] = ff_hevc_put_hevc_bi_qpel_hv16_8_msa;
        c->put_hevc_qpel_bi[6][1][1] = ff_hevc_put_hevc_bi_qpel_hv24_8_msa;
        c->put_hevc_qpel_bi[7][1][1] = ff_hevc_put_hevc_bi_qpel_hv32_8_msa;
        c->put_hevc_qpel_bi[8][1][1] = ff_hevc_put_hevc_bi_qpel_hv48_8_msa;
        c->put_hevc_qpel_bi[9][1][1] = ff_hevc_put_hevc_bi_qpel_hv64_8_msa;
        c->put_hevc_epel_bi[1][0][0] = ff_hevc_put_hevc_bi_pel_pixels4_8_msa;
        c->put_hevc_epel_bi[2][0][0] = ff_hevc_put_hevc_bi_pel_pixels6_8_msa;
        c->put_hevc_epel_bi[3][0][0] = ff_hevc_put_hevc_bi_pel_pixels8_8_msa;
        c->put_hevc_epel_bi[4][0][0] = ff_hevc_put_hevc_bi_pel_pixels12_8_msa;
        c->put_hevc_epel_bi[5][0][0] = ff_hevc_put_hevc_bi_pel_pixels16_8_msa;
        c->put_hevc_epel_bi[6][0][0] = ff_hevc_put_hevc_bi_pel_pixels24_8_msa;
        c->put_hevc_epel_bi[7][0][0] = ff_hevc_put_hevc_bi_pel_pixels32_8_msa;
        c->put_hevc_epel_bi[1][0][1] = ff_hevc_put_hevc_bi_epel_h4_8_msa;
        c->put_hevc_epel_bi[2][0][1] = ff_hevc_put_hevc_bi_epel_h6_8_msa;
        c->put_hevc_epel_bi[3][0][1] = ff_hevc_put_hevc_bi_epel_h8_8_msa;
        c->put_hevc_epel_bi[4][0][1] = ff_hevc_put_hevc_bi_epel_h12_8_msa;
        c->put_hevc_epel_bi[5][0][1] = ff_hevc_put_hevc_bi_epel_h16_8_msa;
        c->put_hevc_epel_bi[6][0][1] = ff_hevc_put_hevc_bi_epel_h24_8_msa;
        c->put_hevc_epel_bi[7][0][1] = ff_hevc_put_hevc_bi_epel_h32_8_msa;
        c->put_hevc_epel_bi[1][1][0] = ff_hevc_put_hevc_bi_epel_v4_8_msa;
        c->put_hevc_epel_bi[2][1][0] = ff_hevc_put_hevc_bi_epel_v6_8_msa;
        c->put_hevc_epel_bi[3][1][0] = ff_hevc_put_hevc_bi_epel_v8_8_msa;
        c->put_hevc_epel_bi[4][1][0] = ff_hevc_put_hevc_bi_epel_v12_8_msa;
        c->put_hevc_epel_bi[5][1][0] = ff_hevc_put_hevc_bi_epel_v16_8_msa;
        c->put_hevc_epel_bi[6][1][0] = ff_hevc_put_hevc_bi_epel_v24_8_msa;
        c->put_hevc_epel_bi[7][1][0] = ff_hevc_put_hevc_bi_epel_v32_8_msa;
        c->put_hevc_epel_bi[1][1][1] = ff_hevc_put_hevc_bi_epel_hv4_8_msa;
        c->put_hevc_epel_bi[2][1][1] = ff_hevc_put_hevc_bi_epel_hv6_8_msa;
        c->put_hevc_epel_bi[3][1][1] = ff_hevc_put_hevc_bi_epel_hv8_8_msa;
        c->put_hevc_epel_bi[4][1][1] = ff_hevc_put_hevc_bi_epel_hv12_8_msa;
        c->put_hevc_epel_bi[5][1][1] = ff_hevc_put_hevc_bi_epel_hv16_8_msa;
        c->put_hevc_epel_bi[6][1][1] = ff_hevc_put_hevc_bi_epel_hv24_8_msa;
        c->put_hevc_epel_bi[7][1][1] = ff_hevc_put_hevc_bi_epel_hv32_8_msa;
    }
 }
 #endif  // #if HAVE_MSA
--- a/libavcodec/mips/hevcdsp_mips.h
+++ b/libavcodec/mips/hevcdsp_mips.h
@@ -116,3 +116,83 @@ UNI_MC(qpel, hv, 48);
 UNI_MC(qpel, hv, 64);
 #undef UNI_MC
 #define BI_MC(PEL, DIR, WIDTH)                                                 \
 void ff_hevc_put_hevc_bi_##PEL##_##DIR####WIDTH##_8_msa(uint8_t *dst,          \
                                                        ptrdiff_t dst_stride,  \
                                                        uint8_t *src,          \
                                                        ptrdiff_t src_stride,  \
                                                        int16_t *src_16bit,    \
                                                        int height,            \
                                                        intptr_t mx,           \
                                                        intptr_t my,           \
                                                        int width)
 BI_MC(pel, pixels, 4);
 BI_MC(pel, pixels, 6);
 BI_MC(pel, pixels, 8);
 BI_MC(pel, pixels, 12);
 BI_MC(pel, pixels, 16);
 BI_MC(pel, pixels, 24);
 BI_MC(pel, pixels, 32);
 BI_MC(pel, pixels, 48);
 BI_MC(pel, pixels, 64);
 BI_MC(qpel, h, 4);
 BI_MC(qpel, h, 8);
 BI_MC(qpel, h, 12);
 BI_MC(qpel, h, 16);
 BI_MC(qpel, h, 24);
 BI_MC(qpel, h, 32);
 BI_MC(qpel, h, 48);
 BI_MC(qpel, h, 64);
 BI_MC(qpel, v, 4);
 BI_MC(qpel, v, 8);
 BI_MC(qpel, v, 12);
 BI_MC(qpel, v, 16);
 BI_MC(qpel, v, 24);
 BI_MC(qpel, v, 32);
 BI_MC(qpel, v, 48);
 BI_MC(qpel, v, 64);
 BI_MC(qpel, hv, 4);
 BI_MC(qpel, hv, 8);
 BI_MC(qpel, hv, 12);
 BI_MC(qpel, hv, 16);
 BI_MC(qpel, hv, 24);
 BI_MC(qpel, hv, 32);
 BI_MC(qpel, hv, 48);
 BI_MC(qpel, hv, 64);
 BI_MC(epel, h, 4);
 BI_MC(epel, h, 6);
 BI_MC(epel, h, 8);
 BI_MC(epel, h, 12);
 BI_MC(epel, h, 16);
 BI_MC(epel, h, 24);
 BI_MC(epel, h, 32);
 BI_MC(epel, h, 48);
 BI_MC(epel, h, 64);
 BI_MC(epel, v, 4);
 BI_MC(epel, v, 6);
 BI_MC(epel, v, 8);
 BI_MC(epel, v, 12);
 BI_MC(epel, v, 16);
 BI_MC(epel, v, 24);
 BI_MC(epel, v, 32);
 BI_MC(epel, v, 48);
 BI_MC(epel, v, 64);
 BI_MC(epel, hv, 4);
 BI_MC(epel, hv, 6);
 BI_MC(epel, hv, 8);
 BI_MC(epel, hv, 12);
 BI_MC(epel, hv, 16);
 BI_MC(epel, hv, 24);
 BI_MC(epel, hv, 32);
 BI_MC(epel, hv, 48);
 BI_MC(epel, hv, 64);
 #undef BI_MC
--- a/libavutil/mips/generic_macros_msa.h
+++ b/libavutil/mips/generic_macros_msa.h
@@ -29,18 +29,23 @@
 #define LD_SB(...) LD_B(v16i8, __VA_ARGS__)
 #define LD_H(RTYPE, psrc) *((RTYPE *)(psrc))
 #define LD_UH(...) LD_H(v8u16, __VA_ARGS__)
 #define LD_SH(...) LD_H(v8i16, __VA_ARGS__)
 #define LD_W(RTYPE, psrc) *((RTYPE *)(psrc))
 #define LD_UW(...) LD_W(v4u32, __VA_ARGS__)
 #define LD_SW(...) LD_W(v4i32, __VA_ARGS__)
 #define ST_B(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
 #define ST_UB(...) ST_B(v16u8, __VA_ARGS__)
 #define ST_SB(...) ST_B(v16i8, __VA_ARGS__)
 #define ST_H(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
 #define ST_UH(...) ST_H(v8u16, __VA_ARGS__)
 #define ST_SH(...) ST_H(v8i16, __VA_ARGS__)
 #define ST_W(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
 #define ST_UW(...) ST_W(v4u32, __VA_ARGS__)
 #define ST_SW(...) ST_W(v4i32, __VA_ARGS__)
 #if (__mips_isa_rev >= 6)
@@ -328,6 +333,46 @@
 #define LD_UB8(...) LD_B8(v16u8, __VA_ARGS__)
 #define LD_SB8(...) LD_B8(v16i8, __VA_ARGS__)
 /* Description : Load vectors with 8 halfword elements with stride
   Arguments   : Inputs  - psrc    (source pointer to load from)
                         - stride
                 Outputs - out0, out1
   Details     : Loads 8 halfword elements in 'out0' from (psrc)
                 Loads 8 halfword elements in 'out1' from (psrc + stride)
 */
 #define LD_H2(RTYPE, psrc, stride, out0, out1)  \
 {                                               \
    out0 = LD_H(RTYPE, (psrc));                 \
    out1 = LD_H(RTYPE, (psrc) + (stride));      \
 }
 #define LD_UH2(...) LD_H2(v8u16, __VA_ARGS__)
 #define LD_SH2(...) LD_H2(v8i16, __VA_ARGS__)
 #define LD_H4(RTYPE, psrc, stride, out0, out1, out2, out3)  \
 {                                                           \
    LD_H2(RTYPE, (psrc), stride, out0, out1);               \
    LD_H2(RTYPE, (psrc) + 2 * stride, stride, out2, out3);  \
 }
 #define LD_UH4(...) LD_H4(v8u16, __VA_ARGS__)
 #define LD_SH4(...) LD_H4(v8i16, __VA_ARGS__)
 #define LD_H6(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5)  \
 {                                                                       \
    LD_H4(RTYPE, (psrc), stride, out0, out1, out2, out3);               \
    LD_H2(RTYPE, (psrc) + 4 * stride, stride, out4, out5);              \
 }
 #define LD_UH6(...) LD_H6(v8u16, __VA_ARGS__)
 #define LD_SH6(...) LD_H6(v8i16, __VA_ARGS__)
 #define LD_H8(RTYPE, psrc, stride,                                      \
              out0, out1, out2, out3, out4, out5, out6, out7)           \
 {                                                                       \
    LD_H4(RTYPE, (psrc), stride, out0, out1, out2, out3);               \
    LD_H4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7);  \
 }
 #define LD_UH8(...) LD_H8(v8u16, __VA_ARGS__)
 #define LD_SH8(...) LD_H8(v8i16, __VA_ARGS__)
 /* Description : Store vectors of 16 byte elements with stride
   Arguments   : Inputs  - in0, in1, stride
                 Outputs - pdst    (destination pointer to store to)
@@ -478,6 +523,55 @@
                                                                  \
    SW4(out0_m, out1_m, out2_m, out3_m, pblk_4x4_m, stride);      \
 }
 #define ST4x8_UB(in0, in1, pdst, stride)                            \
 {                                                                   \
    uint8_t *pblk_4x8 = (uint8_t *) (pdst);                         \
                                                                    \
    ST4x4_UB(in0, in0, 0, 1, 2, 3, pblk_4x8, stride);               \
    ST4x4_UB(in1, in1, 0, 1, 2, 3, pblk_4x8 + 4 * stride, stride);  \
 }
 /* Description : Store as 6x4 byte block to destination memory from input
                 vectors
   Arguments   : Inputs  - in0, in1, pdst, stride
                 Return Type - unsigned byte
   Details     : Index 0 word element from input vector 'in0' is copied and
                 stored on first line followed by index 2 halfword element
                 Index 2 word element from input vector 'in0' is copied and
                 stored on second line followed by index 2 halfword element
                 Index 0 word element from input vector 'in1' is copied and
                 stored on third line followed by index 2 halfword element
                 Index 2 word element from input vector 'in1' is copied and
                 stored on fourth line followed by index 2 halfword element
 */
 #define ST6x4_UB(in0, in1, pdst, stride)       \
 {                                              \
    uint32_t out0_m, out1_m, out2_m, out3_m;   \
    uint16_t out4_m, out5_m, out6_m, out7_m;   \
    uint8_t *pblk_6x4_m = (uint8_t *) (pdst);  \
                                               \
    out0_m = __msa_copy_u_w((v4i32) in0, 0);   \
    out1_m = __msa_copy_u_w((v4i32) in0, 2);   \
    out2_m = __msa_copy_u_w((v4i32) in1, 0);   \
    out3_m = __msa_copy_u_w((v4i32) in1, 2);   \
                                               \
    out4_m = __msa_copy_u_h((v8i16) in0, 2);   \
    out5_m = __msa_copy_u_h((v8i16) in0, 6);   \
    out6_m = __msa_copy_u_h((v8i16) in1, 2);   \
    out7_m = __msa_copy_u_h((v8i16) in1, 6);   \
                                               \
    SW(out0_m, pblk_6x4_m);                    \
    SH(out4_m, (pblk_6x4_m + 4));              \
    pblk_6x4_m += stride;                      \
    SW(out1_m, pblk_6x4_m);                    \
    SH(out5_m, (pblk_6x4_m + 4));              \
    pblk_6x4_m += stride;                      \
    SW(out2_m, pblk_6x4_m);                    \
    SH(out6_m, (pblk_6x4_m + 4));              \
    pblk_6x4_m += stride;                      \
    SW(out3_m, pblk_6x4_m);                    \
    SH(out7_m, (pblk_6x4_m + 4));              \
 }
 /* Description : Store as 8x2 byte block to destination memory from input vector
   Arguments   : Inputs  - in, pdst, stride
@@ -529,6 +623,15 @@
    ST8x4_UB(in0, in1, pblk_8x8_m, stride);               \
    ST8x4_UB(in2, in3, pblk_8x8_m + 4 * stride, stride);  \
 }
 #define ST12x4_UB(in0, in1, in2, pdst, stride)                \
 {                                                             \
    uint8_t *pblk_12x4_m = (uint8_t *) (pdst);                \
                                                              \
    /* left 8x4 */                                            \
    ST8x4_UB(in0, in1, pblk_12x4_m, stride);                  \
    /* right 4x4 */                                           \
    ST4x4_UB(in2, in2, 0, 1, 2, 3, pblk_12x4_m + 8, stride);  \
 }
 /* Description : Store as 12x8 byte block to destination memory from
                 input vectors
@@ -1246,6 +1349,8 @@
 }
 #define PCKEV_B4_SB(...) PCKEV_B4(v16i8, __VA_ARGS__)
 #define PCKEV_B4_UB(...) PCKEV_B4(v16u8, __VA_ARGS__)
 #define PCKEV_B4_SH(...) PCKEV_B4(v8i16, __VA_ARGS__)
 #define PCKEV_B4_SW(...) PCKEV_B4(v4i32, __VA_ARGS__)
 /* Description : Pack even halfword elements of vector pairs
   Arguments   : Inputs  - in0, in1, in2, in3
@@ -1317,6 +1422,13 @@
 }
 #define XORI_B5_128_SB(...) XORI_B5_128(v16i8, __VA_ARGS__)
 #define XORI_B6_128(RTYPE, in0, in1, in2, in3, in4, in5)  \
 {                                                         \
    XORI_B4_128(RTYPE, in0, in1, in2, in3);               \
    XORI_B2_128(RTYPE, in4, in5);                         \
 }
 #define XORI_B6_128_SB(...) XORI_B6_128(v16i8, __VA_ARGS__)
 #define XORI_B7_128(RTYPE, in0, in1, in2, in3, in4, in5, in6)  \
 {                                                              \
    XORI_B4_128(RTYPE, in0, in1, in2, in3);                    \
@@ -1442,6 +1554,25 @@
 }
 #define SRAR_H4_UH(...) SRAR_H4(v8u16, __VA_ARGS__)
 #define SRAR_H4_SH(...) SRAR_H4(v8i16, __VA_ARGS__)
 /* Description : Shift right arithmetic rounded (immediate)
   Arguments   : Inputs  - in0, in1, in2, in3, shift
                 Outputs - in0, in1, in2, in3 (in place)
                 Return Type - as per RTYPE
   Details     : Each element of vector 'in0' is shifted right arithmetic by
                 value in 'shift'.
                 The last discarded bit is added to shifted value for rounding
                 and the result is in place written to 'in0'
                 Similar for other pairs
 */
 #define SRARI_H2(RTYPE, in0, in1, shift)              \
 {                                                     \
    in0 = (RTYPE) __msa_srari_h((v8i16) in0, shift);  \
    in1 = (RTYPE) __msa_srari_h((v8i16) in1, shift);  \
 }
 #define SRARI_H2_UH(...) SRARI_H2(v8u16, __VA_ARGS__)
 #define SRARI_H2_SH(...) SRARI_H2(v8i16, __VA_ARGS__)
 /* Description : Shift right arithmetic rounded (immediate)
   Arguments   : Inputs  - in0, in1, shift
                 Outputs - in0, in1     (in place)
@@ -1499,6 +1630,25 @@
    ILVRL_B2_SH(zero_m, in, out0, out1);              \
 }
 /* Description : Sign extend halfword elements from input vector and return
                 result in pair of vectors
   Arguments   : Inputs  - in           (1 input halfword vector)
                 Outputs - out0, out1   (sign extended 2 word vectors)
                 Return Type - signed word
   Details     : Sign bit of halfword elements from input vector 'in' is
                 extracted and interleaved right with same vector 'in0' to
                 generate 4 signed word elements in 'out0'
                 Then interleaved left with same vector 'in0' to
                 generate 4 signed word elements in 'out1'
 */
 #define UNPCK_SH_SW(in, out0, out1)                  \
 {                                                    \
    v8i16 tmp_m;                                     \
                                                     \
    tmp_m = __msa_clti_s_h((v8i16) in, 0);           \
    ILVRL_H2_SW(tmp_m, in, out0, out1);              \
 }
 /* Description : Transposes input 4x4 byte block
   Arguments   : Inputs  - in0, in1, in2, in3      (input 4x4 byte block)
                 Outputs - out0, out1, out2, out3  (output 4x4 byte block)