avcodec/mips: MSA (MIPS-SIMD-Arch) optimizations for hpel functions

This patch adds MSA (MIPS-SIMD-Arch) optimizations for hpel functions in new file hpeldsp_msa.c Adds new generic macros (needed for this patch) in libavutil/mips/generic_macros_msa.h Signed-off-by: Shivraj Patil <shivraj.patil@imgtec.com> Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
10 years ago · ee3ef5fda2
--- a/libavcodec/hpeldsp.c
+++ b/libavcodec/hpeldsp.c
@@ -365,4 +365,6 @@ av_cold void ff_hpeldsp_init(HpelDSPContext *c, int flags)
        ff_hpeldsp_init_ppc(c, flags);
    if (ARCH_X86)
        ff_hpeldsp_init_x86(c, flags);
    if (ARCH_MIPS)
        ff_hpeldsp_init_mips(c, flags);
 }
--- a/libavcodec/hpeldsp.h
+++ b/libavcodec/hpeldsp.h
@@ -99,5 +99,6 @@ void ff_hpeldsp_init_alpha(HpelDSPContext *c, int flags);
 void ff_hpeldsp_init_arm(HpelDSPContext *c, int flags);
 void ff_hpeldsp_init_ppc(HpelDSPContext *c, int flags);
 void ff_hpeldsp_init_x86(HpelDSPContext *c, int flags);
 void ff_hpeldsp_init_mips(HpelDSPContext *c, int flags);

 #endif /* AVCODEC_HPELDSP_H */
--- a/libavcodec/mips/Makefile
+++ b/libavcodec/mips/Makefile
@@ -26,6 +26,7 @@ OBJS-$(CONFIG_H264CHROMA)                 += mips/h264chroma_init_mips.o
 OBJS-$(CONFIG_H264PRED)                   += mips/h264pred_init_mips.o
 OBJS-$(CONFIG_H263DSP)                    += mips/h263dsp_init_mips.o
 OBJS-$(CONFIG_QPELDSP)                    += mips/qpeldsp_init_mips.o
 OBJS-$(CONFIG_HPELDSP)                    += mips/hpeldsp_init_mips.o
 MSA-OBJS-$(CONFIG_HEVC_DECODER)           += mips/hevcdsp_msa.o            \
                                             mips/hevc_mc_uni_msa.o        \
                                             mips/hevc_mc_uniw_msa.o       \
@@ -41,5 +42,6 @@ MSA-OBJS-$(CONFIG_H264CHROMA)             += mips/h264chroma_msa.o
 MSA-OBJS-$(CONFIG_H264PRED)               += mips/h264pred_msa.o
 MSA-OBJS-$(CONFIG_H263DSP)                += mips/h263dsp_msa.o
 MSA-OBJS-$(CONFIG_QPELDSP)                += mips/qpeldsp_msa.o
 MSA-OBJS-$(CONFIG_HPELDSP)                += mips/hpeldsp_msa.o
 LOONGSON3-OBJS-$(CONFIG_H264DSP)          += mips/h264dsp_mmi.o
 LOONGSON3-OBJS-$(CONFIG_H264CHROMA)       += mips/h264chroma_mmi.o
--- a/libavcodec/mips/hpeldsp_init_mips.c
+++ b/libavcodec/mips/hpeldsp_init_mips.c
@@ -0,0 +1,73 @@
 /*
 * Copyright (c) 2015 Parag Salasakar (Parag.Salasakar@imgtec.com)
 *
 * This file is part of FFmpeg.
 *
 * FFmpeg is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * FFmpeg is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with FFmpeg; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */

 #include "../hpeldsp.h"
 #include "libavcodec/mips/hpeldsp_mips.h"

 #if HAVE_MSA
 static void ff_hpeldsp_init_msa(HpelDSPContext *c, int flags)
 {
    c->put_pixels_tab[0][0] = ff_put_pixels16_msa;
    c->put_pixels_tab[0][1] = ff_put_pixels16_x2_msa;
    c->put_pixels_tab[0][2] = ff_put_pixels16_y2_msa;
    c->put_pixels_tab[0][3] = ff_put_pixels16_xy2_msa;

    c->put_pixels_tab[1][0] = ff_put_pixels8_msa;
    c->put_pixels_tab[1][1] = ff_put_pixels8_x2_msa;
    c->put_pixels_tab[1][2] = ff_put_pixels8_y2_msa;
    c->put_pixels_tab[1][3] = ff_put_pixels8_xy2_msa;

    c->put_pixels_tab[2][1] = ff_put_pixels4_x2_msa;
    c->put_pixels_tab[2][2] = ff_put_pixels4_y2_msa;
    c->put_pixels_tab[2][3] = ff_put_pixels4_xy2_msa;

    c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_msa;
    c->put_no_rnd_pixels_tab[0][1] = ff_put_no_rnd_pixels16_x2_msa;
    c->put_no_rnd_pixels_tab[0][2] = ff_put_no_rnd_pixels16_y2_msa;
    c->put_no_rnd_pixels_tab[0][3] = ff_put_no_rnd_pixels16_xy2_msa;

    c->put_no_rnd_pixels_tab[1][0] = ff_put_pixels8_msa;
    c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_msa;
    c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_msa;
    c->put_no_rnd_pixels_tab[1][3] = ff_put_no_rnd_pixels8_xy2_msa;

    c->avg_pixels_tab[0][0] = ff_avg_pixels16_msa;
    c->avg_pixels_tab[0][1] = ff_avg_pixels16_x2_msa;
    c->avg_pixels_tab[0][2] = ff_avg_pixels16_y2_msa;
    c->avg_pixels_tab[0][3] = ff_avg_pixels16_xy2_msa;

    c->avg_pixels_tab[1][0] = ff_avg_pixels8_msa;
    c->avg_pixels_tab[1][1] = ff_avg_pixels8_x2_msa;
    c->avg_pixels_tab[1][2] = ff_avg_pixels8_y2_msa;
    c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_msa;

    c->avg_pixels_tab[2][0] = ff_avg_pixels4_msa;
    c->avg_pixels_tab[2][1] = ff_avg_pixels4_x2_msa;
    c->avg_pixels_tab[2][2] = ff_avg_pixels4_y2_msa;
    c->avg_pixels_tab[2][3] = ff_avg_pixels4_xy2_msa;
 }
 #endif  // #if HAVE_MSA

 void ff_hpeldsp_init_mips(HpelDSPContext *c, int flags)
 {
 #if HAVE_MSA
    ff_hpeldsp_init_msa(c, flags);
 #endif  // #if HAVE_MSA
 }
--- a/libavcodec/mips/hpeldsp_mips.h
+++ b/libavcodec/mips/hpeldsp_mips.h
@@ -0,0 +1,87 @@
 /*
 * Copyright (c) 2015 Parag Salasakar (Parag.Salasakar@imgtec.com)
 *
 * This file is part of FFmpeg.
 *
 * FFmpeg is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * FFmpeg is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with FFmpeg; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */

 #ifndef AVCODEC_MIPS_HPELDSP_MIPS_H
 #define AVCODEC_MIPS_HPELDSP_MIPS_H

 #include "libavcodec/bit_depth_template.c"

 void ff_put_pixels16_msa(uint8_t *block, const uint8_t *pixels,
                         ptrdiff_t line_size, int32_t h);
 void ff_put_pixels16_x2_msa(uint8_t *block, const uint8_t *pixels,
                            ptrdiff_t line_size, int32_t h);
 void ff_put_pixels16_y2_msa(uint8_t *block, const uint8_t *pixels,
                            ptrdiff_t line_size, int32_t h);
 void ff_put_pixels16_xy2_msa(uint8_t *block, const uint8_t *pixels,
                             ptrdiff_t line_size, int32_t h);
 void ff_put_pixels8_msa(uint8_t *block, const uint8_t *pixels,
                        ptrdiff_t line_size, int32_t h);
 void ff_put_pixels8_x2_msa(uint8_t *block, const uint8_t *pixels,
                           ptrdiff_t line_size, int32_t h);
 void ff_put_pixels8_y2_msa(uint8_t *block, const uint8_t *pixels,
                           ptrdiff_t line_size, int32_t h);
 void ff_put_pixels8_xy2_msa(uint8_t *block, const uint8_t *pixels,
                            ptrdiff_t line_size, int32_t h);
 void ff_put_pixels4_msa(uint8_t *block, const uint8_t *pixels,
                        ptrdiff_t line_size, int32_t h);
 void ff_put_pixels4_x2_msa(uint8_t *block, const uint8_t *pixels,
                           ptrdiff_t line_size, int32_t h);
 void ff_put_pixels4_y2_msa(uint8_t *block, const uint8_t *pixels,
                           ptrdiff_t line_size, int32_t h);
 void ff_put_pixels4_xy2_msa(uint8_t *block, const uint8_t *pixels,
                            ptrdiff_t line_size, int32_t h);
 void ff_put_no_rnd_pixels16_x2_msa(uint8_t *block, const uint8_t *pixels,
                                   ptrdiff_t line_size, int32_t h);
 void ff_put_no_rnd_pixels16_y2_msa(uint8_t *block, const uint8_t *pixels,
                                   ptrdiff_t line_size, int32_t h);
 void ff_put_no_rnd_pixels16_xy2_msa(uint8_t *block, const uint8_t *pixels,
                                    ptrdiff_t line_size, int32_t h);
 void ff_put_no_rnd_pixels8_x2_msa(uint8_t *block, const uint8_t *pixels,
                                  ptrdiff_t line_size, int32_t h);
 void ff_put_no_rnd_pixels8_y2_msa(uint8_t *block, const uint8_t *pixels,
                                  ptrdiff_t line_size, int32_t h);
 void ff_put_no_rnd_pixels8_xy2_msa(uint8_t *block, const uint8_t *pixels,
                                   ptrdiff_t line_size, int32_t h);
 void ff_avg_pixels16_msa(uint8_t *block, const uint8_t *pixels,
                         ptrdiff_t line_size, int32_t h);
 void ff_avg_pixels16_x2_msa(uint8_t *block, const uint8_t *pixels,
                            ptrdiff_t line_size, int32_t h);
 void ff_avg_pixels16_y2_msa(uint8_t *block, const uint8_t *pixels,
                            ptrdiff_t line_size, int32_t h);
 void ff_avg_pixels16_xy2_msa(uint8_t *block, const uint8_t *pixels,
                             ptrdiff_t line_size, int32_t h);
 void ff_avg_pixels8_msa(uint8_t *block, const uint8_t *pixels,
                        ptrdiff_t line_size, int32_t h);
 void ff_avg_pixels8_x2_msa(uint8_t *block, const uint8_t *pixels,
                           ptrdiff_t line_size, int32_t h);
 void ff_avg_pixels8_y2_msa(uint8_t *block, const uint8_t *pixels,
                           ptrdiff_t line_size, int32_t h);
 void ff_avg_pixels8_xy2_msa(uint8_t *block, const uint8_t *pixels,
                            ptrdiff_t line_size, int32_t h);
 void ff_avg_pixels4_msa(uint8_t *block, const uint8_t *pixels,
                        ptrdiff_t line_size, int32_t h);
 void ff_avg_pixels4_x2_msa(uint8_t *block, const uint8_t *pixels,
                           ptrdiff_t line_size, int32_t h);
 void ff_avg_pixels4_y2_msa(uint8_t *block, const uint8_t *pixels,
                           ptrdiff_t line_size, int32_t h);
 void ff_avg_pixels4_xy2_msa(uint8_t *block, const uint8_t *pixels,
                            ptrdiff_t line_size, int32_t h);

 #endif  // #ifndef AVCODEC_MIPS_HPELDSP_MIPS_H
--- a/libavcodec/mips/hpeldsp_msa.c
+++ b/libavcodec/mips/hpeldsp_msa.c
--- a/libavutil/mips/generic_macros_msa.h
+++ b/libavutil/mips/generic_macros_msa.h
@@ -804,6 +804,15 @@
 #define SLDI_B2_0_SB(...) SLDI_B2_0(v16i8, __VA_ARGS__)
 #define SLDI_B2_0_SW(...) SLDI_B2_0(v4i32, __VA_ARGS__)

 #define SLDI_B3_0(RTYPE, in0, in1, in2, out0, out1, out2,  slide_val)     \
 {                                                                         \
    v16i8 zero_m = { 0 };                                                 \
    SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val);                    \
    out2 = (RTYPE) __msa_sldi_b((v16i8) zero_m, (v16i8) in2, slide_val);  \
 }
 #define SLDI_B3_0_UB(...) SLDI_B3_0(v16u8, __VA_ARGS__)
 #define SLDI_B3_0_SB(...) SLDI_B3_0(v16i8, __VA_ARGS__)

 #define SLDI_B4_0(RTYPE, in0, in1, in2, in3,            \
                  out0, out1, out2, out3, slide_val)    \
 {                                                       \
@@ -1174,6 +1183,13 @@
 }
 #define HADD_UB2_UH(...) HADD_UB2(v8u16, __VA_ARGS__)

 #define HADD_UB3(RTYPE, in0, in1, in2, out0, out1, out2)      \
 {                                                             \
    HADD_UB2(RTYPE, in0, in1, out0, out1);                    \
    out2 = (RTYPE) __msa_hadd_u_h((v16u8) in2, (v16u8) in2);  \
 }
 #define HADD_UB3_UH(...) HADD_UB3(v8u16, __VA_ARGS__)

 /* Description : Horizontal subtraction of unsigned byte vector elements
   Arguments   : Inputs  - in0, in1
                 Outputs - out0, out1
@@ -2408,6 +2424,67 @@
    out3 = (v4i32) __msa_ilvl_d((v2i64) s3_m, (v2i64) s1_m);            \
 }

 /* Description : Average byte elements from pair of vectors and store 8x4 byte
                 block in destination memory
   Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride
                 Outputs -
                 Return Type -
   Details     : Each byte element from input vector pair 'in0' and 'in1' are
                 averaged (a + b)/2 and stored in 'tmp0_m'
                 Each byte element from input vector pair 'in2' and 'in3' are
                 averaged (a + b)/2 and stored in 'tmp1_m'
                 Each byte element from input vector pair 'in4' and 'in5' are
                 averaged (a + b)/2 and stored in 'tmp2_m'
                 Each byte element from input vector pair 'in6' and 'in7' are
                 averaged (a + b)/2 and stored in 'tmp3_m'
                 The half vector results from all 4 vectors are stored in
                 destination memory as 8x4 byte block
 */
 #define AVE_ST8x4_UB(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride)  \
 {                                                                           \
    uint64_t out0_m, out1_m, out2_m, out3_m;                                \
    v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                   \
                                                                            \
    tmp0_m = __msa_ave_u_b((v16u8) in0, (v16u8) in1);                       \
    tmp1_m = __msa_ave_u_b((v16u8) in2, (v16u8) in3);                       \
    tmp2_m = __msa_ave_u_b((v16u8) in4, (v16u8) in5);                       \
    tmp3_m = __msa_ave_u_b((v16u8) in6, (v16u8) in7);                       \
                                                                            \
    out0_m = __msa_copy_u_d((v2i64) tmp0_m, 0);                             \
    out1_m = __msa_copy_u_d((v2i64) tmp1_m, 0);                             \
    out2_m = __msa_copy_u_d((v2i64) tmp2_m, 0);                             \
    out3_m = __msa_copy_u_d((v2i64) tmp3_m, 0);                             \
    SD4(out0_m, out1_m, out2_m, out3_m, pdst, stride);                      \
 }

 /* Description : Average byte elements from pair of vectors and store 16x4 byte
                 block in destination memory
   Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride
                 Outputs -
                 Return Type -
   Details     : Each byte element from input vector pair 'in0' and 'in1' are
                 averaged (a + b)/2 and stored in 'tmp0_m'
                 Each byte element from input vector pair 'in2' and 'in3' are
                 averaged (a + b)/2 and stored in 'tmp1_m'
                 Each byte element from input vector pair 'in4' and 'in5' are
                 averaged (a + b)/2 and stored in 'tmp2_m'
                 Each byte element from input vector pair 'in6' and 'in7' are
                 averaged (a + b)/2 and stored in 'tmp3_m'
                 The results from all 4 vectors are stored in destination
                 memory as 16x4 byte block
 */
 #define AVE_ST16x4_UB(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride)  \
 {                                                                            \
    v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                    \
                                                                             \
    tmp0_m = __msa_ave_u_b((v16u8) in0, (v16u8) in1);                        \
    tmp1_m = __msa_ave_u_b((v16u8) in2, (v16u8) in3);                        \
    tmp2_m = __msa_ave_u_b((v16u8) in4, (v16u8) in5);                        \
    tmp3_m = __msa_ave_u_b((v16u8) in6, (v16u8) in7);                        \
                                                                             \
    ST_UB4(tmp0_m, tmp1_m, tmp2_m, tmp3_m, pdst, stride);                    \
 }

 /* Description : Average rounded byte elements from pair of vectors and store
                 8x4 byte block in destination memory
   Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride
@@ -2439,6 +2516,91 @@
    SD4(out0_m, out1_m, out2_m, out3_m, pdst, stride);                       \
 }

 /* Description : Average rounded byte elements from pair of vectors and store
                 16x4 byte block in destination memory
   Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride
                 Outputs -
                 Return Type -
   Details     : Each byte element from input vector pair 'in0' and 'in1' are
                 average rounded (a + b + 1)/2 and stored in 'tmp0_m'
                 Each byte element from input vector pair 'in2' and 'in3' are
                 average rounded (a + b + 1)/2 and stored in 'tmp1_m'
                 Each byte element from input vector pair 'in4' and 'in5' are
                 average rounded (a + b + 1)/2 and stored in 'tmp2_m'
                 Each byte element from input vector pair 'in6' and 'in7' are
                 average rounded (a + b + 1)/2 and stored in 'tmp3_m'
                 The vector results from all 4 vectors are stored in
                 destination memory as 16x4 byte block
 */
 #define AVER_ST16x4_UB(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride)  \
 {                                                                             \
    v16u8 t0_m, t1_m, t2_m, t3_m;                                             \
                                                                              \
    AVER_UB4_UB(in0, in1, in2, in3, in4, in5, in6, in7,                       \
                t0_m, t1_m, t2_m, t3_m);                                      \
    ST_UB4(t0_m, t1_m, t2_m, t3_m, pdst, stride);                             \
 }

 /* Description : Average rounded byte elements from pair of vectors,
                 average rounded with destination and store 8x4 byte block
                 in destination memory
   Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride
                 Outputs -
                 Return Type -
   Details     : Each byte element from input vector pair 'in0' and 'in1' are
                 average rounded (a + b + 1)/2 and stored in 'tmp0_m'
                 Each byte element from input vector pair 'in2' and 'in3' are
                 average rounded (a + b + 1)/2 and stored in 'tmp1_m'
                 Each byte element from input vector pair 'in4' and 'in5' are
                 average rounded (a + b + 1)/2 and stored in 'tmp2_m'
                 Each byte element from input vector pair 'in6' and 'in7' are
                 average rounded (a + b + 1)/2 and stored in 'tmp3_m'
                 The half vector results from all 4 vectors are stored in
                 destination memory as 8x4 byte block
 */
 #define AVER_DST_ST8x4_UB(in0, in1, in2, in3, in4, in5, in6, in7,  \
                          pdst, stride)                            \
 {                                                                  \
    v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                          \
    v16u8 dst0_m, dst1_m, dst2_m, dst3_m;                          \
                                                                   \
    LD_UB4(pdst, stride, dst0_m, dst1_m, dst2_m, dst3_m);          \
    AVER_UB4_UB(in0, in1, in2, in3, in4, in5, in6, in7,            \
                tmp0_m, tmp1_m, tmp2_m, tmp3_m);                   \
    AVER_ST8x4_UB(dst0_m, tmp0_m, dst1_m, tmp1_m,                  \
                  dst2_m, tmp2_m, dst3_m, tmp3_m, pdst, stride);   \
 }

 /* Description : Average rounded byte elements from pair of vectors,
                 average rounded with destination and store 16x4 byte block
                 in destination memory
   Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride
                 Outputs -
                 Return Type -
   Details     : Each byte element from input vector pair 'in0' and 'in1' are
                 average rounded (a + b + 1)/2 and stored in 'tmp0_m'
                 Each byte element from input vector pair 'in2' and 'in3' are
                 average rounded (a + b + 1)/2 and stored in 'tmp1_m'
                 Each byte element from input vector pair 'in4' and 'in5' are
                 average rounded (a + b + 1)/2 and stored in 'tmp2_m'
                 Each byte element from input vector pair 'in6' and 'in7' are
                 average rounded (a + b + 1)/2 and stored in 'tmp3_m'
                 The vector results from all 4 vectors are stored in
                 destination memory as 16x4 byte block
 */
 #define AVER_DST_ST16x4_UB(in0, in1, in2, in3, in4, in5, in6, in7,  \
                           pdst, stride)                            \
 {                                                                   \
    v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                           \
    v16u8 dst0_m, dst1_m, dst2_m, dst3_m;                           \
                                                                    \
    LD_UB4(pdst, stride, dst0_m, dst1_m, dst2_m, dst3_m);           \
    AVER_UB4_UB(in0, in1, in2, in3, in4, in5, in6, in7,             \
                tmp0_m, tmp1_m, tmp2_m, tmp3_m);                    \
    AVER_ST16x4_UB(dst0_m, tmp0_m, dst1_m, tmp1_m,                  \
                   dst2_m, tmp2_m, dst3_m, tmp3_m, pdst, stride);   \
 }

 /* Description : Add block 4x4
   Arguments   : Inputs  - in0, in1, in2, in3, pdst, stride
                 Outputs -