avcodec/mips: Split uni mc optimizations to new file

This patch moves HEVC code of uni mc cases to new file hevc_mc_uni_msa.c. (There are total 5 sub-modules of HEVC mc functions, if we add all these modules in one single file, its size would be huge (~750k) & difficult to maintain, so splitting it in multiple files) This patch also adds new HEVC header file libavcodec/mips/hevc_macros_msa.h Signed-off-by: Shivraj Patil <shivraj.patil@imgtec.com> Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
10 years ago · 10b77fbf0d
--- a/libavcodec/mips/Makefile
+++ b/libavcodec/mips/Makefile
@@ -20,6 +20,7 @@ MIPSDSPR1-OBJS-$(CONFIG_AAC_ENCODER)      += mips/aaccoder_mips.o
 MIPSFPU-OBJS-$(CONFIG_AAC_ENCODER)        += mips/iirfilter_mips.o
 OBJS-$(CONFIG_HEVC_DECODER)               += mips/hevcdsp_init_mips.o
 OBJS-$(CONFIG_H264DSP)                    += mips/h264dsp_init_mips.o
 MSA-OBJS-$(CONFIG_HEVC_DECODER)           += mips/hevcdsp_msa.o
 MSA-OBJS-$(CONFIG_HEVC_DECODER)           += mips/hevcdsp_msa.o            \
                                             mips/hevc_mc_uni_msa.o
 MSA-OBJS-$(CONFIG_H264DSP)                += mips/h264dsp_msa.o
 LOONGSON3-OBJS-$(CONFIG_H264DSP)          += mips/h264dsp_mmi.o
--- a/libavcodec/mips/hevc_macros_msa.h
+++ b/libavcodec/mips/hevc_macros_msa.h
@@ -0,0 +1,51 @@
 /*
 * Copyright (c) 2015 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
 *
 * This file is part of FFmpeg.
 *
 * FFmpeg is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * FFmpeg is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with FFmpeg; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */

 #ifndef AVCODEC_MIPS_HEVC_MACROS_MSA_H
 #define AVCODEC_MIPS_HEVC_MACROS_MSA_H

 #define HEVC_PCK_SW_SB2(in0, in1, out)                            \
 {                                                                 \
    v8i16 tmp0_m;                                                 \
                                                                  \
    tmp0_m = __msa_pckev_h((v8i16) in0, (v8i16) in1);             \
    out = (v4i32) __msa_pckev_b((v16i8) tmp0_m, (v16i8) tmp0_m);  \
 }

 #define HEVC_PCK_SW_SB4(in0, in1, in2, in3, out)                  \
 {                                                                 \
    v8i16 tmp0_m, tmp1_m;                                         \
                                                                  \
    PCKEV_H2_SH(in0, in1, in2, in3, tmp0_m, tmp1_m);              \
    out = (v4i32) __msa_pckev_b((v16i8) tmp1_m, (v16i8) tmp0_m);  \
 }

 #define HEVC_FILT_8TAP(in0, in1, in2, in3,                       \
                       filt0, filt1, filt2, filt3)               \
 ( {                                                              \
    v4i32 out_m;                                                 \
                                                                 \
    out_m = __msa_dotp_s_w((v8i16) in0, (v8i16) filt0);          \
    out_m = __msa_dpadd_s_w(out_m, (v8i16) in1, (v8i16) filt1);  \
    DPADD_SH2_SW(in2, in3, filt2, filt3, out_m, out_m);          \
    out_m;                                                       \
 } )

 #endif  /* AVCODEC_MIPS_HEVC_MACROS_MSA_H */
--- a/libavcodec/mips/hevc_mc_uni_msa.c
+++ b/libavcodec/mips/hevc_mc_uni_msa.c
--- a/libavcodec/mips/hevcdsp_msa.c
+++ b/libavcodec/mips/hevcdsp_msa.c
--- a/libavutil/mips/generic_macros_msa.h
+++ b/libavutil/mips/generic_macros_msa.h
@@ -278,6 +278,7 @@
    out0 = LD_B(RTYPE, (psrc));                 \
    out1 = LD_B(RTYPE, (psrc) + stride);        \
 }
 #define LD_UB2(...) LD_B2(v16u8, __VA_ARGS__)
 #define LD_SB2(...) LD_B2(v16i8, __VA_ARGS__)

 #define LD_B3(RTYPE, psrc, stride, out0, out1, out2)  \
@@ -349,6 +350,14 @@
 #define ST_UB4(...) ST_B4(v16u8, __VA_ARGS__)
 #define ST_SB4(...) ST_B4(v16i8, __VA_ARGS__)

 #define ST_B8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,        \
              pdst, stride)                                         \
 {                                                                   \
    ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride);                 \
    ST_B4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride);  \
 }
 #define ST_UB8(...) ST_B8(v16u8, __VA_ARGS__)

 /* Description : Store vectors of 8 halfword elements with stride
   Arguments   : Inputs  - in0, in1, stride
                 Outputs - pdst    (destination pointer to store to)
@@ -425,6 +434,26 @@
    SH(out3_m, pblk_2x4_m + 3 * stride);               \
 }

 /* Description : Store as 4x2 byte block to destination memory from input vector
   Arguments   : Inputs  - in, pdst, stride
                 Return Type - unsigned byte
   Details     : Index 0 word element from input vector is copied and stored
                 on first line
                 Index 1 word element from input vector is copied and stored
                 on second line
 */
 #define ST4x2_UB(in, pdst, stride)             \
 {                                              \
    uint32_t out0_m, out1_m;                   \
    uint8_t *pblk_4x2_m = (uint8_t *) (pdst);  \
                                               \
    out0_m = __msa_copy_u_w((v4i32) in, 0);    \
    out1_m = __msa_copy_u_w((v4i32) in, 1);    \
                                               \
    SW(out0_m, pblk_4x2_m);                    \
    SW(out1_m, pblk_4x2_m + stride);           \
 }

 /* Description : Store as 4x4 byte block to destination memory from input vector
   Arguments   : Inputs  - in0, in1, pdst, stride
                 Return Type - unsigned byte
@@ -598,7 +627,18 @@
    out0 = (RTYPE) __msa_vshf_b((v16i8) mask0, (v16i8) in1, (v16i8) in0);  \
    out1 = (RTYPE) __msa_vshf_b((v16i8) mask1, (v16i8) in3, (v16i8) in2);  \
 }
 #define VSHF_B2_UB(...) VSHF_B2(v16u8, __VA_ARGS__)
 #define VSHF_B2_SB(...) VSHF_B2(v16i8, __VA_ARGS__)
 #define VSHF_B2_UH(...) VSHF_B2(v8u16, __VA_ARGS__)
 #define VSHF_B2_SH(...) VSHF_B2(v8i16, __VA_ARGS__)

 #define VSHF_B3(RTYPE, in0, in1, in2, in3, in4, in5, mask0, mask1, mask2,  \
                out0, out1, out2)                                          \
 {                                                                          \
    VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1);          \
    out2 = (RTYPE) __msa_vshf_b((v16i8) mask2, (v16i8) in5, (v16i8) in4);  \
 }
 #define VSHF_B3_SB(...) VSHF_B3(v16i8, __VA_ARGS__)

 #define VSHF_B4(RTYPE, in0, in1, mask0, mask1, mask2, mask3,       \
                out0, out1, out2, out3)                            \
@@ -608,6 +648,57 @@
 }
 #define VSHF_B4_SB(...) VSHF_B4(v16i8, __VA_ARGS__)

 /* Description : Shuffle byte vector elements as per mask vector
   Arguments   : Inputs  - in0, in1, in2, in3, mask0, mask1
                 Outputs - out0, out1
                 Return Type - as per RTYPE
   Details     : Selective byte elements from in0 & in1 are copied to out0 as
                 per control vector mask0
                 Selective byte elements from in2 & in3 are copied to out1 as
                 per control vector mask1
 */
 #define VSHF_W2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1)      \
 {                                                                         \
    out0 = (RTYPE) __msa_vshf_w((v4i32) mask0, (v4i32) in1, (v4i32) in0); \
    out1 = (RTYPE) __msa_vshf_w((v4i32) mask1, (v4i32) in3, (v4i32) in2); \
 }
 #define VSHF_W2_SB(...) VSHF_W2(v16i8, __VA_ARGS__)

 /* Description : Dot product of byte vector elements
   Arguments   : Inputs  - mult0, mult1
                           cnst0, cnst1
                 Outputs - out0, out1
                 Return Type - signed halfword
   Details     : Signed byte elements from mult0 are multiplied with
                 signed byte elements from cnst0 producing a result
                 twice the size of input i.e. signed halfword.
                 Then this multiplication results of adjacent odd-even elements
                 are added together and stored to the out vector
                 (2 signed halfword results)
 */
 #define DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1)   \
 {                                                                 \
    out0 = (RTYPE) __msa_dotp_s_h((v16i8) mult0, (v16i8) cnst0);  \
    out1 = (RTYPE) __msa_dotp_s_h((v16i8) mult1, (v16i8) cnst1);  \
 }
 #define DOTP_SB2_SH(...) DOTP_SB2(v8i16, __VA_ARGS__)

 #define DOTP_SB3(RTYPE, mult0, mult1, mult2, cnst0, cnst1, cnst2,  \
                 out0, out1, out2)                                 \
 {                                                                  \
    DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1);       \
    out2 = (RTYPE) __msa_dotp_s_h((v16i8) mult2, (v16i8) cnst2);   \
 }
 #define DOTP_SB3_SH(...) DOTP_SB3(v8i16, __VA_ARGS__)

 #define DOTP_SB4(RTYPE, mult0, mult1, mult2, mult3,                   \
                 cnst0, cnst1, cnst2, cnst3, out0, out1, out2, out3)  \
 {                                                                     \
    DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1);          \
    DOTP_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3);          \
 }
 #define DOTP_SB4_SH(...) DOTP_SB4(v8i16, __VA_ARGS__)

 /* Description : Dot product & addition of byte vector elements
   Arguments   : Inputs  - mult0, mult1
                           cnst0, cnst1
@@ -701,6 +792,22 @@
    CLIP_SH2_0_255(in2, in3);               \
 }

 /* Description : Clips all signed word elements of input vector
                 between 0 & 255
   Arguments   : Inputs  - in       (input vector)
                 Outputs - out_m    (output vector with clipped elements)
                 Return Type - signed word
 */
 #define CLIP_SW_0_255(in)                                 \
 ( {                                                       \
    v4i32 max_m = __msa_ldi_w(255);                       \
    v4i32 out_m;                                          \
                                                          \
    out_m = __msa_maxi_s_w((v4i32) in, 0);                \
    out_m = __msa_min_s_w((v4i32) max_m, (v4i32) out_m);  \
    out_m;                                                \
 } )

 /* Description : Horizontal subtraction of unsigned byte vector elements
   Arguments   : Inputs  - in0, in1
                 Outputs - out0, out1
@@ -1021,6 +1128,37 @@
 }
 #define SAT_UH4_UH(...) SAT_UH4(v8u16, __VA_ARGS__)

 /* Description : Saturate the halfword element values to the max
                 unsigned value of (sat_val+1 bits)
                 The element data width remains unchanged
   Arguments   : Inputs  - in0, in1, in2, in3, sat_val
                 Outputs - in0, in1, in2, in3 (in place)
                 Return Type - unsigned halfword
   Details     : Each unsigned halfword element from 'in0' is saturated to the
                 value generated with (sat_val+1) bit range
                 Results are in placed to original vectors
 */
 #define SAT_SH2(RTYPE, in0, in1, sat_val)               \
 {                                                       \
    in0 = (RTYPE) __msa_sat_s_h((v8i16) in0, sat_val);  \
    in1 = (RTYPE) __msa_sat_s_h((v8i16) in1, sat_val);  \
 }
 #define SAT_SH2_SH(...) SAT_SH2(v8i16, __VA_ARGS__)

 #define SAT_SH3(RTYPE, in0, in1, in2, sat_val)          \
 {                                                       \
    SAT_SH2(RTYPE, in0, in1, sat_val)                   \
    in2 = (RTYPE) __msa_sat_s_h((v8i16) in2, sat_val);  \
 }
 #define SAT_SH3_SH(...) SAT_SH3(v8i16, __VA_ARGS__)

 #define SAT_SH4(RTYPE, in0, in1, in2, in3, sat_val)  \
 {                                                    \
    SAT_SH2(RTYPE, in0, in1, sat_val);               \
    SAT_SH2(RTYPE, in2, in3, sat_val);               \
 }
 #define SAT_SH4_SH(...) SAT_SH4(v8i16, __VA_ARGS__)

 /* Description : Indexed halfword element values are replicated to all
                 elements in output vector
   Arguments   : Inputs  - in, idx0, idx1
@@ -1043,6 +1181,7 @@
    SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1);     \
    SPLATI_H2(RTYPE, in, idx2, idx3, out2, out3);     \
 }
 #define SPLATI_H4_SB(...) SPLATI_H4(v16i8, __VA_ARGS__)
 #define SPLATI_H4_SH(...) SPLATI_H4(v8i16, __VA_ARGS__)

 /* Description : Indexed word element values are replicated to all
@@ -1097,6 +1236,7 @@
    out2 = (RTYPE) __msa_pckev_b((v16i8) in4, (v16i8) in5);              \
 }
 #define PCKEV_B3_UB(...) PCKEV_B3(v16u8, __VA_ARGS__)
 #define PCKEV_B3_SB(...) PCKEV_B3(v16i8, __VA_ARGS__)

 #define PCKEV_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
                 out0, out1, out2, out3)                         \
@@ -1123,6 +1263,7 @@
    out0 = (RTYPE) __msa_pckev_h((v8i16) in0, (v8i16) in1);  \
    out1 = (RTYPE) __msa_pckev_h((v8i16) in2, (v8i16) in3);  \
 }
 #define PCKEV_H2_SH(...) PCKEV_H2(v8i16, __VA_ARGS__)
 #define PCKEV_H2_SW(...) PCKEV_H2(v4i32, __VA_ARGS__)

 #define PCKEV_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
@@ -1131,6 +1272,7 @@
    PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1);             \
    PCKEV_H2(RTYPE, in4, in5, in6, in7, out2, out3);             \
 }
 #define PCKEV_H4_SH(...) PCKEV_H4(v8i16, __VA_ARGS__)
 #define PCKEV_H4_SW(...) PCKEV_H4(v4i32, __VA_ARGS__)

 /* Description : Each byte element is logically xor'ed with immediate 128
@@ -1212,6 +1354,7 @@
    ADDS_SH2(RTYPE, in4, in5, in6, in7, out2, out3);             \
 }
 #define ADDS_SH4_UH(...) ADDS_SH4(v8u16, __VA_ARGS__)
 #define ADDS_SH4_SH(...) ADDS_SH4(v8i16, __VA_ARGS__)

 /* Description : Shift left all elements of vector (generic for all data types)
   Arguments   : Inputs  - in0, in1, in2, in3, shift
@@ -1266,6 +1409,64 @@
 }
 #define SRL_H4_UH(...) SRL_H4(v8u16, __VA_ARGS__)

 /* Description : Shift right arithmetic rounded halfwords
   Arguments   : Inputs  - in0, in1, shift
                 Outputs - in0, in1, (in place)
                 Return Type - unsigned halfword
   Details     : Each element of vector 'in0' is shifted right arithmetic by
                 number of bits respective element holds in vector 'shift'.
                 The last discarded bit is added to shifted value for rounding
                 and the result is in place written to 'in0'
                 Here, 'shift' is a vector passed in
                 Similar for other pairs
 */
 #define SRAR_H2(RTYPE, in0, in1, shift)                      \
 {                                                            \
    in0 = (RTYPE) __msa_srar_h((v8i16) in0, (v8i16) shift);  \
    in1 = (RTYPE) __msa_srar_h((v8i16) in1, (v8i16) shift);  \
 }
 #define SRAR_H2_UH(...) SRAR_H2(v8u16, __VA_ARGS__)
 #define SRAR_H2_SH(...) SRAR_H2(v8i16, __VA_ARGS__)

 #define SRAR_H3(RTYPE, in0, in1, in2, shift)                 \
 {                                                            \
    SRAR_H2(RTYPE, in0, in1, shift)                          \
    in2 = (RTYPE) __msa_srar_h((v8i16) in2, (v8i16) shift);  \
 }
 #define SRAR_H3_SH(...) SRAR_H3(v8i16, __VA_ARGS__)

 #define SRAR_H4(RTYPE, in0, in1, in2, in3, shift)  \
 {                                                  \
    SRAR_H2(RTYPE, in0, in1, shift)                \
    SRAR_H2(RTYPE, in2, in3, shift)                \
 }
 #define SRAR_H4_UH(...) SRAR_H4(v8u16, __VA_ARGS__)
 #define SRAR_H4_SH(...) SRAR_H4(v8i16, __VA_ARGS__)
 /* Description : Shift right arithmetic rounded (immediate)
   Arguments   : Inputs  - in0, in1, shift
                 Outputs - in0, in1     (in place)
                 Return Type - as per RTYPE
   Details     : Each element of vector 'in0' is shifted right arithmetic by
                 value in 'shift'.
                 The last discarded bit is added to shifted value for rounding
                 and the result is in place written to 'in0'
                 Similar for other pairs
 */
 #define SRARI_W2(RTYPE, in0, in1, shift)              \
 {                                                     \
    in0 = (RTYPE) __msa_srari_w((v4i32) in0, shift);  \
    in1 = (RTYPE) __msa_srari_w((v4i32) in1, shift);  \
 }
 #define SRARI_W2_SW(...) SRARI_W2(v4i32, __VA_ARGS__)

 #define SRARI_W4(RTYPE, in0, in1, in2, in3, shift)  \
 {                                                   \
    SRARI_W2(RTYPE, in0, in1, shift);               \
    SRARI_W2(RTYPE, in2, in3, shift);               \
 }
 #define SRARI_W4_SH(...) SRARI_W4(v8i16, __VA_ARGS__)
 #define SRARI_W4_SW(...) SRARI_W4(v4i32, __VA_ARGS__)

 /* Description : Multiplication of pairs of vectors
   Arguments   : Inputs  - in0, in1, in2, in3
                 Outputs - out0, out1
@@ -1392,6 +1593,22 @@
    out7 = (v16u8) __msa_ilvod_w((v4i32) tmp3_m, (v4i32) tmp2_m);            \
 }

 /* Description : Pack even elements of input vectors & xor with 128
   Arguments   : Inputs  - in0, in1
                 Outputs - out_m
                 Return Type - unsigned byte
   Details     : Signed byte even elements from 'in0' and 'in1' are packed
                 together in one vector and the resulted vector is xor'ed with
                 128 to shift the range from signed to unsigned byte
 */
 #define PCKEV_XORI128_UB(in0, in1)                            \
 ( {                                                           \
    v16u8 out_m;                                              \
    out_m = (v16u8) __msa_pckev_b((v16i8) in1, (v16i8) in0);  \
    out_m = (v16u8) __msa_xori_b((v16u8) out_m, 128);         \
    out_m;                                                    \
 } )

 /* Description : Pack even byte elements, extract 0 & 2 index words from pair
                 of results and store 4 words in destination memory as per
                 stride