|
|
@@ -23,6 +23,11 @@ |
|
|
|
|
|
|
|
#include <stdint.h> |
|
|
|
#include <msa.h> |
|
|
|
#include <config.h> |
|
|
|
|
|
|
|
#if HAVE_MSA2 |
|
|
|
#include <msa2.h> |
|
|
|
#endif |
|
|
|
|
|
|
|
#define ALIGNMENT 16 |
|
|
|
#define ALLOC_ALIGNED(align) __attribute__ ((aligned((align) << 1))) |
|
|
@@ -1234,6 +1239,15 @@ |
|
|
|
unsigned absolute diff values, even-odd pairs are added |
|
|
|
together to generate 8 halfword results. |
|
|
|
*/ |
|
|
|
#if HAVE_MSA2 |
|
|
|
#define SAD_UB2_UH(in0, in1, ref0, ref1) \ |
|
|
|
( { \ |
|
|
|
v8u16 sad_m = { 0 }; \ |
|
|
|
sad_m += __builtin_msa2_sad_adj2_u_w2x_b((v16u8) in0, (v16u8) ref0); \ |
|
|
|
sad_m += __builtin_msa2_sad_adj2_u_w2x_b((v16u8) in1, (v16u8) ref1); \ |
|
|
|
sad_m; \ |
|
|
|
} ) |
|
|
|
#else |
|
|
|
#define SAD_UB2_UH(in0, in1, ref0, ref1) \ |
|
|
|
( { \ |
|
|
|
v16u8 diff0_m, diff1_m; \ |
|
|
@@ -1247,6 +1261,7 @@ |
|
|
|
\ |
|
|
|
sad_m; \ |
|
|
|
} ) |
|
|
|
#endif // #if HAVE_MSA2 |
|
|
|
|
|
|
|
/* Description : Insert specified word elements from input vectors to 1 |
|
|
|
destination vector |
|
|
@@ -2287,6 +2302,12 @@ |
|
|
|
extracted and interleaved with same vector 'in0' to generate |
|
|
|
4 word elements keeping sign intact |
|
|
|
*/ |
|
|
|
#if HAVE_MSA2 |
|
|
|
#define UNPCK_R_SH_SW(in, out) \ |
|
|
|
{ \ |
|
|
|
out = (v4i32) __builtin_msa2_w2x_lo_s_h((v8i16) in); \ |
|
|
|
} |
|
|
|
#else |
|
|
|
#define UNPCK_R_SH_SW(in, out) \ |
|
|
|
{ \ |
|
|
|
v8i16 sign_m; \ |
|
|
@@ -2294,6 +2315,7 @@ |
|
|
|
sign_m = __msa_clti_s_h((v8i16) in, 0); \ |
|
|
|
out = (v4i32) __msa_ilvr_h(sign_m, (v8i16) in); \ |
|
|
|
} |
|
|
|
#endif // #if HAVE_MSA2 |
|
|
|
|
|
|
|
/* Description : Sign extend byte elements from input vector and return |
|
|
|
halfword results in pair of vectors |
|
|
@@ -2306,6 +2328,13 @@ |
|
|
|
Then interleaved left with same vector 'in0' to |
|
|
|
generate 8 signed halfword elements in 'out1' |
|
|
|
*/ |
|
|
|
#if HAVE_MSA2 |
|
|
|
#define UNPCK_SB_SH(in, out0, out1) \ |
|
|
|
{ \ |
|
|
|
out0 = (v4i32) __builtin_msa2_w2x_lo_s_b((v16i8) in); \ |
|
|
|
out1 = (v4i32) __builtin_msa2_w2x_hi_s_b((v16i8) in); \ |
|
|
|
} |
|
|
|
#else |
|
|
|
#define UNPCK_SB_SH(in, out0, out1) \ |
|
|
|
{ \ |
|
|
|
v16i8 tmp_m; \ |
|
|
@@ -2313,6 +2342,7 @@ |
|
|
|
tmp_m = __msa_clti_s_b((v16i8) in, 0); \ |
|
|
|
ILVRL_B2_SH(tmp_m, in, out0, out1); \ |
|
|
|
} |
|
|
|
#endif // #if HAVE_MSA2 |
|
|
|
|
|
|
|
/* Description : Zero extend unsigned byte elements to halfword elements |
|
|
|
Arguments : Inputs - in (1 input unsigned byte vector) |
|
|
@@ -2339,6 +2369,13 @@ |
|
|
|
Then interleaved left with same vector 'in0' to |
|
|
|
generate 4 signed word elements in 'out1' |
|
|
|
*/ |
|
|
|
#if HAVE_MSA2 |
|
|
|
#define UNPCK_SH_SW(in, out0, out1) \ |
|
|
|
{ \ |
|
|
|
out0 = (v4i32) __builtin_msa2_w2x_lo_s_h((v8i16) in); \ |
|
|
|
out1 = (v4i32) __builtin_msa2_w2x_hi_s_h((v8i16) in); \ |
|
|
|
} |
|
|
|
#else |
|
|
|
#define UNPCK_SH_SW(in, out0, out1) \ |
|
|
|
{ \ |
|
|
|
v8i16 tmp_m; \ |
|
|
@@ -2346,6 +2383,7 @@ |
|
|
|
tmp_m = __msa_clti_s_h((v8i16) in, 0); \ |
|
|
|
ILVRL_H2_SW(tmp_m, in, out0, out1); \ |
|
|
|
} |
|
|
|
#endif // #if HAVE_MSA2 |
|
|
|
|
|
|
|
/* Description : Swap two variables |
|
|
|
Arguments : Inputs - in0, in1 |
|
|
@@ -2850,13 +2888,11 @@ |
|
|
|
*/ |
|
|
|
#define DPADD_SH3_SH(in0, in1, in2, coeff0, coeff1, coeff2) \ |
|
|
|
( { \ |
|
|
|
v8i16 tmp1_m; \ |
|
|
|
v8i16 out0_m; \ |
|
|
|
\ |
|
|
|
out0_m = __msa_dotp_s_h((v16i8) in0, (v16i8) coeff0); \ |
|
|
|
out0_m = __msa_dpadd_s_h(out0_m, (v16i8) in1, (v16i8) coeff1); \ |
|
|
|
tmp1_m = __msa_dotp_s_h((v16i8) in2, (v16i8) coeff2); \ |
|
|
|
out0_m = __msa_adds_s_h(out0_m, tmp1_m); \ |
|
|
|
out0_m = __msa_dpadd_s_h(out0_m, (v16i8) in2, (v16i8) coeff2); \ |
|
|
|
\ |
|
|
|
out0_m; \ |
|
|
|
} ) |
|
|
|