|
|
|
@@ -25,187 +25,201 @@ static void avc_wgt_4x2_msa(uint8_t *data, int32_t stride, |
|
|
|
int32_t log2_denom, int32_t src_weight, |
|
|
|
int32_t offset_in) |
|
|
|
{ |
|
|
|
uint32_t data0, data1; |
|
|
|
uint32_t tp0, tp1, offset_val; |
|
|
|
v16u8 zero = { 0 }; |
|
|
|
v16u8 src0, src1; |
|
|
|
v4i32 res0, res1; |
|
|
|
v8i16 temp0, temp1, vec0, vec1, wgt, denom, offset; |
|
|
|
v8u16 out0, out1; |
|
|
|
v16u8 src0 = { 0 }; |
|
|
|
v8i16 src0_r, tmp0, wgt, denom, offset; |
|
|
|
|
|
|
|
offset_in <<= (log2_denom); |
|
|
|
|
|
|
|
if (log2_denom) { |
|
|
|
offset_in += (1 << (log2_denom - 1)); |
|
|
|
} |
|
|
|
offset_val = (unsigned) offset_in << log2_denom; |
|
|
|
|
|
|
|
wgt = __msa_fill_h(src_weight); |
|
|
|
offset = __msa_fill_h(offset_in); |
|
|
|
offset = __msa_fill_h(offset_val); |
|
|
|
denom = __msa_fill_h(log2_denom); |
|
|
|
|
|
|
|
data0 = LW(data); |
|
|
|
data1 = LW(data + stride); |
|
|
|
|
|
|
|
src0 = (v16u8) __msa_fill_w(data0); |
|
|
|
src1 = (v16u8) __msa_fill_w(data1); |
|
|
|
LW2(data, stride, tp0, tp1); |
|
|
|
INSERT_W2_UB(tp0, tp1, src0); |
|
|
|
src0_r = (v8i16) __msa_ilvr_b((v16i8) zero, (v16i8) src0); |
|
|
|
tmp0 = wgt * src0_r; |
|
|
|
tmp0 = __msa_adds_s_h(tmp0, offset); |
|
|
|
tmp0 = __msa_maxi_s_h(tmp0, 0); |
|
|
|
tmp0 = __msa_srlr_h(tmp0, denom); |
|
|
|
tmp0 = (v8i16) __msa_sat_u_h((v8u16) tmp0, 7); |
|
|
|
src0 = (v16u8) __msa_pckev_b((v16i8) tmp0, (v16i8) tmp0); |
|
|
|
ST4x2_UB(src0, data, stride); |
|
|
|
} |
|
|
|
|
|
|
|
ILVR_B2_SH(zero, src0, zero, src1, vec0, vec1); |
|
|
|
MUL2(wgt, vec0, wgt, vec1, temp0, temp1); |
|
|
|
ADDS_SH2_SH(temp0, offset, temp1, offset, temp0, temp1); |
|
|
|
MAXI_SH2_SH(temp0, temp1, 0); |
|
|
|
static void avc_wgt_4x4_msa(uint8_t *data, int32_t stride, int32_t log2_denom, |
|
|
|
int32_t src_weight, int32_t offset_in) |
|
|
|
{ |
|
|
|
uint32_t tp0, tp1, tp2, tp3, offset_val; |
|
|
|
v16u8 src0 = { 0 }; |
|
|
|
v8i16 src0_r, src1_r, tmp0, tmp1, wgt, denom, offset; |
|
|
|
|
|
|
|
out0 = (v8u16) __msa_srl_h(temp0, denom); |
|
|
|
out1 = (v8u16) __msa_srl_h(temp1, denom); |
|
|
|
offset_val = (unsigned) offset_in << log2_denom; |
|
|
|
|
|
|
|
SAT_UH2_UH(out0, out1, 7); |
|
|
|
PCKEV_B2_SW(out0, out0, out1, out1, res0, res1); |
|
|
|
wgt = __msa_fill_h(src_weight); |
|
|
|
offset = __msa_fill_h(offset_val); |
|
|
|
denom = __msa_fill_h(log2_denom); |
|
|
|
|
|
|
|
data0 = __msa_copy_u_w(res0, 0); |
|
|
|
data1 = __msa_copy_u_w(res1, 0); |
|
|
|
SW(data0, data); |
|
|
|
data += stride; |
|
|
|
SW(data1, data); |
|
|
|
LW4(data, stride, tp0, tp1, tp2, tp3); |
|
|
|
INSERT_W4_UB(tp0, tp1, tp2, tp3, src0); |
|
|
|
UNPCK_UB_SH(src0, src0_r, src1_r); |
|
|
|
MUL2(wgt, src0_r, wgt, src1_r, tmp0, tmp1); |
|
|
|
ADDS_SH2_SH(tmp0, offset, tmp1, offset, tmp0, tmp1); |
|
|
|
MAXI_SH2_SH(tmp0, tmp1, 0); |
|
|
|
tmp0 = __msa_srlr_h(tmp0, denom); |
|
|
|
tmp1 = __msa_srlr_h(tmp1, denom); |
|
|
|
SAT_UH2_SH(tmp0, tmp1, 7); |
|
|
|
src0 = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0); |
|
|
|
ST4x4_UB(src0, src0, 0, 1, 2, 3, data, stride); |
|
|
|
} |
|
|
|
|
|
|
|
static void avc_wgt_4x4multiple_msa(uint8_t *data, int32_t stride, |
|
|
|
int32_t height, int32_t log2_denom, |
|
|
|
int32_t src_weight, int32_t offset_in) |
|
|
|
static void avc_wgt_4x8_msa(uint8_t *data, int32_t stride, int32_t log2_denom, |
|
|
|
int32_t src_weight, int32_t offset_in) |
|
|
|
{ |
|
|
|
uint8_t cnt; |
|
|
|
uint32_t data0, data1, data2, data3; |
|
|
|
v16u8 zero = { 0 }; |
|
|
|
v16u8 src0, src1, src2, src3; |
|
|
|
v8u16 temp0, temp1, temp2, temp3, wgt; |
|
|
|
v8i16 denom, offset; |
|
|
|
uint32_t tp0, tp1, tp2, tp3, offset_val; |
|
|
|
v16u8 src0 = { 0 }, src1 = { 0 }; |
|
|
|
v8i16 src0_r, src1_r, src2_r, src3_r, tmp0, tmp1, tmp2, tmp3; |
|
|
|
v8i16 wgt, denom, offset; |
|
|
|
|
|
|
|
offset_in <<= (log2_denom); |
|
|
|
offset_val = (unsigned) offset_in << log2_denom; |
|
|
|
|
|
|
|
if (log2_denom) { |
|
|
|
offset_in += (1 << (log2_denom - 1)); |
|
|
|
} |
|
|
|
|
|
|
|
wgt = (v8u16) __msa_fill_h(src_weight); |
|
|
|
offset = __msa_fill_h(offset_in); |
|
|
|
wgt = __msa_fill_h(src_weight); |
|
|
|
offset = __msa_fill_h(offset_val); |
|
|
|
denom = __msa_fill_h(log2_denom); |
|
|
|
|
|
|
|
for (cnt = height / 4; cnt--;) { |
|
|
|
LW4(data, stride, data0, data1, data2, data3); |
|
|
|
|
|
|
|
src0 = (v16u8) __msa_fill_w(data0); |
|
|
|
src1 = (v16u8) __msa_fill_w(data1); |
|
|
|
src2 = (v16u8) __msa_fill_w(data2); |
|
|
|
src3 = (v16u8) __msa_fill_w(data3); |
|
|
|
|
|
|
|
ILVR_B4_UH(zero, src0, zero, src1, zero, src2, zero, src3, |
|
|
|
temp0, temp1, temp2, temp3); |
|
|
|
MUL4(wgt, temp0, wgt, temp1, wgt, temp2, wgt, temp3, |
|
|
|
temp0, temp1, temp2, temp3); |
|
|
|
ADDS_SH4_UH(temp0, offset, temp1, offset, temp2, offset, temp3, offset, |
|
|
|
temp0, temp1, temp2, temp3); |
|
|
|
MAXI_SH4_UH(temp0, temp1, temp2, temp3, 0); |
|
|
|
SRL_H4_UH(temp0, temp1, temp2, temp3, denom); |
|
|
|
SAT_UH4_UH(temp0, temp1, temp2, temp3, 7); |
|
|
|
PCKEV_ST4x4_UB(temp0, temp1, temp2, temp3, data, stride); |
|
|
|
data += (4 * stride); |
|
|
|
} |
|
|
|
LW4(data, stride, tp0, tp1, tp2, tp3); |
|
|
|
INSERT_W4_UB(tp0, tp1, tp2, tp3, src0); |
|
|
|
LW4(data + 4 * stride, stride, tp0, tp1, tp2, tp3); |
|
|
|
INSERT_W4_UB(tp0, tp1, tp2, tp3, src1); |
|
|
|
UNPCK_UB_SH(src0, src0_r, src1_r); |
|
|
|
UNPCK_UB_SH(src1, src2_r, src3_r); |
|
|
|
MUL4(wgt, src0_r, wgt, src1_r, wgt, src2_r, wgt, src3_r, tmp0, tmp1, tmp2, |
|
|
|
tmp3); |
|
|
|
ADDS_SH4_SH(tmp0, offset, tmp1, offset, tmp2, offset, tmp3, offset, tmp0, |
|
|
|
tmp1, tmp2, tmp3); |
|
|
|
MAXI_SH4_SH(tmp0, tmp1, tmp2, tmp3, 0); |
|
|
|
SRLR_H4_SH(tmp0, tmp1, tmp2, tmp3, denom); |
|
|
|
SAT_UH4_SH(tmp0, tmp1, tmp2, tmp3, 7); |
|
|
|
PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1); |
|
|
|
ST4x8_UB(src0, src1, data, stride); |
|
|
|
} |
|
|
|
|
|
|
|
static void avc_wgt_4width_msa(uint8_t *data, int32_t stride, |
|
|
|
int32_t height, int32_t log2_denom, |
|
|
|
int32_t src_weight, int32_t offset_in) |
|
|
|
static void avc_wgt_8x4_msa(uint8_t *data, int32_t stride, int32_t log2_denom, |
|
|
|
int32_t src_weight, int32_t offset_in) |
|
|
|
{ |
|
|
|
if (2 == height) { |
|
|
|
avc_wgt_4x2_msa(data, stride, log2_denom, src_weight, offset_in); |
|
|
|
} else { |
|
|
|
avc_wgt_4x4multiple_msa(data, stride, height, log2_denom, src_weight, |
|
|
|
offset_in); |
|
|
|
} |
|
|
|
uint32_t offset_val; |
|
|
|
uint64_t tp0, tp1, tp2, tp3; |
|
|
|
v16u8 src0 = { 0 }, src1 = { 0 }; |
|
|
|
v8i16 src0_r, src1_r, src2_r, src3_r, tmp0, tmp1, tmp2, tmp3; |
|
|
|
v8i16 wgt, denom, offset; |
|
|
|
|
|
|
|
offset_val = (unsigned) offset_in << log2_denom; |
|
|
|
|
|
|
|
wgt = __msa_fill_h(src_weight); |
|
|
|
offset = __msa_fill_h(offset_val); |
|
|
|
denom = __msa_fill_h(log2_denom); |
|
|
|
|
|
|
|
LD4(data, stride, tp0, tp1, tp2, tp3); |
|
|
|
INSERT_D2_UB(tp0, tp1, src0); |
|
|
|
INSERT_D2_UB(tp2, tp3, src1); |
|
|
|
UNPCK_UB_SH(src0, src0_r, src1_r); |
|
|
|
UNPCK_UB_SH(src1, src2_r, src3_r); |
|
|
|
MUL4(wgt, src0_r, wgt, src1_r, wgt, src2_r, wgt, src3_r, tmp0, tmp1, tmp2, |
|
|
|
tmp3); |
|
|
|
ADDS_SH4_SH(tmp0, offset, tmp1, offset, tmp2, offset, tmp3, offset, tmp0, |
|
|
|
tmp1, tmp2, tmp3); |
|
|
|
MAXI_SH4_SH(tmp0, tmp1, tmp2, tmp3, 0); |
|
|
|
SRLR_H4_SH(tmp0, tmp1, tmp2, tmp3, denom); |
|
|
|
SAT_UH4_SH(tmp0, tmp1, tmp2, tmp3, 7); |
|
|
|
PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1); |
|
|
|
ST8x4_UB(src0, src1, data, stride); |
|
|
|
} |
|
|
|
|
|
|
|
static void avc_wgt_8width_msa(uint8_t *data, int32_t stride, |
|
|
|
int32_t height, int32_t log2_denom, |
|
|
|
int32_t src_weight, int32_t offset_in) |
|
|
|
static void avc_wgt_8x8_msa(uint8_t *data, int32_t stride, int32_t log2_denom, |
|
|
|
int32_t src_weight, int32_t offset_in) |
|
|
|
{ |
|
|
|
uint8_t cnt; |
|
|
|
v16u8 zero = { 0 }; |
|
|
|
v16u8 src0, src1, src2, src3; |
|
|
|
v8u16 src0_r, src1_r, src2_r, src3_r; |
|
|
|
v8u16 temp0, temp1, temp2, temp3; |
|
|
|
v8u16 wgt, denom, offset; |
|
|
|
v16i8 out0, out1; |
|
|
|
uint32_t offset_val; |
|
|
|
uint64_t tp0, tp1, tp2, tp3; |
|
|
|
v16u8 src0 = { 0 }, src1 = { 0 }, src2 = { 0 }, src3 = { 0 }; |
|
|
|
v8i16 src0_r, src1_r, src2_r, src3_r, src4_r, src5_r, src6_r, src7_r; |
|
|
|
v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; |
|
|
|
v8i16 wgt, denom, offset; |
|
|
|
|
|
|
|
offset_in <<= (log2_denom); |
|
|
|
offset_val = (unsigned) offset_in << log2_denom; |
|
|
|
|
|
|
|
if (log2_denom) { |
|
|
|
offset_in += (1 << (log2_denom - 1)); |
|
|
|
} |
|
|
|
|
|
|
|
wgt = (v8u16) __msa_fill_h(src_weight); |
|
|
|
offset = (v8u16) __msa_fill_h(offset_in); |
|
|
|
denom = (v8u16) __msa_fill_h(log2_denom); |
|
|
|
wgt = __msa_fill_h(src_weight); |
|
|
|
offset = __msa_fill_h(offset_val); |
|
|
|
denom = __msa_fill_h(log2_denom); |
|
|
|
|
|
|
|
for (cnt = height / 4; cnt--;) { |
|
|
|
LD_UB4(data, stride, src0, src1, src2, src3); |
|
|
|
ILVR_B4_UH(zero, src0, zero, src1, zero, src2, zero, src3, |
|
|
|
src0_r, src1_r, src2_r, src3_r); |
|
|
|
MUL4(wgt, src0_r, wgt, src1_r, wgt, src2_r, wgt, src3_r, |
|
|
|
temp0, temp1, temp2, temp3); |
|
|
|
ADDS_SH4_UH(temp0, offset, temp1, offset, temp2, offset, temp3, offset, |
|
|
|
temp0, temp1, temp2, temp3); |
|
|
|
MAXI_SH4_UH(temp0, temp1, temp2, temp3, 0); |
|
|
|
SRL_H4_UH(temp0, temp1, temp2, temp3, denom); |
|
|
|
SAT_UH4_UH(temp0, temp1, temp2, temp3, 7); |
|
|
|
PCKEV_B2_SB(temp1, temp0, temp3, temp2, out0, out1); |
|
|
|
ST8x4_UB(out0, out1, data, stride); |
|
|
|
data += (4 * stride); |
|
|
|
} |
|
|
|
LD4(data, stride, tp0, tp1, tp2, tp3); |
|
|
|
INSERT_D2_UB(tp0, tp1, src0); |
|
|
|
INSERT_D2_UB(tp2, tp3, src1); |
|
|
|
LD4(data + 4 * stride, stride, tp0, tp1, tp2, tp3); |
|
|
|
INSERT_D2_UB(tp0, tp1, src2); |
|
|
|
INSERT_D2_UB(tp2, tp3, src3); |
|
|
|
UNPCK_UB_SH(src0, src0_r, src1_r); |
|
|
|
UNPCK_UB_SH(src1, src2_r, src3_r); |
|
|
|
UNPCK_UB_SH(src2, src4_r, src5_r); |
|
|
|
UNPCK_UB_SH(src3, src6_r, src7_r); |
|
|
|
MUL4(wgt, src0_r, wgt, src1_r, wgt, src2_r, wgt, src3_r, tmp0, tmp1, tmp2, |
|
|
|
tmp3); |
|
|
|
MUL4(wgt, src4_r, wgt, src5_r, wgt, src6_r, wgt, src7_r, tmp4, tmp5, tmp6, |
|
|
|
tmp7); |
|
|
|
ADDS_SH4_SH(tmp0, offset, tmp1, offset, tmp2, offset, tmp3, offset, tmp0, |
|
|
|
tmp1, tmp2, tmp3); |
|
|
|
ADDS_SH4_SH(tmp4, offset, tmp5, offset, tmp6, offset, tmp7, offset, tmp4, |
|
|
|
tmp5, tmp6, tmp7); |
|
|
|
MAXI_SH8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, 0); |
|
|
|
SRLR_H8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, denom); |
|
|
|
SAT_UH8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, 7); |
|
|
|
PCKEV_B4_UB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6, src0, src1, |
|
|
|
src2, src3); |
|
|
|
ST8x8_UB(src0, src1, src2, src3, data, stride); |
|
|
|
} |
|
|
|
|
|
|
|
static void avc_wgt_16width_msa(uint8_t *data, int32_t stride, |
|
|
|
int32_t height, int32_t log2_denom, |
|
|
|
int32_t src_weight, int32_t offset_in) |
|
|
|
static void avc_wgt_8x16_msa(uint8_t *data, int32_t stride, int32_t log2_denom, |
|
|
|
int32_t src_weight, int32_t offset_in) |
|
|
|
{ |
|
|
|
uint8_t cnt; |
|
|
|
v16i8 zero = { 0 }; |
|
|
|
v16u8 src0, src1, src2, src3; |
|
|
|
v16u8 dst0, dst1, dst2, dst3; |
|
|
|
v8u16 src0_l, src1_l, src2_l, src3_l, src0_r, src1_r, src2_r, src3_r; |
|
|
|
v8u16 temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7; |
|
|
|
v8u16 wgt, denom, offset; |
|
|
|
|
|
|
|
offset_in <<= (log2_denom); |
|
|
|
uint32_t offset_val, cnt; |
|
|
|
uint64_t tp0, tp1, tp2, tp3; |
|
|
|
v16u8 src0 = { 0 }, src1 = { 0 }, src2 = { 0 }, src3 = { 0 }; |
|
|
|
v8i16 src0_r, src1_r, src2_r, src3_r, src4_r, src5_r, src6_r, src7_r; |
|
|
|
v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; |
|
|
|
v8i16 wgt, denom, offset; |
|
|
|
|
|
|
|
if (log2_denom) { |
|
|
|
offset_in += (1 << (log2_denom - 1)); |
|
|
|
} |
|
|
|
offset_val = (unsigned) offset_in << log2_denom; |
|
|
|
|
|
|
|
wgt = (v8u16) __msa_fill_h(src_weight); |
|
|
|
offset = (v8u16) __msa_fill_h(offset_in); |
|
|
|
denom = (v8u16) __msa_fill_h(log2_denom); |
|
|
|
wgt = __msa_fill_h(src_weight); |
|
|
|
offset = __msa_fill_h(offset_val); |
|
|
|
denom = __msa_fill_h(log2_denom); |
|
|
|
|
|
|
|
for (cnt = height / 4; cnt--;) { |
|
|
|
LD_UB4(data, stride, src0, src1, src2, src3); |
|
|
|
ILVR_B4_UH(zero, src0, zero, src1, zero, src2, zero, src3, |
|
|
|
src0_r, src1_r, src2_r, src3_r); |
|
|
|
ILVL_B4_UH(zero, src0, zero, src1, zero, src2, zero, src3, |
|
|
|
src0_l, src1_l, src2_l, src3_l); |
|
|
|
MUL4(wgt, src0_r, wgt, src0_l, wgt, src1_r, wgt, src1_l, |
|
|
|
temp0, temp1, temp2, temp3); |
|
|
|
MUL4(wgt, src2_r, wgt, src2_l, wgt, src3_r, wgt, src3_l, |
|
|
|
temp4, temp5, temp6, temp7); |
|
|
|
ADDS_SH4_UH(temp0, offset, temp1, offset, temp2, offset, temp3, offset, |
|
|
|
temp0, temp1, temp2, temp3); |
|
|
|
ADDS_SH4_UH(temp4, offset, temp5, offset, temp6, offset, temp7, offset, |
|
|
|
temp4, temp5, temp6, temp7); |
|
|
|
MAXI_SH4_UH(temp0, temp1, temp2, temp3, 0); |
|
|
|
MAXI_SH4_UH(temp4, temp5, temp6, temp7, 0); |
|
|
|
SRL_H4_UH(temp0, temp1, temp2, temp3, denom); |
|
|
|
SRL_H4_UH(temp4, temp5, temp6, temp7, denom); |
|
|
|
SAT_UH4_UH(temp0, temp1, temp2, temp3, 7); |
|
|
|
SAT_UH4_UH(temp4, temp5, temp6, temp7, 7); |
|
|
|
PCKEV_B4_UB(temp1, temp0, temp3, temp2, temp5, temp4, temp7, temp6, |
|
|
|
dst0, dst1, dst2, dst3); |
|
|
|
ST_UB4(dst0, dst1, dst2, dst3, data, stride); |
|
|
|
data += 4 * stride; |
|
|
|
for (cnt = 2; cnt--;) { |
|
|
|
LD4(data, stride, tp0, tp1, tp2, tp3); |
|
|
|
INSERT_D2_UB(tp0, tp1, src0); |
|
|
|
INSERT_D2_UB(tp2, tp3, src1); |
|
|
|
LD4(data + 4 * stride, stride, tp0, tp1, tp2, tp3); |
|
|
|
INSERT_D2_UB(tp0, tp1, src2); |
|
|
|
INSERT_D2_UB(tp2, tp3, src3); |
|
|
|
UNPCK_UB_SH(src0, src0_r, src1_r); |
|
|
|
UNPCK_UB_SH(src1, src2_r, src3_r); |
|
|
|
UNPCK_UB_SH(src2, src4_r, src5_r); |
|
|
|
UNPCK_UB_SH(src3, src6_r, src7_r); |
|
|
|
MUL4(wgt, src0_r, wgt, src1_r, wgt, src2_r, wgt, src3_r, tmp0, tmp1, |
|
|
|
tmp2, tmp3); |
|
|
|
MUL4(wgt, src4_r, wgt, src5_r, wgt, src6_r, wgt, src7_r, tmp4, tmp5, |
|
|
|
tmp6, tmp7); |
|
|
|
ADDS_SH4_SH(tmp0, offset, tmp1, offset, tmp2, offset, tmp3, offset, |
|
|
|
tmp0, tmp1, tmp2, tmp3); |
|
|
|
ADDS_SH4_SH(tmp4, offset, tmp5, offset, tmp6, offset, tmp7, offset, |
|
|
|
tmp4, tmp5, tmp6, tmp7); |
|
|
|
MAXI_SH8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, 0); |
|
|
|
SRLR_H8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, denom); |
|
|
|
SAT_UH8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, 7); |
|
|
|
PCKEV_B4_UB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6, src0, src1, |
|
|
|
src2, src3); |
|
|
|
ST8x8_UB(src0, src1, src2, src3, data, stride); |
|
|
|
data += 8 * stride; |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
@@ -2291,23 +2305,126 @@ void ff_h264_h_loop_filter_luma_mbaff_intra_msa(uint8_t *src, |
|
|
|
|
|
|
|
void ff_weight_h264_pixels16_8_msa(uint8_t *src, ptrdiff_t stride, |
|
|
|
int height, int log2_denom, |
|
|
|
int weight_src, int offset) |
|
|
|
int weight_src, int offset_in) |
|
|
|
{ |
|
|
|
avc_wgt_16width_msa(src, stride, height, log2_denom, weight_src, offset); |
|
|
|
uint32_t offset_val; |
|
|
|
v16i8 zero = { 0 }; |
|
|
|
v16u8 src0, src1, src2, src3, src4, src5, src6, src7; |
|
|
|
v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; |
|
|
|
v8i16 src0_l, src1_l, src2_l, src3_l, src0_r, src1_r, src2_r, src3_r; |
|
|
|
v8i16 src4_l, src5_l, src6_l, src7_l, src4_r, src5_r, src6_r, src7_r; |
|
|
|
v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; |
|
|
|
v8i16 tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15; |
|
|
|
v8i16 wgt, denom, offset; |
|
|
|
|
|
|
|
offset_val = (unsigned) offset_in << log2_denom; |
|
|
|
|
|
|
|
wgt = __msa_fill_h(weight_src); |
|
|
|
offset = __msa_fill_h(offset_val); |
|
|
|
denom = __msa_fill_h(log2_denom); |
|
|
|
|
|
|
|
LD_UB8(src, stride, src0, src1, src2, src3, src4, src5, src6, src7); |
|
|
|
ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, src0_r, src1_r, |
|
|
|
src2_r, src3_r); |
|
|
|
ILVL_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, src0_l, src1_l, |
|
|
|
src2_l, src3_l); |
|
|
|
ILVR_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7, src4_r, src5_r, |
|
|
|
src6_r, src7_r); |
|
|
|
ILVL_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7, src4_l, src5_l, |
|
|
|
src6_l, src7_l); |
|
|
|
MUL4(wgt, src0_r, wgt, src0_l, wgt, src1_r, wgt, src1_l, tmp0, tmp1, tmp2, |
|
|
|
tmp3); |
|
|
|
MUL4(wgt, src2_r, wgt, src2_l, wgt, src3_r, wgt, src3_l, tmp4, tmp5, tmp6, |
|
|
|
tmp7); |
|
|
|
MUL4(wgt, src4_r, wgt, src4_l, wgt, src5_r, wgt, src5_l, tmp8, tmp9, tmp10, |
|
|
|
tmp11); |
|
|
|
MUL4(wgt, src6_r, wgt, src6_l, wgt, src7_r, wgt, src7_l, tmp12, tmp13, |
|
|
|
tmp14, tmp15); |
|
|
|
ADDS_SH4_SH(tmp0, offset, tmp1, offset, tmp2, offset, tmp3, offset, tmp0, |
|
|
|
tmp1, tmp2, tmp3); |
|
|
|
ADDS_SH4_SH(tmp4, offset, tmp5, offset, tmp6, offset, tmp7, offset, tmp4, |
|
|
|
tmp5, tmp6, tmp7); |
|
|
|
ADDS_SH4_SH(tmp8, offset, tmp9, offset, tmp10, offset, tmp11, offset, tmp8, |
|
|
|
tmp9, tmp10, tmp11); |
|
|
|
ADDS_SH4_SH(tmp12, offset, tmp13, offset, tmp14, offset, tmp15, offset, |
|
|
|
tmp12, tmp13, tmp14, tmp15); |
|
|
|
MAXI_SH8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, 0); |
|
|
|
MAXI_SH8_SH(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, 0); |
|
|
|
SRLR_H8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, denom); |
|
|
|
SRLR_H8_SH(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, denom); |
|
|
|
SAT_UH8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, 7); |
|
|
|
SAT_UH8_SH(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, 7); |
|
|
|
PCKEV_B4_UB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6, dst0, dst1, |
|
|
|
dst2, dst3); |
|
|
|
PCKEV_B4_UB(tmp9, tmp8, tmp11, tmp10, tmp13, tmp12, tmp15, tmp14, dst4, |
|
|
|
dst5, dst6, dst7); |
|
|
|
ST_UB8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, src, stride); |
|
|
|
src += 8 * stride; |
|
|
|
|
|
|
|
if (16 == height) { |
|
|
|
LD_UB8(src, stride, src0, src1, src2, src3, src4, src5, src6, src7); |
|
|
|
ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, src0_r, |
|
|
|
src1_r, src2_r, src3_r); |
|
|
|
ILVL_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, src0_l, |
|
|
|
src1_l, src2_l, src3_l); |
|
|
|
ILVR_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7, src4_r, |
|
|
|
src5_r, src6_r, src7_r); |
|
|
|
ILVL_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7, src4_l, |
|
|
|
src5_l, src6_l, src7_l); |
|
|
|
MUL4(wgt, src0_r, wgt, src0_l, wgt, src1_r, wgt, src1_l, tmp0, tmp1, |
|
|
|
tmp2, tmp3); |
|
|
|
MUL4(wgt, src2_r, wgt, src2_l, wgt, src3_r, wgt, src3_l, tmp4, tmp5, |
|
|
|
tmp6, tmp7); |
|
|
|
MUL4(wgt, src4_r, wgt, src4_l, wgt, src5_r, wgt, src5_l, tmp8, tmp9, |
|
|
|
tmp10, tmp11); |
|
|
|
MUL4(wgt, src6_r, wgt, src6_l, wgt, src7_r, wgt, src7_l, tmp12, tmp13, |
|
|
|
tmp14, tmp15); |
|
|
|
ADDS_SH4_SH(tmp0, offset, tmp1, offset, tmp2, offset, tmp3, offset, |
|
|
|
tmp0, tmp1, tmp2, tmp3); |
|
|
|
ADDS_SH4_SH(tmp4, offset, tmp5, offset, tmp6, offset, tmp7, offset, |
|
|
|
tmp4, tmp5, tmp6, tmp7); |
|
|
|
ADDS_SH4_SH(tmp8, offset, tmp9, offset, tmp10, offset, tmp11, offset, |
|
|
|
tmp8, tmp9, tmp10, tmp11); |
|
|
|
ADDS_SH4_SH(tmp12, offset, tmp13, offset, tmp14, offset, tmp15, offset, |
|
|
|
tmp12, tmp13, tmp14, tmp15); |
|
|
|
MAXI_SH8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, 0); |
|
|
|
MAXI_SH8_SH(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, 0); |
|
|
|
SRLR_H8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, denom); |
|
|
|
SRLR_H8_SH(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, denom); |
|
|
|
SAT_UH8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, 7); |
|
|
|
SAT_UH8_SH(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, 7); |
|
|
|
PCKEV_B4_UB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6, dst0, dst1, |
|
|
|
dst2, dst3); |
|
|
|
PCKEV_B4_UB(tmp9, tmp8, tmp11, tmp10, tmp13, tmp12, tmp15, tmp14, dst4, |
|
|
|
dst5, dst6, dst7); |
|
|
|
ST_UB8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, src, stride); |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
void ff_weight_h264_pixels8_8_msa(uint8_t *src, ptrdiff_t stride, |
|
|
|
int height, int log2_denom, |
|
|
|
int weight_src, int offset) |
|
|
|
{ |
|
|
|
avc_wgt_8width_msa(src, stride, height, log2_denom, weight_src, offset); |
|
|
|
if (4 == height) { |
|
|
|
avc_wgt_8x4_msa(src, stride, log2_denom, weight_src, offset); |
|
|
|
} else if (8 == height) { |
|
|
|
avc_wgt_8x8_msa(src, stride, log2_denom, weight_src, offset); |
|
|
|
} else { |
|
|
|
avc_wgt_8x16_msa(src, stride, log2_denom, weight_src, offset); |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
void ff_weight_h264_pixels4_8_msa(uint8_t *src, ptrdiff_t stride, |
|
|
|
int height, int log2_denom, |
|
|
|
int weight_src, int offset) |
|
|
|
{ |
|
|
|
avc_wgt_4width_msa(src, stride, height, log2_denom, weight_src, offset); |
|
|
|
if (2 == height) { |
|
|
|
avc_wgt_4x2_msa(src, stride, log2_denom, weight_src, offset); |
|
|
|
} else if (4 == height) { |
|
|
|
avc_wgt_4x4_msa(src, stride, log2_denom, weight_src, offset); |
|
|
|
} else { |
|
|
|
avc_wgt_4x8_msa(src, stride, log2_denom, weight_src, offset); |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
void ff_biweight_h264_pixels16_8_msa(uint8_t *dst, uint8_t *src, |
|
|
|
|