Changing details as following: 1. Remove the local variable 'out_m' in 'CLIP_SH' and store the result in source vector. 2. Refine the implementation of macro 'CLIP_SH_0_255' and 'CLIP_SW_0_255'. Performance of VP8 decoding has speed up about 1.1%(from 7.03x to 7.11x). Performance of H264 decoding has speed up about 0.5%(from 4.35x to 4.37x). Performance of Theora decoding has speed up about 0.7%(from 5.79x to 5.83x). 3. Remove redundant macro 'CLIP_SH/Wn_0_255_MAX_SATU' and use 'CLIP_SH/Wn_0_255' instead, because there are no difference in the effect of this two macros. Reviewed-by: Shiyou Yin <yinshiyou-hf@loongson.cn> Signed-off-by: Michael Niedermayer <michael@niedermayer.cc>tags/n4.3
@@ -413,8 +413,7 @@ static void avc_biwgt_8x8_msa(uint8_t *src, uint8_t *dst, int32_t stride, | |||||
tmp7 = __msa_dpadd_s_h(offset, wgt, vec7); | tmp7 = __msa_dpadd_s_h(offset, wgt, vec7); | ||||
SRA_4V(tmp0, tmp1, tmp2, tmp3, denom); | SRA_4V(tmp0, tmp1, tmp2, tmp3, denom); | ||||
SRA_4V(tmp4, tmp5, tmp6, tmp7, denom); | SRA_4V(tmp4, tmp5, tmp6, tmp7, denom); | ||||
CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3); | |||||
CLIP_SH4_0_255(tmp4, tmp5, tmp6, tmp7); | |||||
CLIP_SH8_0_255(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); | |||||
PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, dst0, dst1); | PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, dst0, dst1); | ||||
PCKEV_B2_UB(tmp5, tmp4, tmp7, tmp6, dst2, dst3); | PCKEV_B2_UB(tmp5, tmp4, tmp7, tmp6, dst2, dst3); | ||||
ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride); | ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride); | ||||
@@ -475,8 +474,7 @@ static void avc_biwgt_8x16_msa(uint8_t *src, uint8_t *dst, int32_t stride, | |||||
SRA_4V(temp0, temp1, temp2, temp3, denom); | SRA_4V(temp0, temp1, temp2, temp3, denom); | ||||
SRA_4V(temp4, temp5, temp6, temp7, denom); | SRA_4V(temp4, temp5, temp6, temp7, denom); | ||||
CLIP_SH4_0_255(temp0, temp1, temp2, temp3); | |||||
CLIP_SH4_0_255(temp4, temp5, temp6, temp7); | |||||
CLIP_SH8_0_255(temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7); | |||||
PCKEV_B4_UB(temp1, temp0, temp3, temp2, temp5, temp4, temp7, temp6, | PCKEV_B4_UB(temp1, temp0, temp3, temp2, temp5, temp4, temp7, temp6, | ||||
dst0, dst1, dst2, dst3); | dst0, dst1, dst2, dst3); | ||||
ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride); | ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride); | ||||
@@ -531,7 +529,7 @@ static void avc_biwgt_8x16_msa(uint8_t *src, uint8_t *dst, int32_t stride, | |||||
temp = p1_or_q1_org_in << 1; \ | temp = p1_or_q1_org_in << 1; \ | ||||
clip3 = clip3 - temp; \ | clip3 = clip3 - temp; \ | ||||
clip3 = __msa_ave_s_h(p2_or_q2_org_in, clip3); \ | clip3 = __msa_ave_s_h(p2_or_q2_org_in, clip3); \ | ||||
clip3 = CLIP_SH(clip3, negate_tc_in, tc_in); \ | |||||
CLIP_SH(clip3, negate_tc_in, tc_in); \ | |||||
p1_or_q1_out = p1_or_q1_org_in + clip3; \ | p1_or_q1_out = p1_or_q1_org_in + clip3; \ | ||||
} | } | ||||
@@ -549,7 +547,7 @@ static void avc_biwgt_8x16_msa(uint8_t *src, uint8_t *dst, int32_t stride, | |||||
delta = q0_sub_p0 + p1_sub_q1; \ | delta = q0_sub_p0 + p1_sub_q1; \ | ||||
delta >>= 3; \ | delta >>= 3; \ | ||||
\ | \ | ||||
delta = CLIP_SH(delta, negate_threshold_in, threshold_in); \ | |||||
CLIP_SH(delta, negate_threshold_in, threshold_in); \ | |||||
\ | \ | ||||
p0_or_q0_out = p0_or_q0_org_in + delta; \ | p0_or_q0_out = p0_or_q0_org_in + delta; \ | ||||
q0_or_p0_out = q0_or_p0_org_in - delta; \ | q0_or_p0_out = q0_or_p0_org_in - delta; \ | ||||
@@ -598,7 +596,7 @@ static void avc_biwgt_8x16_msa(uint8_t *src, uint8_t *dst, int32_t stride, | |||||
delta = q0_sub_p0 + p1_sub_q1; \ | delta = q0_sub_p0 + p1_sub_q1; \ | ||||
delta = __msa_srari_h(delta, 3); \ | delta = __msa_srari_h(delta, 3); \ | ||||
\ | \ | ||||
delta = CLIP_SH(delta, -tc, tc); \ | |||||
CLIP_SH(delta, -tc, tc); \ | |||||
\ | \ | ||||
ILVR_B2_SH(zeros, src1, zeros, src2, res0_r, res1_r); \ | ILVR_B2_SH(zeros, src1, zeros, src2, res0_r, res1_r); \ | ||||
\ | \ | ||||
@@ -662,7 +660,7 @@ static void avc_biwgt_8x16_msa(uint8_t *src, uint8_t *dst, int32_t stride, | |||||
q0_sub_p0 <<= 2; \ | q0_sub_p0 <<= 2; \ | ||||
delta = q0_sub_p0 + p1_sub_q1; \ | delta = q0_sub_p0 + p1_sub_q1; \ | ||||
delta = __msa_srari_h(delta, 3); \ | delta = __msa_srari_h(delta, 3); \ | ||||
delta = CLIP_SH(delta, -tc, tc); \ | |||||
CLIP_SH(delta, -tc, tc); \ | |||||
\ | \ | ||||
ILVR_B2_SH(zeros, src1, zeros, src2, res0_r, res1_r); \ | ILVR_B2_SH(zeros, src1, zeros, src2, res0_r, res1_r); \ | ||||
\ | \ | ||||
@@ -1741,7 +1739,7 @@ static void avc_h_loop_filter_luma_mbaff_msa(uint8_t *in, int32_t stride, | |||||
v8i16 tc, tc_orig_r, tc_plus1; | v8i16 tc, tc_orig_r, tc_plus1; | ||||
v16u8 is_tc_orig1, is_tc_orig2, tc_orig = { 0 }; | v16u8 is_tc_orig1, is_tc_orig2, tc_orig = { 0 }; | ||||
v8i16 p0_ilvr_q0, p0_add_q0, q0_sub_p0, p1_sub_q1; | v8i16 p0_ilvr_q0, p0_add_q0, q0_sub_p0, p1_sub_q1; | ||||
v8u16 src2_r, src3_r; | |||||
v8i16 src2_r, src3_r; | |||||
v8i16 p2_r, p1_r, q2_r, q1_r; | v8i16 p2_r, p1_r, q2_r, q1_r; | ||||
v16u8 p2, q2, p0, q0; | v16u8 p2, q2, p0, q0; | ||||
v4i32 dst0, dst1; | v4i32 dst0, dst1; | ||||
@@ -1839,8 +1837,8 @@ static void avc_h_loop_filter_luma_mbaff_msa(uint8_t *in, int32_t stride, | |||||
tc_orig_r = (v8i16) __msa_ilvr_b(zeros, (v16i8) tc_orig); | tc_orig_r = (v8i16) __msa_ilvr_b(zeros, (v16i8) tc_orig); | ||||
tc = tc_orig_r; | tc = tc_orig_r; | ||||
p2_r = CLIP_SH(p2_r, -tc_orig_r, tc_orig_r); | |||||
q2_r = CLIP_SH(q2_r, -tc_orig_r, tc_orig_r); | |||||
CLIP_SH(p2_r, -tc_orig_r, tc_orig_r); | |||||
CLIP_SH(q2_r, -tc_orig_r, tc_orig_r); | |||||
p2_r += p1_r; | p2_r += p1_r; | ||||
q2_r += q1_r; | q2_r += q1_r; | ||||
@@ -1872,14 +1870,13 @@ static void avc_h_loop_filter_luma_mbaff_msa(uint8_t *in, int32_t stride, | |||||
(v16i8) is_less_than_beta2); | (v16i8) is_less_than_beta2); | ||||
tc = (v8i16) __msa_bmnz_v((v16u8) tc, (v16u8) tc_plus1, is_less_than_beta2); | tc = (v8i16) __msa_bmnz_v((v16u8) tc, (v16u8) tc_plus1, is_less_than_beta2); | ||||
q0_sub_p0 = CLIP_SH(q0_sub_p0, -tc, tc); | |||||
CLIP_SH(q0_sub_p0, -tc, tc); | |||||
ILVR_B2_UH(zeros, src2, zeros, src3, src2_r, src3_r); | |||||
ILVR_B2_SH(zeros, src2, zeros, src3, src2_r, src3_r); | |||||
src2_r += q0_sub_p0; | src2_r += q0_sub_p0; | ||||
src3_r -= q0_sub_p0; | src3_r -= q0_sub_p0; | ||||
src2_r = (v8u16) CLIP_SH_0_255(src2_r); | |||||
src3_r = (v8u16) CLIP_SH_0_255(src3_r); | |||||
CLIP_SH2_0_255(src2_r, src3_r); | |||||
PCKEV_B2_UB(src2_r, src2_r, src3_r, src3_r, p0, q0); | PCKEV_B2_UB(src2_r, src2_r, src3_r, src3_r, p0, q0); | ||||
@@ -2509,10 +2506,8 @@ void ff_biweight_h264_pixels16_8_msa(uint8_t *dst, uint8_t *src, | |||||
SRA_4V(tmp4, tmp5, tmp6, tmp7, denom); | SRA_4V(tmp4, tmp5, tmp6, tmp7, denom); | ||||
SRA_4V(tmp8, tmp9, tmp10, tmp11, denom); | SRA_4V(tmp8, tmp9, tmp10, tmp11, denom); | ||||
SRA_4V(tmp12, tmp13, tmp14, tmp15, denom); | SRA_4V(tmp12, tmp13, tmp14, tmp15, denom); | ||||
CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3); | |||||
CLIP_SH4_0_255(tmp4, tmp5, tmp6, tmp7); | |||||
CLIP_SH4_0_255(tmp8, tmp9, tmp10, tmp11); | |||||
CLIP_SH4_0_255(tmp12, tmp13, tmp14, tmp15); | |||||
CLIP_SH8_0_255(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); | |||||
CLIP_SH8_0_255(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15); | |||||
PCKEV_B4_UB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6, dst0, dst1, | PCKEV_B4_UB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6, dst0, dst1, | ||||
dst2, dst3); | dst2, dst3); | ||||
PCKEV_B4_UB(tmp9, tmp8, tmp11, tmp10, tmp13, tmp12, tmp15, tmp14, dst4, | PCKEV_B4_UB(tmp9, tmp8, tmp11, tmp10, tmp13, tmp12, tmp15, tmp14, dst4, | ||||
@@ -2553,10 +2548,8 @@ void ff_biweight_h264_pixels16_8_msa(uint8_t *dst, uint8_t *src, | |||||
SRA_4V(tmp4, tmp5, tmp6, tmp7, denom); | SRA_4V(tmp4, tmp5, tmp6, tmp7, denom); | ||||
SRA_4V(tmp8, tmp9, tmp10, tmp11, denom); | SRA_4V(tmp8, tmp9, tmp10, tmp11, denom); | ||||
SRA_4V(tmp12, tmp13, tmp14, tmp15, denom); | SRA_4V(tmp12, tmp13, tmp14, tmp15, denom); | ||||
CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3); | |||||
CLIP_SH4_0_255(tmp4, tmp5, tmp6, tmp7); | |||||
CLIP_SH4_0_255(tmp8, tmp9, tmp10, tmp11); | |||||
CLIP_SH4_0_255(tmp12, tmp13, tmp14, tmp15); | |||||
CLIP_SH8_0_255(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); | |||||
CLIP_SH8_0_255(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15); | |||||
PCKEV_B4_UB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6, dst0, dst1, | PCKEV_B4_UB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6, dst0, dst1, | ||||
dst2, dst3); | dst2, dst3); | ||||
PCKEV_B4_UB(tmp9, tmp8, tmp11, tmp10, tmp13, tmp12, tmp15, tmp14, dst4, | PCKEV_B4_UB(tmp9, tmp8, tmp11, tmp10, tmp13, tmp12, tmp15, tmp14, dst4, | ||||
@@ -233,8 +233,7 @@ static void avc_idct8_addblk_msa(uint8_t *dst, int16_t *src, int32_t dst_stride) | |||||
res0, res1, res2, res3); | res0, res1, res2, res3); | ||||
ADD4(res4, tmp4, res5, tmp5, res6, tmp6, res7, tmp7, | ADD4(res4, tmp4, res5, tmp5, res6, tmp6, res7, tmp7, | ||||
res4, res5, res6, res7); | res4, res5, res6, res7); | ||||
CLIP_SH4_0_255(res0, res1, res2, res3); | |||||
CLIP_SH4_0_255(res4, res5, res6, res7); | |||||
CLIP_SH8_0_255(res0, res1, res2, res3, res4, res5, res6, res7); | |||||
PCKEV_B4_SB(res1, res0, res3, res2, res5, res4, res7, res6, | PCKEV_B4_SB(res1, res0, res3, res2, res5, res4, res7, res6, | ||||
dst0, dst1, dst2, dst3); | dst0, dst1, dst2, dst3); | ||||
ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, dst_stride) | ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, dst_stride) | ||||
@@ -263,8 +262,8 @@ static void avc_idct8_dc_addblk_msa(uint8_t *dst, int16_t *src, | |||||
dst0_r, dst1_r, dst2_r, dst3_r); | dst0_r, dst1_r, dst2_r, dst3_r); | ||||
ADD4(dst4_r, dc, dst5_r, dc, dst6_r, dc, dst7_r, dc, | ADD4(dst4_r, dc, dst5_r, dc, dst6_r, dc, dst7_r, dc, | ||||
dst4_r, dst5_r, dst6_r, dst7_r); | dst4_r, dst5_r, dst6_r, dst7_r); | ||||
CLIP_SH4_0_255(dst0_r, dst1_r, dst2_r, dst3_r); | |||||
CLIP_SH4_0_255(dst4_r, dst5_r, dst6_r, dst7_r); | |||||
CLIP_SH8_0_255(dst0_r, dst1_r, dst2_r, dst3_r, | |||||
dst4_r, dst5_r, dst6_r, dst7_r); | |||||
PCKEV_B4_SB(dst1_r, dst0_r, dst3_r, dst2_r, dst5_r, dst4_r, dst7_r, dst6_r, | PCKEV_B4_SB(dst1_r, dst0_r, dst3_r, dst2_r, dst5_r, dst4_r, dst7_r, dst6_r, | ||||
dst0, dst1, dst2, dst3); | dst0, dst1, dst2, dst3); | ||||
ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, dst_stride) | ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, dst_stride) | ||||
@@ -803,8 +803,9 @@ static void hevc_addblk_16x16_msa(int16_t *coeffs, uint8_t *dst, int32_t stride) | |||||
LD_SH4((coeffs + 8), 16, in1, in3, in5, in7); | LD_SH4((coeffs + 8), 16, in1, in3, in5, in7); | ||||
coeffs += 64; | coeffs += 64; | ||||
CLIP_SH4_0_255(dst_r0, dst_l0, dst_r1, dst_l1); | |||||
CLIP_SH4_0_255(dst_r2, dst_l2, dst_r3, dst_l3); | |||||
CLIP_SH8_0_255(dst_r0, dst_l0, dst_r1, dst_l1, | |||||
dst_r2, dst_l2, dst_r3, dst_l3); | |||||
PCKEV_B4_UB(dst_l0, dst_r0, dst_l1, dst_r1, dst_l2, dst_r2, dst_l3, | PCKEV_B4_UB(dst_l0, dst_r0, dst_l1, dst_r1, dst_l2, dst_r2, dst_l3, | ||||
dst_r3, dst0, dst1, dst2, dst3); | dst_r3, dst0, dst1, dst2, dst3); | ||||
ST_UB4(dst0, dst1, dst2, dst3, dst, stride); | ST_UB4(dst0, dst1, dst2, dst3, dst, stride); | ||||
@@ -825,8 +826,8 @@ static void hevc_addblk_16x16_msa(int16_t *coeffs, uint8_t *dst, int32_t stride) | |||||
dst_r3 += in6; | dst_r3 += in6; | ||||
dst_l3 += in7; | dst_l3 += in7; | ||||
CLIP_SH4_0_255(dst_r0, dst_l0, dst_r1, dst_l1); | |||||
CLIP_SH4_0_255(dst_r2, dst_l2, dst_r3, dst_l3); | |||||
CLIP_SH8_0_255(dst_r0, dst_l0, dst_r1, dst_l1, | |||||
dst_r2, dst_l2, dst_r3, dst_l3); | |||||
PCKEV_B4_UB(dst_l0, dst_r0, dst_l1, dst_r1, dst_l2, dst_r2, dst_l3, | PCKEV_B4_UB(dst_l0, dst_r0, dst_l1, dst_r1, dst_l2, dst_r2, dst_l3, | ||||
dst_r3, dst0, dst1, dst2, dst3); | dst_r3, dst0, dst1, dst2, dst3); | ||||
ST_UB4(dst0, dst1, dst2, dst3, dst, stride); | ST_UB4(dst0, dst1, dst2, dst3, dst, stride); | ||||
@@ -873,8 +874,8 @@ static void hevc_addblk_32x32_msa(int16_t *coeffs, uint8_t *dst, int32_t stride) | |||||
LD_SH4((coeffs + 8), 16, in1, in3, in5, in7); | LD_SH4((coeffs + 8), 16, in1, in3, in5, in7); | ||||
coeffs += 64; | coeffs += 64; | ||||
CLIP_SH4_0_255(dst_r0, dst_l0, dst_r1, dst_l1); | |||||
CLIP_SH4_0_255(dst_r2, dst_l2, dst_r3, dst_l3); | |||||
CLIP_SH8_0_255(dst_r0, dst_l0, dst_r1, dst_l1, | |||||
dst_r2, dst_l2, dst_r3, dst_l3); | |||||
PCKEV_B4_UB(dst_l0, dst_r0, dst_l1, dst_r1, dst_l2, dst_r2, dst_l3, | PCKEV_B4_UB(dst_l0, dst_r0, dst_l1, dst_r1, dst_l2, dst_r2, dst_l3, | ||||
dst_r3, dst0, dst1, dst2, dst3); | dst_r3, dst0, dst1, dst2, dst3); | ||||
ST_UB2(dst0, dst1, dst, 16); | ST_UB2(dst0, dst1, dst, 16); | ||||
@@ -905,8 +906,8 @@ static void hevc_addblk_32x32_msa(int16_t *coeffs, uint8_t *dst, int32_t stride) | |||||
LD_SH4(coeffs, 16, in0, in2, in4, in6); | LD_SH4(coeffs, 16, in0, in2, in4, in6); | ||||
LD_SH4((coeffs + 8), 16, in1, in3, in5, in7); | LD_SH4((coeffs + 8), 16, in1, in3, in5, in7); | ||||
CLIP_SH4_0_255(dst_r0, dst_l0, dst_r1, dst_l1); | |||||
CLIP_SH4_0_255(dst_r2, dst_l2, dst_r3, dst_l3); | |||||
CLIP_SH8_0_255(dst_r0, dst_l0, dst_r1, dst_l1, | |||||
dst_r2, dst_l2, dst_r3, dst_l3); | |||||
PCKEV_B4_UB(dst_l0, dst_r0, dst_l1, dst_r1, dst_l2, dst_r2, dst_l3, | PCKEV_B4_UB(dst_l0, dst_r0, dst_l1, dst_r1, dst_l2, dst_r2, dst_l3, | ||||
dst_r3, dst0, dst1, dst2, dst3); | dst_r3, dst0, dst1, dst2, dst3); | ||||
ST_UB2(dst0, dst1, dst, 16); | ST_UB2(dst0, dst1, dst, 16); | ||||
@@ -928,8 +929,8 @@ static void hevc_addblk_32x32_msa(int16_t *coeffs, uint8_t *dst, int32_t stride) | |||||
dst_r3 += in6; | dst_r3 += in6; | ||||
dst_l3 += in7; | dst_l3 += in7; | ||||
CLIP_SH4_0_255(dst_r0, dst_l0, dst_r1, dst_l1); | |||||
CLIP_SH4_0_255(dst_r2, dst_l2, dst_r3, dst_l3); | |||||
CLIP_SH8_0_255(dst_r0, dst_l0, dst_r1, dst_l1, | |||||
dst_r2, dst_l2, dst_r3, dst_l3); | |||||
PCKEV_B4_UB(dst_l0, dst_r0, dst_l1, dst_r1, dst_l2, dst_r2, dst_l3, | PCKEV_B4_UB(dst_l0, dst_r0, dst_l1, dst_r1, dst_l2, dst_r2, dst_l3, | ||||
dst_r3, dst0, dst1, dst2, dst3); | dst_r3, dst0, dst1, dst2, dst3); | ||||
ST_UB2(dst0, dst1, dst, 16); | ST_UB2(dst0, dst1, dst, 16); | ||||
@@ -140,19 +140,19 @@ static void hevc_loopfilter_luma_hor_msa(uint8_t *src, int32_t stride, | |||||
temp1 = ((p3_src + p2_src) << 1) + p2_src + temp0; | temp1 = ((p3_src + p2_src) << 1) + p2_src + temp0; | ||||
temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3); | temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3); | ||||
temp2 = (v8i16) (temp1 - p2_src); | temp2 = (v8i16) (temp1 - p2_src); | ||||
temp2 = CLIP_SH(temp2, tc_neg, tc_pos); | |||||
CLIP_SH(temp2, tc_neg, tc_pos); | |||||
dst0 = (v16u8) (temp2 + (v8i16) p2_src); | dst0 = (v16u8) (temp2 + (v8i16) p2_src); | ||||
temp1 = temp0 + p2_src; | temp1 = temp0 + p2_src; | ||||
temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 2); | temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 2); | ||||
temp2 = (v8i16) (temp1 - p1_src); | temp2 = (v8i16) (temp1 - p1_src); | ||||
temp2 = CLIP_SH(temp2, tc_neg, tc_pos); | |||||
CLIP_SH(temp2, tc_neg, tc_pos); | |||||
dst1 = (v16u8) (temp2 + (v8i16) p1_src); | dst1 = (v16u8) (temp2 + (v8i16) p1_src); | ||||
temp1 = (temp0 << 1) + p2_src + q1_src; | temp1 = (temp0 << 1) + p2_src + q1_src; | ||||
temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3); | temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3); | ||||
temp2 = (v8i16) (temp1 - p0_src); | temp2 = (v8i16) (temp1 - p0_src); | ||||
temp2 = CLIP_SH(temp2, tc_neg, tc_pos); | |||||
CLIP_SH(temp2, tc_neg, tc_pos); | |||||
dst2 = (v16u8) (temp2 + (v8i16) p0_src); | dst2 = (v16u8) (temp2 + (v8i16) p0_src); | ||||
dst0 = __msa_bmz_v(dst0, (v16u8) p2_src, (v16u8) p_is_pcm_vec); | dst0 = __msa_bmz_v(dst0, (v16u8) p2_src, (v16u8) p_is_pcm_vec); | ||||
@@ -165,19 +165,19 @@ static void hevc_loopfilter_luma_hor_msa(uint8_t *src, int32_t stride, | |||||
temp1 = ((q3_src + q2_src) << 1) + q2_src + temp0; | temp1 = ((q3_src + q2_src) << 1) + q2_src + temp0; | ||||
temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3); | temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3); | ||||
temp2 = (v8i16) (temp1 - q2_src); | temp2 = (v8i16) (temp1 - q2_src); | ||||
temp2 = CLIP_SH(temp2, tc_neg, tc_pos); | |||||
CLIP_SH(temp2, tc_neg, tc_pos); | |||||
dst5 = (v16u8) (temp2 + (v8i16) q2_src); | dst5 = (v16u8) (temp2 + (v8i16) q2_src); | ||||
temp1 = temp0 + q2_src; | temp1 = temp0 + q2_src; | ||||
temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 2); | temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 2); | ||||
temp2 = (v8i16) (temp1 - q1_src); | temp2 = (v8i16) (temp1 - q1_src); | ||||
temp2 = CLIP_SH(temp2, tc_neg, tc_pos); | |||||
CLIP_SH(temp2, tc_neg, tc_pos); | |||||
dst4 = (v16u8) (temp2 + (v8i16) q1_src); | dst4 = (v16u8) (temp2 + (v8i16) q1_src); | ||||
temp1 = (temp0 << 1) + p1_src + q2_src; | temp1 = (temp0 << 1) + p1_src + q2_src; | ||||
temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3); | temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3); | ||||
temp2 = (v8i16) (temp1 - q0_src); | temp2 = (v8i16) (temp1 - q0_src); | ||||
temp2 = CLIP_SH(temp2, tc_neg, tc_pos); | |||||
CLIP_SH(temp2, tc_neg, tc_pos); | |||||
dst3 = (v16u8) (temp2 + (v8i16) q0_src); | dst3 = (v16u8) (temp2 + (v8i16) q0_src); | ||||
dst3 = __msa_bmz_v(dst3, (v16u8) q0_src, (v16u8) q_is_pcm_vec); | dst3 = __msa_bmz_v(dst3, (v16u8) q0_src, (v16u8) q_is_pcm_vec); | ||||
@@ -218,15 +218,15 @@ static void hevc_loopfilter_luma_hor_msa(uint8_t *src, int32_t stride, | |||||
abs_delta0 = __msa_add_a_h(delta0, (v8i16) zero); | abs_delta0 = __msa_add_a_h(delta0, (v8i16) zero); | ||||
abs_delta0 = (v8u16) abs_delta0 < temp1; | abs_delta0 = (v8u16) abs_delta0 < temp1; | ||||
delta0 = CLIP_SH(delta0, tc_neg, tc_pos); | |||||
CLIP_SH(delta0, tc_neg, tc_pos); | |||||
temp0 = (v8u16) (delta0 + p0_src); | |||||
temp0 = (v8u16) CLIP_SH_0_255(temp0); | |||||
temp0 = (v8u16) __msa_bmz_v((v16u8) temp0, (v16u8) p0_src, | |||||
temp2 = (v8i16) (delta0 + p0_src); | |||||
CLIP_SH_0_255(temp2); | |||||
temp0 = (v8u16) __msa_bmz_v((v16u8) temp2, (v16u8) p0_src, | |||||
(v16u8) p_is_pcm_vec); | (v16u8) p_is_pcm_vec); | ||||
temp2 = (v8i16) (q0_src - delta0); | temp2 = (v8i16) (q0_src - delta0); | ||||
temp2 = CLIP_SH_0_255(temp2); | |||||
CLIP_SH_0_255(temp2); | |||||
temp2 = (v8i16) __msa_bmz_v((v16u8) temp2, (v16u8) q0_src, | temp2 = (v8i16) __msa_bmz_v((v16u8) temp2, (v16u8) q0_src, | ||||
(v16u8) q_is_pcm_vec); | (v16u8) q_is_pcm_vec); | ||||
@@ -252,9 +252,9 @@ static void hevc_loopfilter_luma_hor_msa(uint8_t *src, int32_t stride, | |||||
delta1 -= (v8i16) p1_src; | delta1 -= (v8i16) p1_src; | ||||
delta1 += delta0; | delta1 += delta0; | ||||
delta1 >>= 1; | delta1 >>= 1; | ||||
delta1 = CLIP_SH(delta1, tc_neg, tc_pos); | |||||
CLIP_SH(delta1, tc_neg, tc_pos); | |||||
delta1 = (v8i16) p1_src + (v8i16) delta1; | delta1 = (v8i16) p1_src + (v8i16) delta1; | ||||
delta1 = CLIP_SH_0_255(delta1); | |||||
CLIP_SH_0_255(delta1); | |||||
delta1 = (v8i16) __msa_bmnz_v((v16u8) delta1, (v16u8) p1_src, | delta1 = (v8i16) __msa_bmnz_v((v16u8) delta1, (v16u8) p1_src, | ||||
(v16u8) p_is_pcm_vec); | (v16u8) p_is_pcm_vec); | ||||
@@ -262,9 +262,9 @@ static void hevc_loopfilter_luma_hor_msa(uint8_t *src, int32_t stride, | |||||
delta2 = delta2 - (v8i16) q1_src; | delta2 = delta2 - (v8i16) q1_src; | ||||
delta2 = delta2 - delta0; | delta2 = delta2 - delta0; | ||||
delta2 = delta2 >> 1; | delta2 = delta2 >> 1; | ||||
delta2 = CLIP_SH(delta2, tc_neg, tc_pos); | |||||
CLIP_SH(delta2, tc_neg, tc_pos); | |||||
delta2 = (v8i16) q1_src + (v8i16) delta2; | delta2 = (v8i16) q1_src + (v8i16) delta2; | ||||
delta2 = CLIP_SH_0_255(delta2); | |||||
CLIP_SH_0_255(delta2); | |||||
delta2 = (v8i16) __msa_bmnz_v((v16u8) delta2, (v16u8) q1_src, | delta2 = (v8i16) __msa_bmnz_v((v16u8) delta2, (v16u8) q1_src, | ||||
(v16u8) q_is_pcm_vec); | (v16u8) q_is_pcm_vec); | ||||
@@ -298,19 +298,19 @@ static void hevc_loopfilter_luma_hor_msa(uint8_t *src, int32_t stride, | |||||
temp1 = ((p3_src + p2_src) << 1) + p2_src + temp0; | temp1 = ((p3_src + p2_src) << 1) + p2_src + temp0; | ||||
temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3); | temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3); | ||||
temp2 = (v8i16) (temp1 - p2_src); | temp2 = (v8i16) (temp1 - p2_src); | ||||
temp2 = CLIP_SH(temp2, tc_neg, tc_pos); | |||||
CLIP_SH(temp2, tc_neg, tc_pos); | |||||
dst0 = (v16u8) (temp2 + (v8i16) p2_src); | dst0 = (v16u8) (temp2 + (v8i16) p2_src); | ||||
temp1 = temp0 + p2_src; | temp1 = temp0 + p2_src; | ||||
temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 2); | temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 2); | ||||
temp2 = (v8i16) (temp1 - p1_src); | temp2 = (v8i16) (temp1 - p1_src); | ||||
temp2 = CLIP_SH(temp2, tc_neg, tc_pos); | |||||
CLIP_SH(temp2, tc_neg, tc_pos); | |||||
dst1 = (v16u8) (temp2 + (v8i16) p1_src); | dst1 = (v16u8) (temp2 + (v8i16) p1_src); | ||||
temp1 = (temp0 << 1) + p2_src + q1_src; | temp1 = (temp0 << 1) + p2_src + q1_src; | ||||
temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3); | temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3); | ||||
temp2 = (v8i16) (temp1 - p0_src); | temp2 = (v8i16) (temp1 - p0_src); | ||||
temp2 = CLIP_SH(temp2, tc_neg, tc_pos); | |||||
CLIP_SH(temp2, tc_neg, tc_pos); | |||||
dst2 = (v16u8) (temp2 + (v8i16) p0_src); | dst2 = (v16u8) (temp2 + (v8i16) p0_src); | ||||
dst0 = __msa_bmz_v(dst0, (v16u8) p2_src, (v16u8) p_is_pcm_vec); | dst0 = __msa_bmz_v(dst0, (v16u8) p2_src, (v16u8) p_is_pcm_vec); | ||||
@@ -323,19 +323,19 @@ static void hevc_loopfilter_luma_hor_msa(uint8_t *src, int32_t stride, | |||||
temp1 = ((q3_src + q2_src) << 1) + q2_src + temp0; | temp1 = ((q3_src + q2_src) << 1) + q2_src + temp0; | ||||
temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3); | temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3); | ||||
temp2 = (v8i16) (temp1 - q2_src); | temp2 = (v8i16) (temp1 - q2_src); | ||||
temp2 = CLIP_SH(temp2, tc_neg, tc_pos); | |||||
CLIP_SH(temp2, tc_neg, tc_pos); | |||||
dst5 = (v16u8) (temp2 + (v8i16) q2_src); | dst5 = (v16u8) (temp2 + (v8i16) q2_src); | ||||
temp1 = temp0 + q2_src; | temp1 = temp0 + q2_src; | ||||
temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 2); | temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 2); | ||||
temp2 = (v8i16) (temp1 - q1_src); | temp2 = (v8i16) (temp1 - q1_src); | ||||
temp2 = CLIP_SH(temp2, tc_neg, tc_pos); | |||||
CLIP_SH(temp2, tc_neg, tc_pos); | |||||
dst4 = (v16u8) (temp2 + (v8i16) q1_src); | dst4 = (v16u8) (temp2 + (v8i16) q1_src); | ||||
temp1 = (temp0 << 1) + p1_src + q2_src; | temp1 = (temp0 << 1) + p1_src + q2_src; | ||||
temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3); | temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3); | ||||
temp2 = (v8i16) (temp1 - q0_src); | temp2 = (v8i16) (temp1 - q0_src); | ||||
temp2 = CLIP_SH(temp2, tc_neg, tc_pos); | |||||
CLIP_SH(temp2, tc_neg, tc_pos); | |||||
dst3 = (v16u8) (temp2 + (v8i16) q0_src); | dst3 = (v16u8) (temp2 + (v8i16) q0_src); | ||||
dst3 = __msa_bmz_v(dst3, (v16u8) q0_src, (v16u8) q_is_pcm_vec); | dst3 = __msa_bmz_v(dst3, (v16u8) q0_src, (v16u8) q_is_pcm_vec); | ||||
@@ -362,15 +362,15 @@ static void hevc_loopfilter_luma_hor_msa(uint8_t *src, int32_t stride, | |||||
abs_delta0 = __msa_add_a_h(delta0, (v8i16) zero); | abs_delta0 = __msa_add_a_h(delta0, (v8i16) zero); | ||||
abs_delta0 = (v8u16) abs_delta0 < temp1; | abs_delta0 = (v8u16) abs_delta0 < temp1; | ||||
delta0 = CLIP_SH(delta0, tc_neg, tc_pos); | |||||
CLIP_SH(delta0, tc_neg, tc_pos); | |||||
temp0 = (v8u16) (delta0 + p0_src); | |||||
temp0 = (v8u16) CLIP_SH_0_255(temp0); | |||||
temp0 = (v8u16) __msa_bmz_v((v16u8) temp0, (v16u8) p0_src, | |||||
temp2 = (v8i16) (delta0 + p0_src); | |||||
CLIP_SH_0_255(temp2); | |||||
temp0 = (v8u16) __msa_bmz_v((v16u8) temp2, (v16u8) p0_src, | |||||
(v16u8) p_is_pcm_vec); | (v16u8) p_is_pcm_vec); | ||||
temp2 = (v8i16) (q0_src - delta0); | temp2 = (v8i16) (q0_src - delta0); | ||||
temp2 = CLIP_SH_0_255(temp2); | |||||
CLIP_SH_0_255(temp2); | |||||
temp2 = (v8i16) __msa_bmz_v((v16u8) temp2, (v16u8) q0_src, | temp2 = (v8i16) __msa_bmz_v((v16u8) temp2, (v16u8) q0_src, | ||||
(v16u8) q_is_pcm_vec); | (v16u8) q_is_pcm_vec); | ||||
@@ -394,9 +394,9 @@ static void hevc_loopfilter_luma_hor_msa(uint8_t *src, int32_t stride, | |||||
delta1 -= (v8i16) p1_src; | delta1 -= (v8i16) p1_src; | ||||
delta1 += delta0; | delta1 += delta0; | ||||
delta1 >>= 1; | delta1 >>= 1; | ||||
delta1 = CLIP_SH(delta1, tc_neg, tc_pos); | |||||
CLIP_SH(delta1, tc_neg, tc_pos); | |||||
delta1 = (v8i16) p1_src + (v8i16) delta1; | delta1 = (v8i16) p1_src + (v8i16) delta1; | ||||
delta1 = CLIP_SH_0_255(delta1); | |||||
CLIP_SH_0_255(delta1); | |||||
delta1 = (v8i16) __msa_bmnz_v((v16u8) delta1, (v16u8) p1_src, | delta1 = (v8i16) __msa_bmnz_v((v16u8) delta1, (v16u8) p1_src, | ||||
(v16u8) p_is_pcm_vec); | (v16u8) p_is_pcm_vec); | ||||
@@ -404,9 +404,9 @@ static void hevc_loopfilter_luma_hor_msa(uint8_t *src, int32_t stride, | |||||
delta2 = delta2 - (v8i16) q1_src; | delta2 = delta2 - (v8i16) q1_src; | ||||
delta2 = delta2 - delta0; | delta2 = delta2 - delta0; | ||||
delta2 = delta2 >> 1; | delta2 = delta2 >> 1; | ||||
delta2 = CLIP_SH(delta2, tc_neg, tc_pos); | |||||
CLIP_SH(delta2, tc_neg, tc_pos); | |||||
delta2 = (v8i16) q1_src + (v8i16) delta2; | delta2 = (v8i16) q1_src + (v8i16) delta2; | ||||
delta2 = CLIP_SH_0_255(delta2); | |||||
CLIP_SH_0_255(delta2); | |||||
delta2 = (v8i16) __msa_bmnz_v((v16u8) delta2, (v16u8) q1_src, | delta2 = (v8i16) __msa_bmnz_v((v16u8) delta2, (v16u8) q1_src, | ||||
(v16u8) q_is_pcm_vec); | (v16u8) q_is_pcm_vec); | ||||
@@ -561,19 +561,19 @@ static void hevc_loopfilter_luma_ver_msa(uint8_t *src, int32_t stride, | |||||
temp1 = ((p3_src + p2_src) << 1) + p2_src + temp0; | temp1 = ((p3_src + p2_src) << 1) + p2_src + temp0; | ||||
temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3); | temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3); | ||||
temp2 = (v8i16) (temp1 - p2_src); | temp2 = (v8i16) (temp1 - p2_src); | ||||
temp2 = CLIP_SH(temp2, tc_neg, tc_pos); | |||||
CLIP_SH(temp2, tc_neg, tc_pos); | |||||
dst0 = (v16u8) (temp2 + (v8i16) p2_src); | dst0 = (v16u8) (temp2 + (v8i16) p2_src); | ||||
temp1 = temp0 + p2_src; | temp1 = temp0 + p2_src; | ||||
temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 2); | temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 2); | ||||
temp2 = (v8i16) (temp1 - p1_src); | temp2 = (v8i16) (temp1 - p1_src); | ||||
temp2 = CLIP_SH(temp2, tc_neg, tc_pos); | |||||
CLIP_SH(temp2, tc_neg, tc_pos); | |||||
dst1 = (v16u8) (temp2 + (v8i16) p1_src); | dst1 = (v16u8) (temp2 + (v8i16) p1_src); | ||||
temp1 = (temp0 << 1) + p2_src + q1_src; | temp1 = (temp0 << 1) + p2_src + q1_src; | ||||
temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3); | temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3); | ||||
temp2 = (v8i16) (temp1 - p0_src); | temp2 = (v8i16) (temp1 - p0_src); | ||||
temp2 = CLIP_SH(temp2, tc_neg, tc_pos); | |||||
CLIP_SH(temp2, tc_neg, tc_pos); | |||||
dst2 = (v16u8) (temp2 + (v8i16) p0_src); | dst2 = (v16u8) (temp2 + (v8i16) p0_src); | ||||
dst0 = __msa_bmz_v(dst0, (v16u8) p2_src, (v16u8) p_is_pcm_vec); | dst0 = __msa_bmz_v(dst0, (v16u8) p2_src, (v16u8) p_is_pcm_vec); | ||||
@@ -585,19 +585,19 @@ static void hevc_loopfilter_luma_ver_msa(uint8_t *src, int32_t stride, | |||||
temp1 = ((q3_src + q2_src) << 1) + q2_src + temp0; | temp1 = ((q3_src + q2_src) << 1) + q2_src + temp0; | ||||
temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3); | temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3); | ||||
temp2 = (v8i16) (temp1 - q2_src); | temp2 = (v8i16) (temp1 - q2_src); | ||||
temp2 = CLIP_SH(temp2, tc_neg, tc_pos); | |||||
CLIP_SH(temp2, tc_neg, tc_pos); | |||||
dst5 = (v16u8) (temp2 + (v8i16) q2_src); | dst5 = (v16u8) (temp2 + (v8i16) q2_src); | ||||
temp1 = temp0 + q2_src; | temp1 = temp0 + q2_src; | ||||
temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 2); | temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 2); | ||||
temp2 = (v8i16) (temp1 - q1_src); | temp2 = (v8i16) (temp1 - q1_src); | ||||
temp2 = CLIP_SH(temp2, tc_neg, tc_pos); | |||||
CLIP_SH(temp2, tc_neg, tc_pos); | |||||
dst4 = (v16u8) (temp2 + (v8i16) q1_src); | dst4 = (v16u8) (temp2 + (v8i16) q1_src); | ||||
temp1 = (temp0 << 1) + p1_src + q2_src; | temp1 = (temp0 << 1) + p1_src + q2_src; | ||||
temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3); | temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3); | ||||
temp2 = (v8i16) (temp1 - q0_src); | temp2 = (v8i16) (temp1 - q0_src); | ||||
temp2 = CLIP_SH(temp2, tc_neg, tc_pos); | |||||
CLIP_SH(temp2, tc_neg, tc_pos); | |||||
dst3 = (v16u8) (temp2 + (v8i16) q0_src); | dst3 = (v16u8) (temp2 + (v8i16) q0_src); | ||||
dst3 = __msa_bmz_v(dst3, (v16u8) q0_src, (v16u8) q_is_pcm_vec); | dst3 = __msa_bmz_v(dst3, (v16u8) q0_src, (v16u8) q_is_pcm_vec); | ||||
@@ -620,14 +620,14 @@ static void hevc_loopfilter_luma_ver_msa(uint8_t *src, int32_t stride, | |||||
abs_delta0 = __msa_add_a_h(delta0, (v8i16) zero); | abs_delta0 = __msa_add_a_h(delta0, (v8i16) zero); | ||||
abs_delta0 = (v8u16) abs_delta0 < temp1; | abs_delta0 = (v8u16) abs_delta0 < temp1; | ||||
delta0 = CLIP_SH(delta0, tc_neg, tc_pos); | |||||
temp0 = (v8u16) (delta0 + p0_src); | |||||
temp0 = (v8u16) CLIP_SH_0_255(temp0); | |||||
temp0 = (v8u16) __msa_bmz_v((v16u8) temp0, (v16u8) p0_src, | |||||
CLIP_SH(delta0, tc_neg, tc_pos); | |||||
temp2 = (v8i16) (delta0 + p0_src); | |||||
CLIP_SH_0_255(temp2); | |||||
temp0 = (v8u16) __msa_bmz_v((v16u8) temp2, (v16u8) p0_src, | |||||
(v16u8) p_is_pcm_vec); | (v16u8) p_is_pcm_vec); | ||||
temp2 = (v8i16) (q0_src - delta0); | temp2 = (v8i16) (q0_src - delta0); | ||||
temp2 = CLIP_SH_0_255(temp2); | |||||
CLIP_SH_0_255(temp2); | |||||
temp2 = (v8i16) __msa_bmz_v((v16u8) temp2, (v16u8) q0_src, | temp2 = (v8i16) __msa_bmz_v((v16u8) temp2, (v16u8) q0_src, | ||||
(v16u8) q_is_pcm_vec); | (v16u8) q_is_pcm_vec); | ||||
@@ -649,9 +649,9 @@ static void hevc_loopfilter_luma_ver_msa(uint8_t *src, int32_t stride, | |||||
delta1 -= (v8i16) p1_src; | delta1 -= (v8i16) p1_src; | ||||
delta1 += delta0; | delta1 += delta0; | ||||
delta1 >>= 1; | delta1 >>= 1; | ||||
delta1 = CLIP_SH(delta1, tc_neg, tc_pos); | |||||
CLIP_SH(delta1, tc_neg, tc_pos); | |||||
delta1 = (v8i16) p1_src + (v8i16) delta1; | delta1 = (v8i16) p1_src + (v8i16) delta1; | ||||
delta1 = CLIP_SH_0_255(delta1); | |||||
CLIP_SH_0_255(delta1); | |||||
delta1 = (v8i16) __msa_bmnz_v((v16u8) delta1, (v16u8) p1_src, | delta1 = (v8i16) __msa_bmnz_v((v16u8) delta1, (v16u8) p1_src, | ||||
(v16u8) p_is_pcm_vec); | (v16u8) p_is_pcm_vec); | ||||
@@ -659,9 +659,9 @@ static void hevc_loopfilter_luma_ver_msa(uint8_t *src, int32_t stride, | |||||
delta2 = delta2 - (v8i16) q1_src; | delta2 = delta2 - (v8i16) q1_src; | ||||
delta2 = delta2 - delta0; | delta2 = delta2 - delta0; | ||||
delta2 = delta2 >> 1; | delta2 = delta2 >> 1; | ||||
delta2 = CLIP_SH(delta2, tc_neg, tc_pos); | |||||
CLIP_SH(delta2, tc_neg, tc_pos); | |||||
delta2 = (v8i16) q1_src + (v8i16) delta2; | delta2 = (v8i16) q1_src + (v8i16) delta2; | ||||
delta2 = CLIP_SH_0_255(delta2); | |||||
CLIP_SH_0_255(delta2); | |||||
delta2 = (v8i16) __msa_bmnz_v((v16u8) delta2, (v16u8) q1_src, | delta2 = (v8i16) __msa_bmnz_v((v16u8) delta2, (v16u8) q1_src, | ||||
(v16u8) q_is_pcm_vec); | (v16u8) q_is_pcm_vec); | ||||
@@ -726,19 +726,19 @@ static void hevc_loopfilter_luma_ver_msa(uint8_t *src, int32_t stride, | |||||
temp1 = ((p3_src + p2_src) << 1) + p2_src + temp0; | temp1 = ((p3_src + p2_src) << 1) + p2_src + temp0; | ||||
temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3); | temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3); | ||||
temp2 = (v8i16) (temp1 - p2_src); | temp2 = (v8i16) (temp1 - p2_src); | ||||
temp2 = CLIP_SH(temp2, tc_neg, tc_pos); | |||||
CLIP_SH(temp2, tc_neg, tc_pos); | |||||
dst0 = (v16u8) (temp2 + (v8i16) p2_src); | dst0 = (v16u8) (temp2 + (v8i16) p2_src); | ||||
temp1 = temp0 + p2_src; | temp1 = temp0 + p2_src; | ||||
temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 2); | temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 2); | ||||
temp2 = (v8i16) (temp1 - p1_src); | temp2 = (v8i16) (temp1 - p1_src); | ||||
temp2 = CLIP_SH(temp2, tc_neg, tc_pos); | |||||
CLIP_SH(temp2, tc_neg, tc_pos); | |||||
dst1 = (v16u8) (temp2 + (v8i16) p1_src); | dst1 = (v16u8) (temp2 + (v8i16) p1_src); | ||||
temp1 = (temp0 << 1) + p2_src + q1_src; | temp1 = (temp0 << 1) + p2_src + q1_src; | ||||
temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3); | temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3); | ||||
temp2 = (v8i16) (temp1 - p0_src); | temp2 = (v8i16) (temp1 - p0_src); | ||||
temp2 = CLIP_SH(temp2, tc_neg, tc_pos); | |||||
CLIP_SH(temp2, tc_neg, tc_pos); | |||||
dst2 = (v16u8) (temp2 + (v8i16) p0_src); | dst2 = (v16u8) (temp2 + (v8i16) p0_src); | ||||
dst0 = __msa_bmz_v(dst0, (v16u8) p2_src, (v16u8) p_is_pcm_vec); | dst0 = __msa_bmz_v(dst0, (v16u8) p2_src, (v16u8) p_is_pcm_vec); | ||||
@@ -750,19 +750,19 @@ static void hevc_loopfilter_luma_ver_msa(uint8_t *src, int32_t stride, | |||||
temp1 = ((q3_src + q2_src) << 1) + q2_src + temp0; | temp1 = ((q3_src + q2_src) << 1) + q2_src + temp0; | ||||
temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3); | temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3); | ||||
temp2 = (v8i16) (temp1 - q2_src); | temp2 = (v8i16) (temp1 - q2_src); | ||||
temp2 = CLIP_SH(temp2, tc_neg, tc_pos); | |||||
CLIP_SH(temp2, tc_neg, tc_pos); | |||||
dst5 = (v16u8) (temp2 + (v8i16) q2_src); | dst5 = (v16u8) (temp2 + (v8i16) q2_src); | ||||
temp1 = temp0 + q2_src; | temp1 = temp0 + q2_src; | ||||
temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 2); | temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 2); | ||||
temp2 = (v8i16) (temp1 - q1_src); | temp2 = (v8i16) (temp1 - q1_src); | ||||
temp2 = CLIP_SH(temp2, tc_neg, tc_pos); | |||||
CLIP_SH(temp2, tc_neg, tc_pos); | |||||
dst4 = (v16u8) (temp2 + (v8i16) q1_src); | dst4 = (v16u8) (temp2 + (v8i16) q1_src); | ||||
temp1 = (temp0 << 1) + p1_src + q2_src; | temp1 = (temp0 << 1) + p1_src + q2_src; | ||||
temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3); | temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3); | ||||
temp2 = (v8i16) (temp1 - q0_src); | temp2 = (v8i16) (temp1 - q0_src); | ||||
temp2 = CLIP_SH(temp2, tc_neg, tc_pos); | |||||
CLIP_SH(temp2, tc_neg, tc_pos); | |||||
dst3 = (v16u8) (temp2 + (v8i16) q0_src); | dst3 = (v16u8) (temp2 + (v8i16) q0_src); | ||||
dst3 = __msa_bmz_v(dst3, (v16u8) q0_src, (v16u8) q_is_pcm_vec); | dst3 = __msa_bmz_v(dst3, (v16u8) q0_src, (v16u8) q_is_pcm_vec); | ||||
@@ -785,15 +785,15 @@ static void hevc_loopfilter_luma_ver_msa(uint8_t *src, int32_t stride, | |||||
abs_delta0 = __msa_add_a_h(delta0, (v8i16) zero); | abs_delta0 = __msa_add_a_h(delta0, (v8i16) zero); | ||||
abs_delta0 = (v8u16) abs_delta0 < temp1; | abs_delta0 = (v8u16) abs_delta0 < temp1; | ||||
delta0 = CLIP_SH(delta0, tc_neg, tc_pos); | |||||
CLIP_SH(delta0, tc_neg, tc_pos); | |||||
temp0 = (v8u16) (delta0 + p0_src); | |||||
temp0 = (v8u16) CLIP_SH_0_255(temp0); | |||||
temp0 = (v8u16) __msa_bmz_v((v16u8) temp0, (v16u8) p0_src, | |||||
temp2 = (v8i16) (delta0 + p0_src); | |||||
CLIP_SH_0_255(temp2); | |||||
temp0 = (v8u16) __msa_bmz_v((v16u8) temp2, (v16u8) p0_src, | |||||
(v16u8) p_is_pcm_vec); | (v16u8) p_is_pcm_vec); | ||||
temp2 = (v8i16) (q0_src - delta0); | temp2 = (v8i16) (q0_src - delta0); | ||||
temp2 = CLIP_SH_0_255(temp2); | |||||
CLIP_SH_0_255(temp2); | |||||
temp2 = (v8i16) __msa_bmz_v((v16u8) temp2, (v16u8) q0_src, | temp2 = (v8i16) __msa_bmz_v((v16u8) temp2, (v16u8) q0_src, | ||||
(v16u8) q_is_pcm_vec); | (v16u8) q_is_pcm_vec); | ||||
@@ -815,9 +815,9 @@ static void hevc_loopfilter_luma_ver_msa(uint8_t *src, int32_t stride, | |||||
delta1 -= (v8i16) p1_src; | delta1 -= (v8i16) p1_src; | ||||
delta1 += delta0; | delta1 += delta0; | ||||
delta1 >>= 1; | delta1 >>= 1; | ||||
delta1 = CLIP_SH(delta1, tc_neg, tc_pos); | |||||
CLIP_SH(delta1, tc_neg, tc_pos); | |||||
delta1 = (v8i16) p1_src + (v8i16) delta1; | delta1 = (v8i16) p1_src + (v8i16) delta1; | ||||
delta1 = CLIP_SH_0_255(delta1); | |||||
CLIP_SH_0_255(delta1); | |||||
delta1 = (v8i16) __msa_bmnz_v((v16u8) delta1, (v16u8) p1_src, | delta1 = (v8i16) __msa_bmnz_v((v16u8) delta1, (v16u8) p1_src, | ||||
(v16u8) p_is_pcm_vec); | (v16u8) p_is_pcm_vec); | ||||
@@ -825,9 +825,9 @@ static void hevc_loopfilter_luma_ver_msa(uint8_t *src, int32_t stride, | |||||
delta2 = delta2 - (v8i16) q1_src; | delta2 = delta2 - (v8i16) q1_src; | ||||
delta2 = delta2 - delta0; | delta2 = delta2 - delta0; | ||||
delta2 = delta2 >> 1; | delta2 = delta2 >> 1; | ||||
delta2 = CLIP_SH(delta2, tc_neg, tc_pos); | |||||
CLIP_SH(delta2, tc_neg, tc_pos); | |||||
delta2 = (v8i16) q1_src + (v8i16) delta2; | delta2 = (v8i16) q1_src + (v8i16) delta2; | ||||
delta2 = CLIP_SH_0_255(delta2); | |||||
CLIP_SH_0_255(delta2); | |||||
delta2 = (v8i16) __msa_bmnz_v((v16u8) delta2, (v16u8) q1_src, | delta2 = (v8i16) __msa_bmnz_v((v16u8) delta2, (v16u8) q1_src, | ||||
(v16u8) q_is_pcm_vec); | (v16u8) q_is_pcm_vec); | ||||
delta1 = (v8i16) __msa_bmz_v((v16u8) delta1, (v16u8) p1_src, | delta1 = (v8i16) __msa_bmz_v((v16u8) delta1, (v16u8) p1_src, | ||||
@@ -955,15 +955,15 @@ static void hevc_loopfilter_chroma_hor_msa(uint8_t *src, int32_t stride, | |||||
temp0 <<= 2; | temp0 <<= 2; | ||||
temp0 += temp1; | temp0 += temp1; | ||||
delta = __msa_srari_h((v8i16) temp0, 3); | delta = __msa_srari_h((v8i16) temp0, 3); | ||||
delta = CLIP_SH(delta, tc_neg, tc_pos); | |||||
CLIP_SH(delta, tc_neg, tc_pos); | |||||
temp0 = (v8i16) ((v8i16) p0 + delta); | temp0 = (v8i16) ((v8i16) p0 + delta); | ||||
temp0 = CLIP_SH_0_255(temp0); | |||||
CLIP_SH_0_255(temp0); | |||||
temp0 = (v8i16) __msa_bmz_v((v16u8) temp0, (v16u8) p0, | temp0 = (v8i16) __msa_bmz_v((v16u8) temp0, (v16u8) p0, | ||||
(v16u8) p_is_pcm_vec); | (v16u8) p_is_pcm_vec); | ||||
temp1 = (v8i16) ((v8i16) q0 - delta); | temp1 = (v8i16) ((v8i16) q0 - delta); | ||||
temp1 = CLIP_SH_0_255(temp1); | |||||
CLIP_SH_0_255(temp1); | |||||
temp1 = (v8i16) __msa_bmz_v((v16u8) temp1, (v16u8) q0, | temp1 = (v8i16) __msa_bmz_v((v16u8) temp1, (v16u8) q0, | ||||
(v16u8) q_is_pcm_vec); | (v16u8) q_is_pcm_vec); | ||||
@@ -1014,15 +1014,15 @@ static void hevc_loopfilter_chroma_ver_msa(uint8_t *src, int32_t stride, | |||||
temp0 <<= 2; | temp0 <<= 2; | ||||
temp0 += temp1; | temp0 += temp1; | ||||
delta = __msa_srari_h((v8i16) temp0, 3); | delta = __msa_srari_h((v8i16) temp0, 3); | ||||
delta = CLIP_SH(delta, tc_neg, tc_pos); | |||||
CLIP_SH(delta, tc_neg, tc_pos); | |||||
temp0 = (v8i16) ((v8i16) p0 + delta); | temp0 = (v8i16) ((v8i16) p0 + delta); | ||||
temp0 = CLIP_SH_0_255(temp0); | |||||
CLIP_SH_0_255(temp0); | |||||
temp0 = (v8i16) __msa_bmz_v((v16u8) temp0, (v16u8) p0, | temp0 = (v8i16) __msa_bmz_v((v16u8) temp0, (v16u8) p0, | ||||
(v16u8) p_is_pcm_vec); | (v16u8) p_is_pcm_vec); | ||||
temp1 = (v8i16) ((v8i16) q0 - delta); | temp1 = (v8i16) ((v8i16) q0 - delta); | ||||
temp1 = CLIP_SH_0_255(temp1); | |||||
CLIP_SH_0_255(temp1); | |||||
temp1 = (v8i16) __msa_bmz_v((v16u8) temp1, (v16u8) q0, | temp1 = (v8i16) __msa_bmz_v((v16u8) temp1, (v16u8) q0, | ||||
(v16u8) q_is_pcm_vec); | (v16u8) q_is_pcm_vec); | ||||
@@ -48,7 +48,7 @@ static const uint8_t ff_hevc_mask_arr[16 * 2] __attribute__((aligned(0x40))) = { | |||||
{ \ | { \ | ||||
ADDS_SH2_SH(vec0, in0, vec1, in1, out0, out1); \ | ADDS_SH2_SH(vec0, in0, vec1, in1, out0, out1); \ | ||||
SRARI_H2_SH(out0, out1, rnd_val); \ | SRARI_H2_SH(out0, out1, rnd_val); \ | ||||
CLIP_SH2_0_255_MAX_SATU(out0, out1); \ | |||||
CLIP_SH2_0_255(out0, out1); \ | |||||
} | } | ||||
#define HEVC_BI_RND_CLIP4_MAX_SATU(in0, in1, in2, in3, vec0, vec1, vec2, \ | #define HEVC_BI_RND_CLIP4_MAX_SATU(in0, in1, in2, in3, vec0, vec1, vec2, \ | ||||
@@ -83,7 +83,7 @@ static void hevc_bi_copy_4w_msa(uint8_t *src0_ptr, | |||||
dst0 <<= 6; | dst0 <<= 6; | ||||
dst0 += in0; | dst0 += in0; | ||||
dst0 = __msa_srari_h(dst0, 7); | dst0 = __msa_srari_h(dst0, 7); | ||||
dst0 = CLIP_SH_0_255_MAX_SATU(dst0); | |||||
CLIP_SH_0_255(dst0); | |||||
dst0 = (v8i16) __msa_pckev_b((v16i8) dst0, (v16i8) dst0); | dst0 = (v8i16) __msa_pckev_b((v16i8) dst0, (v16i8) dst0); | ||||
ST_W2(dst0, 0, 1, dst, dst_stride); | ST_W2(dst0, 0, 1, dst, dst_stride); | ||||
@@ -739,7 +739,7 @@ static void hevc_hz_bi_8t_12w_msa(uint8_t *src0_ptr, | |||||
HEVC_BI_RND_CLIP2(in0, in1, dst0, dst1, 7, dst0, dst1); | HEVC_BI_RND_CLIP2(in0, in1, dst0, dst1, 7, dst0, dst1); | ||||
dst2 = __msa_adds_s_h(in2, dst2); | dst2 = __msa_adds_s_h(in2, dst2); | ||||
dst2 = __msa_srari_h(dst2, 7); | dst2 = __msa_srari_h(dst2, 7); | ||||
dst2 = CLIP_SH_0_255(dst2); | |||||
CLIP_SH_0_255(dst2); | |||||
PCKEV_B2_SH(dst1, dst0, dst2, dst2, dst0, dst1); | PCKEV_B2_SH(dst1, dst0, dst2, dst2, dst0, dst1); | ||||
tmp2 = __msa_copy_s_d((v2i64) dst0, 0); | tmp2 = __msa_copy_s_d((v2i64) dst0, 0); | ||||
@@ -888,7 +888,7 @@ static void hevc_hz_bi_8t_24w_msa(uint8_t *src0_ptr, | |||||
HEVC_BI_RND_CLIP2(in0, in1, dst0, dst1, 7, dst0, dst1); | HEVC_BI_RND_CLIP2(in0, in1, dst0, dst1, 7, dst0, dst1); | ||||
dst2 = __msa_adds_s_h(dst2, in2); | dst2 = __msa_adds_s_h(dst2, in2); | ||||
dst2 = __msa_srari_h(dst2, 7); | dst2 = __msa_srari_h(dst2, 7); | ||||
dst2 = CLIP_SH_0_255(dst2); | |||||
CLIP_SH_0_255(dst2); | |||||
PCKEV_B2_SB(dst1, dst0, dst2, dst2, tmp0, tmp1); | PCKEV_B2_SB(dst1, dst0, dst2, dst2, tmp0, tmp1); | ||||
dst_val0 = __msa_copy_u_d((v2i64) tmp1, 0); | dst_val0 = __msa_copy_u_d((v2i64) tmp1, 0); | ||||
@@ -1726,7 +1726,7 @@ static void hevc_hv_bi_8t_4w_msa(uint8_t *src0_ptr, | |||||
ADDS_SH2_SH(out0, in0, out1, in1, out0, out1); | ADDS_SH2_SH(out0, in0, out1, in1, out0, out1); | ||||
ADDS_SH2_SH(out0, const_vec, out1, const_vec, out0, out1); | ADDS_SH2_SH(out0, const_vec, out1, const_vec, out0, out1); | ||||
SRARI_H2_SH(out0, out1, 7); | SRARI_H2_SH(out0, out1, 7); | ||||
CLIP_SH2_0_255_MAX_SATU(out0, out1); | |||||
CLIP_SH2_0_255(out0, out1); | |||||
out = (v16u8) __msa_pckev_b((v16i8) out1, (v16i8) out0); | out = (v16u8) __msa_pckev_b((v16i8) out1, (v16i8) out0); | ||||
ST_W4(out, 0, 1, 2, 3, dst, dst_stride); | ST_W4(out, 0, 1, 2, 3, dst, dst_stride); | ||||
dst += (4 * dst_stride); | dst += (4 * dst_stride); | ||||
@@ -1854,7 +1854,7 @@ static void hevc_hv_bi_8t_8multx1mult_msa(uint8_t *src0_ptr, | |||||
tmp = __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r); | tmp = __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r); | ||||
ADDS_SH2_SH(tmp, in0, tmp, const_vec, tmp, tmp); | ADDS_SH2_SH(tmp, in0, tmp, const_vec, tmp, tmp); | ||||
tmp = __msa_srari_h(tmp, 7); | tmp = __msa_srari_h(tmp, 7); | ||||
tmp = CLIP_SH_0_255_MAX_SATU(tmp); | |||||
CLIP_SH_0_255(tmp); | |||||
out = (v16u8) __msa_pckev_b((v16i8) tmp, (v16i8) tmp); | out = (v16u8) __msa_pckev_b((v16i8) tmp, (v16i8) tmp); | ||||
ST_D1(out, 0, dst_tmp); | ST_D1(out, 0, dst_tmp); | ||||
dst_tmp += dst_stride; | dst_tmp += dst_stride; | ||||
@@ -2000,7 +2000,7 @@ static void hevc_hv_bi_8t_12w_msa(uint8_t *src0_ptr, | |||||
tmp = __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r); | tmp = __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r); | ||||
ADDS_SH2_SH(tmp, in0, tmp, const_vec, tmp, tmp); | ADDS_SH2_SH(tmp, in0, tmp, const_vec, tmp, tmp); | ||||
tmp = __msa_srari_h(tmp, 7); | tmp = __msa_srari_h(tmp, 7); | ||||
tmp = CLIP_SH_0_255_MAX_SATU(tmp); | |||||
CLIP_SH_0_255(tmp); | |||||
out = (v16u8) __msa_pckev_b((v16i8) tmp, (v16i8) tmp); | out = (v16u8) __msa_pckev_b((v16i8) tmp, (v16i8) tmp); | ||||
ST_D1(out, 0, dst_tmp); | ST_D1(out, 0, dst_tmp); | ||||
dst_tmp += dst_stride; | dst_tmp += dst_stride; | ||||
@@ -2088,7 +2088,7 @@ static void hevc_hv_bi_8t_12w_msa(uint8_t *src0_ptr, | |||||
ADDS_SH2_SH(out0, in0, out1, in1, out0, out1); | ADDS_SH2_SH(out0, in0, out1, in1, out0, out1); | ||||
ADDS_SH2_SH(out0, const_vec, out1, const_vec, out0, out1); | ADDS_SH2_SH(out0, const_vec, out1, const_vec, out0, out1); | ||||
SRARI_H2_SH(out0, out1, 7); | SRARI_H2_SH(out0, out1, 7); | ||||
CLIP_SH2_0_255_MAX_SATU(out0, out1); | |||||
CLIP_SH2_0_255(out0, out1); | |||||
out = (v16u8) __msa_pckev_b((v16i8) out1, (v16i8) out0); | out = (v16u8) __msa_pckev_b((v16i8) out1, (v16i8) out0); | ||||
ST_W4(out, 0, 1, 2, 3, dst, dst_stride); | ST_W4(out, 0, 1, 2, 3, dst, dst_stride); | ||||
dst += (4 * dst_stride); | dst += (4 * dst_stride); | ||||
@@ -2215,7 +2215,7 @@ static void hevc_hz_bi_4t_4x2_msa(uint8_t *src0_ptr, | |||||
tmp0 = __msa_adds_s_h(tmp0, in0); | tmp0 = __msa_adds_s_h(tmp0, in0); | ||||
tmp0 = __msa_srari_h(tmp0, 7); | tmp0 = __msa_srari_h(tmp0, 7); | ||||
tmp0 = CLIP_SH_0_255(tmp0); | |||||
CLIP_SH_0_255(tmp0); | |||||
dst0 = __msa_pckev_b((v16i8) tmp0, (v16i8) tmp0); | dst0 = __msa_pckev_b((v16i8) tmp0, (v16i8) tmp0); | ||||
ST_W2(dst0, 0, 1, dst, dst_stride); | ST_W2(dst0, 0, 1, dst, dst_stride); | ||||
@@ -2943,7 +2943,7 @@ static void hevc_vt_bi_4t_4x2_msa(uint8_t *src0_ptr, | |||||
DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst10, dst10); | DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst10, dst10); | ||||
dst10 = __msa_adds_s_h(dst10, in0); | dst10 = __msa_adds_s_h(dst10, in0); | ||||
dst10 = __msa_srari_h(dst10, 7); | dst10 = __msa_srari_h(dst10, 7); | ||||
dst10 = CLIP_SH_0_255(dst10); | |||||
CLIP_SH_0_255(dst10); | |||||
dst10 = (v8i16) __msa_pckev_b((v16i8) dst10, (v16i8) dst10); | dst10 = (v8i16) __msa_pckev_b((v16i8) dst10, (v16i8) dst10); | ||||
ST_W2(dst10, 0, 1, dst, dst_stride); | ST_W2(dst10, 0, 1, dst, dst_stride); | ||||
@@ -3843,7 +3843,7 @@ static void hevc_hv_bi_4t_4x2_msa(uint8_t *src0_ptr, | |||||
tmp = __msa_pckev_h((v8i16) dst1, (v8i16) dst0); | tmp = __msa_pckev_h((v8i16) dst1, (v8i16) dst0); | ||||
tmp = __msa_adds_s_h(tmp, in0); | tmp = __msa_adds_s_h(tmp, in0); | ||||
tmp = __msa_srari_h(tmp, 7); | tmp = __msa_srari_h(tmp, 7); | ||||
tmp = CLIP_SH_0_255_MAX_SATU(tmp); | |||||
CLIP_SH_0_255(tmp); | |||||
out = (v16u8) __msa_pckev_b((v16i8) tmp, (v16i8) tmp); | out = (v16u8) __msa_pckev_b((v16i8) tmp, (v16i8) tmp); | ||||
ST_W2(out, 0, 1, dst, dst_stride); | ST_W2(out, 0, 1, dst, dst_stride); | ||||
} | } | ||||
@@ -3919,7 +3919,7 @@ static void hevc_hv_bi_4t_4x4_msa(uint8_t *src0_ptr, | |||||
PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp0, tmp1); | PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp0, tmp1); | ||||
ADDS_SH2_SH(tmp0, in0, tmp1, in1, tmp0, tmp1); | ADDS_SH2_SH(tmp0, in0, tmp1, in1, tmp0, tmp1); | ||||
SRARI_H2_SH(tmp0, tmp1, 7); | SRARI_H2_SH(tmp0, tmp1, 7); | ||||
CLIP_SH2_0_255_MAX_SATU(tmp0, tmp1); | |||||
CLIP_SH2_0_255(tmp0, tmp1); | |||||
out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0); | out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0); | ||||
ST_W4(out, 0, 1, 2, 3, dst, dst_stride); | ST_W4(out, 0, 1, 2, 3, dst, dst_stride); | ||||
} | } | ||||
@@ -4032,7 +4032,7 @@ static void hevc_hv_bi_4t_4multx8mult_msa(uint8_t *src0_ptr, | |||||
ADDS_SH4_SH(in0, tmp0, in1, tmp1, in2, tmp2, in3, tmp3, tmp0, tmp1, | ADDS_SH4_SH(in0, tmp0, in1, tmp1, in2, tmp2, in3, tmp3, tmp0, tmp1, | ||||
tmp2, tmp3); | tmp2, tmp3); | ||||
SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7); | SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7); | ||||
CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3); | |||||
CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3); | |||||
PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1); | PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1); | ||||
ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride); | ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride); | ||||
dst += (8 * dst_stride); | dst += (8 * dst_stride); | ||||
@@ -4200,7 +4200,7 @@ static void hevc_hv_bi_4t_6w_msa(uint8_t *src0_ptr, | |||||
ADDS_SH4_SH(in0, tmp0, in1, tmp1, in2, tmp2, in3, tmp3, tmp0, tmp1, tmp2, | ADDS_SH4_SH(in0, tmp0, in1, tmp1, in2, tmp2, in3, tmp3, tmp0, tmp1, tmp2, | ||||
tmp3); | tmp3); | ||||
SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7); | SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7); | ||||
CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3); | |||||
CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3); | |||||
PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1); | PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1); | ||||
ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride); | ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride); | ||||
@@ -4212,7 +4212,7 @@ static void hevc_hv_bi_4t_6w_msa(uint8_t *src0_ptr, | |||||
ADDS_SH2_SH(in4, const_vec, in5, const_vec, in4, in5); | ADDS_SH2_SH(in4, const_vec, in5, const_vec, in4, in5); | ||||
ADDS_SH2_SH(in4, tmp4, in5, tmp5, tmp4, tmp5); | ADDS_SH2_SH(in4, tmp4, in5, tmp5, tmp4, tmp5); | ||||
SRARI_H2_SH(tmp4, tmp5, 7); | SRARI_H2_SH(tmp4, tmp5, 7); | ||||
CLIP_SH2_0_255_MAX_SATU(tmp4, tmp5); | |||||
CLIP_SH2_0_255(tmp4, tmp5); | |||||
out2 = (v16u8) __msa_pckev_b((v16i8) tmp5, (v16i8) tmp4); | out2 = (v16u8) __msa_pckev_b((v16i8) tmp5, (v16i8) tmp4); | ||||
ST_H8(out2, 0, 1, 2, 3, 4, 5, 6, 7, dst + 4, dst_stride); | ST_H8(out2, 0, 1, 2, 3, 4, 5, 6, 7, dst + 4, dst_stride); | ||||
} | } | ||||
@@ -4286,7 +4286,7 @@ static void hevc_hv_bi_4t_8x2_msa(uint8_t *src0_ptr, | |||||
PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, tmp0, tmp1); | PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, tmp0, tmp1); | ||||
ADDS_SH2_SH(in0, tmp0, in1, tmp1, tmp0, tmp1); | ADDS_SH2_SH(in0, tmp0, in1, tmp1, tmp0, tmp1); | ||||
SRARI_H2_SH(tmp0, tmp1, 7); | SRARI_H2_SH(tmp0, tmp1, 7); | ||||
CLIP_SH2_0_255_MAX_SATU(tmp0, tmp1); | |||||
CLIP_SH2_0_255(tmp0, tmp1); | |||||
out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0); | out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0); | ||||
ST_D2(out, 0, 1, dst, dst_stride); | ST_D2(out, 0, 1, dst, dst_stride); | ||||
} | } | ||||
@@ -4380,7 +4380,7 @@ static void hevc_hv_bi_4t_8multx4_msa(uint8_t *src0_ptr, | |||||
ADDS_SH4_SH(in0, tmp0, in1, tmp1, in2, tmp2, in3, tmp3, | ADDS_SH4_SH(in0, tmp0, in1, tmp1, in2, tmp2, in3, tmp3, | ||||
tmp0, tmp1, tmp2, tmp3); | tmp0, tmp1, tmp2, tmp3); | ||||
SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7); | SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7); | ||||
CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3); | |||||
CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3); | |||||
PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1); | PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1); | ||||
ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride); | ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride); | ||||
dst += 8; | dst += 8; | ||||
@@ -4495,8 +4495,8 @@ static void hevc_hv_bi_4t_8x6_msa(uint8_t *src0_ptr, | |||||
ADDS_SH2_SH(in4, tmp4, in5, tmp5, tmp4, tmp5); | ADDS_SH2_SH(in4, tmp4, in5, tmp5, tmp4, tmp5); | ||||
SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7); | SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7); | ||||
SRARI_H2_SH(tmp4, tmp5, 7); | SRARI_H2_SH(tmp4, tmp5, 7); | ||||
CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3); | |||||
CLIP_SH2_0_255_MAX_SATU(tmp4, tmp5); | |||||
CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3); | |||||
CLIP_SH2_0_255(tmp4, tmp5); | |||||
PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1); | PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1); | ||||
out2 = (v16u8) __msa_pckev_b((v16i8) tmp5, (v16i8) tmp4); | out2 = (v16u8) __msa_pckev_b((v16i8) tmp5, (v16i8) tmp4); | ||||
ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride); | ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride); | ||||
@@ -4610,7 +4610,7 @@ static void hevc_hv_bi_4t_8multx4mult_msa(uint8_t *src0_ptr, | |||||
ADDS_SH4_SH(in0, tmp0, in1, tmp1, in2, tmp2, in3, tmp3, | ADDS_SH4_SH(in0, tmp0, in1, tmp1, in2, tmp2, in3, tmp3, | ||||
tmp0, tmp1, tmp2, tmp3); | tmp0, tmp1, tmp2, tmp3); | ||||
SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7); | SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7); | ||||
CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3); | |||||
CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3); | |||||
PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1); | PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1); | ||||
ST_D4(out0, out1, 0, 1, 0, 1, dst_tmp, dst_stride); | ST_D4(out0, out1, 0, 1, 0, 1, dst_tmp, dst_stride); | ||||
dst_tmp += (4 * dst_stride); | dst_tmp += (4 * dst_stride); | ||||
@@ -4760,7 +4760,7 @@ static void hevc_hv_bi_4t_12w_msa(uint8_t *src0_ptr, | |||||
ADDS_SH4_SH(in0, tmp0, in1, tmp1, in2, tmp2, in3, tmp3, | ADDS_SH4_SH(in0, tmp0, in1, tmp1, in2, tmp2, in3, tmp3, | ||||
tmp0, tmp1, tmp2, tmp3); | tmp0, tmp1, tmp2, tmp3); | ||||
SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7); | SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7); | ||||
CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3); | |||||
CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3); | |||||
PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1); | PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1); | ||||
ST_D4(out0, out1, 0, 1, 0, 1, dst_tmp, dst_stride); | ST_D4(out0, out1, 0, 1, 0, 1, dst_tmp, dst_stride); | ||||
dst_tmp += (4 * dst_stride); | dst_tmp += (4 * dst_stride); | ||||
@@ -4846,7 +4846,7 @@ static void hevc_hv_bi_4t_12w_msa(uint8_t *src0_ptr, | |||||
ADDS_SH4_SH(in0, tmp0, in1, tmp1, in2, tmp2, in3, tmp3, | ADDS_SH4_SH(in0, tmp0, in1, tmp1, in2, tmp2, in3, tmp3, | ||||
tmp0, tmp1, tmp2, tmp3); | tmp0, tmp1, tmp2, tmp3); | ||||
SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7); | SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7); | ||||
CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3); | |||||
CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3); | |||||
PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1); | PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1); | ||||
ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride); | ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride); | ||||
dst += (8 * dst_stride); | dst += (8 * dst_stride); | ||||
@@ -66,7 +66,7 @@ static const uint8_t ff_hevc_mask_arr[16 * 2] __attribute__((aligned(0x40))) = { | |||||
out1_l = __msa_dpadd_s_w(offset, (v8i16) out1_l, (v8i16) wgt); \ | out1_l = __msa_dpadd_s_w(offset, (v8i16) out1_l, (v8i16) wgt); \ | ||||
SRAR_W4_SW(out0_r, out1_r, out0_l, out1_l, rnd); \ | SRAR_W4_SW(out0_r, out1_r, out0_l, out1_l, rnd); \ | ||||
PCKEV_H2_SH(out0_l, out0_r, out1_l, out1_r, out0, out1); \ | PCKEV_H2_SH(out0_l, out0_r, out1_l, out1_r, out0, out1); \ | ||||
CLIP_SH2_0_255_MAX_SATU(out0, out1); \ | |||||
CLIP_SH2_0_255(out0, out1); \ | |||||
} | } | ||||
#define HEVC_BIW_RND_CLIP4_MAX_SATU(in0, in1, in2, in3, vec0, vec1, vec2, \ | #define HEVC_BIW_RND_CLIP4_MAX_SATU(in0, in1, in2, in3, vec0, vec1, vec2, \ | ||||
@@ -124,7 +124,7 @@ static void hevc_biwgt_copy_4w_msa(uint8_t *src0_ptr, | |||||
dst0_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst0_l, weight_vec); | dst0_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst0_l, weight_vec); | ||||
SRAR_W2_SW(dst0_r, dst0_l, rnd_vec); | SRAR_W2_SW(dst0_r, dst0_l, rnd_vec); | ||||
dst0 = (v8i16) __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r); | dst0 = (v8i16) __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r); | ||||
dst0 = CLIP_SH_0_255_MAX_SATU(dst0); | |||||
CLIP_SH_0_255(dst0); | |||||
out0 = (v16u8) __msa_pckev_b((v16i8) dst0, (v16i8) dst0); | out0 = (v16u8) __msa_pckev_b((v16i8) dst0, (v16i8) dst0); | ||||
ST_W2(out0, 0, 1, dst, dst_stride); | ST_W2(out0, 0, 1, dst, dst_stride); | ||||
} else if (4 == height) { | } else if (4 == height) { | ||||
@@ -1069,8 +1069,8 @@ static void hevc_hz_biwgt_8t_24w_msa(uint8_t *src0_ptr, | |||||
dst2_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst2_l, | dst2_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst2_l, | ||||
(v8i16) weight_vec); | (v8i16) weight_vec); | ||||
SRAR_W2_SW(dst2_r, dst2_l, rnd_vec); | SRAR_W2_SW(dst2_r, dst2_l, rnd_vec); | ||||
dst2_r = (v4i32) __msa_pckev_h((v8i16) dst2_l, (v8i16) dst2_r); | |||||
out2 = CLIP_SH_0_255(dst2_r); | |||||
out2 = __msa_pckev_h((v8i16) dst2_l, (v8i16) dst2_r); | |||||
CLIP_SH_0_255(out2); | |||||
LD_SB2(src0_ptr, 16, src0, src1); | LD_SB2(src0_ptr, 16, src0, src1); | ||||
src0_ptr += src_stride; | src0_ptr += src_stride; | ||||
@@ -1100,8 +1100,8 @@ static void hevc_hz_biwgt_8t_24w_msa(uint8_t *src0_ptr, | |||||
dst2_r = __msa_dpadd_s_w(offset_vec, (v8i16) dst2_r, (v8i16) weight_vec); | dst2_r = __msa_dpadd_s_w(offset_vec, (v8i16) dst2_r, (v8i16) weight_vec); | ||||
dst2_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst2_l, (v8i16) weight_vec); | dst2_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst2_l, (v8i16) weight_vec); | ||||
SRAR_W2_SW(dst2_r, dst2_l, rnd_vec); | SRAR_W2_SW(dst2_r, dst2_l, rnd_vec); | ||||
dst2_r = (v4i32) __msa_pckev_h((v8i16) dst2_l, (v8i16) dst2_r); | |||||
out2 = CLIP_SH_0_255(dst2_r); | |||||
out2 = __msa_pckev_h((v8i16) dst2_l, (v8i16) dst2_r); | |||||
CLIP_SH_0_255(out2); | |||||
PCKEV_B2_SH(out1, out0, out2, out2, out0, out2); | PCKEV_B2_SH(out1, out0, out2, out2, out0, out2); | ||||
dst_val0 = __msa_copy_u_d((v2i64) out2, 0); | dst_val0 = __msa_copy_u_d((v2i64) out2, 0); | ||||
ST_SH(out0, dst); | ST_SH(out0, dst); | ||||
@@ -1674,8 +1674,8 @@ static void hevc_vt_biwgt_8t_12w_msa(uint8_t *src0_ptr, | |||||
dst2_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst2_l, | dst2_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst2_l, | ||||
(v8i16) weight_vec); | (v8i16) weight_vec); | ||||
SRAR_W2_SW(dst2_r, dst2_l, rnd_vec); | SRAR_W2_SW(dst2_r, dst2_l, rnd_vec); | ||||
dst2_r = (v4i32) __msa_pckev_h((v8i16) dst2_l, (v8i16) dst2_r); | |||||
out2 = CLIP_SH_0_255(dst2_r); | |||||
out2 = __msa_pckev_h((v8i16) dst2_l, (v8i16) dst2_r); | |||||
CLIP_SH_0_255(out2); | |||||
PCKEV_B2_SH(out1, out0, out2, out2, out0, out2); | PCKEV_B2_SH(out1, out0, out2, out2, out0, out2); | ||||
ST_D2(out0, 0, 1, dst, dst_stride); | ST_D2(out0, 0, 1, dst, dst_stride); | ||||
ST_W2(out2, 0, 1, dst + 8, dst_stride); | ST_W2(out2, 0, 1, dst + 8, dst_stride); | ||||
@@ -2048,7 +2048,7 @@ static void hevc_hv_biwgt_8t_4w_msa(uint8_t *src0_ptr, | |||||
dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec); | dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec); | ||||
dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec); | dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec); | ||||
SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec); | SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec); | ||||
CLIP_SW4_0_255_MAX_SATU(dst0, dst1, dst2, dst3); | |||||
CLIP_SW4_0_255(dst0, dst1, dst2, dst3); | |||||
PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp0, tmp1); | PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp0, tmp1); | ||||
out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0); | out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0); | ||||
ST_W4(out, 0, 1, 2, 3, dst, dst_stride); | ST_W4(out, 0, 1, 2, 3, dst, dst_stride); | ||||
@@ -2226,7 +2226,7 @@ static void hevc_hv_biwgt_8t_8multx2mult_msa(uint8_t *src0_ptr, | |||||
dst1_r = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec); | dst1_r = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec); | ||||
dst1_l = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec); | dst1_l = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec); | ||||
SRAR_W4_SW(dst0_l, dst0_r, dst1_l, dst1_r, rnd_vec); | SRAR_W4_SW(dst0_l, dst0_r, dst1_l, dst1_r, rnd_vec); | ||||
CLIP_SW4_0_255_MAX_SATU(dst0_l, dst0_r, dst1_l, dst1_r); | |||||
CLIP_SW4_0_255(dst0_l, dst0_r, dst1_l, dst1_r); | |||||
PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, tmp0, tmp1); | PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, tmp0, tmp1); | ||||
out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0); | out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0); | ||||
ST_D2(out, 0, 1, dst_tmp, dst_stride); | ST_D2(out, 0, 1, dst_tmp, dst_stride); | ||||
@@ -2412,7 +2412,7 @@ static void hevc_hv_biwgt_8t_12w_msa(uint8_t *src0_ptr, | |||||
dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec); | dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec); | ||||
dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec); | dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec); | ||||
SRAR_W4_SW(dst1, dst0, dst3, dst2, rnd_vec); | SRAR_W4_SW(dst1, dst0, dst3, dst2, rnd_vec); | ||||
CLIP_SW4_0_255_MAX_SATU(dst1, dst0, dst3, dst2); | |||||
CLIP_SW4_0_255(dst1, dst0, dst3, dst2); | |||||
PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp0, tmp1); | PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp0, tmp1); | ||||
out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0); | out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0); | ||||
ST_D2(out, 0, 1, dst_tmp, dst_stride); | ST_D2(out, 0, 1, dst_tmp, dst_stride); | ||||
@@ -2503,7 +2503,7 @@ static void hevc_hv_biwgt_8t_12w_msa(uint8_t *src0_ptr, | |||||
dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec); | dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec); | ||||
dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec); | dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec); | ||||
SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec); | SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec); | ||||
CLIP_SW4_0_255_MAX_SATU(dst0, dst1, dst2, dst3); | |||||
CLIP_SW4_0_255(dst0, dst1, dst2, dst3); | |||||
PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp0, tmp1); | PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp0, tmp1); | ||||
out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0); | out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0); | ||||
ST_W4(out, 0, 1, 2, 3, dst, dst_stride); | ST_W4(out, 0, 1, 2, 3, dst, dst_stride); | ||||
@@ -2683,8 +2683,8 @@ static void hevc_hz_biwgt_4t_4x2_msa(uint8_t *src0_ptr, | |||||
dst0_r = __msa_dpadd_s_w(offset_vec, (v8i16) dst0_r, (v8i16) weight_vec); | dst0_r = __msa_dpadd_s_w(offset_vec, (v8i16) dst0_r, (v8i16) weight_vec); | ||||
dst0_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst0_l, (v8i16) weight_vec); | dst0_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst0_l, (v8i16) weight_vec); | ||||
SRAR_W2_SW(dst0_r, dst0_l, rnd_vec); | SRAR_W2_SW(dst0_r, dst0_l, rnd_vec); | ||||
dst0_r = (v4i32) __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r); | |||||
out0 = CLIP_SH_0_255(dst0_r); | |||||
out0 = __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r); | |||||
CLIP_SH_0_255(out0); | |||||
out0 = (v8i16) __msa_pckev_b((v16i8) out0, (v16i8) out0); | out0 = (v8i16) __msa_pckev_b((v16i8) out0, (v16i8) out0); | ||||
ST_W2(out0, 0, 1, dst, dst_stride); | ST_W2(out0, 0, 1, dst, dst_stride); | ||||
} | } | ||||
@@ -3554,8 +3554,8 @@ static void hevc_vt_biwgt_4t_4x2_msa(uint8_t *src0_ptr, | |||||
dst10_r = __msa_dpadd_s_w(offset_vec, (v8i16) dst10_r, (v8i16) weight_vec); | dst10_r = __msa_dpadd_s_w(offset_vec, (v8i16) dst10_r, (v8i16) weight_vec); | ||||
dst10_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst10_l, (v8i16) weight_vec); | dst10_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst10_l, (v8i16) weight_vec); | ||||
SRAR_W2_SW(dst10_r, dst10_l, rnd_vec); | SRAR_W2_SW(dst10_r, dst10_l, rnd_vec); | ||||
dst10_r = (v4i32) __msa_pckev_h((v8i16) dst10_l, (v8i16) dst10_r); | |||||
out = CLIP_SH_0_255(dst10_r); | |||||
out = __msa_pckev_h((v8i16) dst10_l, (v8i16) dst10_r); | |||||
CLIP_SH_0_255(out); | |||||
out = (v8i16) __msa_pckev_b((v16i8) out, (v16i8) out); | out = (v8i16) __msa_pckev_b((v16i8) out, (v16i8) out); | ||||
ST_W2(out, 0, 1, dst, dst_stride); | ST_W2(out, 0, 1, dst, dst_stride); | ||||
} | } | ||||
@@ -4575,7 +4575,7 @@ static void hevc_hv_biwgt_4t_4x2_msa(uint8_t *src0_ptr, | |||||
dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec); | dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec); | ||||
SRAR_W2_SW(dst0, dst1, rnd_vec); | SRAR_W2_SW(dst0, dst1, rnd_vec); | ||||
tmp = __msa_pckev_h((v8i16) dst1, (v8i16) dst0); | tmp = __msa_pckev_h((v8i16) dst1, (v8i16) dst0); | ||||
tmp = CLIP_SH_0_255_MAX_SATU(tmp); | |||||
CLIP_SH_0_255(tmp); | |||||
out = (v16u8) __msa_pckev_b((v16i8) tmp, (v16i8) tmp); | out = (v16u8) __msa_pckev_b((v16i8) tmp, (v16i8) tmp); | ||||
ST_W2(out, 0, 1, dst, dst_stride); | ST_W2(out, 0, 1, dst, dst_stride); | ||||
} | } | ||||
@@ -4672,7 +4672,7 @@ static void hevc_hv_biwgt_4t_4x4_msa(uint8_t *src0_ptr, | |||||
dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec); | dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec); | ||||
SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec); | SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec); | ||||
PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp0, tmp1); | PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp0, tmp1); | ||||
CLIP_SH2_0_255_MAX_SATU(tmp0, tmp1); | |||||
CLIP_SH2_0_255(tmp0, tmp1); | |||||
out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0); | out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0); | ||||
ST_W4(out, 0, 1, 2, 3, dst, dst_stride); | ST_W4(out, 0, 1, 2, 3, dst, dst_stride); | ||||
} | } | ||||
@@ -4810,7 +4810,7 @@ static void hevc_hv_biwgt_4t_4multx8mult_msa(uint8_t *src0_ptr, | |||||
SRAR_W4_SW(dst4, dst5, dst6, dst7, rnd_vec); | SRAR_W4_SW(dst4, dst5, dst6, dst7, rnd_vec); | ||||
PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, tmp0, tmp1, | PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, tmp0, tmp1, | ||||
tmp2, tmp3); | tmp2, tmp3); | ||||
CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3); | |||||
CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3); | |||||
PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1); | PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1); | ||||
ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride); | ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride); | ||||
dst += (8 * dst_stride); | dst += (8 * dst_stride); | ||||
@@ -5008,7 +5008,7 @@ static void hevc_hv_biwgt_4t_6w_msa(uint8_t *src0_ptr, | |||||
SRAR_W4_SW(dst4, dst5, dst6, dst7, rnd_vec); | SRAR_W4_SW(dst4, dst5, dst6, dst7, rnd_vec); | ||||
PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, tmp0, tmp1, | PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, tmp0, tmp1, | ||||
tmp2, tmp3); | tmp2, tmp3); | ||||
CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3); | |||||
CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3); | |||||
PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1); | PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1); | ||||
ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride); | ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride); | ||||
@@ -5030,7 +5030,7 @@ static void hevc_hv_biwgt_4t_6w_msa(uint8_t *src0_ptr, | |||||
SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec); | SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec); | ||||
PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp4, tmp5); | PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp4, tmp5); | ||||
CLIP_SH2_0_255_MAX_SATU(tmp4, tmp5); | |||||
CLIP_SH2_0_255(tmp4, tmp5); | |||||
out2 = (v16u8) __msa_pckev_b((v16i8) tmp5, (v16i8) tmp4); | out2 = (v16u8) __msa_pckev_b((v16i8) tmp5, (v16i8) tmp4); | ||||
ST_H8(out2, 0, 1, 2, 3, 4, 5, 6, 7, dst + 4, dst_stride); | ST_H8(out2, 0, 1, 2, 3, 4, 5, 6, 7, dst + 4, dst_stride); | ||||
} | } | ||||
@@ -5126,7 +5126,7 @@ static void hevc_hv_biwgt_4t_8x2_msa(uint8_t *src0_ptr, | |||||
dst1_l = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec); | dst1_l = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec); | ||||
SRAR_W4_SW(dst0_r, dst0_l, dst1_r, dst1_l, rnd_vec); | SRAR_W4_SW(dst0_r, dst0_l, dst1_r, dst1_l, rnd_vec); | ||||
PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, tmp0, tmp1); | PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, tmp0, tmp1); | ||||
CLIP_SH2_0_255_MAX_SATU(tmp0, tmp1); | |||||
CLIP_SH2_0_255(tmp0, tmp1); | |||||
out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0); | out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0); | ||||
ST_D2(out, 0, 1, dst, dst_stride); | ST_D2(out, 0, 1, dst, dst_stride); | ||||
} | } | ||||
@@ -5248,7 +5248,7 @@ static void hevc_hv_biwgt_4t_8multx4_msa(uint8_t *src0_ptr, | |||||
SRAR_W4_SW(dst4, dst5, dst6, dst7, rnd_vec); | SRAR_W4_SW(dst4, dst5, dst6, dst7, rnd_vec); | ||||
PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, | PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, | ||||
tmp0, tmp1, tmp2, tmp3); | tmp0, tmp1, tmp2, tmp3); | ||||
CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3); | |||||
CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3); | |||||
PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1); | PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1); | ||||
ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride); | ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride); | ||||
dst += 8; | dst += 8; | ||||
@@ -5387,7 +5387,7 @@ static void hevc_hv_biwgt_4t_8x6_msa(uint8_t *src0_ptr, | |||||
SRAR_W4_SW(dst4, dst5, dst6, dst7, rnd_vec); | SRAR_W4_SW(dst4, dst5, dst6, dst7, rnd_vec); | ||||
PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, | PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, | ||||
tmp0, tmp1, tmp2, tmp3); | tmp0, tmp1, tmp2, tmp3); | ||||
CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3); | |||||
CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3); | |||||
PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1); | PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1); | ||||
PCKEV_H2_SW(dst4_l, dst4_r, dst5_l, dst5_r, dst0, dst1); | PCKEV_H2_SW(dst4_l, dst4_r, dst5_l, dst5_r, dst0, dst1); | ||||
@@ -5399,7 +5399,7 @@ static void hevc_hv_biwgt_4t_8x6_msa(uint8_t *src0_ptr, | |||||
dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec); | dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec); | ||||
SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec); | SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec); | ||||
PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp4, tmp5); | PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp4, tmp5); | ||||
CLIP_SH2_0_255_MAX_SATU(tmp4, tmp5); | |||||
CLIP_SH2_0_255(tmp4, tmp5); | |||||
out2 = (v16u8) __msa_pckev_b((v16i8) tmp5, (v16i8) tmp4); | out2 = (v16u8) __msa_pckev_b((v16i8) tmp5, (v16i8) tmp4); | ||||
ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride); | ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride); | ||||
ST_D2(out2, 0, 1, dst + 4 * dst_stride, dst_stride); | ST_D2(out2, 0, 1, dst + 4 * dst_stride, dst_stride); | ||||
@@ -5537,7 +5537,7 @@ static void hevc_hv_biwgt_4t_8multx4mult_msa(uint8_t *src0_ptr, | |||||
SRAR_W4_SW(dst4, dst5, dst6, dst7, rnd_vec); | SRAR_W4_SW(dst4, dst5, dst6, dst7, rnd_vec); | ||||
PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, | PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, | ||||
tmp0, tmp1, tmp2, tmp3); | tmp0, tmp1, tmp2, tmp3); | ||||
CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3); | |||||
CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3); | |||||
PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1); | PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1); | ||||
ST_D4(out0, out1, 0, 1, 0, 1, dst_tmp, dst_stride); | ST_D4(out0, out1, 0, 1, 0, 1, dst_tmp, dst_stride); | ||||
dst_tmp += (4 * dst_stride); | dst_tmp += (4 * dst_stride); | ||||
@@ -5724,7 +5724,7 @@ static void hevc_hv_biwgt_4t_12w_msa(uint8_t *src0_ptr, | |||||
SRAR_W4_SW(dst4, dst5, dst6, dst7, rnd_vec); | SRAR_W4_SW(dst4, dst5, dst6, dst7, rnd_vec); | ||||
PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, | PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, | ||||
tmp0, tmp1, tmp2, tmp3); | tmp0, tmp1, tmp2, tmp3); | ||||
CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3); | |||||
CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3); | |||||
PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1); | PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1); | ||||
ST_D4(out0, out1, 0, 1, 0, 1, dst_tmp, dst_stride); | ST_D4(out0, out1, 0, 1, 0, 1, dst_tmp, dst_stride); | ||||
dst_tmp += (4 * dst_stride); | dst_tmp += (4 * dst_stride); | ||||
@@ -5820,7 +5820,7 @@ static void hevc_hv_biwgt_4t_12w_msa(uint8_t *src0_ptr, | |||||
SRAR_W4_SW(dst4, dst5, dst6, dst7, rnd_vec); | SRAR_W4_SW(dst4, dst5, dst6, dst7, rnd_vec); | ||||
PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, | PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, | ||||
tmp0, tmp1, tmp2, tmp3); | tmp0, tmp1, tmp2, tmp3); | ||||
CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3); | |||||
CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3); | |||||
PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1); | PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1); | ||||
ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride); | ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride); | ||||
dst += (8 * dst_stride); | dst += (8 * dst_stride); | ||||
@@ -41,7 +41,7 @@ static const uint8_t ff_hevc_mask_arr[16 * 2] __attribute__((aligned(0x40))) = { | |||||
SRAR_W4_SW(in0_r_m, in1_r_m, in0_l_m, in1_l_m, rnd_w); \ | SRAR_W4_SW(in0_r_m, in1_r_m, in0_l_m, in1_l_m, rnd_w); \ | ||||
PCKEV_H2_SH(in0_l_m, in0_r_m, in1_l_m, in1_r_m, out0_h, out1_h); \ | PCKEV_H2_SH(in0_l_m, in0_r_m, in1_l_m, in1_r_m, out0_h, out1_h); \ | ||||
ADDS_SH2_SH(out0_h, offset_h, out1_h, offset_h, out0_h, out1_h); \ | ADDS_SH2_SH(out0_h, offset_h, out1_h, offset_h, out0_h, out1_h); \ | ||||
CLIP_SH2_0_255_MAX_SATU(out0_h, out1_h); \ | |||||
CLIP_SH2_0_255(out0_h, out1_h); \ | |||||
} | } | ||||
#define HEVC_UNIW_RND_CLIP4_MAX_SATU_H(in0_h, in1_h, in2_h, in3_h, wgt_w, \ | #define HEVC_UNIW_RND_CLIP4_MAX_SATU_H(in0_h, in1_h, in2_h, in3_h, wgt_w, \ | ||||
@@ -88,7 +88,7 @@ static void hevc_uniwgt_copy_4w_msa(uint8_t *src, | |||||
SRAR_W2_SW(dst0_r, dst0_l, rnd_vec); | SRAR_W2_SW(dst0_r, dst0_l, rnd_vec); | ||||
dst0 = __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r); | dst0 = __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r); | ||||
dst0 += offset_vec; | dst0 += offset_vec; | ||||
dst0 = CLIP_SH_0_255_MAX_SATU(dst0); | |||||
CLIP_SH_0_255(dst0); | |||||
out0 = (v16u8) __msa_pckev_b((v16i8) dst0, (v16i8) dst0); | out0 = (v16u8) __msa_pckev_b((v16i8) dst0, (v16i8) dst0); | ||||
ST_W2(out0, 0, 1, dst, dst_stride); | ST_W2(out0, 0, 1, dst, dst_stride); | ||||
} else if (4 == height) { | } else if (4 == height) { | ||||
@@ -1863,7 +1863,7 @@ static void hevc_hv_uniwgt_8t_4w_msa(uint8_t *src, | |||||
SRAR_W4_SW(dst0_r, dst1_r, dst2_r, dst3_r, rnd_vec); | SRAR_W4_SW(dst0_r, dst1_r, dst2_r, dst3_r, rnd_vec); | ||||
ADD2(dst0_r, offset_vec, dst1_r, offset_vec, dst0_r, dst1_r); | ADD2(dst0_r, offset_vec, dst1_r, offset_vec, dst0_r, dst1_r); | ||||
ADD2(dst2_r, offset_vec, dst3_r, offset_vec, dst2_r, dst3_r); | ADD2(dst2_r, offset_vec, dst3_r, offset_vec, dst2_r, dst3_r); | ||||
CLIP_SW4_0_255_MAX_SATU(dst0_r, dst1_r, dst2_r, dst3_r); | |||||
CLIP_SW4_0_255(dst0_r, dst1_r, dst2_r, dst3_r); | |||||
PCKEV_H2_SW(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r); | PCKEV_H2_SW(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r); | ||||
out = (v16u8) __msa_pckev_b((v16i8) dst1_r, (v16i8) dst0_r); | out = (v16u8) __msa_pckev_b((v16i8) dst1_r, (v16i8) dst0_r); | ||||
ST_W4(out, 0, 1, 2, 3, dst, dst_stride); | ST_W4(out, 0, 1, 2, 3, dst, dst_stride); | ||||
@@ -2014,7 +2014,7 @@ static void hevc_hv_uniwgt_8t_8multx2mult_msa(uint8_t *src, | |||||
SRAR_W4_SW(dst0_r, dst1_r, dst0_l, dst1_l, rnd_vec); | SRAR_W4_SW(dst0_r, dst1_r, dst0_l, dst1_l, rnd_vec); | ||||
ADD2(dst0_r, offset_vec, dst0_l, offset_vec, dst0_r, dst0_l); | ADD2(dst0_r, offset_vec, dst0_l, offset_vec, dst0_r, dst0_l); | ||||
ADD2(dst1_r, offset_vec, dst1_l, offset_vec, dst1_r, dst1_l); | ADD2(dst1_r, offset_vec, dst1_l, offset_vec, dst1_r, dst1_l); | ||||
CLIP_SW4_0_255_MAX_SATU(dst0_r, dst1_r, dst0_l, dst1_l); | |||||
CLIP_SW4_0_255(dst0_r, dst1_r, dst0_l, dst1_l); | |||||
PCKEV_H2_SW(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r); | PCKEV_H2_SW(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r); | ||||
dst0_r = (v4i32) __msa_pckev_b((v16i8) dst1_r, (v16i8) dst0_r); | dst0_r = (v4i32) __msa_pckev_b((v16i8) dst1_r, (v16i8) dst0_r); | ||||
@@ -2165,7 +2165,7 @@ static void hevc_hv_uniwgt_8t_12w_msa(uint8_t *src, | |||||
MUL2(dst0_r, weight_vec, dst0_l, weight_vec, dst0_r, dst0_l); | MUL2(dst0_r, weight_vec, dst0_l, weight_vec, dst0_r, dst0_l); | ||||
SRAR_W2_SW(dst0_r, dst0_l, rnd_vec); | SRAR_W2_SW(dst0_r, dst0_l, rnd_vec); | ||||
ADD2(dst0_r, offset_vec, dst0_l, offset_vec, dst0_r, dst0_l); | ADD2(dst0_r, offset_vec, dst0_l, offset_vec, dst0_r, dst0_l); | ||||
CLIP_SW2_0_255_MAX_SATU(dst0_r, dst0_l); | |||||
CLIP_SW2_0_255(dst0_r, dst0_l); | |||||
dst0_r = (v4i32) __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r); | dst0_r = (v4i32) __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r); | ||||
out = (v16u8) __msa_pckev_b((v16i8) dst0_r, (v16i8) dst0_r); | out = (v16u8) __msa_pckev_b((v16i8) dst0_r, (v16i8) dst0_r); | ||||
ST_D1(out, 0, dst_tmp); | ST_D1(out, 0, dst_tmp); | ||||
@@ -2246,7 +2246,7 @@ static void hevc_hv_uniwgt_8t_12w_msa(uint8_t *src, | |||||
SRAR_W4_SW(dst0_r, dst1_r, dst2_r, dst3_r, rnd_vec); | SRAR_W4_SW(dst0_r, dst1_r, dst2_r, dst3_r, rnd_vec); | ||||
ADD2(dst0_r, offset_vec, dst1_r, offset_vec, dst0_r, dst1_r); | ADD2(dst0_r, offset_vec, dst1_r, offset_vec, dst0_r, dst1_r); | ||||
ADD2(dst2_r, offset_vec, dst3_r, offset_vec, dst2_r, dst3_r); | ADD2(dst2_r, offset_vec, dst3_r, offset_vec, dst2_r, dst3_r); | ||||
CLIP_SW4_0_255_MAX_SATU(dst0_r, dst1_r, dst2_r, dst3_r); | |||||
CLIP_SW4_0_255(dst0_r, dst1_r, dst2_r, dst3_r); | |||||
PCKEV_H2_SW(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r); | PCKEV_H2_SW(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r); | ||||
out = (v16u8) __msa_pckev_b((v16i8) dst1_r, (v16i8) dst0_r); | out = (v16u8) __msa_pckev_b((v16i8) dst1_r, (v16i8) dst0_r); | ||||
ST_W4(out, 0, 1, 2, 3, dst, dst_stride); | ST_W4(out, 0, 1, 2, 3, dst, dst_stride); | ||||
@@ -2394,7 +2394,7 @@ static void hevc_hz_uniwgt_4t_4x2_msa(uint8_t *src, | |||||
SRAR_W2_SW(dst0_r, dst0_l, rnd_vec); | SRAR_W2_SW(dst0_r, dst0_l, rnd_vec); | ||||
dst0 = __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r); | dst0 = __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r); | ||||
dst0 = __msa_adds_s_h(dst0, offset_vec); | dst0 = __msa_adds_s_h(dst0, offset_vec); | ||||
dst0 = CLIP_SH_0_255_MAX_SATU(dst0); | |||||
CLIP_SH_0_255(dst0); | |||||
out = (v16u8) __msa_pckev_b((v16i8) dst0, (v16i8) dst0); | out = (v16u8) __msa_pckev_b((v16i8) dst0, (v16i8) dst0); | ||||
ST_W2(out, 0, 1, dst, dst_stride); | ST_W2(out, 0, 1, dst, dst_stride); | ||||
dst += (4 * dst_stride); | dst += (4 * dst_stride); | ||||
@@ -3295,7 +3295,7 @@ static void hevc_vt_uniwgt_4t_4x2_msa(uint8_t *src, | |||||
SRAR_W2_SW(dst0_r, dst0_l, rnd_vec); | SRAR_W2_SW(dst0_r, dst0_l, rnd_vec); | ||||
dst0 = __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r); | dst0 = __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r); | ||||
dst0 = __msa_adds_s_h(dst0, offset_vec); | dst0 = __msa_adds_s_h(dst0, offset_vec); | ||||
dst0 = CLIP_SH_0_255_MAX_SATU(dst0); | |||||
CLIP_SH_0_255(dst0); | |||||
out = (v16u8) __msa_pckev_b((v16i8) dst0, (v16i8) dst0); | out = (v16u8) __msa_pckev_b((v16i8) dst0, (v16i8) dst0); | ||||
ST_W2(out, 0, 1, dst, dst_stride); | ST_W2(out, 0, 1, dst, dst_stride); | ||||
} | } | ||||
@@ -4247,7 +4247,7 @@ static void hevc_hv_uniwgt_4t_4x2_msa(uint8_t *src, | |||||
SRAR_W2_SW(dst0, dst1, rnd_vec); | SRAR_W2_SW(dst0, dst1, rnd_vec); | ||||
tmp = __msa_pckev_h((v8i16) dst1, (v8i16) dst0); | tmp = __msa_pckev_h((v8i16) dst1, (v8i16) dst0); | ||||
tmp += offset_vec; | tmp += offset_vec; | ||||
tmp = CLIP_SH_0_255_MAX_SATU(tmp); | |||||
CLIP_SH_0_255(tmp); | |||||
out = (v16u8) __msa_pckev_b((v16i8) tmp, (v16i8) tmp); | out = (v16u8) __msa_pckev_b((v16i8) tmp, (v16i8) tmp); | ||||
ST_W2(out, 0, 1, dst, dst_stride); | ST_W2(out, 0, 1, dst, dst_stride); | ||||
} | } | ||||
@@ -4316,7 +4316,7 @@ static void hevc_hv_uniwgt_4t_4x4_msa(uint8_t *src, | |||||
SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec); | SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec); | ||||
PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp0, tmp1); | PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp0, tmp1); | ||||
ADD2(tmp0, offset_vec, tmp1, offset_vec, tmp0, tmp1); | ADD2(tmp0, offset_vec, tmp1, offset_vec, tmp0, tmp1); | ||||
CLIP_SH2_0_255_MAX_SATU(tmp0, tmp1); | |||||
CLIP_SH2_0_255(tmp0, tmp1); | |||||
out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0); | out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0); | ||||
ST_W4(out, 0, 1, 2, 3, dst, dst_stride); | ST_W4(out, 0, 1, 2, 3, dst, dst_stride); | ||||
} | } | ||||
@@ -4417,7 +4417,7 @@ static void hevc_hv_uniwgt_4t_4multx8mult_msa(uint8_t *src, | |||||
tmp2, tmp3); | tmp2, tmp3); | ||||
ADD2(tmp0, offset_vec, tmp1, offset_vec, tmp0, tmp1); | ADD2(tmp0, offset_vec, tmp1, offset_vec, tmp0, tmp1); | ||||
ADD2(tmp2, offset_vec, tmp3, offset_vec, tmp2, tmp3); | ADD2(tmp2, offset_vec, tmp3, offset_vec, tmp2, tmp3); | ||||
CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3); | |||||
CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3); | |||||
PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1); | PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1); | ||||
ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride); | ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride); | ||||
dst += (8 * dst_stride); | dst += (8 * dst_stride); | ||||
@@ -4574,8 +4574,8 @@ static void hevc_hv_uniwgt_4t_6w_msa(uint8_t *src, | |||||
ADD2(tmp0, offset_vec, tmp1, offset_vec, tmp0, tmp1); | ADD2(tmp0, offset_vec, tmp1, offset_vec, tmp0, tmp1); | ||||
ADD2(tmp2, offset_vec, tmp3, offset_vec, tmp2, tmp3); | ADD2(tmp2, offset_vec, tmp3, offset_vec, tmp2, tmp3); | ||||
ADD2(tmp4, offset_vec, tmp5, offset_vec, tmp4, tmp5); | ADD2(tmp4, offset_vec, tmp5, offset_vec, tmp4, tmp5); | ||||
CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3); | |||||
CLIP_SH2_0_255_MAX_SATU(tmp4, tmp5); | |||||
CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3); | |||||
CLIP_SH2_0_255(tmp4, tmp5); | |||||
PCKEV_B3_UB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, out0, out1, out2); | PCKEV_B3_UB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, out0, out1, out2); | ||||
ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride); | ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride); | ||||
ST_H8(out2, 0, 1, 2, 3, 4, 5, 6, 7, dst + 4, dst_stride); | ST_H8(out2, 0, 1, 2, 3, 4, 5, 6, 7, dst + 4, dst_stride); | ||||
@@ -4652,7 +4652,7 @@ static void hevc_hv_uniwgt_4t_8x2_msa(uint8_t *src, | |||||
SRAR_W4_SW(dst0_r, dst0_l, dst1_r, dst1_l, rnd_vec); | SRAR_W4_SW(dst0_r, dst0_l, dst1_r, dst1_l, rnd_vec); | ||||
PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, tmp0, tmp1); | PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, tmp0, tmp1); | ||||
ADD2(tmp0, offset_vec, tmp1, offset_vec, tmp0, tmp1); | ADD2(tmp0, offset_vec, tmp1, offset_vec, tmp0, tmp1); | ||||
CLIP_SH2_0_255_MAX_SATU(tmp0, tmp1); | |||||
CLIP_SH2_0_255(tmp0, tmp1); | |||||
out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0); | out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0); | ||||
ST_D2(out, 0, 1, dst, dst_stride); | ST_D2(out, 0, 1, dst, dst_stride); | ||||
} | } | ||||
@@ -4745,7 +4745,7 @@ static void hevc_hv_uniwgt_4t_8multx4_msa(uint8_t *src, | |||||
dst3_r, tmp0, tmp1, tmp2, tmp3); | dst3_r, tmp0, tmp1, tmp2, tmp3); | ||||
ADD2(tmp0, offset_vec, tmp1, offset_vec, tmp0, tmp1); | ADD2(tmp0, offset_vec, tmp1, offset_vec, tmp0, tmp1); | ||||
ADD2(tmp2, offset_vec, tmp3, offset_vec, tmp2, tmp3); | ADD2(tmp2, offset_vec, tmp3, offset_vec, tmp2, tmp3); | ||||
CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3); | |||||
CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3); | |||||
PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1); | PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1); | ||||
ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride); | ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride); | ||||
dst += 8; | dst += 8; | ||||
@@ -4861,8 +4861,8 @@ static void hevc_hv_uniwgt_4t_8x6_msa(uint8_t *src, | |||||
ADD2(tmp0, offset_vec, tmp1, offset_vec, tmp0, tmp1); | ADD2(tmp0, offset_vec, tmp1, offset_vec, tmp0, tmp1); | ||||
ADD2(tmp2, offset_vec, tmp3, offset_vec, tmp2, tmp3); | ADD2(tmp2, offset_vec, tmp3, offset_vec, tmp2, tmp3); | ||||
ADD2(tmp4, offset_vec, tmp5, offset_vec, tmp4, tmp5); | ADD2(tmp4, offset_vec, tmp5, offset_vec, tmp4, tmp5); | ||||
CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3); | |||||
CLIP_SH2_0_255_MAX_SATU(tmp4, tmp5); | |||||
CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3); | |||||
CLIP_SH2_0_255(tmp4, tmp5); | |||||
PCKEV_B3_UB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, out0, out1, out2); | PCKEV_B3_UB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, out0, out1, out2); | ||||
ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride); | ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride); | ||||
ST_D2(out2, 0, 1, dst + 4 * dst_stride, dst_stride); | ST_D2(out2, 0, 1, dst + 4 * dst_stride, dst_stride); | ||||
@@ -4973,7 +4973,7 @@ static void hevc_hv_uniwgt_4t_8multx4mult_msa(uint8_t *src, | |||||
dst3_r, tmp0, tmp1, tmp2, tmp3); | dst3_r, tmp0, tmp1, tmp2, tmp3); | ||||
ADD2(tmp0, offset_vec, tmp1, offset_vec, tmp0, tmp1); | ADD2(tmp0, offset_vec, tmp1, offset_vec, tmp0, tmp1); | ||||
ADD2(tmp2, offset_vec, tmp3, offset_vec, tmp2, tmp3); | ADD2(tmp2, offset_vec, tmp3, offset_vec, tmp2, tmp3); | ||||
CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3); | |||||
CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3); | |||||
PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1); | PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1); | ||||
ST_D4(out0, out1, 0, 1, 0, 1, dst_tmp, dst_stride); | ST_D4(out0, out1, 0, 1, 0, 1, dst_tmp, dst_stride); | ||||
dst_tmp += (4 * dst_stride); | dst_tmp += (4 * dst_stride); | ||||
@@ -5120,7 +5120,7 @@ static void hevc_hv_uniwgt_4t_12w_msa(uint8_t *src, | |||||
dst3_r, tmp0, tmp1, tmp2, tmp3); | dst3_r, tmp0, tmp1, tmp2, tmp3); | ||||
ADD2(tmp0, offset_vec, tmp1, offset_vec, tmp0, tmp1); | ADD2(tmp0, offset_vec, tmp1, offset_vec, tmp0, tmp1); | ||||
ADD2(tmp2, offset_vec, tmp3, offset_vec, tmp2, tmp3); | ADD2(tmp2, offset_vec, tmp3, offset_vec, tmp2, tmp3); | ||||
CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3); | |||||
CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3); | |||||
PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1); | PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1); | ||||
ST_D4(out0, out1, 0, 1, 0, 1, dst_tmp, dst_stride); | ST_D4(out0, out1, 0, 1, 0, 1, dst_tmp, dst_stride); | ||||
dst_tmp += (4 * dst_stride); | dst_tmp += (4 * dst_stride); | ||||
@@ -5187,7 +5187,7 @@ static void hevc_hv_uniwgt_4t_12w_msa(uint8_t *src, | |||||
tmp2, tmp3); | tmp2, tmp3); | ||||
ADD2(tmp0, offset_vec, tmp1, offset_vec, tmp0, tmp1); | ADD2(tmp0, offset_vec, tmp1, offset_vec, tmp0, tmp1); | ||||
ADD2(tmp2, offset_vec, tmp3, offset_vec, tmp2, tmp3); | ADD2(tmp2, offset_vec, tmp3, offset_vec, tmp2, tmp3); | ||||
CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3); | |||||
CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3); | |||||
PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1); | PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1); | ||||
ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride); | ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride); | ||||
dst += (8 * dst_stride); | dst += (8 * dst_stride); | ||||
@@ -83,7 +83,7 @@ static void hevc_intra_pred_vert_4x4_msa(const uint8_t *src_top, | |||||
vec2 -= vec0; | vec2 -= vec0; | ||||
vec2 >>= 1; | vec2 >>= 1; | ||||
vec2 += vec1; | vec2 += vec1; | ||||
vec2 = CLIP_SH_0_255(vec2); | |||||
CLIP_SH_0_255(vec2); | |||||
for (col = 0; col < 4; col++) { | for (col = 0; col < 4; col++) { | ||||
dst[stride * col] = (uint8_t) vec2[col]; | dst[stride * col] = (uint8_t) vec2[col]; | ||||
@@ -122,7 +122,7 @@ static void hevc_intra_pred_vert_8x8_msa(const uint8_t *src_top, | |||||
vec2 -= vec0; | vec2 -= vec0; | ||||
vec2 >>= 1; | vec2 >>= 1; | ||||
vec2 += vec1; | vec2 += vec1; | ||||
vec2 = CLIP_SH_0_255(vec2); | |||||
CLIP_SH_0_255(vec2); | |||||
val0 = vec2[0]; | val0 = vec2[0]; | ||||
val1 = vec2[1]; | val1 = vec2[1]; | ||||
@@ -214,7 +214,7 @@ static void hevc_intra_pred_horiz_4x4_msa(const uint8_t *src_top, | |||||
src0_r -= src_top_val; | src0_r -= src_top_val; | ||||
src0_r >>= 1; | src0_r >>= 1; | ||||
src0_r += src_left_val; | src0_r += src_left_val; | ||||
src0_r = CLIP_SH_0_255(src0_r); | |||||
CLIP_SH_0_255(src0_r); | |||||
src0 = __msa_pckev_b((v16i8) src0_r, (v16i8) src0_r); | src0 = __msa_pckev_b((v16i8) src0_r, (v16i8) src0_r); | ||||
val0 = __msa_copy_s_w((v4i32) src0, 0); | val0 = __msa_copy_s_w((v4i32) src0, 0); | ||||
SW(val0, dst); | SW(val0, dst); | ||||
@@ -254,7 +254,7 @@ static void hevc_intra_pred_horiz_8x8_msa(const uint8_t *src_top, | |||||
src0_r -= src_top_val; | src0_r -= src_top_val; | ||||
src0_r >>= 1; | src0_r >>= 1; | ||||
src0_r += src_left_val; | src0_r += src_left_val; | ||||
src0_r = CLIP_SH_0_255(src0_r); | |||||
CLIP_SH_0_255(src0_r); | |||||
src0 = __msa_pckev_b((v16i8) src0_r, (v16i8) src0_r); | src0 = __msa_pckev_b((v16i8) src0_r, (v16i8) src0_r); | ||||
val0 = __msa_copy_s_d((v2i64) src0, 0); | val0 = __msa_copy_s_d((v2i64) src0, 0); | ||||
SD(val0, dst); | SD(val0, dst); | ||||
@@ -28,8 +28,7 @@ static void put_pixels_clamped_msa(const int16_t *block, uint8_t *pixels, | |||||
v8i16 in0, in1, in2, in3, in4, in5, in6, in7; | v8i16 in0, in1, in2, in3, in4, in5, in6, in7; | ||||
LD_SH8(block, 8, in0, in1, in2, in3, in4, in5, in6, in7); | LD_SH8(block, 8, in0, in1, in2, in3, in4, in5, in6, in7); | ||||
CLIP_SH4_0_255(in0, in1, in2, in3); | |||||
CLIP_SH4_0_255(in4, in5, in6, in7); | |||||
CLIP_SH8_0_255(in0, in1, in2, in3, in4, in5, in6, in7); | |||||
PCKEV_B4_SH(in0, in0, in1, in1, in2, in2, in3, in3, in0, in1, in2, in3); | PCKEV_B4_SH(in0, in0, in1, in1, in2, in2, in3, in3, in0, in1, in2, in3); | ||||
PCKEV_B4_SH(in4, in4, in5, in5, in6, in6, in7, in7, in4, in5, in6, in7); | PCKEV_B4_SH(in4, in4, in5, in5, in6, in6, in7, in7, in4, in5, in6, in7); | ||||
@@ -63,8 +62,7 @@ static void put_signed_pixels_clamped_msa(const int16_t *block, uint8_t *pixels, | |||||
in6 += 128; | in6 += 128; | ||||
in7 += 128; | in7 += 128; | ||||
CLIP_SH4_0_255(in0, in1, in2, in3); | |||||
CLIP_SH4_0_255(in4, in5, in6, in7); | |||||
CLIP_SH8_0_255(in0, in1, in2, in3, in4, in5, in6, in7); | |||||
PCKEV_B4_SH(in0, in0, in1, in1, in2, in2, in3, in3, in0, in1, in2, in3); | PCKEV_B4_SH(in0, in0, in1, in1, in2, in2, in3, in3, in0, in1, in2, in3); | ||||
PCKEV_B4_SH(in4, in4, in5, in5, in6, in6, in7, in7, in4, in5, in6, in7); | PCKEV_B4_SH(in4, in4, in5, in5, in6, in6, in7, in7, in4, in5, in6, in7); | ||||
@@ -109,8 +107,7 @@ static void add_pixels_clamped_msa(const int16_t *block, uint8_t *pixels, | |||||
in6 += (v8i16) pix6; | in6 += (v8i16) pix6; | ||||
in7 += (v8i16) pix7; | in7 += (v8i16) pix7; | ||||
CLIP_SH4_0_255(in0, in1, in2, in3); | |||||
CLIP_SH4_0_255(in4, in5, in6, in7); | |||||
CLIP_SH8_0_255(in0, in1, in2, in3, in4, in5, in6, in7); | |||||
PCKEV_B4_SH(in0, in0, in1, in1, in2, in2, in3, in3, in0, in1, in2, in3); | PCKEV_B4_SH(in0, in0, in1, in1, in2, in2, in3, in3, in0, in1, in2, in3); | ||||
PCKEV_B4_SH(in4, in4, in5, in5, in6, in6, in7, in7, in4, in5, in6, in7); | PCKEV_B4_SH(in4, in4, in5, in5, in6, in6, in7, in7, in4, in5, in6, in7); | ||||
@@ -96,7 +96,7 @@ | |||||
DPADD_UB2_UH(sum2_r, sum1_r, coef2, coef1, sum0_r, sum3_r); \ | DPADD_UB2_UH(sum2_r, sum1_r, coef2, coef1, sum0_r, sum3_r); \ | ||||
res0_r = (v8i16) (sum0_r - sum3_r); \ | res0_r = (v8i16) (sum0_r - sum3_r); \ | ||||
res0_r = __msa_srari_h(res0_r, 5); \ | res0_r = __msa_srari_h(res0_r, 5); \ | ||||
res0_r = CLIP_SH_0_255(res0_r); \ | |||||
CLIP_SH_0_255(res0_r); \ | |||||
out = (v16u8) __msa_pckev_b((v16i8) res0_r, (v16i8) res0_r); \ | out = (v16u8) __msa_pckev_b((v16i8) res0_r, (v16i8) res0_r); \ | ||||
\ | \ | ||||
out; \ | out; \ | ||||
@@ -118,7 +118,7 @@ | |||||
res0_r = (v8i16) (sum0_r - sum3_r); \ | res0_r = (v8i16) (sum0_r - sum3_r); \ | ||||
res0_r += 15; \ | res0_r += 15; \ | ||||
res0_r >>= 5; \ | res0_r >>= 5; \ | ||||
res0_r = CLIP_SH_0_255(res0_r); \ | |||||
CLIP_SH_0_255(res0_r); \ | |||||
out = (v16u8) __msa_pckev_b((v16i8) res0_r, (v16i8) res0_r); \ | out = (v16u8) __msa_pckev_b((v16i8) res0_r, (v16i8) res0_r); \ | ||||
\ | \ | ||||
out; \ | out; \ | ||||
@@ -336,35 +336,26 @@ static void simple_idct_put_msa(uint8_t *dst, int32_t dst_stride, | |||||
SRA_4V(temp2_r, temp2_l, temp3_r, temp3_l, 20); | SRA_4V(temp2_r, temp2_l, temp3_r, temp3_l, 20); | ||||
SRA_4V(a3_r, a3_l, a2_r, a2_l, 20); | SRA_4V(a3_r, a3_l, a2_r, a2_l, 20); | ||||
SRA_4V(a1_r, a1_l, a0_r, a0_l, 20); | SRA_4V(a1_r, a1_l, a0_r, a0_l, 20); | ||||
PCKEV_H4_SW(temp0_l, temp0_r, temp1_l, temp1_r, temp2_l, temp2_r, | |||||
temp3_l, temp3_r, temp0_r, temp1_r, temp2_r, temp3_r); | |||||
PCKEV_H4_SW(a0_l, a0_r, a1_l, a1_r, a2_l, a2_r, a3_l, a3_r, | |||||
a0_r, a1_r, a2_r, a3_r); | |||||
temp0_r = (v4i32) CLIP_SH_0_255(temp0_r); | |||||
temp1_r = (v4i32) CLIP_SH_0_255(temp1_r); | |||||
temp2_r = (v4i32) CLIP_SH_0_255(temp2_r); | |||||
temp3_r = (v4i32) CLIP_SH_0_255(temp3_r); | |||||
PCKEV_B4_SW(temp0_r, temp0_r, temp1_r, temp1_r, | |||||
temp2_r, temp2_r, temp3_r, temp3_r, | |||||
temp0_r, temp1_r, temp2_r, temp3_r); | |||||
tmp0 = __msa_copy_u_d((v2i64) temp0_r, 1); | |||||
tmp1 = __msa_copy_u_d((v2i64) temp1_r, 1); | |||||
tmp2 = __msa_copy_u_d((v2i64) temp2_r, 1); | |||||
tmp3 = __msa_copy_u_d((v2i64) temp3_r, 1); | |||||
SD4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride); | |||||
dst += 4 * dst_stride; | |||||
a0_r = (v4i32) CLIP_SH_0_255(a0_r); | |||||
a1_r = (v4i32) CLIP_SH_0_255(a1_r); | |||||
a2_r = (v4i32) CLIP_SH_0_255(a2_r); | |||||
a3_r = (v4i32) CLIP_SH_0_255(a3_r); | |||||
PCKEV_B4_SW(a0_r, a0_r, a1_r, a1_r, | |||||
a2_r, a2_r, a3_r, a3_r, a0_r, a1_r, a2_r, a3_r); | |||||
tmp3 = __msa_copy_u_d((v2i64) a0_r, 1); | |||||
tmp2 = __msa_copy_u_d((v2i64) a1_r, 1); | |||||
tmp1 = __msa_copy_u_d((v2i64) a2_r, 1); | |||||
tmp0 = __msa_copy_u_d((v2i64) a3_r, 1); | |||||
PCKEV_H4_SH(temp0_l, temp0_r, temp1_l, temp1_r, temp2_l, temp2_r, | |||||
temp3_l, temp3_r, in0, in1, in2, in3); | |||||
PCKEV_H4_SH(a0_l, a0_r, a1_l, a1_r, a2_l, a2_r, a3_l, a3_r, | |||||
in4, in5, in6, in7); | |||||
CLIP_SH4_0_255(in0, in1, in2, in3); | |||||
PCKEV_B4_SH(in0, in0, in1, in1, in2, in2, in3, in3, | |||||
in0, in1, in2, in3); | |||||
tmp0 = __msa_copy_u_d((v2i64) in0, 1); | |||||
tmp1 = __msa_copy_u_d((v2i64) in1, 1); | |||||
tmp2 = __msa_copy_u_d((v2i64) in2, 1); | |||||
tmp3 = __msa_copy_u_d((v2i64) in3, 1); | |||||
SD4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride); | SD4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride); | ||||
dst += 4 * dst_stride; | |||||
CLIP_SH4_0_255(in4, in5, in6, in7); | |||||
PCKEV_B4_SH(in4, in4, in5, in5, in6, in6, in7, in7, | |||||
in4, in5, in6, in7); | |||||
tmp3 = __msa_copy_u_d((v2i64) in4, 1); | |||||
tmp2 = __msa_copy_u_d((v2i64) in5, 1); | |||||
tmp1 = __msa_copy_u_d((v2i64) in6, 1); | |||||
tmp0 = __msa_copy_u_d((v2i64) in7, 1); | |||||
SD4(tmp0, tmp1, tmp2, tmp3, dst + 4 * dst_stride, dst_stride); | |||||
} | } | ||||
static void simple_idct_add_msa(uint8_t *dst, int32_t dst_stride, | static void simple_idct_add_msa(uint8_t *dst, int32_t dst_stride, | ||||
@@ -516,21 +507,17 @@ static void simple_idct_add_msa(uint8_t *dst, int32_t dst_stride, | |||||
temp3_l, temp3_r, temp0_r, temp1_r, temp2_r, temp3_r); | temp3_l, temp3_r, temp0_r, temp1_r, temp2_r, temp3_r); | ||||
ILVR_B4_SW(zero, in0, zero, in1, zero, in2, zero, in3, | ILVR_B4_SW(zero, in0, zero, in1, zero, in2, zero, in3, | ||||
temp0_l, temp1_l, temp2_l, temp3_l); | temp0_l, temp1_l, temp2_l, temp3_l); | ||||
temp0_r = (v4i32) ((v8i16) (temp0_r) + (v8i16) (temp0_l)); | |||||
temp1_r = (v4i32) ((v8i16) (temp1_r) + (v8i16) (temp1_l)); | |||||
temp2_r = (v4i32) ((v8i16) (temp2_r) + (v8i16) (temp2_l)); | |||||
temp3_r = (v4i32) ((v8i16) (temp3_r) + (v8i16) (temp3_l)); | |||||
temp0_r = (v4i32) CLIP_SH_0_255(temp0_r); | |||||
temp1_r = (v4i32) CLIP_SH_0_255(temp1_r); | |||||
temp2_r = (v4i32) CLIP_SH_0_255(temp2_r); | |||||
temp3_r = (v4i32) CLIP_SH_0_255(temp3_r); | |||||
PCKEV_B4_SW(temp0_r, temp0_r, temp1_r, temp1_r, | |||||
temp2_r, temp2_r, temp3_r, temp3_r, | |||||
temp0_r, temp1_r, temp2_r, temp3_r); | |||||
tmp0 = __msa_copy_u_d((v2i64) temp0_r, 1); | |||||
tmp1 = __msa_copy_u_d((v2i64) temp1_r, 1); | |||||
tmp2 = __msa_copy_u_d((v2i64) temp2_r, 1); | |||||
tmp3 = __msa_copy_u_d((v2i64) temp3_r, 1); | |||||
in0 = (v8i16) (temp0_r) + (v8i16) (temp0_l); | |||||
in1 = (v8i16) (temp1_r) + (v8i16) (temp1_l); | |||||
in2 = (v8i16) (temp2_r) + (v8i16) (temp2_l); | |||||
in3 = (v8i16) (temp3_r) + (v8i16) (temp3_l); | |||||
CLIP_SH4_0_255(in0, in1, in2, in3); | |||||
PCKEV_B4_SH(in0, in0, in1, in1, in2, in2, in3, in3, | |||||
in0, in1, in2, in3); | |||||
tmp0 = __msa_copy_u_d((v2i64) in0, 1); | |||||
tmp1 = __msa_copy_u_d((v2i64) in1, 1); | |||||
tmp2 = __msa_copy_u_d((v2i64) in2, 1); | |||||
tmp3 = __msa_copy_u_d((v2i64) in3, 1); | |||||
SD4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride); | SD4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride); | ||||
SRA_4V(a3_r, a3_l, a2_r, a2_l, 20); | SRA_4V(a3_r, a3_l, a2_r, a2_l, 20); | ||||
@@ -540,20 +527,17 @@ static void simple_idct_add_msa(uint8_t *dst, int32_t dst_stride, | |||||
a0_r, a1_r, a2_r, a3_r); | a0_r, a1_r, a2_r, a3_r); | ||||
ILVR_B4_SW(zero, in4, zero, in5, zero, in6, zero, in7, | ILVR_B4_SW(zero, in4, zero, in5, zero, in6, zero, in7, | ||||
a3_l, a2_l, a1_l, a0_l); | a3_l, a2_l, a1_l, a0_l); | ||||
a3_r = (v4i32) ((v8i16) (a3_r) + (v8i16) (a3_l)); | |||||
a2_r = (v4i32) ((v8i16) (a2_r) + (v8i16) (a2_l)); | |||||
a1_r = (v4i32) ((v8i16) (a1_r) + (v8i16) (a1_l)); | |||||
a0_r = (v4i32) ((v8i16) (a0_r) + (v8i16) (a0_l)); | |||||
a3_r = (v4i32) CLIP_SH_0_255(a3_r); | |||||
a2_r = (v4i32) CLIP_SH_0_255(a2_r); | |||||
a1_r = (v4i32) CLIP_SH_0_255(a1_r); | |||||
a0_r = (v4i32) CLIP_SH_0_255(a0_r); | |||||
PCKEV_B4_SW(a0_r, a0_r, a1_r, a1_r, | |||||
a2_r, a2_r, a3_r, a3_r, a0_r, a1_r, a2_r, a3_r); | |||||
tmp0 = __msa_copy_u_d((v2i64) a3_r, 1); | |||||
tmp1 = __msa_copy_u_d((v2i64) a2_r, 1); | |||||
tmp2 = __msa_copy_u_d((v2i64) a1_r, 1); | |||||
tmp3 = __msa_copy_u_d((v2i64) a0_r, 1); | |||||
in4 = (v8i16) (a3_r) + (v8i16) (a3_l); | |||||
in5 = (v8i16) (a2_r) + (v8i16) (a2_l); | |||||
in6 = (v8i16) (a1_r) + (v8i16) (a1_l); | |||||
in7 = (v8i16) (a0_r) + (v8i16) (a0_l); | |||||
CLIP_SH4_0_255(in4, in5, in6, in7); | |||||
PCKEV_B4_SH(in4, in4, in5, in5, in6, in6, in7, in7, | |||||
in4, in5, in6, in7); | |||||
tmp0 = __msa_copy_u_d((v2i64) in4, 1); | |||||
tmp1 = __msa_copy_u_d((v2i64) in5, 1); | |||||
tmp2 = __msa_copy_u_d((v2i64) in6, 1); | |||||
tmp3 = __msa_copy_u_d((v2i64) in7, 1); | |||||
SD4(tmp0, tmp1, tmp2, tmp3, dst + 4 * dst_stride, dst_stride); | SD4(tmp0, tmp1, tmp2, tmp3, dst + 4 * dst_stride, dst_stride); | ||||
} | } | ||||
@@ -187,14 +187,7 @@ static void idct_msa(uint8_t *dst, int stride, int16_t *input, int type) | |||||
G += c5; | G += c5; | ||||
H += c6; | H += c6; | ||||
} | } | ||||
A = CLIP_SW_0_255(A); | |||||
B = CLIP_SW_0_255(B); | |||||
C = CLIP_SW_0_255(C); | |||||
D = CLIP_SW_0_255(D); | |||||
E = CLIP_SW_0_255(E); | |||||
F = CLIP_SW_0_255(F); | |||||
G = CLIP_SW_0_255(G); | |||||
H = CLIP_SW_0_255(H); | |||||
CLIP_SW8_0_255(A, B, C, D, E, F, G, H); | |||||
sign_l = __msa_or_v((v16u8)r1_r, (v16u8)r2_r); | sign_l = __msa_or_v((v16u8)r1_r, (v16u8)r2_r); | ||||
sign_l = __msa_or_v(sign_l, (v16u8)r3_r); | sign_l = __msa_or_v(sign_l, (v16u8)r3_r); | ||||
sign_l = __msa_or_v(sign_l, (v16u8)r0_l); | sign_l = __msa_or_v(sign_l, (v16u8)r0_l); | ||||
@@ -205,7 +198,7 @@ static void idct_msa(uint8_t *dst, int stride, int16_t *input, int type) | |||||
Add = ((r0_r * cnst46341w) + (8 << 16)) >> 20; | Add = ((r0_r * cnst46341w) + (8 << 16)) >> 20; | ||||
if (type == 1) { | if (type == 1) { | ||||
Bdd = Add + cnst128w; | Bdd = Add + cnst128w; | ||||
Bdd = CLIP_SW_0_255(Bdd); | |||||
CLIP_SW_0_255(Bdd); | |||||
Ad = Bdd; | Ad = Bdd; | ||||
Bd = Bdd; | Bd = Bdd; | ||||
Cd = Bdd; | Cd = Bdd; | ||||
@@ -223,14 +216,7 @@ static void idct_msa(uint8_t *dst, int stride, int16_t *input, int type) | |||||
Fd = Add + c5; | Fd = Add + c5; | ||||
Gd = Add + c6; | Gd = Add + c6; | ||||
Hd = Add + c7; | Hd = Add + c7; | ||||
Ad = CLIP_SW_0_255(Ad); | |||||
Bd = CLIP_SW_0_255(Bd); | |||||
Cd = CLIP_SW_0_255(Cd); | |||||
Dd = CLIP_SW_0_255(Dd); | |||||
Ed = CLIP_SW_0_255(Ed); | |||||
Fd = CLIP_SW_0_255(Fd); | |||||
Gd = CLIP_SW_0_255(Gd); | |||||
Hd = CLIP_SW_0_255(Hd); | |||||
CLIP_SW8_0_255(Ad, Bd, Cd, Dd, Ed, Fd, Gd, Hd); | |||||
} | } | ||||
Ad = (v4i32)__msa_and_v((v16u8)Ad, (v16u8)sign_t); | Ad = (v4i32)__msa_and_v((v16u8)Ad, (v16u8)sign_t); | ||||
Bd = (v4i32)__msa_and_v((v16u8)Bd, (v16u8)sign_t); | Bd = (v4i32)__msa_and_v((v16u8)Bd, (v16u8)sign_t); | ||||
@@ -309,14 +295,7 @@ static void idct_msa(uint8_t *dst, int stride, int16_t *input, int type) | |||||
G += c5; | G += c5; | ||||
H += c6; | H += c6; | ||||
} | } | ||||
A = CLIP_SW_0_255(A); | |||||
B = CLIP_SW_0_255(B); | |||||
C = CLIP_SW_0_255(C); | |||||
D = CLIP_SW_0_255(D); | |||||
E = CLIP_SW_0_255(E); | |||||
F = CLIP_SW_0_255(F); | |||||
G = CLIP_SW_0_255(G); | |||||
H = CLIP_SW_0_255(H); | |||||
CLIP_SW8_0_255(A, B, C, D, E, F, G, H); | |||||
sign_l = __msa_or_v((v16u8)r5_r, (v16u8)r6_r); | sign_l = __msa_or_v((v16u8)r5_r, (v16u8)r6_r); | ||||
sign_l = __msa_or_v(sign_l, (v16u8)r7_r); | sign_l = __msa_or_v(sign_l, (v16u8)r7_r); | ||||
sign_l = __msa_or_v(sign_l, (v16u8)r4_l); | sign_l = __msa_or_v(sign_l, (v16u8)r4_l); | ||||
@@ -327,7 +306,7 @@ static void idct_msa(uint8_t *dst, int stride, int16_t *input, int type) | |||||
Add = ((r4_r * cnst46341w) + (8 << 16)) >> 20; | Add = ((r4_r * cnst46341w) + (8 << 16)) >> 20; | ||||
if (type == 1) { | if (type == 1) { | ||||
Bdd = Add + cnst128w; | Bdd = Add + cnst128w; | ||||
Bdd = CLIP_SW_0_255(Bdd); | |||||
CLIP_SW_0_255(Bdd); | |||||
Ad = Bdd; | Ad = Bdd; | ||||
Bd = Bdd; | Bd = Bdd; | ||||
Cd = Bdd; | Cd = Bdd; | ||||
@@ -345,14 +324,7 @@ static void idct_msa(uint8_t *dst, int stride, int16_t *input, int type) | |||||
Fd = Add + c5; | Fd = Add + c5; | ||||
Gd = Add + c6; | Gd = Add + c6; | ||||
Hd = Add + c7; | Hd = Add + c7; | ||||
Ad = CLIP_SW_0_255(Ad); | |||||
Bd = CLIP_SW_0_255(Bd); | |||||
Cd = CLIP_SW_0_255(Cd); | |||||
Dd = CLIP_SW_0_255(Dd); | |||||
Ed = CLIP_SW_0_255(Ed); | |||||
Fd = CLIP_SW_0_255(Fd); | |||||
Gd = CLIP_SW_0_255(Gd); | |||||
Hd = CLIP_SW_0_255(Hd); | |||||
CLIP_SW8_0_255(Ad, Bd, Cd, Dd, Ed, Fd, Gd, Hd); | |||||
} | } | ||||
Ad = (v4i32)__msa_and_v((v16u8)Ad, (v16u8)sign_t); | Ad = (v4i32)__msa_and_v((v16u8)Ad, (v16u8)sign_t); | ||||
Bd = (v4i32)__msa_and_v((v16u8)Bd, (v16u8)sign_t); | Bd = (v4i32)__msa_and_v((v16u8)Bd, (v16u8)sign_t); | ||||
@@ -436,14 +408,7 @@ void ff_vp3_idct_dc_add_msa(uint8_t *dest, ptrdiff_t line_size, int16_t *block) | |||||
e5 += dc; | e5 += dc; | ||||
e6 += dc; | e6 += dc; | ||||
e7 += dc; | e7 += dc; | ||||
e0 = CLIP_SW_0_255(e0); | |||||
e1 = CLIP_SW_0_255(e1); | |||||
e2 = CLIP_SW_0_255(e2); | |||||
e3 = CLIP_SW_0_255(e3); | |||||
e4 = CLIP_SW_0_255(e4); | |||||
e5 = CLIP_SW_0_255(e5); | |||||
e6 = CLIP_SW_0_255(e6); | |||||
e7 = CLIP_SW_0_255(e7); | |||||
CLIP_SW8_0_255(e0, e1, e2, e3, e4, e5, e6, e7); | |||||
/* Left part */ | /* Left part */ | ||||
ILVL_H4_SW(zero, c0, zero, c1, zero, c2, zero, c3, | ILVL_H4_SW(zero, c0, zero, c1, zero, c2, zero, c3, | ||||
@@ -458,14 +423,7 @@ void ff_vp3_idct_dc_add_msa(uint8_t *dest, ptrdiff_t line_size, int16_t *block) | |||||
r5 += dc; | r5 += dc; | ||||
r6 += dc; | r6 += dc; | ||||
r7 += dc; | r7 += dc; | ||||
r0 = CLIP_SW_0_255(r0); | |||||
r1 = CLIP_SW_0_255(r1); | |||||
r2 = CLIP_SW_0_255(r2); | |||||
r3 = CLIP_SW_0_255(r3); | |||||
r4 = CLIP_SW_0_255(r4); | |||||
r5 = CLIP_SW_0_255(r5); | |||||
r6 = CLIP_SW_0_255(r6); | |||||
r7 = CLIP_SW_0_255(r7); | |||||
CLIP_SW8_0_255(r0, r1, r2, r3, r4, r5, r6, r7); | |||||
VSHF_B2_SB(e0, r0, e1, r1, mask, mask, d0, d1); | VSHF_B2_SB(e0, r0, e1, r1, mask, mask, d0, d1); | ||||
VSHF_B2_SB(e2, r2, e3, r3, mask, mask, d2, d3); | VSHF_B2_SB(e2, r2, e3, r3, mask, mask, d2, d3); | ||||
VSHF_B2_SB(e4, r4, e5, r5, mask, mask, d4, d5); | VSHF_B2_SB(e4, r4, e5, r5, mask, mask, d4, d5); | ||||
@@ -516,10 +474,7 @@ void ff_vp3_v_loop_filter_msa(uint8_t *first_pixel, ptrdiff_t stride, | |||||
f1 += e1; | f1 += e1; | ||||
g0 -= e0; | g0 -= e0; | ||||
g1 -= e1; | g1 -= e1; | ||||
f0 = CLIP_SW_0_255(f0); | |||||
f1 = CLIP_SW_0_255(f1); | |||||
g0 = CLIP_SW_0_255(g0); | |||||
g1 = CLIP_SW_0_255(g1); | |||||
CLIP_SW4_0_255(f0, f1, g0, g1); | |||||
VSHF_B2_SB(f0, f1, g0, g1, mask, mask, d1, d2); | VSHF_B2_SB(f0, f1, g0, g1, mask, mask, d1, d2); | ||||
/* Final move to first_pixel */ | /* Final move to first_pixel */ | ||||
@@ -563,10 +518,7 @@ void ff_vp3_h_loop_filter_msa(uint8_t *first_pixel, ptrdiff_t stride, | |||||
f1 += e1; | f1 += e1; | ||||
g0 -= e0; | g0 -= e0; | ||||
g1 -= e1; | g1 -= e1; | ||||
f0 = CLIP_SW_0_255(f0); | |||||
f1 = CLIP_SW_0_255(f1); | |||||
g0 = CLIP_SW_0_255(g0); | |||||
g1 = CLIP_SW_0_255(g1); | |||||
CLIP_SW4_0_255(f0, f1, g0, g1); | |||||
VSHF_B2_SB(f0, g0, f1, g1, mask, mask, d1, d2); | VSHF_B2_SB(f0, g0, f1, g1, mask, mask, d1, d2); | ||||
/* Final move to first_pixel */ | /* Final move to first_pixel */ | ||||
ST_H4(d1, 0, 1, 2, 3, first_pixel - 1, stride); | ST_H4(d1, 0, 1, 2, 3, first_pixel - 1, stride); | ||||
@@ -71,10 +71,7 @@ void ff_vp8_idct_add_msa(uint8_t *dst, int16_t input[16], ptrdiff_t stride) | |||||
ILVR_H4_SW(zero, res0, zero, res1, zero, res2, zero, res3, | ILVR_H4_SW(zero, res0, zero, res1, zero, res2, zero, res3, | ||||
res0, res1, res2, res3); | res0, res1, res2, res3); | ||||
ADD4(res0, vt0, res1, vt1, res2, vt2, res3, vt3, res0, res1, res2, res3); | ADD4(res0, vt0, res1, vt1, res2, vt2, res3, vt3, res0, res1, res2, res3); | ||||
res0 = CLIP_SW_0_255(res0); | |||||
res1 = CLIP_SW_0_255(res1); | |||||
res2 = CLIP_SW_0_255(res2); | |||||
res3 = CLIP_SW_0_255(res3); | |||||
CLIP_SW4_0_255(res0, res1, res2, res3); | |||||
VSHF_B2_SB(res0, res1, res2, res3, mask, mask, dest0, dest1); | VSHF_B2_SB(res0, res1, res2, res3, mask, mask, dest0, dest1); | ||||
ST_W2(dest0, 0, 1, dst, stride); | ST_W2(dest0, 0, 1, dst, stride); | ||||
ST_W2(dest1, 0, 1, dst + 2 * stride, stride); | ST_W2(dest1, 0, 1, dst + 2 * stride, stride); | ||||
@@ -763,13 +763,13 @@ static void vp9_iadst8x8_colcol_addblk_msa(int16_t *input, uint8_t *dst, | |||||
res0 = (v8i16) __msa_ilvr_b((v16i8) zero, (v16i8) dst0); | res0 = (v8i16) __msa_ilvr_b((v16i8) zero, (v16i8) dst0); | ||||
res0 += out0; | res0 += out0; | ||||
res0 = CLIP_SH_0_255(res0); | |||||
CLIP_SH_0_255(res0); | |||||
res0 = (v8i16) __msa_pckev_b((v16i8) res0, (v16i8) res0); | res0 = (v8i16) __msa_pckev_b((v16i8) res0, (v16i8) res0); | ||||
ST_D1(res0, 0, dst); | ST_D1(res0, 0, dst); | ||||
res7 = (v8i16) __msa_ilvr_b((v16i8) zero, (v16i8) dst7); | res7 = (v8i16) __msa_ilvr_b((v16i8) zero, (v16i8) dst7); | ||||
res7 += out7; | res7 += out7; | ||||
res7 = CLIP_SH_0_255(res7); | |||||
CLIP_SH_0_255(res7); | |||||
res7 = (v8i16) __msa_pckev_b((v16i8) res7, (v16i8) res7); | res7 = (v8i16) __msa_pckev_b((v16i8) res7, (v16i8) res7); | ||||
ST_D1(res7, 0, dst + 7 * dst_stride); | ST_D1(res7, 0, dst + 7 * dst_stride); | ||||
@@ -1192,8 +1192,7 @@ static void vp9_idct16x16_1_add_msa(int16_t *input, uint8_t *dst, | |||||
res3); | res3); | ||||
ADD4(res4, vec, res5, vec, res6, vec, res7, vec, res4, res5, res6, | ADD4(res4, vec, res5, vec, res6, vec, res7, vec, res4, res5, res6, | ||||
res7); | res7); | ||||
CLIP_SH4_0_255(res0, res1, res2, res3); | |||||
CLIP_SH4_0_255(res4, res5, res6, res7); | |||||
CLIP_SH8_0_255(res0, res1, res2, res3, res4, res5, res6, res7); | |||||
PCKEV_B4_UB(res4, res0, res5, res1, res6, res2, res7, res3, | PCKEV_B4_UB(res4, res0, res5, res1, res6, res2, res7, res3, | ||||
tmp0, tmp1, tmp2, tmp3); | tmp0, tmp1, tmp2, tmp3); | ||||
ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride); | ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride); | ||||
@@ -1981,8 +1980,7 @@ static void vp9_idct32x32_1_add_msa(int16_t *input, uint8_t *dst, | |||||
res3); | res3); | ||||
ADD4(res4, vec, res5, vec, res6, vec, res7, vec, res4, res5, res6, | ADD4(res4, vec, res5, vec, res6, vec, res7, vec, res4, res5, res6, | ||||
res7); | res7); | ||||
CLIP_SH4_0_255(res0, res1, res2, res3); | |||||
CLIP_SH4_0_255(res4, res5, res6, res7); | |||||
CLIP_SH8_0_255(res0, res1, res2, res3, res4, res5, res6, res7); | |||||
PCKEV_B4_UB(res4, res0, res5, res1, res6, res2, res7, res3, | PCKEV_B4_UB(res4, res0, res5, res1, res6, res2, res7, res3, | ||||
tmp0, tmp1, tmp2, tmp3); | tmp0, tmp1, tmp2, tmp3); | ||||
@@ -933,99 +933,78 @@ | |||||
/* Description : Clips all halfword elements of input vector between min & max | /* Description : Clips all halfword elements of input vector between min & max | ||||
out = ((in) < (min)) ? (min) : (((in) > (max)) ? (max) : (in)) | out = ((in) < (min)) ? (min) : (((in) > (max)) ? (max) : (in)) | ||||
Arguments : Inputs - in (input vector) | |||||
- min (min threshold) | |||||
- max (max threshold) | |||||
Outputs - out_m (output vector with clipped elements) | |||||
Arguments : Inputs - in (input vector) | |||||
- min (min threshold) | |||||
- max (max threshold) | |||||
Outputs - in (output vector with clipped elements) | |||||
Return Type - signed halfword | Return Type - signed halfword | ||||
*/ | */ | ||||
#define CLIP_SH(in, min, max) \ | |||||
( { \ | |||||
v8i16 out_m; \ | |||||
\ | |||||
out_m = __msa_max_s_h((v8i16) min, (v8i16) in); \ | |||||
out_m = __msa_min_s_h((v8i16) max, (v8i16) out_m); \ | |||||
out_m; \ | |||||
} ) | |||||
#define CLIP_SH(in, min, max) \ | |||||
{ \ | |||||
in = __msa_max_s_h((v8i16) min, (v8i16) in); \ | |||||
in = __msa_min_s_h((v8i16) max, (v8i16) in); \ | |||||
} | |||||
/* Description : Clips all signed halfword elements of input vector | /* Description : Clips all signed halfword elements of input vector | ||||
between 0 & 255 | between 0 & 255 | ||||
Arguments : Inputs - in (input vector) | |||||
Outputs - out_m (output vector with clipped elements) | |||||
Return Type - signed halfword | |||||
Arguments : Inputs - in (input vector) | |||||
Outputs - in (output vector with clipped elements) | |||||
Return Type - signed halfwords | |||||
*/ | */ | ||||
#define CLIP_SH_0_255(in) \ | |||||
( { \ | |||||
v8i16 max_m = __msa_ldi_h(255); \ | |||||
v8i16 out_m; \ | |||||
\ | |||||
out_m = __msa_maxi_s_h((v8i16) in, 0); \ | |||||
out_m = __msa_min_s_h((v8i16) max_m, (v8i16) out_m); \ | |||||
out_m; \ | |||||
} ) | |||||
#define CLIP_SH_0_255(in) \ | |||||
{ \ | |||||
in = __msa_maxi_s_h((v8i16) in, 0); \ | |||||
in = (v8i16) __msa_sat_u_h((v8u16) in, 7); \ | |||||
} | |||||
#define CLIP_SH2_0_255(in0, in1) \ | #define CLIP_SH2_0_255(in0, in1) \ | ||||
{ \ | { \ | ||||
in0 = CLIP_SH_0_255(in0); \ | |||||
in1 = CLIP_SH_0_255(in1); \ | |||||
CLIP_SH_0_255(in0); \ | |||||
CLIP_SH_0_255(in1); \ | |||||
} | } | ||||
#define CLIP_SH4_0_255(in0, in1, in2, in3) \ | #define CLIP_SH4_0_255(in0, in1, in2, in3) \ | ||||
{ \ | { \ | ||||
CLIP_SH2_0_255(in0, in1); \ | CLIP_SH2_0_255(in0, in1); \ | ||||
CLIP_SH2_0_255(in2, in3); \ | CLIP_SH2_0_255(in2, in3); \ | ||||
} | } | ||||
#define CLIP_SH_0_255_MAX_SATU(in) \ | |||||
( { \ | |||||
v8i16 out_m; \ | |||||
\ | |||||
out_m = __msa_maxi_s_h((v8i16) in, 0); \ | |||||
out_m = (v8i16) __msa_sat_u_h((v8u16) out_m, 7); \ | |||||
out_m; \ | |||||
} ) | |||||
#define CLIP_SH2_0_255_MAX_SATU(in0, in1) \ | |||||
{ \ | |||||
in0 = CLIP_SH_0_255_MAX_SATU(in0); \ | |||||
in1 = CLIP_SH_0_255_MAX_SATU(in1); \ | |||||
} | |||||
#define CLIP_SH4_0_255_MAX_SATU(in0, in1, in2, in3) \ | |||||
{ \ | |||||
CLIP_SH2_0_255_MAX_SATU(in0, in1); \ | |||||
CLIP_SH2_0_255_MAX_SATU(in2, in3); \ | |||||
#define CLIP_SH8_0_255(in0, in1, in2, in3, \ | |||||
in4, in5, in6, in7) \ | |||||
{ \ | |||||
CLIP_SH4_0_255(in0, in1, in2, in3); \ | |||||
CLIP_SH4_0_255(in4, in5, in6, in7); \ | |||||
} | } | ||||
/* Description : Clips all signed word elements of input vector | /* Description : Clips all signed word elements of input vector | ||||
between 0 & 255 | between 0 & 255 | ||||
Arguments : Inputs - in (input vector) | |||||
Outputs - out_m (output vector with clipped elements) | |||||
Arguments : Inputs - in (input vector) | |||||
Outputs - in (output vector with clipped elements) | |||||
Return Type - signed word | Return Type - signed word | ||||
*/ | */ | ||||
#define CLIP_SW_0_255(in) \ | |||||
( { \ | |||||
v4i32 max_m = __msa_ldi_w(255); \ | |||||
v4i32 out_m; \ | |||||
\ | |||||
out_m = __msa_maxi_s_w((v4i32) in, 0); \ | |||||
out_m = __msa_min_s_w((v4i32) max_m, (v4i32) out_m); \ | |||||
out_m; \ | |||||
} ) | |||||
#define CLIP_SW_0_255(in) \ | |||||
{ \ | |||||
in = __msa_maxi_s_w((v4i32) in, 0); \ | |||||
in = (v4i32) __msa_sat_u_w((v4u32) in, 7); \ | |||||
} | |||||
#define CLIP_SW_0_255_MAX_SATU(in) \ | |||||
( { \ | |||||
v4i32 out_m; \ | |||||
\ | |||||
out_m = __msa_maxi_s_w((v4i32) in, 0); \ | |||||
out_m = (v4i32) __msa_sat_u_w((v4u32) out_m, 7); \ | |||||
out_m; \ | |||||
} ) | |||||
#define CLIP_SW2_0_255_MAX_SATU(in0, in1) \ | |||||
{ \ | |||||
in0 = CLIP_SW_0_255_MAX_SATU(in0); \ | |||||
in1 = CLIP_SW_0_255_MAX_SATU(in1); \ | |||||
#define CLIP_SW2_0_255(in0, in1) \ | |||||
{ \ | |||||
CLIP_SW_0_255(in0); \ | |||||
CLIP_SW_0_255(in1); \ | |||||
} | } | ||||
#define CLIP_SW4_0_255_MAX_SATU(in0, in1, in2, in3) \ | |||||
{ \ | |||||
CLIP_SW2_0_255_MAX_SATU(in0, in1); \ | |||||
CLIP_SW2_0_255_MAX_SATU(in2, in3); \ | |||||
#define CLIP_SW4_0_255(in0, in1, in2, in3) \ | |||||
{ \ | |||||
CLIP_SW2_0_255(in0, in1); \ | |||||
CLIP_SW2_0_255(in2, in3); \ | |||||
} | |||||
#define CLIP_SW8_0_255(in0, in1, in2, in3, \ | |||||
in4, in5, in6, in7) \ | |||||
{ \ | |||||
CLIP_SW4_0_255(in0, in1, in2, in3); \ | |||||
CLIP_SW4_0_255(in4, in5, in6, in7); \ | |||||
} | } | ||||
/* Description : Addition of 4 signed word elements | /* Description : Addition of 4 signed word elements | ||||