beta0 and beta1 will always be the same within a CU
Signed-off-by: Mickaël Raulet <mraulet@insa-rennes.fr>
cherry picked from commit 4a23d82474
Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
tags/n2.4
| @@ -340,7 +340,7 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0) | |||
| uint8_t *src; | |||
| int x, y; | |||
| int chroma; | |||
| int c_tc[2], beta[2], tc[2]; | |||
| int c_tc[2], tc[2], beta; | |||
| uint8_t no_p[2] = { 0 }; | |||
| uint8_t no_q[2] = { 0 }; | |||
| @@ -381,13 +381,11 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0) | |||
| const int bs0 = s->vertical_bs[(x + y * s->bs_width) >> 2]; | |||
| const int bs1 = s->vertical_bs[(x + (y + 4) * s->bs_width) >> 2]; | |||
| if (bs0 || bs1) { | |||
| const int qp0 = (get_qPy(s, x - 1, y) + get_qPy(s, x, y) + 1) >> 1; | |||
| const int qp1 = (get_qPy(s, x - 1, y + 4) + get_qPy(s, x, y + 4) + 1) >> 1; | |||
| const int qp = (get_qPy(s, x - 1, y) + get_qPy(s, x, y) + 1) >> 1; | |||
| beta[0] = betatable[av_clip(qp0 + beta_offset, 0, MAX_QP)]; | |||
| beta[1] = betatable[av_clip(qp1 + beta_offset, 0, MAX_QP)]; | |||
| tc[0] = bs0 ? TC_CALC(qp0, bs0) : 0; | |||
| tc[1] = bs1 ? TC_CALC(qp1, bs1) : 0; | |||
| beta = betatable[av_clip(qp + beta_offset, 0, MAX_QP)]; | |||
| tc[0] = bs0 ? TC_CALC(qp, bs0) : 0; | |||
| tc[1] = bs1 ? TC_CALC(qp, bs1) : 0; | |||
| src = &s->frame->data[LUMA][y * s->frame->linesize[LUMA] + (x << s->sps->pixel_shift)]; | |||
| if (pcmf) { | |||
| no_p[0] = get_pcm(s, x - 1, y); | |||
| @@ -447,16 +445,14 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0) | |||
| const int bs0 = s->horizontal_bs[( x + y * s->bs_width) >> 2]; | |||
| const int bs1 = s->horizontal_bs[((x + 4) + y * s->bs_width) >> 2]; | |||
| if (bs0 || bs1) { | |||
| const int qp0 = (get_qPy(s, x, y - 1) + get_qPy(s, x, y) + 1) >> 1; | |||
| const int qp1 = (get_qPy(s, x + 4, y - 1) + get_qPy(s, x + 4, y) + 1) >> 1; | |||
| const int qp = (get_qPy(s, x, y - 1) + get_qPy(s, x, y) + 1) >> 1; | |||
| tc_offset = x >= x0 ? cur_tc_offset : left_tc_offset; | |||
| beta_offset = x >= x0 ? cur_beta_offset : left_beta_offset; | |||
| beta[0] = betatable[av_clip(qp0 + beta_offset, 0, MAX_QP)]; | |||
| beta[1] = betatable[av_clip(qp1 + beta_offset, 0, MAX_QP)]; | |||
| tc[0] = bs0 ? TC_CALC(qp0, bs0) : 0; | |||
| tc[1] = bs1 ? TC_CALC(qp1, bs1) : 0; | |||
| beta = betatable[av_clip(qp + beta_offset, 0, MAX_QP)]; | |||
| tc[0] = bs0 ? TC_CALC(qp, bs0) : 0; | |||
| tc[1] = bs1 ? TC_CALC(qp, bs1) : 0; | |||
| src = &s->frame->data[LUMA][y * s->frame->linesize[LUMA] + (x << s->sps->pixel_shift)]; | |||
| if (pcmf) { | |||
| no_p[0] = get_pcm(s, x, y - 1); | |||
| @@ -97,20 +97,20 @@ typedef struct HEVCDSPContext { | |||
| int ox1, intptr_t mx, intptr_t my, int width); | |||
| void (*hevc_h_loop_filter_luma)(uint8_t *pix, ptrdiff_t stride, | |||
| int *beta, int *tc, | |||
| int beta, int *tc, | |||
| uint8_t *no_p, uint8_t *no_q); | |||
| void (*hevc_v_loop_filter_luma)(uint8_t *pix, ptrdiff_t stride, | |||
| int *beta, int *tc, | |||
| int beta, int *tc, | |||
| uint8_t *no_p, uint8_t *no_q); | |||
| void (*hevc_h_loop_filter_chroma)(uint8_t *pix, ptrdiff_t stride, | |||
| int *tc, uint8_t *no_p, uint8_t *no_q); | |||
| void (*hevc_v_loop_filter_chroma)(uint8_t *pix, ptrdiff_t stride, | |||
| int *tc, uint8_t *no_p, uint8_t *no_q); | |||
| void (*hevc_h_loop_filter_luma_c)(uint8_t *pix, ptrdiff_t stride, | |||
| int *beta, int *tc, | |||
| int beta, int *tc, | |||
| uint8_t *no_p, uint8_t *no_q); | |||
| void (*hevc_v_loop_filter_luma_c)(uint8_t *pix, ptrdiff_t stride, | |||
| int *beta, int *tc, | |||
| int beta, int *tc, | |||
| uint8_t *no_p, uint8_t *no_q); | |||
| void (*hevc_h_loop_filter_chroma_c)(uint8_t *pix, ptrdiff_t stride, | |||
| int *tc, uint8_t *no_p, | |||
| @@ -1564,7 +1564,7 @@ static void FUNC(put_hevc_epel_bi_w_hv)(uint8_t *_dst, ptrdiff_t _dststride, uin | |||
| static void FUNC(hevc_loop_filter_luma)(uint8_t *_pix, | |||
| ptrdiff_t _xstride, ptrdiff_t _ystride, | |||
| int *_beta, int *_tc, | |||
| int beta, int *_tc, | |||
| uint8_t *_no_p, uint8_t *_no_q) | |||
| { | |||
| int d, j; | |||
| @@ -1572,6 +1572,8 @@ static void FUNC(hevc_loop_filter_luma)(uint8_t *_pix, | |||
| ptrdiff_t xstride = _xstride / sizeof(pixel); | |||
| ptrdiff_t ystride = _ystride / sizeof(pixel); | |||
| beta <<= BIT_DEPTH - 8; | |||
| for (j = 0; j < 2; j++) { | |||
| const int dp0 = abs(P2 - 2 * P1 + P0); | |||
| const int dq0 = abs(Q2 - 2 * Q1 + Q0); | |||
| @@ -1579,7 +1581,6 @@ static void FUNC(hevc_loop_filter_luma)(uint8_t *_pix, | |||
| const int dq3 = abs(TQ2 - 2 * TQ1 + TQ0); | |||
| const int d0 = dp0 + dq0; | |||
| const int d3 = dp3 + dq3; | |||
| const int beta = _beta[j] << (BIT_DEPTH - 8); | |||
| const int tc = _tc[j] << (BIT_DEPTH - 8); | |||
| const int no_p = _no_p[j]; | |||
| const int no_q = _no_q[j]; | |||
| @@ -1706,7 +1707,7 @@ static void FUNC(hevc_v_loop_filter_chroma)(uint8_t *pix, ptrdiff_t stride, | |||
| } | |||
| static void FUNC(hevc_h_loop_filter_luma)(uint8_t *pix, ptrdiff_t stride, | |||
| int *beta, int *tc, uint8_t *no_p, | |||
| int beta, int *tc, uint8_t *no_p, | |||
| uint8_t *no_q) | |||
| { | |||
| FUNC(hevc_loop_filter_luma)(pix, stride, sizeof(pixel), | |||
| @@ -1714,7 +1715,7 @@ static void FUNC(hevc_h_loop_filter_luma)(uint8_t *pix, ptrdiff_t stride, | |||
| } | |||
| static void FUNC(hevc_v_loop_filter_luma)(uint8_t *pix, ptrdiff_t stride, | |||
| int *beta, int *tc, uint8_t *no_p, | |||
| int beta, int *tc, uint8_t *no_p, | |||
| uint8_t *no_q) | |||
| { | |||
| FUNC(hevc_loop_filter_luma)(pix, sizeof(pixel), stride, | |||
| @@ -310,7 +310,7 @@ INIT_XMM sse2 | |||
| %endmacro | |||
| ALIGN 16 | |||
| ; input in m0 ... m3 and tcs in tc (r2). Output in m1 and m2 | |||
| ; input in m0 ... m3 and tcs in r2. Output in m1 and m2 | |||
| %macro CHROMA_DEBLOCK_BODY 1 | |||
| psubw m4, m2, m1; q0 - p0 | |||
| psubw m5, m0, m3; p1 - q1 | |||
| @@ -339,7 +339,7 @@ ALIGN 16 | |||
| psubw m2, m5; q0 - delta0 | |||
| %endmacro | |||
| ; input in m0 ... m7, betas in r2 tcs in r3. Output in m1...m6 | |||
| ; input in m0 ... m7, beta in r2 tcs in r3. Output in m1...m6 | |||
| %macro LUMA_DEBLOCK_BODY 2 | |||
| psllw m9, m2, 1; *2 | |||
| psubw m10, m1, m9 | |||
| @@ -352,20 +352,11 @@ ALIGN 16 | |||
| ABS1 m11, m13 ; 0dq0, 0dq3 , 1dq0, 1dq3 | |||
| ;beta calculations | |||
| mov r11, [betaq]; | |||
| %if %1 > 8 | |||
| shl r11, %1 - 8 | |||
| %endif | |||
| movd m13, r11d; beta0 | |||
| add betaq, 4; | |||
| punpcklwd m13, m13 | |||
| mov r12, [betaq]; | |||
| %if %1 > 8 | |||
| shl r12, %1 - 8 | |||
| shl betaq, %1 - 8 | |||
| %endif | |||
| movd m14, r12d; beta1 | |||
| punpcklwd m14, m14 | |||
| pshufd m13, m14, 0; beta0, beta1 | |||
| movd m13, betaq | |||
| SPLATW m13, m13, 0 | |||
| ;end beta calculations | |||
| paddw m9, m10, m11; 0d0, 0d3 , 1d0, 1d3 | |||
| @@ -412,31 +403,31 @@ ALIGN 16 | |||
| ; end calc for weak filter | |||
| ; filtering mask | |||
| mov r2, r13 | |||
| shr r2, 3 | |||
| movd m15, r2d | |||
| mov r11, r13 | |||
| shr r11, 3 | |||
| movd m15, r11d | |||
| and r13, 1 | |||
| movd m11, r13d | |||
| shufps m11, m15, 0 | |||
| shl r2, 1 | |||
| or r13, r2 | |||
| shl r11, 1 | |||
| or r13, r11 | |||
| pcmpeqd m11, [pd_1]; filtering mask | |||
| ;decide between strong and weak filtering | |||
| ;tc25 calculations | |||
| mov r2d, [tcq]; | |||
| mov r11d, [tcq]; | |||
| %if %1 > 8 | |||
| shl r2, %1 - 8 | |||
| shl r11, %1 - 8 | |||
| %endif | |||
| movd m8, r2d; tc0 | |||
| movd m8, r11d; tc0 | |||
| add tcq, 4; | |||
| mov r3d, [tcq]; | |||
| %if %1 > 8 | |||
| shl r3, %1 - 8 | |||
| %endif | |||
| movd m9, r3d; tc1 | |||
| add r2d, r3d; tc0 + tc1 | |||
| add r11d, r3d; tc0 + tc1 | |||
| jz .bypassluma | |||
| punpcklwd m8, m8 | |||
| punpcklwd m9, m9 | |||
| @@ -460,8 +451,8 @@ ALIGN 16 | |||
| psraw m13, 3; beta >> 3 | |||
| pcmpgtw m13, m12; | |||
| movmskps r2, m13; | |||
| and r14, r2; strong mask , beta_2 and beta_3 comparisons | |||
| movmskps r11, m13; | |||
| and r14, r11; strong mask , beta_2 and beta_3 comparisons | |||
| ;----beta_3 comparison end----- | |||
| ;----tc25 comparison--- | |||
| psubw m12, m3, m4; p0 - q0 | |||
| @@ -471,24 +462,24 @@ ALIGN 16 | |||
| pshuflw m12, m12, 0xf0 ;0b11110000; | |||
| pcmpgtw m8, m12; tc25 comparisons | |||
| movmskps r2, m8; | |||
| and r14, r2; strong mask, beta_2, beta_3 and tc25 comparisons | |||
| movmskps r11, m8; | |||
| and r14, r11; strong mask, beta_2, beta_3 and tc25 comparisons | |||
| ;----tc25 comparison end--- | |||
| mov r2, r14; | |||
| shr r2, 1; | |||
| and r14, r2; strong mask, bits 2 and 0 | |||
| mov r11, r14; | |||
| shr r11, 1; | |||
| and r14, r11; strong mask, bits 2 and 0 | |||
| pmullw m14, m9, [pw_m2]; -tc * 2 | |||
| paddw m9, m9 | |||
| and r14, 5; 0b101 | |||
| mov r2, r14; strong mask | |||
| mov r11, r14; strong mask | |||
| shr r14, 2; | |||
| movd m12, r14d; store to xmm for mask generation | |||
| shl r14, 1 | |||
| and r2, 1 | |||
| movd m10, r2d; store to xmm for mask generation | |||
| or r14, r2; final strong mask, bits 1 and 0 | |||
| and r11, 1 | |||
| movd m10, r11d; store to xmm for mask generation | |||
| or r14, r11; final strong mask, bits 1 and 0 | |||
| jz .weakfilter | |||
| shufps m10, m12, 0 | |||
| @@ -578,23 +569,18 @@ ALIGN 16 | |||
| jz .store | |||
| ; weak filtering mask | |||
| mov r2, r14 | |||
| shr r2, 1 | |||
| movd m12, r2d | |||
| mov r11, r14 | |||
| shr r11, 1 | |||
| movd m12, r11d | |||
| and r14, 1 | |||
| movd m11, r14d | |||
| shufps m11, m12, 0 | |||
| pcmpeqd m11, [pd_1]; filtering mask | |||
| mov r13, r11; beta0 | |||
| shr r13, 1; | |||
| add r11, r13 | |||
| shr r11, 3; ((beta0+(beta0>>1))>>3)) | |||
| mov r13, r12; beta1 | |||
| mov r13, betaq | |||
| shr r13, 1; | |||
| add r12, r13 | |||
| shr r12, 3; ((beta1+(beta1>>1))>>3)) | |||
| add betaq, r13 | |||
| shr betaq, 3; ((beta + (beta >> 1)) >> 3)) | |||
| mova m13, [pw_8] | |||
| psubw m12, m4, m3 ; q0 - p0 | |||
| @@ -633,11 +619,8 @@ ALIGN 16 | |||
| paddw m15, m2; p1' | |||
| ;beta calculations | |||
| movd m10, r11d; beta0 | |||
| punpcklwd m10, m10 | |||
| movd m13, r12d; beta1 | |||
| punpcklwd m13, m13 | |||
| shufps m10, m13, 0; betax0, betax1 | |||
| movd m10, betaq | |||
| SPLATW m10, m10, 0 | |||
| movd m13, r7d; 1dp0 + 1dp3 | |||
| movd m8, r8d; 0dp0 + 0dp3 | |||