beta0 and beta1 will always be the same within a CU
Signed-off-by: Mickaël Raulet <mraulet@insa-rennes.fr>
cherry picked from commit 4a23d82474
Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
tags/n2.4
| @@ -340,7 +340,7 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0) | |||||
| uint8_t *src; | uint8_t *src; | ||||
| int x, y; | int x, y; | ||||
| int chroma; | int chroma; | ||||
| int c_tc[2], beta[2], tc[2]; | |||||
| int c_tc[2], tc[2], beta; | |||||
| uint8_t no_p[2] = { 0 }; | uint8_t no_p[2] = { 0 }; | ||||
| uint8_t no_q[2] = { 0 }; | uint8_t no_q[2] = { 0 }; | ||||
| @@ -381,13 +381,11 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0) | |||||
| const int bs0 = s->vertical_bs[(x + y * s->bs_width) >> 2]; | const int bs0 = s->vertical_bs[(x + y * s->bs_width) >> 2]; | ||||
| const int bs1 = s->vertical_bs[(x + (y + 4) * s->bs_width) >> 2]; | const int bs1 = s->vertical_bs[(x + (y + 4) * s->bs_width) >> 2]; | ||||
| if (bs0 || bs1) { | if (bs0 || bs1) { | ||||
| const int qp0 = (get_qPy(s, x - 1, y) + get_qPy(s, x, y) + 1) >> 1; | |||||
| const int qp1 = (get_qPy(s, x - 1, y + 4) + get_qPy(s, x, y + 4) + 1) >> 1; | |||||
| const int qp = (get_qPy(s, x - 1, y) + get_qPy(s, x, y) + 1) >> 1; | |||||
| beta[0] = betatable[av_clip(qp0 + beta_offset, 0, MAX_QP)]; | |||||
| beta[1] = betatable[av_clip(qp1 + beta_offset, 0, MAX_QP)]; | |||||
| tc[0] = bs0 ? TC_CALC(qp0, bs0) : 0; | |||||
| tc[1] = bs1 ? TC_CALC(qp1, bs1) : 0; | |||||
| beta = betatable[av_clip(qp + beta_offset, 0, MAX_QP)]; | |||||
| tc[0] = bs0 ? TC_CALC(qp, bs0) : 0; | |||||
| tc[1] = bs1 ? TC_CALC(qp, bs1) : 0; | |||||
| src = &s->frame->data[LUMA][y * s->frame->linesize[LUMA] + (x << s->sps->pixel_shift)]; | src = &s->frame->data[LUMA][y * s->frame->linesize[LUMA] + (x << s->sps->pixel_shift)]; | ||||
| if (pcmf) { | if (pcmf) { | ||||
| no_p[0] = get_pcm(s, x - 1, y); | no_p[0] = get_pcm(s, x - 1, y); | ||||
| @@ -447,16 +445,14 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0) | |||||
| const int bs0 = s->horizontal_bs[( x + y * s->bs_width) >> 2]; | const int bs0 = s->horizontal_bs[( x + y * s->bs_width) >> 2]; | ||||
| const int bs1 = s->horizontal_bs[((x + 4) + y * s->bs_width) >> 2]; | const int bs1 = s->horizontal_bs[((x + 4) + y * s->bs_width) >> 2]; | ||||
| if (bs0 || bs1) { | if (bs0 || bs1) { | ||||
| const int qp0 = (get_qPy(s, x, y - 1) + get_qPy(s, x, y) + 1) >> 1; | |||||
| const int qp1 = (get_qPy(s, x + 4, y - 1) + get_qPy(s, x + 4, y) + 1) >> 1; | |||||
| const int qp = (get_qPy(s, x, y - 1) + get_qPy(s, x, y) + 1) >> 1; | |||||
| tc_offset = x >= x0 ? cur_tc_offset : left_tc_offset; | tc_offset = x >= x0 ? cur_tc_offset : left_tc_offset; | ||||
| beta_offset = x >= x0 ? cur_beta_offset : left_beta_offset; | beta_offset = x >= x0 ? cur_beta_offset : left_beta_offset; | ||||
| beta[0] = betatable[av_clip(qp0 + beta_offset, 0, MAX_QP)]; | |||||
| beta[1] = betatable[av_clip(qp1 + beta_offset, 0, MAX_QP)]; | |||||
| tc[0] = bs0 ? TC_CALC(qp0, bs0) : 0; | |||||
| tc[1] = bs1 ? TC_CALC(qp1, bs1) : 0; | |||||
| beta = betatable[av_clip(qp + beta_offset, 0, MAX_QP)]; | |||||
| tc[0] = bs0 ? TC_CALC(qp, bs0) : 0; | |||||
| tc[1] = bs1 ? TC_CALC(qp, bs1) : 0; | |||||
| src = &s->frame->data[LUMA][y * s->frame->linesize[LUMA] + (x << s->sps->pixel_shift)]; | src = &s->frame->data[LUMA][y * s->frame->linesize[LUMA] + (x << s->sps->pixel_shift)]; | ||||
| if (pcmf) { | if (pcmf) { | ||||
| no_p[0] = get_pcm(s, x, y - 1); | no_p[0] = get_pcm(s, x, y - 1); | ||||
| @@ -97,20 +97,20 @@ typedef struct HEVCDSPContext { | |||||
| int ox1, intptr_t mx, intptr_t my, int width); | int ox1, intptr_t mx, intptr_t my, int width); | ||||
| void (*hevc_h_loop_filter_luma)(uint8_t *pix, ptrdiff_t stride, | void (*hevc_h_loop_filter_luma)(uint8_t *pix, ptrdiff_t stride, | ||||
| int *beta, int *tc, | |||||
| int beta, int *tc, | |||||
| uint8_t *no_p, uint8_t *no_q); | uint8_t *no_p, uint8_t *no_q); | ||||
| void (*hevc_v_loop_filter_luma)(uint8_t *pix, ptrdiff_t stride, | void (*hevc_v_loop_filter_luma)(uint8_t *pix, ptrdiff_t stride, | ||||
| int *beta, int *tc, | |||||
| int beta, int *tc, | |||||
| uint8_t *no_p, uint8_t *no_q); | uint8_t *no_p, uint8_t *no_q); | ||||
| void (*hevc_h_loop_filter_chroma)(uint8_t *pix, ptrdiff_t stride, | void (*hevc_h_loop_filter_chroma)(uint8_t *pix, ptrdiff_t stride, | ||||
| int *tc, uint8_t *no_p, uint8_t *no_q); | int *tc, uint8_t *no_p, uint8_t *no_q); | ||||
| void (*hevc_v_loop_filter_chroma)(uint8_t *pix, ptrdiff_t stride, | void (*hevc_v_loop_filter_chroma)(uint8_t *pix, ptrdiff_t stride, | ||||
| int *tc, uint8_t *no_p, uint8_t *no_q); | int *tc, uint8_t *no_p, uint8_t *no_q); | ||||
| void (*hevc_h_loop_filter_luma_c)(uint8_t *pix, ptrdiff_t stride, | void (*hevc_h_loop_filter_luma_c)(uint8_t *pix, ptrdiff_t stride, | ||||
| int *beta, int *tc, | |||||
| int beta, int *tc, | |||||
| uint8_t *no_p, uint8_t *no_q); | uint8_t *no_p, uint8_t *no_q); | ||||
| void (*hevc_v_loop_filter_luma_c)(uint8_t *pix, ptrdiff_t stride, | void (*hevc_v_loop_filter_luma_c)(uint8_t *pix, ptrdiff_t stride, | ||||
| int *beta, int *tc, | |||||
| int beta, int *tc, | |||||
| uint8_t *no_p, uint8_t *no_q); | uint8_t *no_p, uint8_t *no_q); | ||||
| void (*hevc_h_loop_filter_chroma_c)(uint8_t *pix, ptrdiff_t stride, | void (*hevc_h_loop_filter_chroma_c)(uint8_t *pix, ptrdiff_t stride, | ||||
| int *tc, uint8_t *no_p, | int *tc, uint8_t *no_p, | ||||
| @@ -1564,7 +1564,7 @@ static void FUNC(put_hevc_epel_bi_w_hv)(uint8_t *_dst, ptrdiff_t _dststride, uin | |||||
| static void FUNC(hevc_loop_filter_luma)(uint8_t *_pix, | static void FUNC(hevc_loop_filter_luma)(uint8_t *_pix, | ||||
| ptrdiff_t _xstride, ptrdiff_t _ystride, | ptrdiff_t _xstride, ptrdiff_t _ystride, | ||||
| int *_beta, int *_tc, | |||||
| int beta, int *_tc, | |||||
| uint8_t *_no_p, uint8_t *_no_q) | uint8_t *_no_p, uint8_t *_no_q) | ||||
| { | { | ||||
| int d, j; | int d, j; | ||||
| @@ -1572,6 +1572,8 @@ static void FUNC(hevc_loop_filter_luma)(uint8_t *_pix, | |||||
| ptrdiff_t xstride = _xstride / sizeof(pixel); | ptrdiff_t xstride = _xstride / sizeof(pixel); | ||||
| ptrdiff_t ystride = _ystride / sizeof(pixel); | ptrdiff_t ystride = _ystride / sizeof(pixel); | ||||
| beta <<= BIT_DEPTH - 8; | |||||
| for (j = 0; j < 2; j++) { | for (j = 0; j < 2; j++) { | ||||
| const int dp0 = abs(P2 - 2 * P1 + P0); | const int dp0 = abs(P2 - 2 * P1 + P0); | ||||
| const int dq0 = abs(Q2 - 2 * Q1 + Q0); | const int dq0 = abs(Q2 - 2 * Q1 + Q0); | ||||
| @@ -1579,7 +1581,6 @@ static void FUNC(hevc_loop_filter_luma)(uint8_t *_pix, | |||||
| const int dq3 = abs(TQ2 - 2 * TQ1 + TQ0); | const int dq3 = abs(TQ2 - 2 * TQ1 + TQ0); | ||||
| const int d0 = dp0 + dq0; | const int d0 = dp0 + dq0; | ||||
| const int d3 = dp3 + dq3; | const int d3 = dp3 + dq3; | ||||
| const int beta = _beta[j] << (BIT_DEPTH - 8); | |||||
| const int tc = _tc[j] << (BIT_DEPTH - 8); | const int tc = _tc[j] << (BIT_DEPTH - 8); | ||||
| const int no_p = _no_p[j]; | const int no_p = _no_p[j]; | ||||
| const int no_q = _no_q[j]; | const int no_q = _no_q[j]; | ||||
| @@ -1706,7 +1707,7 @@ static void FUNC(hevc_v_loop_filter_chroma)(uint8_t *pix, ptrdiff_t stride, | |||||
| } | } | ||||
| static void FUNC(hevc_h_loop_filter_luma)(uint8_t *pix, ptrdiff_t stride, | static void FUNC(hevc_h_loop_filter_luma)(uint8_t *pix, ptrdiff_t stride, | ||||
| int *beta, int *tc, uint8_t *no_p, | |||||
| int beta, int *tc, uint8_t *no_p, | |||||
| uint8_t *no_q) | uint8_t *no_q) | ||||
| { | { | ||||
| FUNC(hevc_loop_filter_luma)(pix, stride, sizeof(pixel), | FUNC(hevc_loop_filter_luma)(pix, stride, sizeof(pixel), | ||||
| @@ -1714,7 +1715,7 @@ static void FUNC(hevc_h_loop_filter_luma)(uint8_t *pix, ptrdiff_t stride, | |||||
| } | } | ||||
| static void FUNC(hevc_v_loop_filter_luma)(uint8_t *pix, ptrdiff_t stride, | static void FUNC(hevc_v_loop_filter_luma)(uint8_t *pix, ptrdiff_t stride, | ||||
| int *beta, int *tc, uint8_t *no_p, | |||||
| int beta, int *tc, uint8_t *no_p, | |||||
| uint8_t *no_q) | uint8_t *no_q) | ||||
| { | { | ||||
| FUNC(hevc_loop_filter_luma)(pix, sizeof(pixel), stride, | FUNC(hevc_loop_filter_luma)(pix, sizeof(pixel), stride, | ||||
| @@ -310,7 +310,7 @@ INIT_XMM sse2 | |||||
| %endmacro | %endmacro | ||||
| ALIGN 16 | ALIGN 16 | ||||
| ; input in m0 ... m3 and tcs in tc (r2). Output in m1 and m2 | |||||
| ; input in m0 ... m3 and tcs in r2. Output in m1 and m2 | |||||
| %macro CHROMA_DEBLOCK_BODY 1 | %macro CHROMA_DEBLOCK_BODY 1 | ||||
| psubw m4, m2, m1; q0 - p0 | psubw m4, m2, m1; q0 - p0 | ||||
| psubw m5, m0, m3; p1 - q1 | psubw m5, m0, m3; p1 - q1 | ||||
| @@ -339,7 +339,7 @@ ALIGN 16 | |||||
| psubw m2, m5; q0 - delta0 | psubw m2, m5; q0 - delta0 | ||||
| %endmacro | %endmacro | ||||
| ; input in m0 ... m7, betas in r2 tcs in r3. Output in m1...m6 | |||||
| ; input in m0 ... m7, beta in r2 tcs in r3. Output in m1...m6 | |||||
| %macro LUMA_DEBLOCK_BODY 2 | %macro LUMA_DEBLOCK_BODY 2 | ||||
| psllw m9, m2, 1; *2 | psllw m9, m2, 1; *2 | ||||
| psubw m10, m1, m9 | psubw m10, m1, m9 | ||||
| @@ -352,20 +352,11 @@ ALIGN 16 | |||||
| ABS1 m11, m13 ; 0dq0, 0dq3 , 1dq0, 1dq3 | ABS1 m11, m13 ; 0dq0, 0dq3 , 1dq0, 1dq3 | ||||
| ;beta calculations | ;beta calculations | ||||
| mov r11, [betaq]; | |||||
| %if %1 > 8 | %if %1 > 8 | ||||
| shl r11, %1 - 8 | |||||
| %endif | |||||
| movd m13, r11d; beta0 | |||||
| add betaq, 4; | |||||
| punpcklwd m13, m13 | |||||
| mov r12, [betaq]; | |||||
| %if %1 > 8 | |||||
| shl r12, %1 - 8 | |||||
| shl betaq, %1 - 8 | |||||
| %endif | %endif | ||||
| movd m14, r12d; beta1 | |||||
| punpcklwd m14, m14 | |||||
| pshufd m13, m14, 0; beta0, beta1 | |||||
| movd m13, betaq | |||||
| SPLATW m13, m13, 0 | |||||
| ;end beta calculations | ;end beta calculations | ||||
| paddw m9, m10, m11; 0d0, 0d3 , 1d0, 1d3 | paddw m9, m10, m11; 0d0, 0d3 , 1d0, 1d3 | ||||
| @@ -412,31 +403,31 @@ ALIGN 16 | |||||
| ; end calc for weak filter | ; end calc for weak filter | ||||
| ; filtering mask | ; filtering mask | ||||
| mov r2, r13 | |||||
| shr r2, 3 | |||||
| movd m15, r2d | |||||
| mov r11, r13 | |||||
| shr r11, 3 | |||||
| movd m15, r11d | |||||
| and r13, 1 | and r13, 1 | ||||
| movd m11, r13d | movd m11, r13d | ||||
| shufps m11, m15, 0 | shufps m11, m15, 0 | ||||
| shl r2, 1 | |||||
| or r13, r2 | |||||
| shl r11, 1 | |||||
| or r13, r11 | |||||
| pcmpeqd m11, [pd_1]; filtering mask | pcmpeqd m11, [pd_1]; filtering mask | ||||
| ;decide between strong and weak filtering | ;decide between strong and weak filtering | ||||
| ;tc25 calculations | ;tc25 calculations | ||||
| mov r2d, [tcq]; | |||||
| mov r11d, [tcq]; | |||||
| %if %1 > 8 | %if %1 > 8 | ||||
| shl r2, %1 - 8 | |||||
| shl r11, %1 - 8 | |||||
| %endif | %endif | ||||
| movd m8, r2d; tc0 | |||||
| movd m8, r11d; tc0 | |||||
| add tcq, 4; | add tcq, 4; | ||||
| mov r3d, [tcq]; | mov r3d, [tcq]; | ||||
| %if %1 > 8 | %if %1 > 8 | ||||
| shl r3, %1 - 8 | shl r3, %1 - 8 | ||||
| %endif | %endif | ||||
| movd m9, r3d; tc1 | movd m9, r3d; tc1 | ||||
| add r2d, r3d; tc0 + tc1 | |||||
| add r11d, r3d; tc0 + tc1 | |||||
| jz .bypassluma | jz .bypassluma | ||||
| punpcklwd m8, m8 | punpcklwd m8, m8 | ||||
| punpcklwd m9, m9 | punpcklwd m9, m9 | ||||
| @@ -460,8 +451,8 @@ ALIGN 16 | |||||
| psraw m13, 3; beta >> 3 | psraw m13, 3; beta >> 3 | ||||
| pcmpgtw m13, m12; | pcmpgtw m13, m12; | ||||
| movmskps r2, m13; | |||||
| and r14, r2; strong mask , beta_2 and beta_3 comparisons | |||||
| movmskps r11, m13; | |||||
| and r14, r11; strong mask , beta_2 and beta_3 comparisons | |||||
| ;----beta_3 comparison end----- | ;----beta_3 comparison end----- | ||||
| ;----tc25 comparison--- | ;----tc25 comparison--- | ||||
| psubw m12, m3, m4; p0 - q0 | psubw m12, m3, m4; p0 - q0 | ||||
| @@ -471,24 +462,24 @@ ALIGN 16 | |||||
| pshuflw m12, m12, 0xf0 ;0b11110000; | pshuflw m12, m12, 0xf0 ;0b11110000; | ||||
| pcmpgtw m8, m12; tc25 comparisons | pcmpgtw m8, m12; tc25 comparisons | ||||
| movmskps r2, m8; | |||||
| and r14, r2; strong mask, beta_2, beta_3 and tc25 comparisons | |||||
| movmskps r11, m8; | |||||
| and r14, r11; strong mask, beta_2, beta_3 and tc25 comparisons | |||||
| ;----tc25 comparison end--- | ;----tc25 comparison end--- | ||||
| mov r2, r14; | |||||
| shr r2, 1; | |||||
| and r14, r2; strong mask, bits 2 and 0 | |||||
| mov r11, r14; | |||||
| shr r11, 1; | |||||
| and r14, r11; strong mask, bits 2 and 0 | |||||
| pmullw m14, m9, [pw_m2]; -tc * 2 | pmullw m14, m9, [pw_m2]; -tc * 2 | ||||
| paddw m9, m9 | paddw m9, m9 | ||||
| and r14, 5; 0b101 | and r14, 5; 0b101 | ||||
| mov r2, r14; strong mask | |||||
| mov r11, r14; strong mask | |||||
| shr r14, 2; | shr r14, 2; | ||||
| movd m12, r14d; store to xmm for mask generation | movd m12, r14d; store to xmm for mask generation | ||||
| shl r14, 1 | shl r14, 1 | ||||
| and r2, 1 | |||||
| movd m10, r2d; store to xmm for mask generation | |||||
| or r14, r2; final strong mask, bits 1 and 0 | |||||
| and r11, 1 | |||||
| movd m10, r11d; store to xmm for mask generation | |||||
| or r14, r11; final strong mask, bits 1 and 0 | |||||
| jz .weakfilter | jz .weakfilter | ||||
| shufps m10, m12, 0 | shufps m10, m12, 0 | ||||
| @@ -578,23 +569,18 @@ ALIGN 16 | |||||
| jz .store | jz .store | ||||
| ; weak filtering mask | ; weak filtering mask | ||||
| mov r2, r14 | |||||
| shr r2, 1 | |||||
| movd m12, r2d | |||||
| mov r11, r14 | |||||
| shr r11, 1 | |||||
| movd m12, r11d | |||||
| and r14, 1 | and r14, 1 | ||||
| movd m11, r14d | movd m11, r14d | ||||
| shufps m11, m12, 0 | shufps m11, m12, 0 | ||||
| pcmpeqd m11, [pd_1]; filtering mask | pcmpeqd m11, [pd_1]; filtering mask | ||||
| mov r13, r11; beta0 | |||||
| shr r13, 1; | |||||
| add r11, r13 | |||||
| shr r11, 3; ((beta0+(beta0>>1))>>3)) | |||||
| mov r13, r12; beta1 | |||||
| mov r13, betaq | |||||
| shr r13, 1; | shr r13, 1; | ||||
| add r12, r13 | |||||
| shr r12, 3; ((beta1+(beta1>>1))>>3)) | |||||
| add betaq, r13 | |||||
| shr betaq, 3; ((beta + (beta >> 1)) >> 3)) | |||||
| mova m13, [pw_8] | mova m13, [pw_8] | ||||
| psubw m12, m4, m3 ; q0 - p0 | psubw m12, m4, m3 ; q0 - p0 | ||||
| @@ -633,11 +619,8 @@ ALIGN 16 | |||||
| paddw m15, m2; p1' | paddw m15, m2; p1' | ||||
| ;beta calculations | ;beta calculations | ||||
| movd m10, r11d; beta0 | |||||
| punpcklwd m10, m10 | |||||
| movd m13, r12d; beta1 | |||||
| punpcklwd m13, m13 | |||||
| shufps m10, m13, 0; betax0, betax1 | |||||
| movd m10, betaq | |||||
| SPLATW m10, m10, 0 | |||||
| movd m13, r7d; 1dp0 + 1dp3 | movd m13, r7d; 1dp0 + 1dp3 | ||||
| movd m8, r8d; 0dp0 + 0dp3 | movd m8, r8d; 0dp0 + 0dp3 | ||||