|
|
|
@@ -377,10 +377,99 @@ cglobal deblock_h_luma_8, 5,9,0,0x60+16*WIN64 |
|
|
|
RET |
|
|
|
%endmacro |
|
|
|
|
|
|
|
%macro DEBLOCK_H_LUMA_MBAFF 0 |
|
|
|
|
|
|
|
cglobal deblock_h_luma_mbaff_8, 5, 9, 10, 8*16, pix_, stride_, alpha_, beta_, tc0_, base3_, stride3_ |
|
|
|
movsxd stride_q, stride_d |
|
|
|
dec alpha_d |
|
|
|
dec beta_d |
|
|
|
mov base3_q, pix_q |
|
|
|
lea stride3_q, [3*stride_q] |
|
|
|
add base3_q, stride3_q |
|
|
|
|
|
|
|
movq m0, [pix_q - 4] |
|
|
|
movq m1, [pix_q + stride_q - 4] |
|
|
|
movq m2, [pix_q + 2*stride_q - 4] |
|
|
|
movq m3, [base3_q - 4] |
|
|
|
movq m4, [base3_q + stride_q - 4] |
|
|
|
movq m5, [base3_q + 2*stride_q - 4] |
|
|
|
movq m6, [base3_q + stride3_q - 4] |
|
|
|
movq m7, [base3_q + 4*stride_q - 4] |
|
|
|
|
|
|
|
TRANSPOSE_8X8B 0,1,2,3,4,5,6,7 |
|
|
|
|
|
|
|
%assign i 0 |
|
|
|
%rep 8 |
|
|
|
movq [rsp + 16*i], m %+ i |
|
|
|
%assign i i+1 |
|
|
|
%endrep |
|
|
|
|
|
|
|
; p2 = m1 [rsp + 16] |
|
|
|
; p1 = m2 [rsp + 32] |
|
|
|
; p0 = m3 [rsp + 48] |
|
|
|
; q0 = m4 [rsp + 64] |
|
|
|
; q1 = m5 [rsp + 80] |
|
|
|
; q2 = m6 [rsp + 96] |
|
|
|
|
|
|
|
SWAP 0, 2 |
|
|
|
SWAP 1, 3 |
|
|
|
SWAP 2, 4 |
|
|
|
SWAP 3, 5 |
|
|
|
|
|
|
|
LOAD_MASK alpha_d, beta_d |
|
|
|
movd m8, [tc0_q] |
|
|
|
punpcklbw m8, m8 |
|
|
|
pcmpeqb m9, m9 |
|
|
|
pcmpeqb m9, m8 |
|
|
|
pandn m9, m7 |
|
|
|
pand m8, m9 |
|
|
|
|
|
|
|
movdqa m3, [rsp + 16] ; p2 |
|
|
|
DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| > beta-1 |
|
|
|
pand m6, m9 |
|
|
|
psubb m7, m8, m6 |
|
|
|
pand m6, m8 |
|
|
|
LUMA_Q1 m0, m3, [rsp + 16], [rsp + 32], m6, m4 |
|
|
|
|
|
|
|
movdqa m4, [rsp + 96] ; q2 |
|
|
|
DIFF_GT2 m2, m4, m5, m6, m3 ; |q2-q0| > beta-1 |
|
|
|
pand m6, m9 |
|
|
|
pand m8, m6 |
|
|
|
psubb m7, m6 |
|
|
|
mova m3, [rsp + 80] |
|
|
|
LUMA_Q1 m3, m4, [rsp + 96], [rsp + 80], m8, m6 |
|
|
|
|
|
|
|
DEBLOCK_P0_Q0 |
|
|
|
SWAP 1, 3 |
|
|
|
SWAP 2, 4 |
|
|
|
movq m0, [rsp] |
|
|
|
movq m1, [rsp + 16] |
|
|
|
movq m2, [rsp + 32] |
|
|
|
movq m5, [rsp + 80] |
|
|
|
movq m6, [rsp + 96] |
|
|
|
movq m7, [rsp + 112] |
|
|
|
|
|
|
|
TRANSPOSE_8X8B 0,1,2,3,4,5,6,7 |
|
|
|
movq [pix_q - 4], m0 |
|
|
|
movq [pix_q + stride_q - 4], m1 |
|
|
|
movq [pix_q + 2*stride_q - 4], m2 |
|
|
|
movq [base3_q - 4], m3 |
|
|
|
movq [base3_q + stride_q - 4], m4 |
|
|
|
movq [base3_q + 2*stride_q - 4], m5 |
|
|
|
movq [base3_q + stride3_q - 4], m6 |
|
|
|
movq [base3_q + 4*stride_q - 4], m7 |
|
|
|
|
|
|
|
RET |
|
|
|
|
|
|
|
%endmacro |
|
|
|
|
|
|
|
INIT_XMM sse2 |
|
|
|
DEBLOCK_H_LUMA_MBAFF |
|
|
|
DEBLOCK_LUMA |
|
|
|
|
|
|
|
%if HAVE_AVX_EXTERNAL |
|
|
|
INIT_XMM avx |
|
|
|
DEBLOCK_H_LUMA_MBAFF |
|
|
|
DEBLOCK_LUMA |
|
|
|
%endif |
|
|
|
|
|
|
|
|