From bee330e300ba6424d0f8b119550c9cc26a1d9f02 Mon Sep 17 00:00:00 2001 From: "Ronald S. Bultje" Date: Sun, 4 Mar 2012 16:17:14 -0800 Subject: [PATCH 1/7] vp8: convert inner loopfilter x86 assembly to use named arguments. --- libavcodec/x86/vp8dsp.asm | 351 ++++++++++++++++++-------------------- 1 file changed, 164 insertions(+), 187 deletions(-) diff --git a/libavcodec/x86/vp8dsp.asm b/libavcodec/x86/vp8dsp.asm index 8e13560b9e..793087e99c 100644 --- a/libavcodec/x86/vp8dsp.asm +++ b/libavcodec/x86/vp8dsp.asm @@ -1654,174 +1654,151 @@ SIMPLE_LOOPFILTER h, 5 ; int flimE, int flimI, int hev_thr); ;----------------------------------------------------------------------------- -%macro INNER_LOOPFILTER 3 -%if %3 == 8 ; chroma -cglobal vp8_%1_loop_filter8uv_inner, 6, %2, 13 -%define dst8_reg r1 -%define mstride_reg r2 -%define E_reg r3 -%define I_reg r4 -%define hev_thr_reg r5 +%macro INNER_LOOPFILTER 2 +%if %2 == 8 ; chroma +cglobal vp8_%1_loop_filter8uv_inner, 6, 6, 13, dst, dst8, stride, flimE, flimI, hevthr %else ; luma -cglobal vp8_%1_loop_filter16y_inner, 5, %2, 13 -%define mstride_reg r1 -%define E_reg r2 -%define I_reg r3 -%define hev_thr_reg r4 -%ifdef m8 ; x86-64, sse2 -%define dst8_reg r4 -%elif mmsize == 16 ; x86-32, sse2 -%define dst8_reg r5 -%else ; x86-32, mmx/mmxext -%define cnt_reg r5 -%endif -%endif -%define dst_reg r0 -%define stride_reg E_reg -%define dst2_reg I_reg -%ifndef m8 -%define stack_reg hev_thr_reg +cglobal vp8_%1_loop_filter16y_inner, 5, 5, 13, dst, stride, flimE, flimI, hevthr %endif %if cpuflag(ssse3) pxor m7, m7 %endif - -%ifndef m8 ; mmx/mmxext or sse2 on x86-32 - ; splat function arguments - SPLATB_REG m0, E_reg, m7 ; E - SPLATB_REG m1, I_reg, m7 ; I - SPLATB_REG m2, hev_thr_reg, m7 ; hev_thresh - - ; align stack - mov stack_reg, rsp ; backup stack pointer - and rsp, ~(mmsize-1) ; align stack -%ifidn %1, v - sub rsp, mmsize * 4 ; stack layout: [0]=E, [1]=I, [2]=hev_thr - ; [3]=hev() result -%else ; h - sub rsp, mmsize * 5 ; extra storage space for transposes +%ifndef m8 ; stack layout: [0]=E, [1]=I, [2]=hev_thr +%ifidn %1, v ; [3]=hev() result +%assign pad 16 + mmsize * 4 - gprsize - (stack_offset & 15) +%else ; h ; extra storage space for transposes +%assign pad 16 + mmsize * 5 - gprsize - (stack_offset & 15) %endif - -%define flim_E [rsp] -%define flim_I [rsp+mmsize] -%define hev_thr [rsp+mmsize*2] -%define mask_res [rsp+mmsize*3] -%define p0backup [rsp+mmsize*3] -%define q0backup [rsp+mmsize*4] - - mova flim_E, m0 - mova flim_I, m1 - mova hev_thr, m2 - -%else ; sse2 on x86-64 - -%define flim_E m9 -%define flim_I m10 -%define hev_thr m11 -%define mask_res m12 -%define p0backup m12 -%define q0backup m8 + ; splat function arguments + SPLATB_REG m0, flimEq, m7 ; E + SPLATB_REG m1, flimIq, m7 ; I + SPLATB_REG m2, hevthrq, m7 ; hev_thresh + + SUB rsp, pad + +%define m_flimE [rsp] +%define m_flimI [rsp+mmsize] +%define m_hevthr [rsp+mmsize*2] +%define m_maskres [rsp+mmsize*3] +%define m_p0backup [rsp+mmsize*3] +%define m_q0backup [rsp+mmsize*4] + + mova m_flimE, m0 + mova m_flimI, m1 + mova m_hevthr, m2 +%else +%define m_flimE m9 +%define m_flimI m10 +%define m_hevthr m11 +%define m_maskres m12 +%define m_p0backup m12 +%define m_q0backup m8 ; splat function arguments - SPLATB_REG flim_E, E_reg, m7 ; E - SPLATB_REG flim_I, I_reg, m7 ; I - SPLATB_REG hev_thr, hev_thr_reg, m7 ; hev_thresh + SPLATB_REG m_flimE, flimEq, m7 ; E + SPLATB_REG m_flimI, flimIq, m7 ; I + SPLATB_REG m_hevthr, hevthrq, m7 ; hev_thresh %endif -%if mmsize == 8 && %3 == 16 ; mmx/mmxext - mov cnt_reg, 2 +%if %2 == 8 ; chroma + DEFINE_ARGS dst1, dst8, mstride, stride, dst2 +%elif mmsize == 8 + DEFINE_ARGS dst1, mstride, stride, dst2, cntr + mov cntrq, 2 +%else + DEFINE_ARGS dst1, mstride, stride, dst2, dst8 %endif - mov stride_reg, mstride_reg - neg mstride_reg + mov strideq, mstrideq + neg mstrideq %ifidn %1, h - lea dst_reg, [dst_reg + stride_reg*4-4] -%if %3 == 8 - lea dst8_reg, [dst8_reg+ stride_reg*4-4] + lea dst1q, [dst1q+strideq*4-4] +%if %2 == 8 ; chroma + lea dst8q, [dst8q+strideq*4-4] %endif %endif %if mmsize == 8 -.next8px +.next8px: %endif ; read - lea dst2_reg, [dst_reg + stride_reg] + lea dst2q, [dst1q+strideq] %ifidn %1, v -%if %3 == 8 && mmsize == 16 +%if %2 == 8 && mmsize == 16 %define movrow movh %else %define movrow mova %endif - movrow m0, [dst_reg +mstride_reg*4] ; p3 - movrow m1, [dst2_reg+mstride_reg*4] ; p2 - movrow m2, [dst_reg +mstride_reg*2] ; p1 - movrow m5, [dst2_reg] ; q1 - movrow m6, [dst2_reg+ stride_reg] ; q2 - movrow m7, [dst2_reg+ stride_reg*2] ; q3 -%if mmsize == 16 && %3 == 8 - movhps m0, [dst8_reg+mstride_reg*4] - movhps m2, [dst8_reg+mstride_reg*2] - add dst8_reg, stride_reg - movhps m1, [dst8_reg+mstride_reg*4] - movhps m5, [dst8_reg] - movhps m6, [dst8_reg+ stride_reg] - movhps m7, [dst8_reg+ stride_reg*2] - add dst8_reg, mstride_reg + movrow m0, [dst1q+mstrideq*4] ; p3 + movrow m1, [dst2q+mstrideq*4] ; p2 + movrow m2, [dst1q+mstrideq*2] ; p1 + movrow m5, [dst2q] ; q1 + movrow m6, [dst2q+ strideq*1] ; q2 + movrow m7, [dst2q+ strideq*2] ; q3 +%if mmsize == 16 && %2 == 8 + movhps m0, [dst8q+mstrideq*4] + movhps m2, [dst8q+mstrideq*2] + add dst8q, strideq + movhps m1, [dst8q+mstrideq*4] + movhps m5, [dst8q] + movhps m6, [dst8q+ strideq ] + movhps m7, [dst8q+ strideq*2] + add dst8q, mstrideq %endif %elif mmsize == 8 ; mmx/mmxext (h) ; read 8 rows of 8px each - movu m0, [dst_reg +mstride_reg*4] - movu m1, [dst2_reg+mstride_reg*4] - movu m2, [dst_reg +mstride_reg*2] - movu m3, [dst_reg +mstride_reg] - movu m4, [dst_reg] - movu m5, [dst2_reg] - movu m6, [dst2_reg+ stride_reg] + movu m0, [dst1q+mstrideq*4] + movu m1, [dst2q+mstrideq*4] + movu m2, [dst1q+mstrideq*2] + movu m3, [dst1q+mstrideq ] + movu m4, [dst1q] + movu m5, [dst2q] + movu m6, [dst2q+ strideq ] ; 8x8 transpose TRANSPOSE4x4B 0, 1, 2, 3, 7 - mova q0backup, m1 - movu m7, [dst2_reg+ stride_reg*2] + mova m_q0backup, m1 + movu m7, [dst2q+ strideq*2] TRANSPOSE4x4B 4, 5, 6, 7, 1 SBUTTERFLY dq, 0, 4, 1 ; p3/p2 SBUTTERFLY dq, 2, 6, 1 ; q0/q1 SBUTTERFLY dq, 3, 7, 1 ; q2/q3 - mova m1, q0backup - mova q0backup, m2 ; store q0 + mova m1, m_q0backup + mova m_q0backup, m2 ; store q0 SBUTTERFLY dq, 1, 5, 2 ; p1/p0 - mova p0backup, m5 ; store p0 + mova m_p0backup, m5 ; store p0 SWAP 1, 4 SWAP 2, 4 SWAP 6, 3 SWAP 5, 3 %else ; sse2 (h) -%if %3 == 16 - lea dst8_reg, [dst_reg + stride_reg*8] +%if %2 == 16 + lea dst8q, [dst1q+ strideq*8] %endif ; read 16 rows of 8px each, interleave - movh m0, [dst_reg +mstride_reg*4] - movh m1, [dst8_reg+mstride_reg*4] - movh m2, [dst_reg +mstride_reg*2] - movh m5, [dst8_reg+mstride_reg*2] - movh m3, [dst_reg +mstride_reg] - movh m6, [dst8_reg+mstride_reg] - movh m4, [dst_reg] - movh m7, [dst8_reg] + movh m0, [dst1q+mstrideq*4] + movh m1, [dst8q+mstrideq*4] + movh m2, [dst1q+mstrideq*2] + movh m5, [dst8q+mstrideq*2] + movh m3, [dst1q+mstrideq ] + movh m6, [dst8q+mstrideq ] + movh m4, [dst1q] + movh m7, [dst8q] punpcklbw m0, m1 ; A/I punpcklbw m2, m5 ; C/K punpcklbw m3, m6 ; D/L punpcklbw m4, m7 ; E/M - add dst8_reg, stride_reg - movh m1, [dst2_reg+mstride_reg*4] - movh m6, [dst8_reg+mstride_reg*4] - movh m5, [dst2_reg] - movh m7, [dst8_reg] + add dst8q, strideq + movh m1, [dst2q+mstrideq*4] + movh m6, [dst8q+mstrideq*4] + movh m5, [dst2q] + movh m7, [dst8q] punpcklbw m1, m6 ; B/J punpcklbw m5, m7 ; F/N - movh m6, [dst2_reg+ stride_reg] - movh m7, [dst8_reg+ stride_reg] + movh m6, [dst2q+ strideq ] + movh m7, [dst8q+ strideq ] punpcklbw m6, m7 ; G/O ; 8x16 transpose @@ -1829,10 +1806,10 @@ cglobal vp8_%1_loop_filter16y_inner, 5, %2, 13 %ifdef m8 SWAP 1, 8 %else - mova q0backup, m1 + mova m_q0backup, m1 %endif - movh m7, [dst2_reg+ stride_reg*2] - movh m1, [dst8_reg+ stride_reg*2] + movh m7, [dst2q+ strideq*2] + movh m1, [dst8q+ strideq*2] punpcklbw m7, m1 ; H/P TRANSPOSE4x4B 4, 5, 6, 7, 1 SBUTTERFLY dq, 0, 4, 1 ; p3/p2 @@ -1842,14 +1819,14 @@ cglobal vp8_%1_loop_filter16y_inner, 5, %2, 13 SWAP 1, 8 SWAP 2, 8 %else - mova m1, q0backup - mova q0backup, m2 ; store q0 + mova m1, m_q0backup + mova m_q0backup, m2 ; store q0 %endif SBUTTERFLY dq, 1, 5, 2 ; p1/p0 %ifdef m12 SWAP 5, 12 %else - mova p0backup, m5 ; store p0 + mova m_p0backup, m5 ; store p0 %endif SWAP 1, 4 SWAP 2, 4 @@ -1883,7 +1860,7 @@ cglobal vp8_%1_loop_filter16y_inner, 5, %2, 13 por m6, m4 ; abs(q2-q1) %if notcpuflag(mmx2) - mova m4, flim_I + mova m4, m_flimI pxor m3, m3 psubusb m0, m4 psubusb m1, m4 @@ -1905,14 +1882,14 @@ cglobal vp8_%1_loop_filter16y_inner, 5, %2, 13 ; normal_limit and high_edge_variance for p1-p0, q1-q0 SWAP 7, 3 ; now m7 is zero %ifidn %1, v - movrow m3, [dst_reg +mstride_reg] ; p0 -%if mmsize == 16 && %3 == 8 - movhps m3, [dst8_reg+mstride_reg] + movrow m3, [dst1q+mstrideq ] ; p0 +%if mmsize == 16 && %2 == 8 + movhps m3, [dst8q+mstrideq ] %endif %elifdef m12 SWAP 3, 12 %else - mova m3, p0backup + mova m3, m_p0backup %endif mova m1, m2 @@ -1925,11 +1902,11 @@ cglobal vp8_%1_loop_filter16y_inner, 5, %2, 13 %if notcpuflag(mmx2) mova m6, m1 psubusb m1, m4 - psubusb m6, hev_thr + psubusb m6, m_hevthr pcmpeqb m1, m7 ; abs(p1-p0) <= I pcmpeqb m6, m7 ; abs(p1-p0) <= hev_thresh pand m0, m1 - mova mask_res, m6 + mova m_maskres, m6 %else ; mmxext/sse2 pmaxub m0, m1 ; max_I SWAP 1, 4 ; max_hev_thresh @@ -1937,14 +1914,14 @@ cglobal vp8_%1_loop_filter16y_inner, 5, %2, 13 SWAP 6, 4 ; now m6 is I %ifidn %1, v - movrow m4, [dst_reg] ; q0 -%if mmsize == 16 && %3 == 8 - movhps m4, [dst8_reg] + movrow m4, [dst1q] ; q0 +%if mmsize == 16 && %2 == 8 + movhps m4, [dst8q] %endif %elifdef m8 SWAP 4, 8 %else - mova m4, q0backup + mova m4, m_q0backup %endif mova m1, m4 SWAP 1, 4 @@ -1956,26 +1933,26 @@ cglobal vp8_%1_loop_filter16y_inner, 5, %2, 13 %if notcpuflag(mmx2) mova m7, m1 psubusb m1, m6 - psubusb m7, hev_thr + psubusb m7, m_hevthr pxor m6, m6 pcmpeqb m1, m6 ; abs(q1-q0) <= I pcmpeqb m7, m6 ; abs(q1-q0) <= hev_thresh - mova m6, mask_res + mova m6, m_maskres pand m0, m1 ; abs([pq][321]-[pq][210]) <= I pand m6, m7 %else ; mmxext/sse2 pxor m7, m7 pmaxub m0, m1 pmaxub m6, m1 - psubusb m0, flim_I - psubusb m6, hev_thr + psubusb m0, m_flimI + psubusb m6, m_hevthr pcmpeqb m0, m7 ; max(abs(..)) <= I pcmpeqb m6, m7 ; !(max(abs..) > thresh) %endif %ifdef m12 SWAP 6, 12 %else - mova mask_res, m6 ; !(abs(p1-p0) > hev_t || abs(q1-q0) > hev_t) + mova m_maskres, m6 ; !(abs(p1-p0) > hev_t || abs(q1-q0) > hev_t) %endif ; simple_limit @@ -1999,28 +1976,28 @@ cglobal vp8_%1_loop_filter16y_inner, 5, %2, 13 pand m7, [pb_FE] psrlq m7, 1 ; abs(q1-p1)/2 paddusb m7, m1 ; abs(q0-p0)*2+abs(q1-p1)/2 - psubusb m7, flim_E + psubusb m7, m_flimE pcmpeqb m7, m6 ; abs(q0-p0)*2+abs(q1-p1)/2 <= E pand m0, m7 ; normal_limit result ; filter_common; at this point, m2-m5=p1-q1 and m0 is filter_mask %ifdef m8 ; x86-64 && sse2 mova m8, [pb_80] -%define pb_80_var m8 +%define m_pb_80 m8 %else ; x86-32 or mmx/mmxext -%define pb_80_var [pb_80] +%define m_pb_80 [pb_80] %endif mova m1, m4 mova m7, m3 - pxor m1, pb_80_var - pxor m7, pb_80_var + pxor m1, m_pb_80 + pxor m7, m_pb_80 psubsb m1, m7 ; (signed) q0-p0 mova m6, m2 mova m7, m5 - pxor m6, pb_80_var - pxor m7, pb_80_var + pxor m6, m_pb_80 + pxor m7, m_pb_80 psubsb m6, m7 ; (signed) p1-q1 - mova m7, mask_res + mova m7, m_maskres pandn m7, m6 paddsb m7, m1 paddsb m7, m1 @@ -2059,7 +2036,7 @@ cglobal vp8_%1_loop_filter16y_inner, 5, %2, 13 %ifdef m12 SWAP 6, 12 %else - mova m6, mask_res + mova m6, m_maskres %endif %if notcpuflag(mmx2) mova m7, [pb_1] @@ -2087,81 +2064,81 @@ cglobal vp8_%1_loop_filter16y_inner, 5, %2, 13 ; store %ifidn %1, v - movrow [dst_reg +mstride_reg*2], m2 - movrow [dst_reg +mstride_reg ], m3 - movrow [dst_reg], m4 - movrow [dst_reg + stride_reg ], m5 -%if mmsize == 16 && %3 == 8 - movhps [dst8_reg+mstride_reg*2], m2 - movhps [dst8_reg+mstride_reg ], m3 - movhps [dst8_reg], m4 - movhps [dst8_reg+ stride_reg ], m5 + movrow [dst1q+mstrideq*2], m2 + movrow [dst1q+mstrideq ], m3 + movrow [dst1q], m4 + movrow [dst1q+ strideq ], m5 +%if mmsize == 16 && %2 == 8 + movhps [dst8q+mstrideq*2], m2 + movhps [dst8q+mstrideq ], m3 + movhps [dst8q], m4 + movhps [dst8q+ strideq ], m5 %endif %else ; h - add dst_reg, 2 - add dst2_reg, 2 + add dst1q, 2 + add dst2q, 2 ; 4x8/16 transpose TRANSPOSE4x4B 2, 3, 4, 5, 6 %if mmsize == 8 ; mmx/mmxext (h) - WRITE_4x2D 2, 3, 4, 5, dst_reg, dst2_reg, mstride_reg, stride_reg + WRITE_4x2D 2, 3, 4, 5, dst1q, dst2q, mstrideq, strideq %else ; sse2 (h) - lea dst8_reg, [dst8_reg+mstride_reg+2] - WRITE_4x4D 2, 3, 4, 5, dst_reg, dst2_reg, dst8_reg, mstride_reg, stride_reg, %3 + lea dst8q, [dst8q+mstrideq +2] + WRITE_4x4D 2, 3, 4, 5, dst1q, dst2q, dst8q, mstrideq, strideq, %2 %endif %endif %if mmsize == 8 -%if %3 == 8 ; chroma +%if %2 == 8 ; chroma %ifidn %1, h - sub dst_reg, 2 + sub dst1q, 2 %endif - cmp dst_reg, dst8_reg - mov dst_reg, dst8_reg + cmp dst1q, dst8q + mov dst1q, dst8q jnz .next8px %else %ifidn %1, h - lea dst_reg, [dst_reg + stride_reg*8-2] + lea dst1q, [dst1q+ strideq*8-2] %else ; v - add dst_reg, 8 + add dst1q, 8 %endif - dec cnt_reg + dec cntrq jg .next8px %endif %endif %ifndef m8 ; sse2 on x86-32 or mmx/mmxext - mov rsp, stack_reg ; restore stack pointer + ADD rsp, pad %endif RET %endmacro %if ARCH_X86_32 INIT_MMX mmx -INNER_LOOPFILTER v, 6, 16 -INNER_LOOPFILTER h, 6, 16 -INNER_LOOPFILTER v, 6, 8 -INNER_LOOPFILTER h, 6, 8 +INNER_LOOPFILTER v, 16 +INNER_LOOPFILTER h, 16 +INNER_LOOPFILTER v, 8 +INNER_LOOPFILTER h, 8 INIT_MMX mmx2 -INNER_LOOPFILTER v, 6, 16 -INNER_LOOPFILTER h, 6, 16 -INNER_LOOPFILTER v, 6, 8 -INNER_LOOPFILTER h, 6, 8 +INNER_LOOPFILTER v, 16 +INNER_LOOPFILTER h, 16 +INNER_LOOPFILTER v, 8 +INNER_LOOPFILTER h, 8 %endif INIT_XMM sse2 -INNER_LOOPFILTER v, 5, 16 -INNER_LOOPFILTER h, 5 + ARCH_X86_32, 16 -INNER_LOOPFILTER v, 6, 8 -INNER_LOOPFILTER h, 6, 8 +INNER_LOOPFILTER v, 16 +INNER_LOOPFILTER h, 16 +INNER_LOOPFILTER v, 8 +INNER_LOOPFILTER h, 8 INIT_XMM ssse3 -INNER_LOOPFILTER v, 5, 16 -INNER_LOOPFILTER h, 5 + ARCH_X86_32, 16 -INNER_LOOPFILTER v, 6, 8 -INNER_LOOPFILTER h, 6, 8 +INNER_LOOPFILTER v, 16 +INNER_LOOPFILTER h, 16 +INNER_LOOPFILTER v, 8 +INNER_LOOPFILTER h, 8 ;----------------------------------------------------------------------------- ; void vp8_h/v_loop_filter_mbedge_(uint8_t *dst, [uint8_t *v,] int stride, From a928ed375108caf1dc8a0a2a6f42e5f17665e74a Mon Sep 17 00:00:00 2001 From: "Ronald S. Bultje" Date: Sun, 4 Mar 2012 20:21:22 -0800 Subject: [PATCH 2/7] vp8: convert mbedge loopfilter x86 assembly to use named arguments. --- libavcodec/x86/vp8dsp.asm | 447 ++++++++++++++++++-------------------- 1 file changed, 212 insertions(+), 235 deletions(-) diff --git a/libavcodec/x86/vp8dsp.asm b/libavcodec/x86/vp8dsp.asm index 793087e99c..4cfeb3e330 100644 --- a/libavcodec/x86/vp8dsp.asm +++ b/libavcodec/x86/vp8dsp.asm @@ -2145,189 +2145,166 @@ INNER_LOOPFILTER h, 8 ; int flimE, int flimI, int hev_thr); ;----------------------------------------------------------------------------- -%macro MBEDGE_LOOPFILTER 3 -%if %3 == 8 ; chroma -cglobal vp8_%1_loop_filter8uv_mbedge, 6, %2, 15 -%define dst8_reg r1 -%define mstride_reg r2 -%define E_reg r3 -%define I_reg r4 -%define hev_thr_reg r5 +%macro MBEDGE_LOOPFILTER 2 +%if %2 == 8 ; chroma +cglobal vp8_%1_loop_filter8uv_mbedge, 6, 6, 15, dst1, dst8, stride, flimE, flimI, hevthr %else ; luma -cglobal vp8_%1_loop_filter16y_mbedge, 5, %2, 15 -%define mstride_reg r1 -%define E_reg r2 -%define I_reg r3 -%define hev_thr_reg r4 -%ifdef m8 ; x86-64, sse2 -%define dst8_reg r4 -%elif mmsize == 16 ; x86-32, sse2 -%define dst8_reg r5 -%else ; x86-32, mmx/mmxext -%define cnt_reg r5 -%endif -%endif -%define dst_reg r0 -%define stride_reg E_reg -%define dst2_reg I_reg -%ifndef m8 -%define stack_reg hev_thr_reg +cglobal vp8_%1_loop_filter16y_mbedge, 5, 5, 15, dst1, stride, flimE, flimI, hevthr %endif %if cpuflag(ssse3) pxor m7, m7 %endif - -%ifndef m8 ; mmx/mmxext or sse2 on x86-32 +%ifndef m8 ; stack layout: [0]=E, [1]=I, [2]=hev_thr +%if mmsize == 16 ; [3]=hev() result + ; [4]=filter tmp result + ; [5]/[6] = p2/q2 backup + ; [7]=lim_res sign result +%assign pad 16 + mmsize * 7 - gprsize - (stack_offset & 15) +%else ; 8 ; extra storage space for transposes +%assign pad 16 + mmsize * 8 - gprsize - (stack_offset & 15) +%endif ; splat function arguments - SPLATB_REG m0, E_reg, m7 ; E - SPLATB_REG m1, I_reg, m7 ; I - SPLATB_REG m2, hev_thr_reg, m7 ; hev_thresh + SPLATB_REG m0, flimEq, m7 ; E + SPLATB_REG m1, flimIq, m7 ; I + SPLATB_REG m2, hevthrq, m7 ; hev_thresh - ; align stack - mov stack_reg, rsp ; backup stack pointer - and rsp, ~(mmsize-1) ; align stack -%if mmsize == 16 - sub rsp, mmsize * 7 -%else - sub rsp, mmsize * 8 ; stack layout: [0]=E, [1]=I, [2]=hev_thr - ; [3]=hev() result - ; [4]=filter tmp result - ; [5]/[6] = p2/q2 backup - ; [7]=lim_res sign result -%endif - -%define flim_E [rsp] -%define flim_I [rsp+mmsize] -%define hev_thr [rsp+mmsize*2] -%define mask_res [rsp+mmsize*3] -%define lim_res [rsp+mmsize*4] -%define p0backup [rsp+mmsize*3] -%define q0backup [rsp+mmsize*4] -%define p2backup [rsp+mmsize*5] -%define q2backup [rsp+mmsize*6] + SUB rsp, pad + +%define m_flimE [rsp] +%define m_flimI [rsp+mmsize] +%define m_hevthr [rsp+mmsize*2] +%define m_maskres [rsp+mmsize*3] +%define m_limres [rsp+mmsize*4] +%define m_p0backup [rsp+mmsize*3] +%define m_q0backup [rsp+mmsize*4] +%define m_p2backup [rsp+mmsize*5] +%define m_q2backup [rsp+mmsize*6] %if mmsize == 16 -%define lim_sign [rsp] +%define m_limsign [rsp] %else -%define lim_sign [rsp+mmsize*7] +%define m_limsign [rsp+mmsize*7] %endif - mova flim_E, m0 - mova flim_I, m1 - mova hev_thr, m2 - + mova m_flimE, m0 + mova m_flimI, m1 + mova m_hevthr, m2 %else ; sse2 on x86-64 - -%define flim_E m9 -%define flim_I m10 -%define hev_thr m11 -%define mask_res m12 -%define lim_res m8 -%define p0backup m12 -%define q0backup m8 -%define p2backup m13 -%define q2backup m14 -%define lim_sign m9 +%define m_flimE m9 +%define m_flimI m10 +%define m_hevthr m11 +%define m_maskres m12 +%define m_limres m8 +%define m_p0backup m12 +%define m_q0backup m8 +%define m_p2backup m13 +%define m_q2backup m14 +%define m_limsign m9 ; splat function arguments - SPLATB_REG flim_E, E_reg, m7 ; E - SPLATB_REG flim_I, I_reg, m7 ; I - SPLATB_REG hev_thr, hev_thr_reg, m7 ; hev_thresh + SPLATB_REG m_flimE, flimEq, m7 ; E + SPLATB_REG m_flimI, flimIq, m7 ; I + SPLATB_REG m_hevthr, hevthrq, m7 ; hev_thresh %endif -%if mmsize == 8 && %3 == 16 ; mmx/mmxext - mov cnt_reg, 2 +%if %2 == 8 ; chroma + DEFINE_ARGS dst1, dst8, mstride, stride, dst2 +%elif mmsize == 8 + DEFINE_ARGS dst1, mstride, stride, dst2, cntr + mov cntrq, 2 +%else + DEFINE_ARGS dst1, mstride, stride, dst2, dst8 %endif - mov stride_reg, mstride_reg - neg mstride_reg + mov strideq, mstrideq + neg mstrideq %ifidn %1, h - lea dst_reg, [dst_reg + stride_reg*4-4] -%if %3 == 8 - lea dst8_reg, [dst8_reg+ stride_reg*4-4] + lea dst1q, [dst1q+strideq*4-4] +%if %2 == 8 ; chroma + lea dst8q, [dst8q+strideq*4-4] %endif %endif %if mmsize == 8 -.next8px +.next8px: %endif ; read - lea dst2_reg, [dst_reg + stride_reg] + lea dst2q, [dst1q+ strideq ] %ifidn %1, v -%if %3 == 8 && mmsize == 16 +%if %2 == 8 && mmsize == 16 %define movrow movh %else %define movrow mova %endif - movrow m0, [dst_reg +mstride_reg*4] ; p3 - movrow m1, [dst2_reg+mstride_reg*4] ; p2 - movrow m2, [dst_reg +mstride_reg*2] ; p1 - movrow m5, [dst2_reg] ; q1 - movrow m6, [dst2_reg+ stride_reg] ; q2 - movrow m7, [dst2_reg+ stride_reg*2] ; q3 -%if mmsize == 16 && %3 == 8 - movhps m0, [dst8_reg+mstride_reg*4] - movhps m2, [dst8_reg+mstride_reg*2] - add dst8_reg, stride_reg - movhps m1, [dst8_reg+mstride_reg*4] - movhps m5, [dst8_reg] - movhps m6, [dst8_reg+ stride_reg] - movhps m7, [dst8_reg+ stride_reg*2] - add dst8_reg, mstride_reg + movrow m0, [dst1q+mstrideq*4] ; p3 + movrow m1, [dst2q+mstrideq*4] ; p2 + movrow m2, [dst1q+mstrideq*2] ; p1 + movrow m5, [dst2q] ; q1 + movrow m6, [dst2q+ strideq ] ; q2 + movrow m7, [dst2q+ strideq*2] ; q3 +%if mmsize == 16 && %2 == 8 + movhps m0, [dst8q+mstrideq*4] + movhps m2, [dst8q+mstrideq*2] + add dst8q, strideq + movhps m1, [dst8q+mstrideq*4] + movhps m5, [dst8q] + movhps m6, [dst8q+ strideq ] + movhps m7, [dst8q+ strideq*2] + add dst8q, mstrideq %endif %elif mmsize == 8 ; mmx/mmxext (h) ; read 8 rows of 8px each - movu m0, [dst_reg +mstride_reg*4] - movu m1, [dst2_reg+mstride_reg*4] - movu m2, [dst_reg +mstride_reg*2] - movu m3, [dst_reg +mstride_reg] - movu m4, [dst_reg] - movu m5, [dst2_reg] - movu m6, [dst2_reg+ stride_reg] + movu m0, [dst1q+mstrideq*4] + movu m1, [dst2q+mstrideq*4] + movu m2, [dst1q+mstrideq*2] + movu m3, [dst1q+mstrideq ] + movu m4, [dst1q] + movu m5, [dst2q] + movu m6, [dst2q+ strideq ] ; 8x8 transpose TRANSPOSE4x4B 0, 1, 2, 3, 7 - mova q0backup, m1 - movu m7, [dst2_reg+ stride_reg*2] + mova m_q0backup, m1 + movu m7, [dst2q+ strideq*2] TRANSPOSE4x4B 4, 5, 6, 7, 1 SBUTTERFLY dq, 0, 4, 1 ; p3/p2 SBUTTERFLY dq, 2, 6, 1 ; q0/q1 SBUTTERFLY dq, 3, 7, 1 ; q2/q3 - mova m1, q0backup - mova q0backup, m2 ; store q0 + mova m1, m_q0backup + mova m_q0backup, m2 ; store q0 SBUTTERFLY dq, 1, 5, 2 ; p1/p0 - mova p0backup, m5 ; store p0 + mova m_p0backup, m5 ; store p0 SWAP 1, 4 SWAP 2, 4 SWAP 6, 3 SWAP 5, 3 %else ; sse2 (h) -%if %3 == 16 - lea dst8_reg, [dst_reg + stride_reg*8] +%if %2 == 16 + lea dst8q, [dst1q+ strideq*8 ] %endif ; read 16 rows of 8px each, interleave - movh m0, [dst_reg +mstride_reg*4] - movh m1, [dst8_reg+mstride_reg*4] - movh m2, [dst_reg +mstride_reg*2] - movh m5, [dst8_reg+mstride_reg*2] - movh m3, [dst_reg +mstride_reg] - movh m6, [dst8_reg+mstride_reg] - movh m4, [dst_reg] - movh m7, [dst8_reg] + movh m0, [dst1q+mstrideq*4] + movh m1, [dst8q+mstrideq*4] + movh m2, [dst1q+mstrideq*2] + movh m5, [dst8q+mstrideq*2] + movh m3, [dst1q+mstrideq ] + movh m6, [dst8q+mstrideq ] + movh m4, [dst1q] + movh m7, [dst8q] punpcklbw m0, m1 ; A/I punpcklbw m2, m5 ; C/K punpcklbw m3, m6 ; D/L punpcklbw m4, m7 ; E/M - add dst8_reg, stride_reg - movh m1, [dst2_reg+mstride_reg*4] - movh m6, [dst8_reg+mstride_reg*4] - movh m5, [dst2_reg] - movh m7, [dst8_reg] + add dst8q, strideq + movh m1, [dst2q+mstrideq*4] + movh m6, [dst8q+mstrideq*4] + movh m5, [dst2q] + movh m7, [dst8q] punpcklbw m1, m6 ; B/J punpcklbw m5, m7 ; F/N - movh m6, [dst2_reg+ stride_reg] - movh m7, [dst8_reg+ stride_reg] + movh m6, [dst2q+ strideq ] + movh m7, [dst8q+ strideq ] punpcklbw m6, m7 ; G/O ; 8x16 transpose @@ -2335,10 +2312,10 @@ cglobal vp8_%1_loop_filter16y_mbedge, 5, %2, 15 %ifdef m8 SWAP 1, 8 %else - mova q0backup, m1 + mova m_q0backup, m1 %endif - movh m7, [dst2_reg+ stride_reg*2] - movh m1, [dst8_reg+ stride_reg*2] + movh m7, [dst2q+ strideq*2] + movh m1, [dst8q+ strideq*2] punpcklbw m7, m1 ; H/P TRANSPOSE4x4B 4, 5, 6, 7, 1 SBUTTERFLY dq, 0, 4, 1 ; p3/p2 @@ -2348,14 +2325,14 @@ cglobal vp8_%1_loop_filter16y_mbedge, 5, %2, 15 SWAP 1, 8 SWAP 2, 8 %else - mova m1, q0backup - mova q0backup, m2 ; store q0 + mova m1, m_q0backup + mova m_q0backup, m2 ; store q0 %endif SBUTTERFLY dq, 1, 5, 2 ; p1/p0 %ifdef m12 SWAP 5, 12 %else - mova p0backup, m5 ; store p0 + mova m_p0backup, m5 ; store p0 %endif SWAP 1, 4 SWAP 2, 4 @@ -2373,7 +2350,7 @@ cglobal vp8_%1_loop_filter16y_mbedge, 5, %2, 15 mova m4, m2 SWAP 4, 2 psubusb m4, m1 ; p1-p2 - mova p2backup, m1 + mova m_p2backup, m1 psubusb m1, m2 ; p2-p1 por m1, m4 ; abs(p2-p1) @@ -2386,12 +2363,12 @@ cglobal vp8_%1_loop_filter16y_mbedge, 5, %2, 15 mova m4, m5 SWAP 4, 5 psubusb m4, m6 ; q1-q2 - mova q2backup, m6 + mova m_q2backup, m6 psubusb m6, m5 ; q2-q1 por m6, m4 ; abs(q2-q1) %if notcpuflag(mmx2) - mova m4, flim_I + mova m4, m_flimI pxor m3, m3 psubusb m0, m4 psubusb m1, m4 @@ -2413,14 +2390,14 @@ cglobal vp8_%1_loop_filter16y_mbedge, 5, %2, 15 ; normal_limit and high_edge_variance for p1-p0, q1-q0 SWAP 7, 3 ; now m7 is zero %ifidn %1, v - movrow m3, [dst_reg +mstride_reg] ; p0 -%if mmsize == 16 && %3 == 8 - movhps m3, [dst8_reg+mstride_reg] + movrow m3, [dst1q+mstrideq ] ; p0 +%if mmsize == 16 && %2 == 8 + movhps m3, [dst8q+mstrideq ] %endif %elifdef m12 SWAP 3, 12 %else - mova m3, p0backup + mova m3, m_p0backup %endif mova m1, m2 @@ -2433,11 +2410,11 @@ cglobal vp8_%1_loop_filter16y_mbedge, 5, %2, 15 %if notcpuflag(mmx2) mova m6, m1 psubusb m1, m4 - psubusb m6, hev_thr + psubusb m6, m_hevthr pcmpeqb m1, m7 ; abs(p1-p0) <= I pcmpeqb m6, m7 ; abs(p1-p0) <= hev_thresh pand m0, m1 - mova mask_res, m6 + mova m_maskres, m6 %else ; mmxext/sse2 pmaxub m0, m1 ; max_I SWAP 1, 4 ; max_hev_thresh @@ -2445,14 +2422,14 @@ cglobal vp8_%1_loop_filter16y_mbedge, 5, %2, 15 SWAP 6, 4 ; now m6 is I %ifidn %1, v - movrow m4, [dst_reg] ; q0 -%if mmsize == 16 && %3 == 8 - movhps m4, [dst8_reg] + movrow m4, [dst1q] ; q0 +%if mmsize == 16 && %2 == 8 + movhps m4, [dst8q] %endif %elifdef m8 SWAP 4, 8 %else - mova m4, q0backup + mova m4, m_q0backup %endif mova m1, m4 SWAP 1, 4 @@ -2464,26 +2441,26 @@ cglobal vp8_%1_loop_filter16y_mbedge, 5, %2, 15 %if notcpuflag(mmx2) mova m7, m1 psubusb m1, m6 - psubusb m7, hev_thr + psubusb m7, m_hevthr pxor m6, m6 pcmpeqb m1, m6 ; abs(q1-q0) <= I pcmpeqb m7, m6 ; abs(q1-q0) <= hev_thresh - mova m6, mask_res + mova m6, m_maskres pand m0, m1 ; abs([pq][321]-[pq][210]) <= I pand m6, m7 %else ; mmxext/sse2 pxor m7, m7 pmaxub m0, m1 pmaxub m6, m1 - psubusb m0, flim_I - psubusb m6, hev_thr + psubusb m0, m_flimI + psubusb m6, m_hevthr pcmpeqb m0, m7 ; max(abs(..)) <= I pcmpeqb m6, m7 ; !(max(abs..) > thresh) %endif %ifdef m12 SWAP 6, 12 %else - mova mask_res, m6 ; !(abs(p1-p0) > hev_t || abs(q1-q0) > hev_t) + mova m_maskres, m6 ; !(abs(p1-p0) > hev_t || abs(q1-q0) > hev_t) %endif ; simple_limit @@ -2507,39 +2484,39 @@ cglobal vp8_%1_loop_filter16y_mbedge, 5, %2, 15 pand m7, [pb_FE] psrlq m7, 1 ; abs(q1-p1)/2 paddusb m7, m1 ; abs(q0-p0)*2+abs(q1-p1)/2 - psubusb m7, flim_E + psubusb m7, m_flimE pcmpeqb m7, m6 ; abs(q0-p0)*2+abs(q1-p1)/2 <= E pand m0, m7 ; normal_limit result ; filter_common; at this point, m2-m5=p1-q1 and m0 is filter_mask %ifdef m8 ; x86-64 && sse2 mova m8, [pb_80] -%define pb_80_var m8 +%define m_pb_80 m8 %else ; x86-32 or mmx/mmxext -%define pb_80_var [pb_80] +%define m_pb_80 [pb_80] %endif mova m1, m4 mova m7, m3 - pxor m1, pb_80_var - pxor m7, pb_80_var + pxor m1, m_pb_80 + pxor m7, m_pb_80 psubsb m1, m7 ; (signed) q0-p0 mova m6, m2 mova m7, m5 - pxor m6, pb_80_var - pxor m7, pb_80_var + pxor m6, m_pb_80 + pxor m7, m_pb_80 psubsb m6, m7 ; (signed) p1-q1 - mova m7, mask_res + mova m7, m_maskres paddsb m6, m1 paddsb m6, m1 paddsb m6, m1 pand m6, m0 %ifdef m8 - mova lim_res, m6 ; 3*(qp-p0)+(p1-q1) masked for filter_mbedge - pand lim_res, m7 + mova m_limres, m6 ; 3*(qp-p0)+(p1-q1) masked for filter_mbedge + pand m_limres, m7 %else mova m0, m6 pand m0, m7 - mova lim_res, m0 + mova m_limres, m0 %endif pandn m7, m6 ; 3*(q0-p0)+(p1-q1) masked for filter_common @@ -2581,7 +2558,7 @@ cglobal vp8_%1_loop_filter16y_mbedge, 5, %2, 15 %ifdef m8 SWAP 1, 8 %else - mova m1, lim_res + mova m1, m_limres %endif pxor m0, m0 mova m6, m1 @@ -2593,11 +2570,11 @@ cglobal vp8_%1_loop_filter16y_mbedge, 5, %2, 15 punpcklbw m6, m0 ; signed byte->word punpckhbw m1, m0 %endif - mova lim_sign, m0 + mova m_limsign, m0 %if cpuflag(ssse3) mova m7, [pb_27_63] %ifndef m8 - mova lim_res, m1 + mova m_limres, m1 %endif %ifdef m10 SWAP 0, 10 ; don't lose lim_sign copy @@ -2610,11 +2587,11 @@ cglobal vp8_%1_loop_filter16y_mbedge, 5, %2, 15 %ifdef m10 SWAP 0, 10 %else - mova m0, lim_sign + mova m0, m_limsign %endif %else - mova mask_res, m6 ; backup for later in filter - mova lim_res, m1 + mova m_maskres, m6 ; backup for later in filter + mova m_limres, m1 pmullw m6, [pw_27] pmullw m1, [pw_27] paddw m6, m7 @@ -2640,7 +2617,7 @@ cglobal vp8_%1_loop_filter16y_mbedge, 5, %2, 15 %ifdef m10 SWAP 1, 10 %else - mova m1, lim_res + mova m1, m_limres %endif mova m0, m7 pmaddubsw m7, m6 @@ -2650,16 +2627,16 @@ cglobal vp8_%1_loop_filter16y_mbedge, 5, %2, 15 %ifdef m10 SWAP 0, 10 %endif - mova m0, lim_sign + mova m0, m_limsign %else - mova m6, mask_res - mova m1, lim_res + mova m6, m_maskres + mova m1, m_limres pmullw m6, [pw_18] pmullw m1, [pw_18] paddw m6, m7 paddw m1, m7 %endif - mova m0, lim_sign + mova m0, m_limsign psraw m6, 7 psraw m1, 7 packsswb m6, m1 ; a1 @@ -2680,7 +2657,7 @@ cglobal vp8_%1_loop_filter16y_mbedge, 5, %2, 15 %ifdef m10 SWAP 1, 10 %else - mova m1, lim_res + mova m1, m_limres %endif mova m0, m7 pmaddubsw m7, m6 @@ -2692,8 +2669,8 @@ cglobal vp8_%1_loop_filter16y_mbedge, 5, %2, 15 SWAP 6, 12 SWAP 1, 8 %else - mova m6, mask_res - mova m1, lim_res + mova m6, m_maskres + mova m1, m_limres %endif pmullw m6, [pw_9] pmullw m1, [pw_9] @@ -2703,7 +2680,7 @@ cglobal vp8_%1_loop_filter16y_mbedge, 5, %2, 15 %ifdef m9 SWAP 7, 9 %else - mova m7, lim_sign + mova m7, m_limsign %endif psraw m6, 7 psraw m1, 7 @@ -2716,8 +2693,8 @@ cglobal vp8_%1_loop_filter16y_mbedge, 5, %2, 15 SWAP 1, 13 SWAP 6, 14 %else - mova m1, p2backup - mova m6, q2backup + mova m1, m_p2backup + mova m6, m_q2backup %endif psubusb m1, m0 paddusb m6, m0 @@ -2726,101 +2703,101 @@ cglobal vp8_%1_loop_filter16y_mbedge, 5, %2, 15 ; store %ifidn %1, v - movrow [dst2_reg+mstride_reg*4], m1 - movrow [dst_reg +mstride_reg*2], m2 - movrow [dst_reg +mstride_reg ], m3 - movrow [dst_reg], m4 - movrow [dst2_reg], m5 - movrow [dst2_reg+ stride_reg ], m6 -%if mmsize == 16 && %3 == 8 - add dst8_reg, mstride_reg - movhps [dst8_reg+mstride_reg*2], m1 - movhps [dst8_reg+mstride_reg ], m2 - movhps [dst8_reg], m3 - add dst8_reg, stride_reg - movhps [dst8_reg], m4 - movhps [dst8_reg+ stride_reg ], m5 - movhps [dst8_reg+ stride_reg*2], m6 + movrow [dst2q+mstrideq*4], m1 + movrow [dst1q+mstrideq*2], m2 + movrow [dst1q+mstrideq ], m3 + movrow [dst1q], m4 + movrow [dst2q], m5 + movrow [dst2q+ strideq ], m6 +%if mmsize == 16 && %2 == 8 + add dst8q, mstrideq + movhps [dst8q+mstrideq*2], m1 + movhps [dst8q+mstrideq ], m2 + movhps [dst8q], m3 + add dst8q, strideq + movhps [dst8q], m4 + movhps [dst8q+ strideq ], m5 + movhps [dst8q+ strideq*2], m6 %endif %else ; h - inc dst_reg - inc dst2_reg + inc dst1q + inc dst2q ; 4x8/16 transpose - TRANSPOSE4x4B 1, 2, 3, 4, 0 - SBUTTERFLY bw, 5, 6, 0 + TRANSPOSE4x4B 1, 2, 3, 4, 0 + SBUTTERFLY bw, 5, 6, 0 %if mmsize == 8 ; mmx/mmxext (h) - WRITE_4x2D 1, 2, 3, 4, dst_reg, dst2_reg, mstride_reg, stride_reg - add dst_reg, 4 - WRITE_2x4W m5, m6, dst2_reg, dst_reg, mstride_reg, stride_reg + WRITE_4x2D 1, 2, 3, 4, dst1q, dst2q, mstrideq, strideq + add dst1q, 4 + WRITE_2x4W m5, m6, dst2q, dst1q, mstrideq, strideq %else ; sse2 (h) - lea dst8_reg, [dst8_reg+mstride_reg+1] - WRITE_4x4D 1, 2, 3, 4, dst_reg, dst2_reg, dst8_reg, mstride_reg, stride_reg, %3 - lea dst_reg, [dst2_reg+mstride_reg+4] - lea dst8_reg, [dst8_reg+mstride_reg+4] + lea dst8q, [dst8q+mstrideq+1] + WRITE_4x4D 1, 2, 3, 4, dst1q, dst2q, dst8q, mstrideq, strideq, %2 + lea dst1q, [dst2q+mstrideq+4] + lea dst8q, [dst8q+mstrideq+4] %if cpuflag(sse4) - add dst2_reg, 4 + add dst2q, 4 %endif - WRITE_8W m5, dst2_reg, dst_reg, mstride_reg, stride_reg + WRITE_8W m5, dst2q, dst1q, mstrideq, strideq %if cpuflag(sse4) - lea dst2_reg, [dst8_reg+ stride_reg] + lea dst2q, [dst8q+ strideq ] %endif - WRITE_8W m6, dst2_reg, dst8_reg, mstride_reg, stride_reg + WRITE_8W m6, dst2q, dst8q, mstrideq, strideq %endif %endif %if mmsize == 8 -%if %3 == 8 ; chroma +%if %2 == 8 ; chroma %ifidn %1, h - sub dst_reg, 5 + sub dst1q, 5 %endif - cmp dst_reg, dst8_reg - mov dst_reg, dst8_reg + cmp dst1q, dst8q + mov dst1q, dst8q jnz .next8px %else %ifidn %1, h - lea dst_reg, [dst_reg + stride_reg*8-5] + lea dst1q, [dst1q+ strideq*8-5] %else ; v - add dst_reg, 8 + add dst1q, 8 %endif - dec cnt_reg + dec cntrq jg .next8px %endif %endif %ifndef m8 ; sse2 on x86-32 or mmx/mmxext - mov rsp, stack_reg ; restore stack pointer + ADD rsp, pad %endif RET %endmacro %if ARCH_X86_32 INIT_MMX mmx -MBEDGE_LOOPFILTER v, 6, 16 -MBEDGE_LOOPFILTER h, 6, 16 -MBEDGE_LOOPFILTER v, 6, 8 -MBEDGE_LOOPFILTER h, 6, 8 +MBEDGE_LOOPFILTER v, 16 +MBEDGE_LOOPFILTER h, 16 +MBEDGE_LOOPFILTER v, 8 +MBEDGE_LOOPFILTER h, 8 INIT_MMX mmx2 -MBEDGE_LOOPFILTER v, 6, 16 -MBEDGE_LOOPFILTER h, 6, 16 -MBEDGE_LOOPFILTER v, 6, 8 -MBEDGE_LOOPFILTER h, 6, 8 +MBEDGE_LOOPFILTER v, 16 +MBEDGE_LOOPFILTER h, 16 +MBEDGE_LOOPFILTER v, 8 +MBEDGE_LOOPFILTER h, 8 %endif INIT_XMM sse2 -MBEDGE_LOOPFILTER v, 5, 16 -MBEDGE_LOOPFILTER h, 5 + ARCH_X86_32, 16 -MBEDGE_LOOPFILTER v, 6, 8 -MBEDGE_LOOPFILTER h, 6, 8 +MBEDGE_LOOPFILTER v, 16 +MBEDGE_LOOPFILTER h, 16 +MBEDGE_LOOPFILTER v, 8 +MBEDGE_LOOPFILTER h, 8 INIT_XMM ssse3 -MBEDGE_LOOPFILTER v, 5, 16 -MBEDGE_LOOPFILTER h, 5 + ARCH_X86_32, 16 -MBEDGE_LOOPFILTER v, 6, 8 -MBEDGE_LOOPFILTER h, 6, 8 +MBEDGE_LOOPFILTER v, 16 +MBEDGE_LOOPFILTER h, 16 +MBEDGE_LOOPFILTER v, 8 +MBEDGE_LOOPFILTER h, 8 INIT_XMM sse4 -MBEDGE_LOOPFILTER h, 5 + ARCH_X86_32, 16 -MBEDGE_LOOPFILTER h, 6, 8 +MBEDGE_LOOPFILTER h, 16 +MBEDGE_LOOPFILTER h, 8 From f1279e286b00e99f343adb51e251f036a3df6f32 Mon Sep 17 00:00:00 2001 From: "Ronald S. Bultje" Date: Thu, 8 Mar 2012 16:32:46 -0800 Subject: [PATCH 3/7] xxan: don't read before start of buffer in av_memcpy_backptr(). Found-by: Mateusz "j00ru" Jurczyk and Gynvael Coldwind CC: libav-stable@libav.org --- libavcodec/xxan.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/libavcodec/xxan.c b/libavcodec/xxan.c index 4659d34972..8147bad5f1 100644 --- a/libavcodec/xxan.c +++ b/libavcodec/xxan.c @@ -129,7 +129,8 @@ static int xan_unpack(uint8_t *dest, const int dest_len, if (size + size2 > dest_end - dest) break; } - if (src + size > src_end || dest + size + size2 > dest_end) + if (src + size > src_end || dest + size + size2 > dest_end || + dest - orig_dest + size < back) return -1; bytestream_get_buffer(&src, dest, size); dest += size; From 55188278169c3a1838334d7aa47a1f7a40741690 Mon Sep 17 00:00:00 2001 From: "Ronald S. Bultje" Date: Thu, 8 Mar 2012 16:32:47 -0800 Subject: [PATCH 4/7] xxan: convert to bytestream2 API. Protects against overreads. Found-by: Mateusz "j00ru" Jurczyk and Gynvael Coldwind CC: libav-stable@libav.org --- libavcodec/xxan.c | 117 ++++++++++++++++++++++------------------------ 1 file changed, 56 insertions(+), 61 deletions(-) diff --git a/libavcodec/xxan.c b/libavcodec/xxan.c index 8147bad5f1..86b4195ce4 100644 --- a/libavcodec/xxan.c +++ b/libavcodec/xxan.c @@ -35,6 +35,7 @@ typedef struct XanContext { uint8_t *y_buffer; uint8_t *scratch_buffer; int buffer_size; + GetByteContext gb; } XanContext; static av_cold int xan_decode_init(AVCodecContext *avctx) @@ -58,29 +59,29 @@ static av_cold int xan_decode_init(AVCodecContext *avctx) return 0; } -static int xan_unpack_luma(const uint8_t *src, const int src_size, +static int xan_unpack_luma(XanContext *s, uint8_t *dst, const int dst_size) { int tree_size, eof; - const uint8_t *tree; int bits, mask; int tree_root, node; const uint8_t *dst_end = dst + dst_size; - const uint8_t *src_end = src + src_size; + GetByteContext tree = s->gb; + int start_off = bytestream2_tell(&tree); - tree_size = *src++; - eof = *src++; - tree = src - eof * 2 - 2; + tree_size = bytestream2_get_byte(&s->gb); + eof = bytestream2_get_byte(&s->gb); tree_root = eof + tree_size; - src += tree_size * 2; + bytestream2_skip(&s->gb, tree_size * 2); node = tree_root; - bits = *src++; + bits = bytestream2_get_byte(&s->gb); mask = 0x80; for (;;) { int bit = !!(bits & mask); mask >>= 1; - node = tree[node*2 + bit]; + bytestream2_seek(&tree, start_off + node*2 + bit - eof * 2, SEEK_SET); + node = bytestream2_get_byte(&tree); if (node == eof) break; if (node < eof) { @@ -90,49 +91,51 @@ static int xan_unpack_luma(const uint8_t *src, const int src_size, node = tree_root; } if (!mask) { - bits = *src++; - if (src > src_end) + if (bytestream2_get_bytes_left(&s->gb) <= 0) break; + bits = bytestream2_get_byteu(&s->gb); mask = 0x80; } } - return dst != dst_end; + return dst != dst_end ? AVERROR_INVALIDDATA : 0; } /* almost the same as in xan_wc3 decoder */ -static int xan_unpack(uint8_t *dest, const int dest_len, - const uint8_t *src, const int src_len) +static int xan_unpack(XanContext *s, + uint8_t *dest, const int dest_len) { uint8_t opcode; int size; uint8_t *orig_dest = dest; - const uint8_t *src_end = src + src_len; const uint8_t *dest_end = dest + dest_len; while (dest < dest_end) { - opcode = *src++; + if (bytestream2_get_bytes_left(&s->gb) <= 0) + return AVERROR_INVALIDDATA; + + opcode = bytestream2_get_byteu(&s->gb); if (opcode < 0xe0) { int size2, back; if ((opcode & 0x80) == 0) { size = opcode & 3; - back = ((opcode & 0x60) << 3) + *src++ + 1; + back = ((opcode & 0x60) << 3) + bytestream2_get_byte(&s->gb) + 1; size2 = ((opcode & 0x1c) >> 2) + 3; } else if ((opcode & 0x40) == 0) { - size = *src >> 6; - back = (bytestream_get_be16(&src) & 0x3fff) + 1; + size = bytestream2_peek_byte(&s->gb) >> 6; + back = (bytestream2_get_be16(&s->gb) & 0x3fff) + 1; size2 = (opcode & 0x3f) + 4; } else { size = opcode & 3; - back = ((opcode & 0x10) << 12) + bytestream_get_be16(&src) + 1; - size2 = ((opcode & 0x0c) << 6) + *src++ + 5; + back = ((opcode & 0x10) << 12) + bytestream2_get_be16(&s->gb) + 1; + size2 = ((opcode & 0x0c) << 6) + bytestream2_get_byte(&s->gb) + 5; if (size + size2 > dest_end - dest) break; } - if (src + size > src_end || dest + size + size2 > dest_end || + if (dest + size + size2 > dest_end || dest - orig_dest + size < back) return -1; - bytestream_get_buffer(&src, dest, size); + bytestream2_get_buffer(&s->gb, dest, size); dest += size; av_memcpy_backptr(dest, back, size2); dest += size2; @@ -140,9 +143,9 @@ static int xan_unpack(uint8_t *dest, const int dest_len, int finish = opcode >= 0xfc; size = finish ? opcode & 3 : ((opcode & 0x1f) << 2) + 4; - if (src + size > src_end || dest + size > dest_end) + if (dest_end - dest < size) return -1; - bytestream_get_buffer(&src, dest, size); + bytestream2_get_buffer(&s->gb, dest, size); dest += size; if (finish) break; @@ -151,38 +154,35 @@ static int xan_unpack(uint8_t *dest, const int dest_len, return dest - orig_dest; } -static int xan_decode_chroma(AVCodecContext *avctx, AVPacket *avpkt) +static int xan_decode_chroma(AVCodecContext *avctx, unsigned chroma_off) { - const uint8_t *buf = avpkt->data; XanContext *s = avctx->priv_data; uint8_t *U, *V; - unsigned chroma_off; int val, uval, vval; int i, j; const uint8_t *src, *src_end; const uint8_t *table; int mode, offset, dec_size; - chroma_off = AV_RL32(buf + 4); if (!chroma_off) return 0; - if (chroma_off + 10 >= avpkt->size) { + if (chroma_off + 4 >= bytestream2_get_bytes_left(&s->gb)) { av_log(avctx, AV_LOG_ERROR, "Invalid chroma block position\n"); return -1; } - src = avpkt->data + 4 + chroma_off; - table = src + 2; - mode = bytestream_get_le16(&src); - offset = bytestream_get_le16(&src) * 2; + bytestream2_seek(&s->gb, chroma_off + 4, SEEK_SET); + mode = bytestream2_get_le16(&s->gb); + table = s->gb.buffer; + offset = bytestream2_get_le16(&s->gb) * 2; - if (src - avpkt->data >= avpkt->size - offset) { + if (offset >= bytestream2_get_bytes_left(&s->gb)) { av_log(avctx, AV_LOG_ERROR, "Invalid chroma block offset\n"); return -1; } + bytestream2_skip(&s->gb, offset); memset(s->scratch_buffer, 0, s->buffer_size); - dec_size = xan_unpack(s->scratch_buffer, s->buffer_size, src + offset, - avpkt->size - offset - (src - avpkt->data)); + dec_size = xan_unpack(s, s->scratch_buffer, s->buffer_size); if (dec_size < 0) { av_log(avctx, AV_LOG_ERROR, "Chroma unpacking failed\n"); return -1; @@ -234,32 +234,27 @@ static int xan_decode_chroma(AVCodecContext *avctx, AVPacket *avpkt) return 0; } -static int xan_decode_frame_type0(AVCodecContext *avctx, AVPacket *avpkt) +static int xan_decode_frame_type0(AVCodecContext *avctx) { - const uint8_t *buf = avpkt->data; XanContext *s = avctx->priv_data; uint8_t *ybuf, *prev_buf, *src = s->scratch_buffer; unsigned chroma_off, corr_off; - int cur, last, size; + int cur, last; int i, j; int ret; - corr_off = AV_RL32(buf + 8); - chroma_off = AV_RL32(buf + 4); + chroma_off = bytestream2_get_le32(&s->gb); + corr_off = bytestream2_get_le32(&s->gb); - if ((ret = xan_decode_chroma(avctx, avpkt)) != 0) + if ((ret = xan_decode_chroma(avctx, chroma_off)) != 0) return ret; - size = avpkt->size - 4; - if (corr_off >= avpkt->size) { + if (corr_off >= (s->gb.buffer_end - s->gb.buffer_start)) { av_log(avctx, AV_LOG_WARNING, "Ignoring invalid correction block position\n"); corr_off = 0; } - if (corr_off) - size = corr_off; - if (chroma_off) - size = FFMIN(size, chroma_off); - ret = xan_unpack_luma(buf + 12, size, src, s->buffer_size >> 1); + bytestream2_seek(&s->gb, 12, SEEK_SET); + ret = xan_unpack_luma(s, src, s->buffer_size >> 1); if (ret) { av_log(avctx, AV_LOG_ERROR, "Luma decoding failed\n"); return ret; @@ -295,12 +290,11 @@ static int xan_decode_frame_type0(AVCodecContext *avctx, AVPacket *avpkt) if (corr_off) { int corr_end, dec_size; - corr_end = avpkt->size; + corr_end = (s->gb.buffer_end - s->gb.buffer_start); if (chroma_off > corr_off) corr_end = chroma_off; - dec_size = xan_unpack(s->scratch_buffer, s->buffer_size, - avpkt->data + 8 + corr_off, - corr_end - corr_off); + bytestream2_seek(&s->gb, 8 + corr_off, SEEK_SET); + dec_size = xan_unpack(s, s->scratch_buffer, s->buffer_size); if (dec_size < 0) dec_size = 0; for (i = 0; i < dec_size; i++) @@ -319,19 +313,19 @@ static int xan_decode_frame_type0(AVCodecContext *avctx, AVPacket *avpkt) return 0; } -static int xan_decode_frame_type1(AVCodecContext *avctx, AVPacket *avpkt) +static int xan_decode_frame_type1(AVCodecContext *avctx) { - const uint8_t *buf = avpkt->data; XanContext *s = avctx->priv_data; uint8_t *ybuf, *src = s->scratch_buffer; int cur, last; int i, j; int ret; - if ((ret = xan_decode_chroma(avctx, avpkt)) != 0) + if ((ret = xan_decode_chroma(avctx, bytestream2_get_le32(&s->gb))) != 0) return ret; - ret = xan_unpack_luma(buf + 16, avpkt->size - 16, src, + bytestream2_seek(&s->gb, 16, SEEK_SET); + ret = xan_unpack_luma(s, src, s->buffer_size >> 1); if (ret) { av_log(avctx, AV_LOG_ERROR, "Luma decoding failed\n"); @@ -381,13 +375,14 @@ static int xan_decode_frame(AVCodecContext *avctx, return ret; } - ftype = AV_RL32(avpkt->data); + bytestream2_init(&s->gb, avpkt->data, avpkt->size); + ftype = bytestream2_get_le32(&s->gb); switch (ftype) { case 0: - ret = xan_decode_frame_type0(avctx, avpkt); + ret = xan_decode_frame_type0(avctx); break; case 1: - ret = xan_decode_frame_type1(avctx, avpkt); + ret = xan_decode_frame_type1(avctx); break; default: av_log(avctx, AV_LOG_ERROR, "Unknown frame type %d\n", ftype); From f77bfa837636a99a4034d31916a76f7d1688cf5a Mon Sep 17 00:00:00 2001 From: "Ronald S. Bultje" Date: Sat, 10 Mar 2012 11:57:17 -0800 Subject: [PATCH 5/7] xxan: protect against chroma LUT overreads. Found-by: Mateusz "j00ru" Jurczyk and Gynvael Coldwind CC: libav-stable@libav.org --- libavcodec/xxan.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/libavcodec/xxan.c b/libavcodec/xxan.c index 86b4195ce4..dd5447940b 100644 --- a/libavcodec/xxan.c +++ b/libavcodec/xxan.c @@ -162,7 +162,7 @@ static int xan_decode_chroma(AVCodecContext *avctx, unsigned chroma_off) int i, j; const uint8_t *src, *src_end; const uint8_t *table; - int mode, offset, dec_size; + int mode, offset, dec_size, table_size; if (!chroma_off) return 0; @@ -171,9 +171,11 @@ static int xan_decode_chroma(AVCodecContext *avctx, unsigned chroma_off) return -1; } bytestream2_seek(&s->gb, chroma_off + 4, SEEK_SET); - mode = bytestream2_get_le16(&s->gb); - table = s->gb.buffer; - offset = bytestream2_get_le16(&s->gb) * 2; + mode = bytestream2_get_le16(&s->gb); + table = s->gb.buffer; + table_size = bytestream2_get_le16(&s->gb); + offset = table_size * 2; + table_size += 1; if (offset >= bytestream2_get_bytes_left(&s->gb)) { av_log(avctx, AV_LOG_ERROR, "Invalid chroma block offset\n"); @@ -196,7 +198,7 @@ static int xan_decode_chroma(AVCodecContext *avctx, unsigned chroma_off) for (j = 0; j < avctx->height >> 1; j++) { for (i = 0; i < avctx->width >> 1; i++) { val = *src++; - if (val) { + if (val && val < table_size) { val = AV_RL16(table + (val << 1)); uval = (val >> 3) & 0xF8; vval = (val >> 8) & 0xF8; @@ -216,7 +218,7 @@ static int xan_decode_chroma(AVCodecContext *avctx, unsigned chroma_off) for (j = 0; j < avctx->height >> 2; j++) { for (i = 0; i < avctx->width >> 1; i += 2) { val = *src++; - if (val) { + if (val && val < table_size) { val = AV_RL16(table + (val << 1)); uval = (val >> 3) & 0xF8; vval = (val >> 8) & 0xF8; From 71af42bd964313b3869cdd8a8c0fb97a9ee90d49 Mon Sep 17 00:00:00 2001 From: "Ronald S. Bultje" Date: Thu, 8 Mar 2012 16:32:49 -0800 Subject: [PATCH 6/7] xxan: reindent xan_unpack_luma(). It used 3-space indent instead of 4-space indent. --- libavcodec/xxan.c | 72 +++++++++++++++++++++++------------------------ 1 file changed, 36 insertions(+), 36 deletions(-) diff --git a/libavcodec/xxan.c b/libavcodec/xxan.c index dd5447940b..4561714124 100644 --- a/libavcodec/xxan.c +++ b/libavcodec/xxan.c @@ -62,42 +62,42 @@ static av_cold int xan_decode_init(AVCodecContext *avctx) static int xan_unpack_luma(XanContext *s, uint8_t *dst, const int dst_size) { - int tree_size, eof; - int bits, mask; - int tree_root, node; - const uint8_t *dst_end = dst + dst_size; - GetByteContext tree = s->gb; - int start_off = bytestream2_tell(&tree); - - tree_size = bytestream2_get_byte(&s->gb); - eof = bytestream2_get_byte(&s->gb); - tree_root = eof + tree_size; - bytestream2_skip(&s->gb, tree_size * 2); - - node = tree_root; - bits = bytestream2_get_byte(&s->gb); - mask = 0x80; - for (;;) { - int bit = !!(bits & mask); - mask >>= 1; - bytestream2_seek(&tree, start_off + node*2 + bit - eof * 2, SEEK_SET); - node = bytestream2_get_byte(&tree); - if (node == eof) - break; - if (node < eof) { - *dst++ = node; - if (dst > dst_end) - break; - node = tree_root; - } - if (!mask) { - if (bytestream2_get_bytes_left(&s->gb) <= 0) - break; - bits = bytestream2_get_byteu(&s->gb); - mask = 0x80; - } - } - return dst != dst_end ? AVERROR_INVALIDDATA : 0; + int tree_size, eof; + int bits, mask; + int tree_root, node; + const uint8_t *dst_end = dst + dst_size; + GetByteContext tree = s->gb; + int start_off = bytestream2_tell(&tree); + + tree_size = bytestream2_get_byte(&s->gb); + eof = bytestream2_get_byte(&s->gb); + tree_root = eof + tree_size; + bytestream2_skip(&s->gb, tree_size * 2); + + node = tree_root; + bits = bytestream2_get_byte(&s->gb); + mask = 0x80; + for (;;) { + int bit = !!(bits & mask); + mask >>= 1; + bytestream2_seek(&tree, start_off + node*2 + bit - eof * 2, SEEK_SET); + node = bytestream2_get_byte(&tree); + if (node == eof) + break; + if (node < eof) { + *dst++ = node; + if (dst > dst_end) + break; + node = tree_root; + } + if (!mask) { + if (bytestream2_get_bytes_left(&s->gb) <= 0) + break; + bits = bytestream2_get_byteu(&s->gb); + mask = 0x80; + } + } + return dst != dst_end ? AVERROR_INVALIDDATA : 0; } /* almost the same as in xan_wc3 decoder */ From 442c3a8cb1785d74f8e2d7ab35b1862b7088436b Mon Sep 17 00:00:00 2001 From: "Ronald S. Bultje" Date: Thu, 8 Mar 2012 17:09:27 -0800 Subject: [PATCH 7/7] cook: expand dither_tab[], and make sure indexes into it don't overflow. Fixes overflows in accessing dither_tab[]. Found-by: Mateusz "j00ru" Jurczyk and Gynvael Coldwind CC: libav-stable@libav.org --- libavcodec/cook.c | 6 +++++- libavcodec/cookdata.h | 4 ++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/libavcodec/cook.c b/libavcodec/cook.c index 65e16e4077..41ce0e528e 100644 --- a/libavcodec/cook.c +++ b/libavcodec/cook.c @@ -507,7 +507,11 @@ static inline void expand_category(COOKContext *q, int *category, { int i; for (i = 0; i < q->num_vectors; i++) - ++category[category_index[i]]; + { + int idx = category_index[i]; + if (++category[idx] >= FF_ARRAY_ELEMS(dither_tab)) + --category[idx]; + } } /** diff --git a/libavcodec/cookdata.h b/libavcodec/cookdata.h index 126010a985..c4c26fae5f 100644 --- a/libavcodec/cookdata.h +++ b/libavcodec/cookdata.h @@ -36,8 +36,8 @@ static const int expbits_tab[8] = { 52,47,43,37,29,22,16,0, }; -static const float dither_tab[8] = { - 0.0, 0.0, 0.0, 0.0, 0.0, 0.176777, 0.25, 0.707107, +static const float dither_tab[9] = { + 0.0, 0.0, 0.0, 0.0, 0.0, 0.176777, 0.25, 0.707107, 1.0 }; static const float quant_centroid_tab[7][14] = {