| @@ -456,12 +456,12 @@ static void put_pixels16_sse2(uint8_t *block, const uint8_t *pixels, int line_si | |||
| "movdqu (%1,%3), %%xmm1 \n\t" | |||
| "movdqu (%1,%3,2), %%xmm2 \n\t" | |||
| "movdqu (%1,%4), %%xmm3 \n\t" | |||
| "lea (%1,%3,4), %1 \n\t" | |||
| "movdqa %%xmm0, (%2) \n\t" | |||
| "movdqa %%xmm1, (%2,%3) \n\t" | |||
| "movdqa %%xmm2, (%2,%3,2) \n\t" | |||
| "movdqa %%xmm3, (%2,%4) \n\t" | |||
| "subl $4, %0 \n\t" | |||
| "lea (%1,%3,4), %1 \n\t" | |||
| "lea (%2,%3,4), %2 \n\t" | |||
| "jnz 1b \n\t" | |||
| : "+g"(h), "+r" (pixels), "+r" (block) | |||
| @@ -478,6 +478,7 @@ static void avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels, int line_si | |||
| "movdqu (%1,%3), %%xmm1 \n\t" | |||
| "movdqu (%1,%3,2), %%xmm2 \n\t" | |||
| "movdqu (%1,%4), %%xmm3 \n\t" | |||
| "lea (%1,%3,4), %1 \n\t" | |||
| "pavgb (%2), %%xmm0 \n\t" | |||
| "pavgb (%2,%3), %%xmm1 \n\t" | |||
| "pavgb (%2,%3,2), %%xmm2 \n\t" | |||
| @@ -487,7 +488,6 @@ static void avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels, int line_si | |||
| "movdqa %%xmm2, (%2,%3,2) \n\t" | |||
| "movdqa %%xmm3, (%2,%4) \n\t" | |||
| "subl $4, %0 \n\t" | |||
| "lea (%1,%3,4), %1 \n\t" | |||
| "lea (%2,%3,4), %2 \n\t" | |||
| "jnz 1b \n\t" | |||
| : "+g"(h), "+r" (pixels), "+r" (block) | |||
| @@ -72,17 +72,17 @@ SECTION .text | |||
| .next4rows | |||
| movq mm0, [r1 ] | |||
| movq mm1, [r1+r2] | |||
| add r1, r4 | |||
| CHROMAMC_AVG mm0, [r0 ] | |||
| CHROMAMC_AVG mm1, [r0+r2] | |||
| movq [r0 ], mm0 | |||
| movq [r0+r2], mm1 | |||
| add r0, r4 | |||
| add r1, r4 | |||
| movq mm0, [r1 ] | |||
| movq mm1, [r1+r2] | |||
| add r1, r4 | |||
| CHROMAMC_AVG mm0, [r0 ] | |||
| CHROMAMC_AVG mm1, [r0+r2] | |||
| add r1, r4 | |||
| movq [r0 ], mm0 | |||
| movq [r0+r2], mm1 | |||
| add r0, r4 | |||
| @@ -472,8 +472,8 @@ cglobal %1_%2_chroma_mc8_%3, 6, 7, 8 | |||
| mov r6d, r4d | |||
| shl r4d, 8 | |||
| sub r4, r6 | |||
| add r4, 8 ; x*288+8 = x<<8 | (8-x) | |||
| mov r6, 8 | |||
| add r4, 8 ; x*288+8 = x<<8 | (8-x) | |||
| sub r6d, r5d | |||
| imul r6, r4 ; (8-y)*(x*255+8) = (8-y)*x<<8 | (8-y)*(8-x) | |||
| imul r4d, r5d ; y *(x*255+8) = y *x<<8 | y *(8-x) | |||
| @@ -481,24 +481,23 @@ cglobal %1_%2_chroma_mc8_%3, 6, 7, 8 | |||
| movd m7, r6d | |||
| movd m6, r4d | |||
| movdqa m5, [rnd_2d_%2] | |||
| movq m0, [r1 ] | |||
| movq m1, [r1+1] | |||
| pshuflw m7, m7, 0 | |||
| pshuflw m6, m6, 0 | |||
| punpcklbw m0, m1 | |||
| movlhps m7, m7 | |||
| movlhps m6, m6 | |||
| movq m0, [r1 ] | |||
| movq m1, [r1 +1] | |||
| punpcklbw m0, m1 | |||
| add r1, r2 | |||
| .next2rows | |||
| movq m1, [r1 ] | |||
| movq m2, [r1 +1] | |||
| movq m3, [r1+r2 ] | |||
| movq m4, [r1+r2+1] | |||
| movq m1, [r1+r2*1 ] | |||
| movq m2, [r1+r2*1+1] | |||
| movq m3, [r1+r2*2 ] | |||
| movq m4, [r1+r2*2+1] | |||
| lea r1, [r1+r2*2] | |||
| punpcklbw m1, m2 | |||
| punpcklbw m3, m4 | |||
| movdqa m2, m1 | |||
| punpcklbw m3, m4 | |||
| movdqa m4, m3 | |||
| pmaddubsw m0, m7 | |||
| pmaddubsw m1, m6 | |||
| @@ -508,8 +507,8 @@ cglobal %1_%2_chroma_mc8_%3, 6, 7, 8 | |||
| paddw m2, m5 | |||
| paddw m1, m0 | |||
| paddw m3, m2 | |||
| movdqa m0, m4 | |||
| psrlw m1, 6 | |||
| movdqa m0, m4 | |||
| psrlw m3, 6 | |||
| %ifidn %1, avg | |||
| movq m2, [r0 ] | |||
| @@ -576,6 +575,7 @@ cglobal %1_%2_chroma_mc8_%3, 6, 7, 8 | |||
| movq m1, [r1+r2 ] | |||
| movdqa m2, m1 | |||
| movq m3, [r1+r2*2] | |||
| lea r1, [r1+r2*2] | |||
| punpcklbw m0, m1 | |||
| punpcklbw m2, m3 | |||
| pmaddubsw m0, m7 | |||
| @@ -594,7 +594,6 @@ cglobal %1_%2_chroma_mc8_%3, 6, 7, 8 | |||
| movhps [r0+r2], m0 | |||
| sub r3d, 2 | |||
| lea r0, [r0+r2*2] | |||
| lea r1, [r1+r2*2] | |||
| jg .next2yrows | |||
| REP_RET | |||
| %endmacro | |||
| @@ -607,8 +606,8 @@ cglobal %1_%2_chroma_mc4_%3, 6, 7, 0 | |||
| mov r6, r4 | |||
| shl r4d, 8 | |||
| sub r4d, r6d | |||
| add r4d, 8 ; x*288+8 | |||
| mov r6, 8 | |||
| add r4d, 8 ; x*288+8 | |||
| sub r6d, r5d | |||
| imul r6d, r4d ; (8-y)*(x*255+8) = (8-y)*x<<8 | (8-y)*(8-x) | |||
| imul r4d, r5d ; y *(x*255+8) = y *x<<8 | y *(8-x) | |||
| @@ -616,17 +615,16 @@ cglobal %1_%2_chroma_mc4_%3, 6, 7, 0 | |||
| movd m7, r6d | |||
| movd m6, r4d | |||
| movq m5, [pw_32] | |||
| movd m0, [r1 ] | |||
| pshufw m7, m7, 0 | |||
| punpcklbw m0, [r1+1] | |||
| pshufw m6, m6, 0 | |||
| movd m0, [r1 ] | |||
| punpcklbw m0, [r1 +1] | |||
| add r1, r2 | |||
| .next2rows | |||
| movd m1, [r1 ] | |||
| movd m3, [r1+r2 ] | |||
| punpcklbw m1, [r1 +1] | |||
| punpcklbw m3, [r1+r2+1] | |||
| movd m1, [r1+r2*1 ] | |||
| movd m3, [r1+r2*2 ] | |||
| punpcklbw m1, [r1+r2*1+1] | |||
| punpcklbw m3, [r1+r2*2+1] | |||
| lea r1, [r1+r2*2] | |||
| movq m2, m1 | |||
| movq m4, m3 | |||
| @@ -638,8 +636,8 @@ cglobal %1_%2_chroma_mc4_%3, 6, 7, 0 | |||
| paddw m2, m5 | |||
| paddw m1, m0 | |||
| paddw m3, m2 | |||
| movq m0, m4 | |||
| psrlw m1, 6 | |||
| movq m0, m4 | |||
| psrlw m3, 6 | |||
| packuswb m1, m1 | |||
| packuswb m3, m3 | |||
| @@ -240,17 +240,17 @@ cextern pb_A1 | |||
| ; out: m1=p0' m2=q0' | |||
| ; clobbers: m0,3-6 | |||
| %macro DEBLOCK_P0_Q0 0 | |||
| pxor m5, m1, m2 ; p0^q0 | |||
| pand m5, [pb_1] ; (p0^q0)&1 | |||
| pcmpeqb m4, m4 | |||
| pxor m5, m1, m2 ; p0^q0 | |||
| pxor m3, m4 | |||
| pand m5, [pb_1] ; (p0^q0)&1 | |||
| pavgb m3, m0 ; (p1 - q1 + 256)>>1 | |||
| pavgb m3, [pb_3] ; (((p1 - q1 + 256)>>1)+4)>>1 = 64+2+(p1-q1)>>2 | |||
| pxor m4, m1 | |||
| pavgb m3, [pb_3] ; (((p1 - q1 + 256)>>1)+4)>>1 = 64+2+(p1-q1)>>2 | |||
| pavgb m4, m2 ; (q0 - p0 + 256)>>1 | |||
| pavgb m3, m5 | |||
| paddusb m3, m4 ; d+128+33 | |||
| mova m6, [pb_A1] | |||
| paddusb m3, m4 ; d+128+33 | |||
| psubusb m6, m3 | |||
| psubusb m3, [pb_A1] | |||
| pminub m6, m7 | |||
| @@ -411,16 +411,16 @@ cglobal deblock_%2_luma_8_%1, 5,5 | |||
| LOAD_MASK r2, r3 | |||
| mov r3, r4mp | |||
| pcmpeqb m3, m3 | |||
| movd m4, [r3] ; tc0 | |||
| punpcklbw m4, m4 | |||
| punpcklbw m4, m4 ; tc = 4x tc0[3], 4x tc0[2], 4x tc0[1], 4x tc0[0] | |||
| mova [esp+%3], m4 ; tc | |||
| pcmpeqb m3, m3 | |||
| pcmpgtb m4, m3 | |||
| mova m3, [r4] ; p2 | |||
| pand m4, m7 | |||
| mova [esp], m4 ; mask | |||
| mova m3, [r4] ; p2 | |||
| DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| > beta-1 | |||
| pand m6, m4 | |||
| pand m4, [esp+%3] ; tc | |||
| @@ -430,11 +430,10 @@ cglobal deblock_%2_luma_8_%1, 5,5 | |||
| mova m4, [r0+2*r1] ; q2 | |||
| DIFF_GT2 m2, m4, m5, m6, m3 ; |q2-q0| > beta-1 | |||
| mova m5, [esp] ; mask | |||
| pand m6, m5 | |||
| pand m6, [esp] ; mask | |||
| mova m5, [esp+%3] ; tc | |||
| pand m5, m6 | |||
| psubb m7, m6 | |||
| pand m5, m6 | |||
| mova m3, [r0+r1] | |||
| LUMA_Q1 m3, m4, [r0+2*r1], [r0+r1], m5, m6 | |||
| @@ -482,10 +481,10 @@ cglobal deblock_h_luma_8_%1, 0,5 | |||
| ; transpose 16x4 -> original space (only the middle 4 rows were changed by the filter) | |||
| mov r0, r0mp | |||
| sub r0, 2 | |||
| lea r1, [r0+r4] | |||
| movq m0, [pix_tmp+0x10] | |||
| movq m1, [pix_tmp+0x20] | |||
| lea r1, [r0+r4] | |||
| movq m2, [pix_tmp+0x30] | |||
| movq m3, [pix_tmp+0x40] | |||
| TRANSPOSE8x4B_STORE PASS8ROWS(r0, r1, r3, r4) | |||
| @@ -82,10 +82,10 @@ cglobal h264_idct_add_8_mmx, 3, 3, 0 | |||
| RET | |||
| %macro IDCT8_1D 2 | |||
| mova m4, m5 | |||
| mova m0, m1 | |||
| psraw m4, 1 | |||
| psraw m1, 1 | |||
| mova m4, m5 | |||
| psraw m4, 1 | |||
| paddw m4, m5 | |||
| paddw m1, m0 | |||
| paddw m4, m7 | |||
| @@ -95,16 +95,16 @@ cglobal h264_idct_add_8_mmx, 3, 3, 0 | |||
| psubw m0, m3 | |||
| psubw m5, m3 | |||
| psraw m3, 1 | |||
| paddw m0, m7 | |||
| psubw m5, m7 | |||
| psraw m3, 1 | |||
| psraw m7, 1 | |||
| psubw m0, m3 | |||
| psubw m5, m7 | |||
| mova m3, m4 | |||
| mova m7, m1 | |||
| psraw m1, 2 | |||
| mova m3, m4 | |||
| psraw m3, 2 | |||
| paddw m3, m0 | |||
| psraw m0, 2 | |||
| @@ -113,12 +113,12 @@ cglobal h264_idct_add_8_mmx, 3, 3, 0 | |||
| psubw m0, m4 | |||
| psubw m7, m5 | |||
| mova m4, m2 | |||
| mova m5, m6 | |||
| psraw m4, 1 | |||
| psraw m6, 1 | |||
| psubw m4, m5 | |||
| mova m4, m2 | |||
| psraw m4, 1 | |||
| paddw m6, m2 | |||
| psubw m4, m5 | |||
| mova m2, %1 | |||
| mova m5, %2 | |||
| @@ -337,7 +337,7 @@ cglobal h264_idct8_add4_8_mmx, 5, 7, 0 | |||
| test r6, r6 | |||
| jz .skipblock | |||
| mov r6d, dword [r1+r5*4] | |||
| lea r6, [r0+r6] | |||
| add r6, r0 | |||
| add word [r2], 32 | |||
| IDCT8_ADD_MMX_START r2 , rsp | |||
| IDCT8_ADD_MMX_START r2+8, rsp+64 | |||
| @@ -391,7 +391,7 @@ cglobal h264_idct_add16_8_mmx2, 5, 7, 0 | |||
| REP_RET | |||
| .no_dc | |||
| mov r6d, dword [r1+r5*4] | |||
| lea r6, [r0+r6] | |||
| add r6, r0 | |||
| IDCT4_ADD r6, r2, r3 | |||
| .skipblock | |||
| inc r5 | |||
| @@ -414,7 +414,7 @@ cglobal h264_idct_add16intra_8_mmx, 5, 7, 0 | |||
| test r6, r6 | |||
| jz .skipblock | |||
| mov r6d, dword [r1+r5*4] | |||
| lea r6, [r0+r6] | |||
| add r6, r0 | |||
| IDCT4_ADD r6, r2, r3 | |||
| .skipblock | |||
| inc r5 | |||
| @@ -456,7 +456,7 @@ cglobal h264_idct_add16intra_8_mmx2, 5, 7, 0 | |||
| %define dst_regd r1d | |||
| %endif | |||
| mov dst_regd, dword [r1+r5*4] | |||
| lea dst_reg, [r0+dst_reg] | |||
| add dst_reg, r0 | |||
| DC_ADD_MMX2_OP movh, dst_reg, r3, r6 | |||
| %ifndef ARCH_X86_64 | |||
| mov r1, r1m | |||
| @@ -513,7 +513,7 @@ cglobal h264_idct8_add4_8_mmx2, 5, 7, 0 | |||
| RET | |||
| .no_dc | |||
| mov r6d, dword [r1+r5*4] | |||
| lea r6, [r0+r6] | |||
| add r6, r0 | |||
| add word [r2], 32 | |||
| IDCT8_ADD_MMX_START r2 , rsp | |||
| IDCT8_ADD_MMX_START r2+8, rsp+64 | |||
| @@ -558,7 +558,7 @@ INIT_MMX | |||
| %define dst_regd r1d | |||
| %endif | |||
| mov dst_regd, dword [r1+r5*4] | |||
| lea dst_reg, [r0+dst_reg] | |||
| add dst_reg, r0 | |||
| DC_ADD_MMX2_OP mova, dst_reg, r3, r6 | |||
| lea dst_reg, [dst_reg+r3*4] | |||
| DC_ADD_MMX2_OP mova, dst_reg, r3, r6 | |||
| @@ -573,7 +573,7 @@ INIT_MMX | |||
| .no_dc | |||
| INIT_XMM | |||
| mov dst_regd, dword [r1+r5*4] | |||
| lea dst_reg, [r0+dst_reg] | |||
| add dst_reg, r0 | |||
| IDCT8_ADD_SSE dst_reg, r2, r3, r6 | |||
| %ifndef ARCH_X86_64 | |||
| mov r1, r1m | |||
| @@ -497,10 +497,10 @@ | |||
| %macro STORE_DIFFx2 8 ; add1, add2, reg1, reg2, zero, shift, source, stride | |||
| movh %3, [%7] | |||
| movh %4, [%7+%8] | |||
| punpcklbw %3, %5 | |||
| punpcklbw %4, %5 | |||
| psraw %1, %6 | |||
| psraw %2, %6 | |||
| punpcklbw %3, %5 | |||
| punpcklbw %4, %5 | |||
| paddw %3, %1 | |||
| paddw %4, %2 | |||
| packuswb %3, %5 | |||