| @@ -456,12 +456,12 @@ static void put_pixels16_sse2(uint8_t *block, const uint8_t *pixels, int line_si | |||||
| "movdqu (%1,%3), %%xmm1 \n\t" | "movdqu (%1,%3), %%xmm1 \n\t" | ||||
| "movdqu (%1,%3,2), %%xmm2 \n\t" | "movdqu (%1,%3,2), %%xmm2 \n\t" | ||||
| "movdqu (%1,%4), %%xmm3 \n\t" | "movdqu (%1,%4), %%xmm3 \n\t" | ||||
| "lea (%1,%3,4), %1 \n\t" | |||||
| "movdqa %%xmm0, (%2) \n\t" | "movdqa %%xmm0, (%2) \n\t" | ||||
| "movdqa %%xmm1, (%2,%3) \n\t" | "movdqa %%xmm1, (%2,%3) \n\t" | ||||
| "movdqa %%xmm2, (%2,%3,2) \n\t" | "movdqa %%xmm2, (%2,%3,2) \n\t" | ||||
| "movdqa %%xmm3, (%2,%4) \n\t" | "movdqa %%xmm3, (%2,%4) \n\t" | ||||
| "subl $4, %0 \n\t" | "subl $4, %0 \n\t" | ||||
| "lea (%1,%3,4), %1 \n\t" | |||||
| "lea (%2,%3,4), %2 \n\t" | "lea (%2,%3,4), %2 \n\t" | ||||
| "jnz 1b \n\t" | "jnz 1b \n\t" | ||||
| : "+g"(h), "+r" (pixels), "+r" (block) | : "+g"(h), "+r" (pixels), "+r" (block) | ||||
| @@ -478,6 +478,7 @@ static void avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels, int line_si | |||||
| "movdqu (%1,%3), %%xmm1 \n\t" | "movdqu (%1,%3), %%xmm1 \n\t" | ||||
| "movdqu (%1,%3,2), %%xmm2 \n\t" | "movdqu (%1,%3,2), %%xmm2 \n\t" | ||||
| "movdqu (%1,%4), %%xmm3 \n\t" | "movdqu (%1,%4), %%xmm3 \n\t" | ||||
| "lea (%1,%3,4), %1 \n\t" | |||||
| "pavgb (%2), %%xmm0 \n\t" | "pavgb (%2), %%xmm0 \n\t" | ||||
| "pavgb (%2,%3), %%xmm1 \n\t" | "pavgb (%2,%3), %%xmm1 \n\t" | ||||
| "pavgb (%2,%3,2), %%xmm2 \n\t" | "pavgb (%2,%3,2), %%xmm2 \n\t" | ||||
| @@ -487,7 +488,6 @@ static void avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels, int line_si | |||||
| "movdqa %%xmm2, (%2,%3,2) \n\t" | "movdqa %%xmm2, (%2,%3,2) \n\t" | ||||
| "movdqa %%xmm3, (%2,%4) \n\t" | "movdqa %%xmm3, (%2,%4) \n\t" | ||||
| "subl $4, %0 \n\t" | "subl $4, %0 \n\t" | ||||
| "lea (%1,%3,4), %1 \n\t" | |||||
| "lea (%2,%3,4), %2 \n\t" | "lea (%2,%3,4), %2 \n\t" | ||||
| "jnz 1b \n\t" | "jnz 1b \n\t" | ||||
| : "+g"(h), "+r" (pixels), "+r" (block) | : "+g"(h), "+r" (pixels), "+r" (block) | ||||
| @@ -72,17 +72,17 @@ SECTION .text | |||||
| .next4rows | .next4rows | ||||
| movq mm0, [r1 ] | movq mm0, [r1 ] | ||||
| movq mm1, [r1+r2] | movq mm1, [r1+r2] | ||||
| add r1, r4 | |||||
| CHROMAMC_AVG mm0, [r0 ] | CHROMAMC_AVG mm0, [r0 ] | ||||
| CHROMAMC_AVG mm1, [r0+r2] | CHROMAMC_AVG mm1, [r0+r2] | ||||
| movq [r0 ], mm0 | movq [r0 ], mm0 | ||||
| movq [r0+r2], mm1 | movq [r0+r2], mm1 | ||||
| add r0, r4 | add r0, r4 | ||||
| add r1, r4 | |||||
| movq mm0, [r1 ] | movq mm0, [r1 ] | ||||
| movq mm1, [r1+r2] | movq mm1, [r1+r2] | ||||
| add r1, r4 | |||||
| CHROMAMC_AVG mm0, [r0 ] | CHROMAMC_AVG mm0, [r0 ] | ||||
| CHROMAMC_AVG mm1, [r0+r2] | CHROMAMC_AVG mm1, [r0+r2] | ||||
| add r1, r4 | |||||
| movq [r0 ], mm0 | movq [r0 ], mm0 | ||||
| movq [r0+r2], mm1 | movq [r0+r2], mm1 | ||||
| add r0, r4 | add r0, r4 | ||||
| @@ -472,8 +472,8 @@ cglobal %1_%2_chroma_mc8_%3, 6, 7, 8 | |||||
| mov r6d, r4d | mov r6d, r4d | ||||
| shl r4d, 8 | shl r4d, 8 | ||||
| sub r4, r6 | sub r4, r6 | ||||
| add r4, 8 ; x*288+8 = x<<8 | (8-x) | |||||
| mov r6, 8 | mov r6, 8 | ||||
| add r4, 8 ; x*288+8 = x<<8 | (8-x) | |||||
| sub r6d, r5d | sub r6d, r5d | ||||
| imul r6, r4 ; (8-y)*(x*255+8) = (8-y)*x<<8 | (8-y)*(8-x) | imul r6, r4 ; (8-y)*(x*255+8) = (8-y)*x<<8 | (8-y)*(8-x) | ||||
| imul r4d, r5d ; y *(x*255+8) = y *x<<8 | y *(8-x) | imul r4d, r5d ; y *(x*255+8) = y *x<<8 | y *(8-x) | ||||
| @@ -481,24 +481,23 @@ cglobal %1_%2_chroma_mc8_%3, 6, 7, 8 | |||||
| movd m7, r6d | movd m7, r6d | ||||
| movd m6, r4d | movd m6, r4d | ||||
| movdqa m5, [rnd_2d_%2] | movdqa m5, [rnd_2d_%2] | ||||
| movq m0, [r1 ] | |||||
| movq m1, [r1+1] | |||||
| pshuflw m7, m7, 0 | pshuflw m7, m7, 0 | ||||
| pshuflw m6, m6, 0 | pshuflw m6, m6, 0 | ||||
| punpcklbw m0, m1 | |||||
| movlhps m7, m7 | movlhps m7, m7 | ||||
| movlhps m6, m6 | movlhps m6, m6 | ||||
| movq m0, [r1 ] | |||||
| movq m1, [r1 +1] | |||||
| punpcklbw m0, m1 | |||||
| add r1, r2 | |||||
| .next2rows | .next2rows | ||||
| movq m1, [r1 ] | |||||
| movq m2, [r1 +1] | |||||
| movq m3, [r1+r2 ] | |||||
| movq m4, [r1+r2+1] | |||||
| movq m1, [r1+r2*1 ] | |||||
| movq m2, [r1+r2*1+1] | |||||
| movq m3, [r1+r2*2 ] | |||||
| movq m4, [r1+r2*2+1] | |||||
| lea r1, [r1+r2*2] | lea r1, [r1+r2*2] | ||||
| punpcklbw m1, m2 | punpcklbw m1, m2 | ||||
| punpcklbw m3, m4 | |||||
| movdqa m2, m1 | movdqa m2, m1 | ||||
| punpcklbw m3, m4 | |||||
| movdqa m4, m3 | movdqa m4, m3 | ||||
| pmaddubsw m0, m7 | pmaddubsw m0, m7 | ||||
| pmaddubsw m1, m6 | pmaddubsw m1, m6 | ||||
| @@ -508,8 +507,8 @@ cglobal %1_%2_chroma_mc8_%3, 6, 7, 8 | |||||
| paddw m2, m5 | paddw m2, m5 | ||||
| paddw m1, m0 | paddw m1, m0 | ||||
| paddw m3, m2 | paddw m3, m2 | ||||
| movdqa m0, m4 | |||||
| psrlw m1, 6 | psrlw m1, 6 | ||||
| movdqa m0, m4 | |||||
| psrlw m3, 6 | psrlw m3, 6 | ||||
| %ifidn %1, avg | %ifidn %1, avg | ||||
| movq m2, [r0 ] | movq m2, [r0 ] | ||||
| @@ -576,6 +575,7 @@ cglobal %1_%2_chroma_mc8_%3, 6, 7, 8 | |||||
| movq m1, [r1+r2 ] | movq m1, [r1+r2 ] | ||||
| movdqa m2, m1 | movdqa m2, m1 | ||||
| movq m3, [r1+r2*2] | movq m3, [r1+r2*2] | ||||
| lea r1, [r1+r2*2] | |||||
| punpcklbw m0, m1 | punpcklbw m0, m1 | ||||
| punpcklbw m2, m3 | punpcklbw m2, m3 | ||||
| pmaddubsw m0, m7 | pmaddubsw m0, m7 | ||||
| @@ -594,7 +594,6 @@ cglobal %1_%2_chroma_mc8_%3, 6, 7, 8 | |||||
| movhps [r0+r2], m0 | movhps [r0+r2], m0 | ||||
| sub r3d, 2 | sub r3d, 2 | ||||
| lea r0, [r0+r2*2] | lea r0, [r0+r2*2] | ||||
| lea r1, [r1+r2*2] | |||||
| jg .next2yrows | jg .next2yrows | ||||
| REP_RET | REP_RET | ||||
| %endmacro | %endmacro | ||||
| @@ -607,8 +606,8 @@ cglobal %1_%2_chroma_mc4_%3, 6, 7, 0 | |||||
| mov r6, r4 | mov r6, r4 | ||||
| shl r4d, 8 | shl r4d, 8 | ||||
| sub r4d, r6d | sub r4d, r6d | ||||
| add r4d, 8 ; x*288+8 | |||||
| mov r6, 8 | mov r6, 8 | ||||
| add r4d, 8 ; x*288+8 | |||||
| sub r6d, r5d | sub r6d, r5d | ||||
| imul r6d, r4d ; (8-y)*(x*255+8) = (8-y)*x<<8 | (8-y)*(8-x) | imul r6d, r4d ; (8-y)*(x*255+8) = (8-y)*x<<8 | (8-y)*(8-x) | ||||
| imul r4d, r5d ; y *(x*255+8) = y *x<<8 | y *(8-x) | imul r4d, r5d ; y *(x*255+8) = y *x<<8 | y *(8-x) | ||||
| @@ -616,17 +615,16 @@ cglobal %1_%2_chroma_mc4_%3, 6, 7, 0 | |||||
| movd m7, r6d | movd m7, r6d | ||||
| movd m6, r4d | movd m6, r4d | ||||
| movq m5, [pw_32] | movq m5, [pw_32] | ||||
| movd m0, [r1 ] | |||||
| pshufw m7, m7, 0 | pshufw m7, m7, 0 | ||||
| punpcklbw m0, [r1+1] | |||||
| pshufw m6, m6, 0 | pshufw m6, m6, 0 | ||||
| movd m0, [r1 ] | |||||
| punpcklbw m0, [r1 +1] | |||||
| add r1, r2 | |||||
| .next2rows | .next2rows | ||||
| movd m1, [r1 ] | |||||
| movd m3, [r1+r2 ] | |||||
| punpcklbw m1, [r1 +1] | |||||
| punpcklbw m3, [r1+r2+1] | |||||
| movd m1, [r1+r2*1 ] | |||||
| movd m3, [r1+r2*2 ] | |||||
| punpcklbw m1, [r1+r2*1+1] | |||||
| punpcklbw m3, [r1+r2*2+1] | |||||
| lea r1, [r1+r2*2] | lea r1, [r1+r2*2] | ||||
| movq m2, m1 | movq m2, m1 | ||||
| movq m4, m3 | movq m4, m3 | ||||
| @@ -638,8 +636,8 @@ cglobal %1_%2_chroma_mc4_%3, 6, 7, 0 | |||||
| paddw m2, m5 | paddw m2, m5 | ||||
| paddw m1, m0 | paddw m1, m0 | ||||
| paddw m3, m2 | paddw m3, m2 | ||||
| movq m0, m4 | |||||
| psrlw m1, 6 | psrlw m1, 6 | ||||
| movq m0, m4 | |||||
| psrlw m3, 6 | psrlw m3, 6 | ||||
| packuswb m1, m1 | packuswb m1, m1 | ||||
| packuswb m3, m3 | packuswb m3, m3 | ||||
| @@ -240,17 +240,17 @@ cextern pb_A1 | |||||
| ; out: m1=p0' m2=q0' | ; out: m1=p0' m2=q0' | ||||
| ; clobbers: m0,3-6 | ; clobbers: m0,3-6 | ||||
| %macro DEBLOCK_P0_Q0 0 | %macro DEBLOCK_P0_Q0 0 | ||||
| pxor m5, m1, m2 ; p0^q0 | |||||
| pand m5, [pb_1] ; (p0^q0)&1 | |||||
| pcmpeqb m4, m4 | pcmpeqb m4, m4 | ||||
| pxor m5, m1, m2 ; p0^q0 | |||||
| pxor m3, m4 | pxor m3, m4 | ||||
| pand m5, [pb_1] ; (p0^q0)&1 | |||||
| pavgb m3, m0 ; (p1 - q1 + 256)>>1 | pavgb m3, m0 ; (p1 - q1 + 256)>>1 | ||||
| pavgb m3, [pb_3] ; (((p1 - q1 + 256)>>1)+4)>>1 = 64+2+(p1-q1)>>2 | |||||
| pxor m4, m1 | pxor m4, m1 | ||||
| pavgb m3, [pb_3] ; (((p1 - q1 + 256)>>1)+4)>>1 = 64+2+(p1-q1)>>2 | |||||
| pavgb m4, m2 ; (q0 - p0 + 256)>>1 | pavgb m4, m2 ; (q0 - p0 + 256)>>1 | ||||
| pavgb m3, m5 | pavgb m3, m5 | ||||
| paddusb m3, m4 ; d+128+33 | |||||
| mova m6, [pb_A1] | mova m6, [pb_A1] | ||||
| paddusb m3, m4 ; d+128+33 | |||||
| psubusb m6, m3 | psubusb m6, m3 | ||||
| psubusb m3, [pb_A1] | psubusb m3, [pb_A1] | ||||
| pminub m6, m7 | pminub m6, m7 | ||||
| @@ -411,16 +411,16 @@ cglobal deblock_%2_luma_8_%1, 5,5 | |||||
| LOAD_MASK r2, r3 | LOAD_MASK r2, r3 | ||||
| mov r3, r4mp | mov r3, r4mp | ||||
| pcmpeqb m3, m3 | |||||
| movd m4, [r3] ; tc0 | movd m4, [r3] ; tc0 | ||||
| punpcklbw m4, m4 | punpcklbw m4, m4 | ||||
| punpcklbw m4, m4 ; tc = 4x tc0[3], 4x tc0[2], 4x tc0[1], 4x tc0[0] | punpcklbw m4, m4 ; tc = 4x tc0[3], 4x tc0[2], 4x tc0[1], 4x tc0[0] | ||||
| mova [esp+%3], m4 ; tc | mova [esp+%3], m4 ; tc | ||||
| pcmpeqb m3, m3 | |||||
| pcmpgtb m4, m3 | pcmpgtb m4, m3 | ||||
| mova m3, [r4] ; p2 | |||||
| pand m4, m7 | pand m4, m7 | ||||
| mova [esp], m4 ; mask | mova [esp], m4 ; mask | ||||
| mova m3, [r4] ; p2 | |||||
| DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| > beta-1 | DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| > beta-1 | ||||
| pand m6, m4 | pand m6, m4 | ||||
| pand m4, [esp+%3] ; tc | pand m4, [esp+%3] ; tc | ||||
| @@ -430,11 +430,10 @@ cglobal deblock_%2_luma_8_%1, 5,5 | |||||
| mova m4, [r0+2*r1] ; q2 | mova m4, [r0+2*r1] ; q2 | ||||
| DIFF_GT2 m2, m4, m5, m6, m3 ; |q2-q0| > beta-1 | DIFF_GT2 m2, m4, m5, m6, m3 ; |q2-q0| > beta-1 | ||||
| mova m5, [esp] ; mask | |||||
| pand m6, m5 | |||||
| pand m6, [esp] ; mask | |||||
| mova m5, [esp+%3] ; tc | mova m5, [esp+%3] ; tc | ||||
| pand m5, m6 | |||||
| psubb m7, m6 | psubb m7, m6 | ||||
| pand m5, m6 | |||||
| mova m3, [r0+r1] | mova m3, [r0+r1] | ||||
| LUMA_Q1 m3, m4, [r0+2*r1], [r0+r1], m5, m6 | LUMA_Q1 m3, m4, [r0+2*r1], [r0+r1], m5, m6 | ||||
| @@ -482,10 +481,10 @@ cglobal deblock_h_luma_8_%1, 0,5 | |||||
| ; transpose 16x4 -> original space (only the middle 4 rows were changed by the filter) | ; transpose 16x4 -> original space (only the middle 4 rows were changed by the filter) | ||||
| mov r0, r0mp | mov r0, r0mp | ||||
| sub r0, 2 | sub r0, 2 | ||||
| lea r1, [r0+r4] | |||||
| movq m0, [pix_tmp+0x10] | movq m0, [pix_tmp+0x10] | ||||
| movq m1, [pix_tmp+0x20] | movq m1, [pix_tmp+0x20] | ||||
| lea r1, [r0+r4] | |||||
| movq m2, [pix_tmp+0x30] | movq m2, [pix_tmp+0x30] | ||||
| movq m3, [pix_tmp+0x40] | movq m3, [pix_tmp+0x40] | ||||
| TRANSPOSE8x4B_STORE PASS8ROWS(r0, r1, r3, r4) | TRANSPOSE8x4B_STORE PASS8ROWS(r0, r1, r3, r4) | ||||
| @@ -82,10 +82,10 @@ cglobal h264_idct_add_8_mmx, 3, 3, 0 | |||||
| RET | RET | ||||
| %macro IDCT8_1D 2 | %macro IDCT8_1D 2 | ||||
| mova m4, m5 | |||||
| mova m0, m1 | mova m0, m1 | ||||
| psraw m4, 1 | |||||
| psraw m1, 1 | psraw m1, 1 | ||||
| mova m4, m5 | |||||
| psraw m4, 1 | |||||
| paddw m4, m5 | paddw m4, m5 | ||||
| paddw m1, m0 | paddw m1, m0 | ||||
| paddw m4, m7 | paddw m4, m7 | ||||
| @@ -95,16 +95,16 @@ cglobal h264_idct_add_8_mmx, 3, 3, 0 | |||||
| psubw m0, m3 | psubw m0, m3 | ||||
| psubw m5, m3 | psubw m5, m3 | ||||
| psraw m3, 1 | |||||
| paddw m0, m7 | paddw m0, m7 | ||||
| psubw m5, m7 | psubw m5, m7 | ||||
| psraw m3, 1 | |||||
| psraw m7, 1 | psraw m7, 1 | ||||
| psubw m0, m3 | psubw m0, m3 | ||||
| psubw m5, m7 | psubw m5, m7 | ||||
| mova m3, m4 | |||||
| mova m7, m1 | mova m7, m1 | ||||
| psraw m1, 2 | psraw m1, 2 | ||||
| mova m3, m4 | |||||
| psraw m3, 2 | psraw m3, 2 | ||||
| paddw m3, m0 | paddw m3, m0 | ||||
| psraw m0, 2 | psraw m0, 2 | ||||
| @@ -113,12 +113,12 @@ cglobal h264_idct_add_8_mmx, 3, 3, 0 | |||||
| psubw m0, m4 | psubw m0, m4 | ||||
| psubw m7, m5 | psubw m7, m5 | ||||
| mova m4, m2 | |||||
| mova m5, m6 | mova m5, m6 | ||||
| psraw m4, 1 | |||||
| psraw m6, 1 | psraw m6, 1 | ||||
| psubw m4, m5 | |||||
| mova m4, m2 | |||||
| psraw m4, 1 | |||||
| paddw m6, m2 | paddw m6, m2 | ||||
| psubw m4, m5 | |||||
| mova m2, %1 | mova m2, %1 | ||||
| mova m5, %2 | mova m5, %2 | ||||
| @@ -337,7 +337,7 @@ cglobal h264_idct8_add4_8_mmx, 5, 7, 0 | |||||
| test r6, r6 | test r6, r6 | ||||
| jz .skipblock | jz .skipblock | ||||
| mov r6d, dword [r1+r5*4] | mov r6d, dword [r1+r5*4] | ||||
| lea r6, [r0+r6] | |||||
| add r6, r0 | |||||
| add word [r2], 32 | add word [r2], 32 | ||||
| IDCT8_ADD_MMX_START r2 , rsp | IDCT8_ADD_MMX_START r2 , rsp | ||||
| IDCT8_ADD_MMX_START r2+8, rsp+64 | IDCT8_ADD_MMX_START r2+8, rsp+64 | ||||
| @@ -391,7 +391,7 @@ cglobal h264_idct_add16_8_mmx2, 5, 7, 0 | |||||
| REP_RET | REP_RET | ||||
| .no_dc | .no_dc | ||||
| mov r6d, dword [r1+r5*4] | mov r6d, dword [r1+r5*4] | ||||
| lea r6, [r0+r6] | |||||
| add r6, r0 | |||||
| IDCT4_ADD r6, r2, r3 | IDCT4_ADD r6, r2, r3 | ||||
| .skipblock | .skipblock | ||||
| inc r5 | inc r5 | ||||
| @@ -414,7 +414,7 @@ cglobal h264_idct_add16intra_8_mmx, 5, 7, 0 | |||||
| test r6, r6 | test r6, r6 | ||||
| jz .skipblock | jz .skipblock | ||||
| mov r6d, dword [r1+r5*4] | mov r6d, dword [r1+r5*4] | ||||
| lea r6, [r0+r6] | |||||
| add r6, r0 | |||||
| IDCT4_ADD r6, r2, r3 | IDCT4_ADD r6, r2, r3 | ||||
| .skipblock | .skipblock | ||||
| inc r5 | inc r5 | ||||
| @@ -456,7 +456,7 @@ cglobal h264_idct_add16intra_8_mmx2, 5, 7, 0 | |||||
| %define dst_regd r1d | %define dst_regd r1d | ||||
| %endif | %endif | ||||
| mov dst_regd, dword [r1+r5*4] | mov dst_regd, dword [r1+r5*4] | ||||
| lea dst_reg, [r0+dst_reg] | |||||
| add dst_reg, r0 | |||||
| DC_ADD_MMX2_OP movh, dst_reg, r3, r6 | DC_ADD_MMX2_OP movh, dst_reg, r3, r6 | ||||
| %ifndef ARCH_X86_64 | %ifndef ARCH_X86_64 | ||||
| mov r1, r1m | mov r1, r1m | ||||
| @@ -513,7 +513,7 @@ cglobal h264_idct8_add4_8_mmx2, 5, 7, 0 | |||||
| RET | RET | ||||
| .no_dc | .no_dc | ||||
| mov r6d, dword [r1+r5*4] | mov r6d, dword [r1+r5*4] | ||||
| lea r6, [r0+r6] | |||||
| add r6, r0 | |||||
| add word [r2], 32 | add word [r2], 32 | ||||
| IDCT8_ADD_MMX_START r2 , rsp | IDCT8_ADD_MMX_START r2 , rsp | ||||
| IDCT8_ADD_MMX_START r2+8, rsp+64 | IDCT8_ADD_MMX_START r2+8, rsp+64 | ||||
| @@ -558,7 +558,7 @@ INIT_MMX | |||||
| %define dst_regd r1d | %define dst_regd r1d | ||||
| %endif | %endif | ||||
| mov dst_regd, dword [r1+r5*4] | mov dst_regd, dword [r1+r5*4] | ||||
| lea dst_reg, [r0+dst_reg] | |||||
| add dst_reg, r0 | |||||
| DC_ADD_MMX2_OP mova, dst_reg, r3, r6 | DC_ADD_MMX2_OP mova, dst_reg, r3, r6 | ||||
| lea dst_reg, [dst_reg+r3*4] | lea dst_reg, [dst_reg+r3*4] | ||||
| DC_ADD_MMX2_OP mova, dst_reg, r3, r6 | DC_ADD_MMX2_OP mova, dst_reg, r3, r6 | ||||
| @@ -573,7 +573,7 @@ INIT_MMX | |||||
| .no_dc | .no_dc | ||||
| INIT_XMM | INIT_XMM | ||||
| mov dst_regd, dword [r1+r5*4] | mov dst_regd, dword [r1+r5*4] | ||||
| lea dst_reg, [r0+dst_reg] | |||||
| add dst_reg, r0 | |||||
| IDCT8_ADD_SSE dst_reg, r2, r3, r6 | IDCT8_ADD_SSE dst_reg, r2, r3, r6 | ||||
| %ifndef ARCH_X86_64 | %ifndef ARCH_X86_64 | ||||
| mov r1, r1m | mov r1, r1m | ||||
| @@ -497,10 +497,10 @@ | |||||
| %macro STORE_DIFFx2 8 ; add1, add2, reg1, reg2, zero, shift, source, stride | %macro STORE_DIFFx2 8 ; add1, add2, reg1, reg2, zero, shift, source, stride | ||||
| movh %3, [%7] | movh %3, [%7] | ||||
| movh %4, [%7+%8] | movh %4, [%7+%8] | ||||
| punpcklbw %3, %5 | |||||
| punpcklbw %4, %5 | |||||
| psraw %1, %6 | psraw %1, %6 | ||||
| psraw %2, %6 | psraw %2, %6 | ||||
| punpcklbw %3, %5 | |||||
| punpcklbw %4, %5 | |||||
| paddw %3, %1 | paddw %3, %1 | ||||
| paddw %4, %2 | paddw %4, %2 | ||||
| packuswb %3, %5 | packuswb %3, %5 | ||||