nasm prints a warning if the colon is missing. Signed-off-by: Mans Rullgard <mans@mansr.com>tags/n1.0
| @@ -39,7 +39,7 @@ cglobal deinterlace_line_mmx, 7,7,7, dst, lum_m4, lum_m3, lum_m2, lum_m1 | |||||
| %endif | %endif | ||||
| pxor mm7, mm7 | pxor mm7, mm7 | ||||
| movq mm6, [pw_4] | movq mm6, [pw_4] | ||||
| .nextrow | |||||
| .nextrow: | |||||
| movd mm0, [lum_m4q] | movd mm0, [lum_m4q] | ||||
| movd mm1, [lum_m3q] | movd mm1, [lum_m3q] | ||||
| movd mm2, [lum_m2q] | movd mm2, [lum_m2q] | ||||
| @@ -1143,7 +1143,7 @@ VECTOR_CLIP_INT32 6, 1, 0, 0 | |||||
| cglobal vector_fmul_reverse, 4,4,2, dst, src0, src1, len | cglobal vector_fmul_reverse, 4,4,2, dst, src0, src1, len | ||||
| lea lenq, [lend*4 - 2*mmsize] | lea lenq, [lend*4 - 2*mmsize] | ||||
| ALIGN 16 | ALIGN 16 | ||||
| .loop | |||||
| .loop: | |||||
| %if cpuflag(avx) | %if cpuflag(avx) | ||||
| vmovaps xmm0, [src1q + 16] | vmovaps xmm0, [src1q + 16] | ||||
| vinsertf128 m0, m0, [src1q], 1 | vinsertf128 m0, m0, [src1q], 1 | ||||
| @@ -1182,7 +1182,7 @@ VECTOR_FMUL_REVERSE | |||||
| cglobal vector_fmul_add, 5,5,2, dst, src0, src1, src2, len | cglobal vector_fmul_add, 5,5,2, dst, src0, src1, src2, len | ||||
| lea lenq, [lend*4 - 2*mmsize] | lea lenq, [lend*4 - 2*mmsize] | ||||
| ALIGN 16 | ALIGN 16 | ||||
| .loop | |||||
| .loop: | |||||
| mova m0, [src0q + lenq] | mova m0, [src0q + lenq] | ||||
| mova m1, [src0q + lenq + mmsize] | mova m1, [src0q + lenq + mmsize] | ||||
| mulps m0, m0, [src1q + lenq] | mulps m0, m0, [src1q + lenq] | ||||
| @@ -1313,7 +1313,7 @@ cglobal bswap32_buf, 3,4,5 | |||||
| add r0, 4 | add r0, 4 | ||||
| dec r2 | dec r2 | ||||
| jnz .loop2 | jnz .loop2 | ||||
| .end | |||||
| .end: | |||||
| RET | RET | ||||
| ; %1 = aligned/unaligned | ; %1 = aligned/unaligned | ||||
| @@ -184,7 +184,7 @@ cglobal hadamard8_diff16_%1, 5, 6, %2 | |||||
| call hadamard8x8_diff_%1 | call hadamard8x8_diff_%1 | ||||
| add r5d, eax | add r5d, eax | ||||
| .done | |||||
| .done: | |||||
| mov eax, r5d | mov eax, r5d | ||||
| %ifndef m8 | %ifndef m8 | ||||
| ADD rsp, pad | ADD rsp, pad | ||||
| @@ -288,7 +288,7 @@ cglobal sse16_sse2, 5, 5, 8 | |||||
| pxor m0, m0 ; mm0 = 0 | pxor m0, m0 ; mm0 = 0 | ||||
| pxor m7, m7 ; mm7 holds the sum | pxor m7, m7 ; mm7 holds the sum | ||||
| .next2lines ; FIXME why are these unaligned movs? pix1[] is aligned | |||||
| .next2lines: ; FIXME why are these unaligned movs? pix1[] is aligned | |||||
| movu m1, [r1 ] ; mm1 = pix1[0][0-15] | movu m1, [r1 ] ; mm1 = pix1[0][0-15] | ||||
| movu m2, [r2 ] ; mm2 = pix2[0][0-15] | movu m2, [r2 ] ; mm2 = pix2[0][0-15] | ||||
| movu m3, [r1+r3] ; mm3 = pix1[1][0-15] | movu m3, [r1+r3] ; mm3 = pix1[1][0-15] | ||||
| @@ -607,7 +607,7 @@ cglobal fft_calc, 2,5,8 | |||||
| add rcx, 3 | add rcx, 3 | ||||
| shl r2, cl | shl r2, cl | ||||
| sub r4, r2 | sub r4, r2 | ||||
| .loop | |||||
| .loop: | |||||
| %if mmsize == 8 | %if mmsize == 8 | ||||
| PSWAPD m0, [r4 + r2 + 4] | PSWAPD m0, [r4 + r2 + 4] | ||||
| mova [r4 + r2 + 4], m0 | mova [r4 + r2 + 4], m0 | ||||
| @@ -404,7 +404,7 @@ cglobal float_interleave2_%1, 3,4,%2, dst, src, len, src1 | |||||
| mov src1q, [srcq+gprsize] | mov src1q, [srcq+gprsize] | ||||
| mov srcq, [srcq ] | mov srcq, [srcq ] | ||||
| sub src1q, srcq | sub src1q, srcq | ||||
| .loop | |||||
| .loop: | |||||
| MOVPS m0, [srcq ] | MOVPS m0, [srcq ] | ||||
| MOVPS m1, [srcq+src1q ] | MOVPS m1, [srcq+src1q ] | ||||
| MOVPS m3, [srcq +mmsize] | MOVPS m3, [srcq +mmsize] | ||||
| @@ -69,7 +69,7 @@ SECTION .text | |||||
| %macro mv0_pixels_mc8 0 | %macro mv0_pixels_mc8 0 | ||||
| lea r4, [r2*2 ] | lea r4, [r2*2 ] | ||||
| .next4rows | |||||
| .next4rows: | |||||
| movq mm0, [r1 ] | movq mm0, [r1 ] | ||||
| movq mm1, [r1+r2] | movq mm1, [r1+r2] | ||||
| add r1, r4 | add r1, r4 | ||||
| @@ -117,7 +117,7 @@ cglobal %1_%2_chroma_mc8_%3, 6, 7 + extra_regs, 0 | |||||
| mv0_pixels_mc8 | mv0_pixels_mc8 | ||||
| REP_RET | REP_RET | ||||
| .at_least_one_non_zero | |||||
| .at_least_one_non_zero: | |||||
| %ifidn %2, rv40 | %ifidn %2, rv40 | ||||
| %if ARCH_X86_64 | %if ARCH_X86_64 | ||||
| mov r7, r5 | mov r7, r5 | ||||
| @@ -145,7 +145,7 @@ cglobal %1_%2_chroma_mc8_%3, 6, 7 + extra_regs, 0 | |||||
| test r4d, r4d | test r4d, r4d | ||||
| mov r6, r2 ; dxy = x ? 1 : stride | mov r6, r2 ; dxy = x ? 1 : stride | ||||
| jne .both_non_zero | jne .both_non_zero | ||||
| .my_is_zero | |||||
| .my_is_zero: | |||||
| ; mx == 0 XOR my == 0 - 1 dimensional filter only | ; mx == 0 XOR my == 0 - 1 dimensional filter only | ||||
| or r4d, r5d ; x + y | or r4d, r5d ; x + y | ||||
| @@ -166,7 +166,7 @@ cglobal %1_%2_chroma_mc8_%3, 6, 7 + extra_regs, 0 | |||||
| pxor m7, m7 | pxor m7, m7 | ||||
| psubw m4, m5 ; mm4 = A = 8-x | psubw m4, m5 ; mm4 = A = 8-x | ||||
| .next1drow | |||||
| .next1drow: | |||||
| movq m0, [r1 ] ; mm0 = src[0..7] | movq m0, [r1 ] ; mm0 = src[0..7] | ||||
| movq m2, [r1+r6] ; mm1 = src[1..8] | movq m2, [r1+r6] ; mm1 = src[1..8] | ||||
| @@ -197,7 +197,7 @@ cglobal %1_%2_chroma_mc8_%3, 6, 7 + extra_regs, 0 | |||||
| jne .next1drow | jne .next1drow | ||||
| REP_RET | REP_RET | ||||
| .both_non_zero ; general case, bilinear | |||||
| .both_non_zero: ; general case, bilinear | |||||
| movd m4, r4d ; x | movd m4, r4d ; x | ||||
| movd m6, r5d ; y | movd m6, r5d ; y | ||||
| %ifidn %2, rv40 | %ifidn %2, rv40 | ||||
| @@ -232,7 +232,7 @@ cglobal %1_%2_chroma_mc8_%3, 6, 7 + extra_regs, 0 | |||||
| movq m0, [r1 ] ; mm0 = src[0..7] | movq m0, [r1 ] ; mm0 = src[0..7] | ||||
| movq m1, [r1+1] ; mm1 = src[1..8] | movq m1, [r1+1] ; mm1 = src[1..8] | ||||
| .next2drow | |||||
| .next2drow: | |||||
| add r1, r2 | add r1, r2 | ||||
| movq m2, m0 | movq m2, m0 | ||||
| @@ -330,7 +330,7 @@ cglobal %1_%2_chroma_mc4_%3, 6, 6 + extra_regs, 0 | |||||
| pmullw m6, m2 | pmullw m6, m2 | ||||
| paddw m6, m0 | paddw m6, m0 | ||||
| .next2rows | |||||
| .next2rows: | |||||
| movd m0, [r1 ] | movd m0, [r1 ] | ||||
| movd m1, [r1+1] | movd m1, [r1+1] | ||||
| add r1, r2 | add r1, r2 | ||||
| @@ -397,7 +397,7 @@ cglobal %1_%2_chroma_mc2_%3, 6, 7, 0 | |||||
| punpcklbw m2, m7 | punpcklbw m2, m7 | ||||
| pshufw m2, m2, 0x94 ; mm0 = src[0,1,1,2] | pshufw m2, m2, 0x94 ; mm0 = src[0,1,1,2] | ||||
| .nextrow | |||||
| .nextrow: | |||||
| add r1, r2 | add r1, r2 | ||||
| movq m1, m2 | movq m1, m2 | ||||
| pmaddwd m1, m5 ; mm1 = A * src[0,1] + B * src[1,2] | pmaddwd m1, m5 ; mm1 = A * src[0,1] + B * src[1,2] | ||||
| @@ -474,7 +474,7 @@ cglobal %1_%2_chroma_mc8_%3, 6, 7, 8 | |||||
| mv0_pixels_mc8 | mv0_pixels_mc8 | ||||
| REP_RET | REP_RET | ||||
| .at_least_one_non_zero | |||||
| .at_least_one_non_zero: | |||||
| test r5d, r5d | test r5d, r5d | ||||
| je .my_is_zero | je .my_is_zero | ||||
| test r4d, r4d | test r4d, r4d | ||||
| @@ -501,7 +501,7 @@ cglobal %1_%2_chroma_mc8_%3, 6, 7, 8 | |||||
| movlhps m7, m7 | movlhps m7, m7 | ||||
| movlhps m6, m6 | movlhps m6, m6 | ||||
| .next2rows | |||||
| .next2rows: | |||||
| movq m1, [r1+r2*1 ] | movq m1, [r1+r2*1 ] | ||||
| movq m2, [r1+r2*1+1] | movq m2, [r1+r2*1+1] | ||||
| movq m3, [r1+r2*2 ] | movq m3, [r1+r2*2 ] | ||||
| @@ -535,7 +535,7 @@ cglobal %1_%2_chroma_mc8_%3, 6, 7, 8 | |||||
| jg .next2rows | jg .next2rows | ||||
| REP_RET | REP_RET | ||||
| .my_is_zero | |||||
| .my_is_zero: | |||||
| mov r5d, r4d | mov r5d, r4d | ||||
| shl r4d, 8 | shl r4d, 8 | ||||
| add r4, 8 | add r4, 8 | ||||
| @@ -545,7 +545,7 @@ cglobal %1_%2_chroma_mc8_%3, 6, 7, 8 | |||||
| pshuflw m7, m7, 0 | pshuflw m7, m7, 0 | ||||
| movlhps m7, m7 | movlhps m7, m7 | ||||
| .next2xrows | |||||
| .next2xrows: | |||||
| movq m0, [r1 ] | movq m0, [r1 ] | ||||
| movq m1, [r1 +1] | movq m1, [r1 +1] | ||||
| movq m2, [r1+r2 ] | movq m2, [r1+r2 ] | ||||
| @@ -572,7 +572,7 @@ cglobal %1_%2_chroma_mc8_%3, 6, 7, 8 | |||||
| jg .next2xrows | jg .next2xrows | ||||
| REP_RET | REP_RET | ||||
| .mx_is_zero | |||||
| .mx_is_zero: | |||||
| mov r4d, r5d | mov r4d, r5d | ||||
| shl r5d, 8 | shl r5d, 8 | ||||
| add r5, 8 | add r5, 8 | ||||
| @@ -582,7 +582,7 @@ cglobal %1_%2_chroma_mc8_%3, 6, 7, 8 | |||||
| pshuflw m7, m7, 0 | pshuflw m7, m7, 0 | ||||
| movlhps m7, m7 | movlhps m7, m7 | ||||
| .next2yrows | |||||
| .next2yrows: | |||||
| movq m0, [r1 ] | movq m0, [r1 ] | ||||
| movq m1, [r1+r2 ] | movq m1, [r1+r2 ] | ||||
| movdqa m2, m1 | movdqa m2, m1 | ||||
| @@ -632,7 +632,7 @@ cglobal %1_%2_chroma_mc4_%3, 6, 7, 0 | |||||
| punpcklbw m0, [r1+1] | punpcklbw m0, [r1+1] | ||||
| pshufw m6, m6, 0 | pshufw m6, m6, 0 | ||||
| .next2rows | |||||
| .next2rows: | |||||
| movd m1, [r1+r2*1 ] | movd m1, [r1+r2*1 ] | ||||
| movd m3, [r1+r2*2 ] | movd m3, [r1+r2*2 ] | ||||
| punpcklbw m1, [r1+r2*1+1] | punpcklbw m1, [r1+r2*1+1] | ||||
| @@ -38,7 +38,7 @@ SECTION .text | |||||
| %macro MV0_PIXELS_MC8 0 | %macro MV0_PIXELS_MC8 0 | ||||
| lea r4, [r2*3 ] | lea r4, [r2*3 ] | ||||
| lea r5, [r2*4 ] | lea r5, [r2*4 ] | ||||
| .next4rows | |||||
| .next4rows: | |||||
| movu m0, [r1 ] | movu m0, [r1 ] | ||||
| movu m1, [r1+r2 ] | movu m1, [r1+r2 ] | ||||
| CHROMAMC_AVG m0, [r0 ] | CHROMAMC_AVG m0, [r0 ] | ||||
| @@ -72,14 +72,14 @@ cglobal %1_h264_chroma_mc8_10, 6,7,8 | |||||
| MV0_PIXELS_MC8 | MV0_PIXELS_MC8 | ||||
| REP_RET | REP_RET | ||||
| .at_least_one_non_zero | |||||
| .at_least_one_non_zero: | |||||
| mov r6d, 2 | mov r6d, 2 | ||||
| test r5d, r5d | test r5d, r5d | ||||
| je .x_interpolation | je .x_interpolation | ||||
| mov r6, r2 ; dxy = x ? 1 : stride | mov r6, r2 ; dxy = x ? 1 : stride | ||||
| test r4d, r4d | test r4d, r4d | ||||
| jne .xy_interpolation | jne .xy_interpolation | ||||
| .x_interpolation | |||||
| .x_interpolation: | |||||
| ; mx == 0 XOR my == 0 - 1 dimensional filter only | ; mx == 0 XOR my == 0 - 1 dimensional filter only | ||||
| or r4d, r5d ; x + y | or r4d, r5d ; x + y | ||||
| movd m5, r4d | movd m5, r4d | ||||
| @@ -88,7 +88,7 @@ cglobal %1_h264_chroma_mc8_10, 6,7,8 | |||||
| SPLATW m5, m5 ; mm5 = B = x | SPLATW m5, m5 ; mm5 = B = x | ||||
| psubw m4, m5 ; mm4 = A = 8-x | psubw m4, m5 ; mm4 = A = 8-x | ||||
| .next1drow | |||||
| .next1drow: | |||||
| movu m0, [r1 ] ; mm0 = src[0..7] | movu m0, [r1 ] ; mm0 = src[0..7] | ||||
| movu m2, [r1+r6] ; mm2 = src[1..8] | movu m2, [r1+r6] ; mm2 = src[1..8] | ||||
| @@ -107,7 +107,7 @@ cglobal %1_h264_chroma_mc8_10, 6,7,8 | |||||
| jne .next1drow | jne .next1drow | ||||
| REP_RET | REP_RET | ||||
| .xy_interpolation ; general case, bilinear | |||||
| .xy_interpolation: ; general case, bilinear | |||||
| movd m4, r4m ; x | movd m4, r4m ; x | ||||
| movd m6, r5m ; y | movd m6, r5m ; y | ||||
| @@ -125,7 +125,7 @@ cglobal %1_h264_chroma_mc8_10, 6,7,8 | |||||
| movu m0, [r1 ] ; mm0 = src[0..7] | movu m0, [r1 ] ; mm0 = src[0..7] | ||||
| movu m1, [r1+2] ; mm1 = src[1..8] | movu m1, [r1+2] ; mm1 = src[1..8] | ||||
| .next2drow | |||||
| .next2drow: | |||||
| add r1, r2 | add r1, r2 | ||||
| pmullw m2, m0, m4 | pmullw m2, m0, m4 | ||||
| @@ -192,7 +192,7 @@ cglobal %1_h264_chroma_mc4_10, 6,6,7 | |||||
| pmullw m6, m2 | pmullw m6, m2 | ||||
| paddw m6, m0 | paddw m6, m0 | ||||
| .next2rows | |||||
| .next2rows: | |||||
| MC4_OP m0, m6 | MC4_OP m0, m6 | ||||
| MC4_OP m6, m0 | MC4_OP m6, m0 | ||||
| sub r3d, 2 | sub r3d, 2 | ||||
| @@ -221,7 +221,7 @@ cglobal %1_h264_chroma_mc2_10, 6,7 | |||||
| pxor m7, m7 | pxor m7, m7 | ||||
| pshufw m2, [r1], 0x94 ; mm0 = src[0,1,1,2] | pshufw m2, [r1], 0x94 ; mm0 = src[0,1,1,2] | ||||
| .nextrow | |||||
| .nextrow: | |||||
| add r1, r2 | add r1, r2 | ||||
| movq m1, m2 | movq m1, m2 | ||||
| pmaddwd m1, m5 ; mm1 = A * src[0,1] + B * src[1,2] | pmaddwd m1, m5 ; mm1 = A * src[0,1] + B * src[1,2] | ||||
| @@ -623,7 +623,7 @@ cglobal deblock_v_luma_intra_10, 4,7,16 | |||||
| shl r2d, 2 | shl r2d, 2 | ||||
| shl r3d, 2 | shl r3d, 2 | ||||
| LOAD_AB aa, bb, r2d, r3d | LOAD_AB aa, bb, r2d, r3d | ||||
| .loop | |||||
| .loop: | |||||
| mova p2, [r4+r1] | mova p2, [r4+r1] | ||||
| mova p1, [r4+2*r1] | mova p1, [r4+2*r1] | ||||
| mova p0, [r4+r5] | mova p0, [r4+r5] | ||||
| @@ -674,7 +674,7 @@ cglobal deblock_h_luma_intra_10, 4,7,16 | |||||
| mova m0, [pw_2] | mova m0, [pw_2] | ||||
| shl r2d, 2 | shl r2d, 2 | ||||
| shl r3d, 2 | shl r3d, 2 | ||||
| .loop | |||||
| .loop: | |||||
| movu q3, [r0-8] | movu q3, [r0-8] | ||||
| movu q2, [r0+r1-8] | movu q2, [r0+r1-8] | ||||
| movu q1, [r0+r1*2-8] | movu q1, [r0+r1*2-8] | ||||
| @@ -308,7 +308,7 @@ cglobal h264_idct_add16_8_mmx, 5, 7 + npicregs, 0, dst, block_offset, block, str | |||||
| %ifdef PIC | %ifdef PIC | ||||
| lea picregq, [scan8_mem] | lea picregq, [scan8_mem] | ||||
| %endif | %endif | ||||
| .nextblock | |||||
| .nextblock: | |||||
| movzx r6, byte [scan8+r5] | movzx r6, byte [scan8+r5] | ||||
| movzx r6, byte [r4+r6] | movzx r6, byte [r4+r6] | ||||
| test r6, r6 | test r6, r6 | ||||
| @@ -316,7 +316,7 @@ cglobal h264_idct_add16_8_mmx, 5, 7 + npicregs, 0, dst, block_offset, block, str | |||||
| mov r6d, dword [r1+r5*4] | mov r6d, dword [r1+r5*4] | ||||
| lea r6, [r0+r6] | lea r6, [r0+r6] | ||||
| IDCT4_ADD r6, r2, r3 | IDCT4_ADD r6, r2, r3 | ||||
| .skipblock | |||||
| .skipblock: | |||||
| inc r5 | inc r5 | ||||
| add r2, 32 | add r2, 32 | ||||
| cmp r5, 16 | cmp r5, 16 | ||||
| @@ -333,7 +333,7 @@ cglobal h264_idct8_add4_8_mmx, 5, 7 + npicregs, 0, dst, block_offset, block, str | |||||
| %ifdef PIC | %ifdef PIC | ||||
| lea picregq, [scan8_mem] | lea picregq, [scan8_mem] | ||||
| %endif | %endif | ||||
| .nextblock | |||||
| .nextblock: | |||||
| movzx r6, byte [scan8+r5] | movzx r6, byte [scan8+r5] | ||||
| movzx r6, byte [r4+r6] | movzx r6, byte [r4+r6] | ||||
| test r6, r6 | test r6, r6 | ||||
| @@ -347,7 +347,7 @@ cglobal h264_idct8_add4_8_mmx, 5, 7 + npicregs, 0, dst, block_offset, block, str | |||||
| mov r6d, dword [r1+r5*4] | mov r6d, dword [r1+r5*4] | ||||
| lea r6, [r0+r6+4] | lea r6, [r0+r6+4] | ||||
| IDCT8_ADD_MMX_END r6 , rsp+8, r3 | IDCT8_ADD_MMX_END r6 , rsp+8, r3 | ||||
| .skipblock | |||||
| .skipblock: | |||||
| add r5, 4 | add r5, 4 | ||||
| add r2, 128 | add r2, 128 | ||||
| cmp r5, 16 | cmp r5, 16 | ||||
| @@ -362,7 +362,7 @@ cglobal h264_idct_add16_8_mmx2, 5, 8 + npicregs, 0, dst1, block_offset, block, s | |||||
| %ifdef PIC | %ifdef PIC | ||||
| lea picregq, [scan8_mem] | lea picregq, [scan8_mem] | ||||
| %endif | %endif | ||||
| .nextblock | |||||
| .nextblock: | |||||
| movzx r6, byte [scan8+r5] | movzx r6, byte [scan8+r5] | ||||
| movzx r6, byte [r4+r6] | movzx r6, byte [r4+r6] | ||||
| test r6, r6 | test r6, r6 | ||||
| @@ -388,11 +388,11 @@ cglobal h264_idct_add16_8_mmx2, 5, 8 + npicregs, 0, dst1, block_offset, block, s | |||||
| cmp r5, 16 | cmp r5, 16 | ||||
| jl .nextblock | jl .nextblock | ||||
| REP_RET | REP_RET | ||||
| .no_dc | |||||
| .no_dc: | |||||
| mov r6d, dword [r1+r5*4] | mov r6d, dword [r1+r5*4] | ||||
| add r6, r0 | add r6, r0 | ||||
| IDCT4_ADD r6, r2, r3 | IDCT4_ADD r6, r2, r3 | ||||
| .skipblock | |||||
| .skipblock: | |||||
| inc r5 | inc r5 | ||||
| add r2, 32 | add r2, 32 | ||||
| cmp r5, 16 | cmp r5, 16 | ||||
| @@ -406,7 +406,7 @@ cglobal h264_idct_add16intra_8_mmx, 5, 7 + npicregs, 0, dst, block_offset, block | |||||
| %ifdef PIC | %ifdef PIC | ||||
| lea picregq, [scan8_mem] | lea picregq, [scan8_mem] | ||||
| %endif | %endif | ||||
| .nextblock | |||||
| .nextblock: | |||||
| movzx r6, byte [scan8+r5] | movzx r6, byte [scan8+r5] | ||||
| movzx r6, byte [r4+r6] | movzx r6, byte [r4+r6] | ||||
| or r6w, word [r2] | or r6w, word [r2] | ||||
| @@ -415,7 +415,7 @@ cglobal h264_idct_add16intra_8_mmx, 5, 7 + npicregs, 0, dst, block_offset, block | |||||
| mov r6d, dword [r1+r5*4] | mov r6d, dword [r1+r5*4] | ||||
| add r6, r0 | add r6, r0 | ||||
| IDCT4_ADD r6, r2, r3 | IDCT4_ADD r6, r2, r3 | ||||
| .skipblock | |||||
| .skipblock: | |||||
| inc r5 | inc r5 | ||||
| add r2, 32 | add r2, 32 | ||||
| cmp r5, 16 | cmp r5, 16 | ||||
| @@ -429,7 +429,7 @@ cglobal h264_idct_add16intra_8_mmx2, 5, 8 + npicregs, 0, dst1, block_offset, blo | |||||
| %ifdef PIC | %ifdef PIC | ||||
| lea picregq, [scan8_mem] | lea picregq, [scan8_mem] | ||||
| %endif | %endif | ||||
| .nextblock | |||||
| .nextblock: | |||||
| movzx r6, byte [scan8+r5] | movzx r6, byte [scan8+r5] | ||||
| movzx r6, byte [r4+r6] | movzx r6, byte [r4+r6] | ||||
| test r6, r6 | test r6, r6 | ||||
| @@ -442,7 +442,7 @@ cglobal h264_idct_add16intra_8_mmx2, 5, 8 + npicregs, 0, dst1, block_offset, blo | |||||
| cmp r5, 16 | cmp r5, 16 | ||||
| jl .nextblock | jl .nextblock | ||||
| REP_RET | REP_RET | ||||
| .try_dc | |||||
| .try_dc: | |||||
| movsx r6, word [r2] | movsx r6, word [r2] | ||||
| test r6, r6 | test r6, r6 | ||||
| jz .skipblock | jz .skipblock | ||||
| @@ -457,7 +457,7 @@ cglobal h264_idct_add16intra_8_mmx2, 5, 8 + npicregs, 0, dst1, block_offset, blo | |||||
| %if ARCH_X86_64 == 0 | %if ARCH_X86_64 == 0 | ||||
| mov r1, r1m | mov r1, r1m | ||||
| %endif | %endif | ||||
| .skipblock | |||||
| .skipblock: | |||||
| inc r5 | inc r5 | ||||
| add r2, 32 | add r2, 32 | ||||
| cmp r5, 16 | cmp r5, 16 | ||||
| @@ -474,7 +474,7 @@ cglobal h264_idct8_add4_8_mmx2, 5, 8 + npicregs, 0, dst1, block_offset, block, s | |||||
| %ifdef PIC | %ifdef PIC | ||||
| lea picregq, [scan8_mem] | lea picregq, [scan8_mem] | ||||
| %endif | %endif | ||||
| .nextblock | |||||
| .nextblock: | |||||
| movzx r6, byte [scan8+r5] | movzx r6, byte [scan8+r5] | ||||
| movzx r6, byte [r4+r6] | movzx r6, byte [r4+r6] | ||||
| test r6, r6 | test r6, r6 | ||||
| @@ -504,7 +504,7 @@ cglobal h264_idct8_add4_8_mmx2, 5, 8 + npicregs, 0, dst1, block_offset, block, s | |||||
| ADD rsp, pad | ADD rsp, pad | ||||
| RET | RET | ||||
| .no_dc | |||||
| .no_dc: | |||||
| mov r6d, dword [r1+r5*4] | mov r6d, dword [r1+r5*4] | ||||
| add r6, r0 | add r6, r0 | ||||
| add word [r2], 32 | add word [r2], 32 | ||||
| @@ -514,7 +514,7 @@ cglobal h264_idct8_add4_8_mmx2, 5, 8 + npicregs, 0, dst1, block_offset, block, s | |||||
| mov r6d, dword [r1+r5*4] | mov r6d, dword [r1+r5*4] | ||||
| lea r6, [r0+r6+4] | lea r6, [r0+r6+4] | ||||
| IDCT8_ADD_MMX_END r6 , rsp+8, r3 | IDCT8_ADD_MMX_END r6 , rsp+8, r3 | ||||
| .skipblock | |||||
| .skipblock: | |||||
| add r5, 4 | add r5, 4 | ||||
| add r2, 128 | add r2, 128 | ||||
| cmp r5, 16 | cmp r5, 16 | ||||
| @@ -531,7 +531,7 @@ cglobal h264_idct8_add4_8_sse2, 5, 8 + npicregs, 10, dst1, block_offset, block, | |||||
| %ifdef PIC | %ifdef PIC | ||||
| lea picregq, [scan8_mem] | lea picregq, [scan8_mem] | ||||
| %endif | %endif | ||||
| .nextblock | |||||
| .nextblock: | |||||
| movzx r6, byte [scan8+r5] | movzx r6, byte [scan8+r5] | ||||
| movzx r6, byte [r4+r6] | movzx r6, byte [r4+r6] | ||||
| test r6, r6 | test r6, r6 | ||||
| @@ -560,7 +560,7 @@ INIT_MMX | |||||
| cmp r5, 16 | cmp r5, 16 | ||||
| jl .nextblock | jl .nextblock | ||||
| REP_RET | REP_RET | ||||
| .no_dc | |||||
| .no_dc: | |||||
| INIT_XMM | INIT_XMM | ||||
| mov dst2d, dword [r1+r5*4] | mov dst2d, dword [r1+r5*4] | ||||
| add dst2q, r0 | add dst2q, r0 | ||||
| @@ -568,7 +568,7 @@ INIT_XMM | |||||
| %if ARCH_X86_64 == 0 | %if ARCH_X86_64 == 0 | ||||
| mov r1, r1m | mov r1, r1m | ||||
| %endif | %endif | ||||
| .skipblock | |||||
| .skipblock: | |||||
| add r5, 4 | add r5, 4 | ||||
| add r2, 128 | add r2, 128 | ||||
| cmp r5, 16 | cmp r5, 16 | ||||
| @@ -577,7 +577,7 @@ INIT_XMM | |||||
| INIT_MMX | INIT_MMX | ||||
| h264_idct_add8_mmx_plane: | h264_idct_add8_mmx_plane: | ||||
| .nextblock | |||||
| .nextblock: | |||||
| movzx r6, byte [scan8+r5] | movzx r6, byte [scan8+r5] | ||||
| movzx r6, byte [r4+r6] | movzx r6, byte [r4+r6] | ||||
| or r6w, word [r2] | or r6w, word [r2] | ||||
| @@ -592,7 +592,7 @@ h264_idct_add8_mmx_plane: | |||||
| add r0, dword [r1+r5*4] | add r0, dword [r1+r5*4] | ||||
| %endif | %endif | ||||
| IDCT4_ADD r0, r2, r3 | IDCT4_ADD r0, r2, r3 | ||||
| .skipblock | |||||
| .skipblock: | |||||
| inc r5 | inc r5 | ||||
| add r2, 32 | add r2, 32 | ||||
| test r5, 3 | test r5, 3 | ||||
| @@ -621,8 +621,8 @@ cglobal h264_idct_add8_8_mmx, 5, 8 + npicregs, 0, dst1, block_offset, block, str | |||||
| call h264_idct_add8_mmx_plane | call h264_idct_add8_mmx_plane | ||||
| RET | RET | ||||
| h264_idct_add8_mmx2_plane | |||||
| .nextblock | |||||
| h264_idct_add8_mmx2_plane: | |||||
| .nextblock: | |||||
| movzx r6, byte [scan8+r5] | movzx r6, byte [scan8+r5] | ||||
| movzx r6, byte [r4+r6] | movzx r6, byte [r4+r6] | ||||
| test r6, r6 | test r6, r6 | ||||
| @@ -641,7 +641,7 @@ h264_idct_add8_mmx2_plane | |||||
| test r5, 3 | test r5, 3 | ||||
| jnz .nextblock | jnz .nextblock | ||||
| rep ret | rep ret | ||||
| .try_dc | |||||
| .try_dc: | |||||
| movsx r6, word [r2] | movsx r6, word [r2] | ||||
| test r6, r6 | test r6, r6 | ||||
| jz .skipblock | jz .skipblock | ||||
| @@ -655,7 +655,7 @@ h264_idct_add8_mmx2_plane | |||||
| add r0, dword [r1+r5*4] | add r0, dword [r1+r5*4] | ||||
| %endif | %endif | ||||
| DC_ADD_MMX2_OP movh, r0, r3, r6 | DC_ADD_MMX2_OP movh, r0, r3, r6 | ||||
| .skipblock | |||||
| .skipblock: | |||||
| inc r5 | inc r5 | ||||
| add r2, 32 | add r2, 32 | ||||
| test r5, 3 | test r5, 3 | ||||
| @@ -734,7 +734,7 @@ h264_add8x4_idct_sse2: | |||||
| add r0, r0m | add r0, r0m | ||||
| %endif | %endif | ||||
| call h264_add8x4_idct_sse2 | call h264_add8x4_idct_sse2 | ||||
| .cycle%1end | |||||
| .cycle%1end: | |||||
| %if %1 < 7 | %if %1 < 7 | ||||
| add r2, 64 | add r2, 64 | ||||
| %endif | %endif | ||||
| @@ -770,7 +770,7 @@ cglobal h264_idct_add16_8_sse2, 5, 5 + ARCH_X86_64, 8 | |||||
| %endif | %endif | ||||
| call h264_add8x4_idct_sse2 | call h264_add8x4_idct_sse2 | ||||
| jmp .cycle%1end | jmp .cycle%1end | ||||
| .try%1dc | |||||
| .try%1dc: | |||||
| movsx r0, word [r2 ] | movsx r0, word [r2 ] | ||||
| or r0w, word [r2+32] | or r0w, word [r2+32] | ||||
| jz .cycle%1end | jz .cycle%1end | ||||
| @@ -781,7 +781,7 @@ cglobal h264_idct_add16_8_sse2, 5, 5 + ARCH_X86_64, 8 | |||||
| add r0, r0m | add r0, r0m | ||||
| %endif | %endif | ||||
| call h264_idct_dc_add8_mmx2 | call h264_idct_dc_add8_mmx2 | ||||
| .cycle%1end | |||||
| .cycle%1end: | |||||
| %if %1 < 7 | %if %1 < 7 | ||||
| add r2, 64 | add r2, 64 | ||||
| %endif | %endif | ||||
| @@ -817,7 +817,7 @@ cglobal h264_idct_add16intra_8_sse2, 5, 7 + ARCH_X86_64, 8 | |||||
| %endif | %endif | ||||
| call h264_add8x4_idct_sse2 | call h264_add8x4_idct_sse2 | ||||
| jmp .cycle%1end | jmp .cycle%1end | ||||
| .try%1dc | |||||
| .try%1dc: | |||||
| movsx r0, word [r2 ] | movsx r0, word [r2 ] | ||||
| or r0w, word [r2+32] | or r0w, word [r2+32] | ||||
| jz .cycle%1end | jz .cycle%1end | ||||
| @@ -830,7 +830,7 @@ cglobal h264_idct_add16intra_8_sse2, 5, 7 + ARCH_X86_64, 8 | |||||
| add r0, dword [r1+(%1&1)*8+64*(1+(%1>>1))] | add r0, dword [r1+(%1&1)*8+64*(1+(%1>>1))] | ||||
| %endif | %endif | ||||
| call h264_idct_dc_add8_mmx2 | call h264_idct_dc_add8_mmx2 | ||||
| .cycle%1end | |||||
| .cycle%1end: | |||||
| %if %1 == 1 | %if %1 == 1 | ||||
| add r2, 384+64 | add r2, 384+64 | ||||
| %elif %1 < 3 | %elif %1 < 3 | ||||
| @@ -225,7 +225,7 @@ IDCT8_DC_ADD | |||||
| ; h264_idct_add16intra(pixel *dst, const int *block_offset, dctcoef *block, int stride, const uint8_t nnzc[6*8]) | ; h264_idct_add16intra(pixel *dst, const int *block_offset, dctcoef *block, int stride, const uint8_t nnzc[6*8]) | ||||
| ;----------------------------------------------------------------------------- | ;----------------------------------------------------------------------------- | ||||
| %macro AC 1 | %macro AC 1 | ||||
| .ac%1 | |||||
| .ac%1: | |||||
| mov r5d, [r1+(%1+0)*4] | mov r5d, [r1+(%1+0)*4] | ||||
| call add4x4_idct %+ SUFFIX | call add4x4_idct %+ SUFFIX | ||||
| mov r5d, [r1+(%1+1)*4] | mov r5d, [r1+(%1+1)*4] | ||||
| @@ -484,7 +484,7 @@ cglobal pred16x16_plane_%1, 2,9,7 | |||||
| %endif | %endif | ||||
| mov r4, 8 | mov r4, 8 | ||||
| .loop | |||||
| .loop: | |||||
| mova m3, m0 ; b[0..7] | mova m3, m0 ; b[0..7] | ||||
| mova m4, m2 ; b[8..15] | mova m4, m2 ; b[8..15] | ||||
| psraw m3, 5 | psraw m3, 5 | ||||
| @@ -680,7 +680,7 @@ cglobal pred8x8_plane, 2,9,7 | |||||
| mov r4, 4 | mov r4, 4 | ||||
| ALIGN 16 | ALIGN 16 | ||||
| .loop | |||||
| .loop: | |||||
| %if mmsize == 16 | %if mmsize == 16 | ||||
| mova m3, m0 ; b[0..7] | mova m3, m0 ; b[0..7] | ||||
| paddw m0, m1 | paddw m0, m1 | ||||
| @@ -1045,7 +1045,7 @@ cglobal pred8x8l_top_dc_%1, 4,4 | |||||
| psrlq mm5, 56 | psrlq mm5, 56 | ||||
| psllq mm5, 56 | psllq mm5, 56 | ||||
| pxor mm1, mm5 | pxor mm1, mm5 | ||||
| .body | |||||
| .body: | |||||
| PRED4x4_LOWPASS mm0, mm2, mm1, mm3, mm5 | PRED4x4_LOWPASS mm0, mm2, mm1, mm3, mm5 | ||||
| psadbw mm7, mm0 | psadbw mm7, mm0 | ||||
| paddw mm7, [pw_4] | paddw mm7, [pw_4] | ||||
| @@ -1141,7 +1141,7 @@ cglobal pred8x8l_dc_%1, 4,5 | |||||
| jz .fix_lt_2 | jz .fix_lt_2 | ||||
| test r2, r2 | test r2, r2 | ||||
| jz .fix_tr_1 | jz .fix_tr_1 | ||||
| .body | |||||
| .body: | |||||
| lea r1, [r0+r3*2] | lea r1, [r0+r3*2] | ||||
| PRED4x4_LOWPASS mm6, mm2, mm1, mm3, mm5 | PRED4x4_LOWPASS mm6, mm2, mm1, mm3, mm5 | ||||
| pxor mm0, mm0 | pxor mm0, mm0 | ||||
| @@ -1276,7 +1276,7 @@ cglobal pred8x8l_vertical_%1, 4,4 | |||||
| psrlq mm5, 56 | psrlq mm5, 56 | ||||
| psllq mm5, 56 | psllq mm5, 56 | ||||
| pxor mm1, mm5 | pxor mm1, mm5 | ||||
| .body | |||||
| .body: | |||||
| PRED4x4_LOWPASS mm0, mm2, mm1, mm3, mm5 | PRED4x4_LOWPASS mm0, mm2, mm1, mm3, mm5 | ||||
| %rep 3 | %rep 3 | ||||
| movq [r0+r3*1], mm0 | movq [r0+r3*1], mm0 | ||||
| @@ -1576,7 +1576,7 @@ cglobal pred8x8l_down_right_mmxext, 4,5 | |||||
| psllq mm5, 56 | psllq mm5, 56 | ||||
| pxor mm1, mm5 | pxor mm1, mm5 | ||||
| jmp .do_top | jmp .do_top | ||||
| .body | |||||
| .body: | |||||
| lea r1, [r0+r3*2] | lea r1, [r0+r3*2] | ||||
| movq mm1, mm7 | movq mm1, mm7 | ||||
| movq mm7, mm5 | movq mm7, mm5 | ||||
| @@ -1822,7 +1822,7 @@ cglobal pred8x8l_vertical_right_mmxext, 4,5 | |||||
| jz .fix_lt_2 | jz .fix_lt_2 | ||||
| test r2, r2 | test r2, r2 | ||||
| jz .fix_tr_1 | jz .fix_tr_1 | ||||
| .do_top | |||||
| .do_top: | |||||
| PRED4x4_LOWPASS mm6, mm2, mm1, mm3, mm5 | PRED4x4_LOWPASS mm6, mm2, mm1, mm3, mm5 | ||||
| lea r1, [r0+r3*2] | lea r1, [r0+r3*2] | ||||
| movq mm2, mm6 | movq mm2, mm6 | ||||
| @@ -1931,7 +1931,7 @@ cglobal pred8x8l_vertical_right_%1, 4,5,7 | |||||
| jz .fix_lt_2 | jz .fix_lt_2 | ||||
| test r2, r2 | test r2, r2 | ||||
| jz .fix_tr_1 | jz .fix_tr_1 | ||||
| .do_top | |||||
| .do_top: | |||||
| PRED4x4_LOWPASS mm6, mm2, mm1, mm3, mm5 | PRED4x4_LOWPASS mm6, mm2, mm1, mm3, mm5 | ||||
| lea r1, [r0+r3*2] | lea r1, [r0+r3*2] | ||||
| movq2dq xmm4, mm6 | movq2dq xmm4, mm6 | ||||
| @@ -264,7 +264,7 @@ cglobal_mc %1, %2, mc20, %3, 3,4,9 | |||||
| %else | %else | ||||
| %define p16 [pw_16] | %define p16 [pw_16] | ||||
| %endif | %endif | ||||
| .nextrow | |||||
| .nextrow: | |||||
| %if %0 == 4 | %if %0 == 4 | ||||
| movu m2, [r1-4] | movu m2, [r1-4] | ||||
| movu m3, [r1-2] | movu m3, [r1-2] | ||||
| @@ -330,7 +330,7 @@ MC_CACHE MC30 | |||||
| %macro MC10 3-4 | %macro MC10 3-4 | ||||
| cglobal_mc %1, %2, mc10, %3, 3,5,9 | cglobal_mc %1, %2, mc10, %3, 3,5,9 | ||||
| mov r4, r1 | mov r4, r1 | ||||
| .body | |||||
| .body: | |||||
| mov r3d, %3 | mov r3d, %3 | ||||
| mova m1, [pw_pixel_max] | mova m1, [pw_pixel_max] | ||||
| %if num_mmregs > 8 | %if num_mmregs > 8 | ||||
| @@ -339,7 +339,7 @@ cglobal_mc %1, %2, mc10, %3, 3,5,9 | |||||
| %else | %else | ||||
| %define p16 [pw_16] | %define p16 [pw_16] | ||||
| %endif | %endif | ||||
| .nextrow | |||||
| .nextrow: | |||||
| %if %0 == 4 | %if %0 == 4 | ||||
| movu m2, [r1-4] | movu m2, [r1-4] | ||||
| movu m3, [r1-2] | movu m3, [r1-2] | ||||
| @@ -446,7 +446,7 @@ MC MC02 | |||||
| %macro MC01 3 | %macro MC01 3 | ||||
| cglobal_mc %1, %2, mc01, %3, 3,5,8 | cglobal_mc %1, %2, mc01, %3, 3,5,8 | ||||
| mov r4, r1 | mov r4, r1 | ||||
| .body | |||||
| .body: | |||||
| PRELOAD_V | PRELOAD_V | ||||
| sub r4, r2 | sub r4, r2 | ||||
| @@ -535,7 +535,7 @@ SWAP 0,1,2,3,4,5 | |||||
| ; this REALLY needs x86_64 | ; this REALLY needs x86_64 | ||||
| cglobal_mc %1, %2, mc11, %3, 3,6,8 | cglobal_mc %1, %2, mc11, %3, 3,6,8 | ||||
| mov r4, r1 | mov r4, r1 | ||||
| .body | |||||
| .body: | |||||
| PRELOAD_V | PRELOAD_V | ||||
| sub r0, r2 | sub r0, r2 | ||||
| @@ -778,7 +778,7 @@ cglobal_mc %1, %2, mc12, %3, 3,7,12 | |||||
| call put_hv%3_10_%1 | call put_hv%3_10_%1 | ||||
| xor r4d, r4d | xor r4d, r4d | ||||
| .body | |||||
| .body: | |||||
| mov r3d, %3 | mov r3d, %3 | ||||
| pxor m0, m0 | pxor m0, m0 | ||||
| mova m7, [pw_pixel_max] | mova m7, [pw_pixel_max] | ||||
| @@ -837,7 +837,7 @@ put_h%2_10_%1: | |||||
| mov r3d, %2 | mov r3d, %2 | ||||
| xor r4d, r4d | xor r4d, r4d | ||||
| mova m6, [pad20] | mova m6, [pad20] | ||||
| .nextrow | |||||
| .nextrow: | |||||
| movu m2, [r5-4] | movu m2, [r5-4] | ||||
| movu m3, [r5-2] | movu m3, [r5-2] | ||||
| movu m4, [r5+0] | movu m4, [r5+0] | ||||
| @@ -864,7 +864,7 @@ H_NRD sse2 , 8 | |||||
| %macro MC21 3 | %macro MC21 3 | ||||
| cglobal_mc %1, %2, mc21, %3, 3,7,12 | cglobal_mc %1, %2, mc21, %3, 3,7,12 | ||||
| mov r5, r1 | mov r5, r1 | ||||
| .body | |||||
| .body: | |||||
| %define PAD mmsize*8*3*2 ; SIZE*16*4*sizeof(pixel) | %define PAD mmsize*8*3*2 ; SIZE*16*4*sizeof(pixel) | ||||
| mov r6, rsp ; backup stack pointer | mov r6, rsp ; backup stack pointer | ||||
| and rsp, ~(mmsize-1) ; align stack | and rsp, ~(mmsize-1) ; align stack | ||||
| @@ -73,7 +73,7 @@ SECTION .text | |||||
| INIT_MMX | INIT_MMX | ||||
| cglobal h264_weight_16_mmx2, 6, 6, 0 | cglobal h264_weight_16_mmx2, 6, 6, 0 | ||||
| WEIGHT_SETUP | WEIGHT_SETUP | ||||
| .nextrow | |||||
| .nextrow: | |||||
| WEIGHT_OP 0, 4 | WEIGHT_OP 0, 4 | ||||
| mova [r0 ], m0 | mova [r0 ], m0 | ||||
| WEIGHT_OP 8, 12 | WEIGHT_OP 8, 12 | ||||
| @@ -86,7 +86,7 @@ cglobal h264_weight_16_mmx2, 6, 6, 0 | |||||
| %macro WEIGHT_FUNC_MM 3 | %macro WEIGHT_FUNC_MM 3 | ||||
| cglobal h264_weight_%1_%3, 6, 6, %2 | cglobal h264_weight_%1_%3, 6, 6, %2 | ||||
| WEIGHT_SETUP | WEIGHT_SETUP | ||||
| .nextrow | |||||
| .nextrow: | |||||
| WEIGHT_OP 0, mmsize/2 | WEIGHT_OP 0, mmsize/2 | ||||
| mova [r0], m0 | mova [r0], m0 | ||||
| add r0, r1 | add r0, r1 | ||||
| @@ -105,7 +105,7 @@ cglobal h264_weight_%1_%3, 6, 6, %2 | |||||
| WEIGHT_SETUP | WEIGHT_SETUP | ||||
| sar r2d, 1 | sar r2d, 1 | ||||
| lea r3, [r1*2] | lea r3, [r1*2] | ||||
| .nextrow | |||||
| .nextrow: | |||||
| WEIGHT_OP 0, r1 | WEIGHT_OP 0, r1 | ||||
| movh [r0], m0 | movh [r0], m0 | ||||
| %if mmsize == 16 | %if mmsize == 16 | ||||
| @@ -178,7 +178,7 @@ INIT_MMX | |||||
| cglobal h264_biweight_16_mmx2, 7, 8, 0 | cglobal h264_biweight_16_mmx2, 7, 8, 0 | ||||
| BIWEIGHT_SETUP | BIWEIGHT_SETUP | ||||
| movifnidn r3d, r3m | movifnidn r3d, r3m | ||||
| .nextrow | |||||
| .nextrow: | |||||
| BIWEIGHT_STEPA 0, 1, 0 | BIWEIGHT_STEPA 0, 1, 0 | ||||
| BIWEIGHT_STEPA 1, 2, 4 | BIWEIGHT_STEPA 1, 2, 4 | ||||
| BIWEIGHT_STEPB | BIWEIGHT_STEPB | ||||
| @@ -197,7 +197,7 @@ cglobal h264_biweight_16_mmx2, 7, 8, 0 | |||||
| cglobal h264_biweight_%1_%3, 7, 8, %2 | cglobal h264_biweight_%1_%3, 7, 8, %2 | ||||
| BIWEIGHT_SETUP | BIWEIGHT_SETUP | ||||
| movifnidn r3d, r3m | movifnidn r3d, r3m | ||||
| .nextrow | |||||
| .nextrow: | |||||
| BIWEIGHT_STEPA 0, 1, 0 | BIWEIGHT_STEPA 0, 1, 0 | ||||
| BIWEIGHT_STEPA 1, 2, mmsize/2 | BIWEIGHT_STEPA 1, 2, mmsize/2 | ||||
| BIWEIGHT_STEPB | BIWEIGHT_STEPB | ||||
| @@ -220,7 +220,7 @@ cglobal h264_biweight_%1_%3, 7, 8, %2 | |||||
| movifnidn r3d, r3m | movifnidn r3d, r3m | ||||
| sar r3, 1 | sar r3, 1 | ||||
| lea r4, [r2*2] | lea r4, [r2*2] | ||||
| .nextrow | |||||
| .nextrow: | |||||
| BIWEIGHT_STEPA 0, 1, 0 | BIWEIGHT_STEPA 0, 1, 0 | ||||
| BIWEIGHT_STEPA 1, 2, r2 | BIWEIGHT_STEPA 1, 2, r2 | ||||
| BIWEIGHT_STEPB | BIWEIGHT_STEPB | ||||
| @@ -281,7 +281,7 @@ cglobal h264_biweight_16_ssse3, 7, 8, 8 | |||||
| BIWEIGHT_SSSE3_SETUP | BIWEIGHT_SSSE3_SETUP | ||||
| movifnidn r3d, r3m | movifnidn r3d, r3m | ||||
| .nextrow | |||||
| .nextrow: | |||||
| movh m0, [r0] | movh m0, [r0] | ||||
| movh m2, [r0+8] | movh m2, [r0+8] | ||||
| movh m3, [r1+8] | movh m3, [r1+8] | ||||
| @@ -302,7 +302,7 @@ cglobal h264_biweight_8_ssse3, 7, 8, 8 | |||||
| sar r3, 1 | sar r3, 1 | ||||
| lea r4, [r2*2] | lea r4, [r2*2] | ||||
| .nextrow | |||||
| .nextrow: | |||||
| movh m0, [r0] | movh m0, [r0] | ||||
| movh m1, [r1] | movh m1, [r1] | ||||
| movh m2, [r0+r2] | movh m2, [r0+r2] | ||||
| @@ -40,7 +40,7 @@ SECTION .text | |||||
| ; int weight, int offset); | ; int weight, int offset); | ||||
| ;----------------------------------------------------------------------------- | ;----------------------------------------------------------------------------- | ||||
| %macro WEIGHT_PROLOGUE 0 | %macro WEIGHT_PROLOGUE 0 | ||||
| .prologue | |||||
| .prologue: | |||||
| PROLOGUE 0,6,8 | PROLOGUE 0,6,8 | ||||
| movifnidn r0, r0mp | movifnidn r0, r0mp | ||||
| movifnidn r1d, r1m | movifnidn r1d, r1m | ||||
| @@ -93,7 +93,7 @@ SECTION .text | |||||
| cglobal h264_weight_16_10_%1 | cglobal h264_weight_16_10_%1 | ||||
| WEIGHT_PROLOGUE | WEIGHT_PROLOGUE | ||||
| WEIGHT_SETUP %1 | WEIGHT_SETUP %1 | ||||
| .nextrow | |||||
| .nextrow: | |||||
| WEIGHT_OP %1, 0 | WEIGHT_OP %1, 0 | ||||
| mova [r0 ], m5 | mova [r0 ], m5 | ||||
| WEIGHT_OP %1, 16 | WEIGHT_OP %1, 16 | ||||
| @@ -113,7 +113,7 @@ WEIGHT_FUNC_DBL sse4 | |||||
| cglobal h264_weight_8_10_%1 | cglobal h264_weight_8_10_%1 | ||||
| WEIGHT_PROLOGUE | WEIGHT_PROLOGUE | ||||
| WEIGHT_SETUP %1 | WEIGHT_SETUP %1 | ||||
| .nextrow | |||||
| .nextrow: | |||||
| WEIGHT_OP %1, 0 | WEIGHT_OP %1, 0 | ||||
| mova [r0], m5 | mova [r0], m5 | ||||
| add r0, r1 | add r0, r1 | ||||
| @@ -133,7 +133,7 @@ cglobal h264_weight_4_10_%1 | |||||
| sar r2d, 1 | sar r2d, 1 | ||||
| WEIGHT_SETUP %1 | WEIGHT_SETUP %1 | ||||
| lea r3, [r1*2] | lea r3, [r1*2] | ||||
| .nextrow | |||||
| .nextrow: | |||||
| WEIGHT_OP %1, 0, r1 | WEIGHT_OP %1, 0, r1 | ||||
| movh [r0], m5 | movh [r0], m5 | ||||
| movhps [r0+r1], m5 | movhps [r0+r1], m5 | ||||
| @@ -159,7 +159,7 @@ DECLARE_REG_TMP 7 | |||||
| %endif | %endif | ||||
| %macro BIWEIGHT_PROLOGUE 0 | %macro BIWEIGHT_PROLOGUE 0 | ||||
| .prologue | |||||
| .prologue: | |||||
| PROLOGUE 0,8,8 | PROLOGUE 0,8,8 | ||||
| movifnidn r0, r0mp | movifnidn r0, r0mp | ||||
| movifnidn r1, r1mp | movifnidn r1, r1mp | ||||
| @@ -221,7 +221,7 @@ DECLARE_REG_TMP 7 | |||||
| cglobal h264_biweight_16_10_%1 | cglobal h264_biweight_16_10_%1 | ||||
| BIWEIGHT_PROLOGUE | BIWEIGHT_PROLOGUE | ||||
| BIWEIGHT_SETUP %1 | BIWEIGHT_SETUP %1 | ||||
| .nextrow | |||||
| .nextrow: | |||||
| BIWEIGHT %1, 0 | BIWEIGHT %1, 0 | ||||
| mova [r0 ], m0 | mova [r0 ], m0 | ||||
| BIWEIGHT %1, 16 | BIWEIGHT %1, 16 | ||||
| @@ -241,7 +241,7 @@ BIWEIGHT_FUNC_DBL sse4 | |||||
| cglobal h264_biweight_8_10_%1 | cglobal h264_biweight_8_10_%1 | ||||
| BIWEIGHT_PROLOGUE | BIWEIGHT_PROLOGUE | ||||
| BIWEIGHT_SETUP %1 | BIWEIGHT_SETUP %1 | ||||
| .nextrow | |||||
| .nextrow: | |||||
| BIWEIGHT %1, 0 | BIWEIGHT %1, 0 | ||||
| mova [r0], m0 | mova [r0], m0 | ||||
| add r0, r2 | add r0, r2 | ||||
| @@ -261,7 +261,7 @@ cglobal h264_biweight_4_10_%1 | |||||
| BIWEIGHT_SETUP %1 | BIWEIGHT_SETUP %1 | ||||
| sar r3d, 1 | sar r3d, 1 | ||||
| lea r4, [r2*2] | lea r4, [r2*2] | ||||
| .nextrow | |||||
| .nextrow: | |||||
| BIWEIGHT %1, 0, r2 | BIWEIGHT %1, 0, r2 | ||||
| movh [r0 ], m0 | movh [r0 ], m0 | ||||
| movhps [r0+r2], m0 | movhps [r0+r2], m0 | ||||
| @@ -139,7 +139,7 @@ cglobal vp6_filter_diag4, 5, 7, 8 | |||||
| mov r3, rsp | mov r3, rsp | ||||
| mov r6, 11 | mov r6, 11 | ||||
| .nextrow | |||||
| .nextrow: | |||||
| DIAG4 r1, -1, 0, 1, 2, r3 | DIAG4 r1, -1, 0, 1, 2, r3 | ||||
| add r3, 8 | add r3, 8 | ||||
| add r1, r2 | add r1, r2 | ||||
| @@ -151,7 +151,7 @@ cglobal vp6_filter_diag4, 5, 7, 8 | |||||
| lea r3, [rsp+8] | lea r3, [rsp+8] | ||||
| mov r6, 8 | mov r6, 8 | ||||
| .nextcol | |||||
| .nextcol: | |||||
| DIAG4 r3, -8, 0, 8, 16, r0 | DIAG4 r3, -8, 0, 8, 16, r0 | ||||
| add r3, 8 | add r3, 8 | ||||
| add r0, r2 | add r0, r2 | ||||
| @@ -189,7 +189,7 @@ cglobal put_vp8_epel%1_h6, 6, 6 + npicregs, 8, dst, dststride, src, srcstride, h | |||||
| mova m6, [sixtap_filter_hb+mxq*8-32] | mova m6, [sixtap_filter_hb+mxq*8-32] | ||||
| mova m7, [sixtap_filter_hb+mxq*8-16] | mova m7, [sixtap_filter_hb+mxq*8-16] | ||||
| .nextrow | |||||
| .nextrow: | |||||
| movu m0, [srcq-2] | movu m0, [srcq-2] | ||||
| mova m1, m0 | mova m1, m0 | ||||
| mova m2, m0 | mova m2, m0 | ||||
| @@ -229,7 +229,7 @@ cglobal put_vp8_epel%1_h4, 6, 6 + npicregs, 7, dst, dststride, src, srcstride, h | |||||
| mova m5, [fourtap_filter_hb+mxq-16] ; set up 4tap filter in bytes | mova m5, [fourtap_filter_hb+mxq-16] ; set up 4tap filter in bytes | ||||
| mova m6, [fourtap_filter_hb+mxq] | mova m6, [fourtap_filter_hb+mxq] | ||||
| .nextrow | |||||
| .nextrow: | |||||
| movu m0, [srcq-1] | movu m0, [srcq-1] | ||||
| mova m1, m0 | mova m1, m0 | ||||
| pshufb m0, m3 | pshufb m0, m3 | ||||
| @@ -264,7 +264,7 @@ cglobal put_vp8_epel%1_v4, 7, 7, 8, dst, dststride, src, srcstride, height, picr | |||||
| movh m2, [srcq+2*srcstrideq] | movh m2, [srcq+2*srcstrideq] | ||||
| add srcq, srcstrideq | add srcq, srcstrideq | ||||
| .nextrow | |||||
| .nextrow: | |||||
| movh m3, [srcq+2*srcstrideq] ; read new row | movh m3, [srcq+2*srcstrideq] ; read new row | ||||
| mova m4, m0 | mova m4, m0 | ||||
| mova m0, m1 | mova m0, m1 | ||||
| @@ -304,7 +304,7 @@ cglobal put_vp8_epel%1_v6, 7, 7, 8, dst, dststride, src, srcstride, height, picr | |||||
| movh m3, [srcq] | movh m3, [srcq] | ||||
| movh m4, [srcq+srcstrideq] | movh m4, [srcq+srcstrideq] | ||||
| .nextrow | |||||
| .nextrow: | |||||
| movh m5, [srcq+2*srcstrideq] ; read new row | movh m5, [srcq+2*srcstrideq] ; read new row | ||||
| mova m6, m0 | mova m6, m0 | ||||
| punpcklbw m6, m5 | punpcklbw m6, m5 | ||||
| @@ -350,7 +350,7 @@ cglobal put_vp8_epel4_h4, 6, 6 + npicregs, 0, dst, dststride, src, srcstride, he | |||||
| movq mm7, [pw_64] | movq mm7, [pw_64] | ||||
| pxor mm6, mm6 | pxor mm6, mm6 | ||||
| .nextrow | |||||
| .nextrow: | |||||
| movq mm1, [srcq-1] ; (ABCDEFGH) load 8 horizontal pixels | movq mm1, [srcq-1] ; (ABCDEFGH) load 8 horizontal pixels | ||||
| ; first set of 2 pixels | ; first set of 2 pixels | ||||
| @@ -399,7 +399,7 @@ cglobal put_vp8_epel4_h6, 6, 6 + npicregs, 0, dst, dststride, src, srcstride, he | |||||
| movq mm7, [pw_64] | movq mm7, [pw_64] | ||||
| pxor mm3, mm3 | pxor mm3, mm3 | ||||
| .nextrow | |||||
| .nextrow: | |||||
| movq mm1, [srcq-2] ; (ABCDEFGH) load 8 horizontal pixels | movq mm1, [srcq-2] ; (ABCDEFGH) load 8 horizontal pixels | ||||
| ; first set of 2 pixels | ; first set of 2 pixels | ||||
| @@ -459,7 +459,7 @@ cglobal put_vp8_epel8_h4, 6, 6 + npicregs, 10, dst, dststride, src, srcstride, h | |||||
| mova m8, [mxq+32] | mova m8, [mxq+32] | ||||
| mova m9, [mxq+48] | mova m9, [mxq+48] | ||||
| %endif | %endif | ||||
| .nextrow | |||||
| .nextrow: | |||||
| movq m0, [srcq-1] | movq m0, [srcq-1] | ||||
| movq m1, [srcq-0] | movq m1, [srcq-0] | ||||
| movq m2, [srcq+1] | movq m2, [srcq+1] | ||||
| @@ -510,7 +510,7 @@ cglobal put_vp8_epel8_h6, 6, 6 + npicregs, 14, dst, dststride, src, srcstride, h | |||||
| mova m12, [mxq+64] | mova m12, [mxq+64] | ||||
| mova m13, [mxq+80] | mova m13, [mxq+80] | ||||
| %endif | %endif | ||||
| .nextrow | |||||
| .nextrow: | |||||
| movq m0, [srcq-2] | movq m0, [srcq-2] | ||||
| movq m1, [srcq-1] | movq m1, [srcq-1] | ||||
| movq m2, [srcq-0] | movq m2, [srcq-0] | ||||
| @@ -577,7 +577,7 @@ cglobal put_vp8_epel%1_v4, 7, 7, 8, dst, dststride, src, srcstride, height, picr | |||||
| punpcklbw m1, m7 | punpcklbw m1, m7 | ||||
| punpcklbw m2, m7 | punpcklbw m2, m7 | ||||
| .nextrow | |||||
| .nextrow: | |||||
| ; first calculate negative taps (to prevent losing positive overflows) | ; first calculate negative taps (to prevent losing positive overflows) | ||||
| movh m4, [srcq+2*srcstrideq] ; read new row | movh m4, [srcq+2*srcstrideq] ; read new row | ||||
| punpcklbw m4, m7 | punpcklbw m4, m7 | ||||
| @@ -635,7 +635,7 @@ cglobal put_vp8_epel%1_v6, 7, 7, 8, dst, dststride, src, srcstride, height, picr | |||||
| punpcklbw m3, m7 | punpcklbw m3, m7 | ||||
| punpcklbw m4, m7 | punpcklbw m4, m7 | ||||
| .nextrow | |||||
| .nextrow: | |||||
| ; first calculate negative taps (to prevent losing positive overflows) | ; first calculate negative taps (to prevent losing positive overflows) | ||||
| mova m5, m1 | mova m5, m1 | ||||
| pmullw m5, [myq+16] | pmullw m5, [myq+16] | ||||
| @@ -689,7 +689,7 @@ cglobal put_vp8_bilinear%1_v, 7, 7, 7, dst, dststride, src, srcstride, height, p | |||||
| mova m5, [bilinear_filter_vw+myq-1*16] | mova m5, [bilinear_filter_vw+myq-1*16] | ||||
| neg myq | neg myq | ||||
| mova m4, [bilinear_filter_vw+myq+7*16] | mova m4, [bilinear_filter_vw+myq+7*16] | ||||
| .nextrow | |||||
| .nextrow: | |||||
| movh m0, [srcq+srcstrideq*0] | movh m0, [srcq+srcstrideq*0] | ||||
| movh m1, [srcq+srcstrideq*1] | movh m1, [srcq+srcstrideq*1] | ||||
| movh m3, [srcq+srcstrideq*2] | movh m3, [srcq+srcstrideq*2] | ||||
| @@ -733,7 +733,7 @@ cglobal put_vp8_bilinear%1_h, 6, 6 + npicregs, 7, dst, dststride, src, srcstride | |||||
| mova m5, [bilinear_filter_vw+mxq-1*16] | mova m5, [bilinear_filter_vw+mxq-1*16] | ||||
| neg mxq | neg mxq | ||||
| mova m4, [bilinear_filter_vw+mxq+7*16] | mova m4, [bilinear_filter_vw+mxq+7*16] | ||||
| .nextrow | |||||
| .nextrow: | |||||
| movh m0, [srcq+srcstrideq*0+0] | movh m0, [srcq+srcstrideq*0+0] | ||||
| movh m1, [srcq+srcstrideq*0+1] | movh m1, [srcq+srcstrideq*0+1] | ||||
| movh m2, [srcq+srcstrideq*1+0] | movh m2, [srcq+srcstrideq*1+0] | ||||
| @@ -783,7 +783,7 @@ cglobal put_vp8_bilinear%1_v, 7, 7, 5, dst, dststride, src, srcstride, height, p | |||||
| %endif | %endif | ||||
| pxor m4, m4 | pxor m4, m4 | ||||
| mova m3, [bilinear_filter_vb+myq-16] | mova m3, [bilinear_filter_vb+myq-16] | ||||
| .nextrow | |||||
| .nextrow: | |||||
| movh m0, [srcq+srcstrideq*0] | movh m0, [srcq+srcstrideq*0] | ||||
| movh m1, [srcq+srcstrideq*1] | movh m1, [srcq+srcstrideq*1] | ||||
| movh m2, [srcq+srcstrideq*2] | movh m2, [srcq+srcstrideq*2] | ||||
| @@ -820,7 +820,7 @@ cglobal put_vp8_bilinear%1_h, 6, 6 + npicregs, 5, dst, dststride, src, srcstride | |||||
| pxor m4, m4 | pxor m4, m4 | ||||
| mova m2, [filter_h2_shuf] | mova m2, [filter_h2_shuf] | ||||
| mova m3, [bilinear_filter_vb+mxq-16] | mova m3, [bilinear_filter_vb+mxq-16] | ||||
| .nextrow | |||||
| .nextrow: | |||||
| movu m0, [srcq+srcstrideq*0] | movu m0, [srcq+srcstrideq*0] | ||||
| movu m1, [srcq+srcstrideq*1] | movu m1, [srcq+srcstrideq*1] | ||||
| pshufb m0, m2 | pshufb m0, m2 | ||||
| @@ -1488,7 +1488,7 @@ cglobal vp8_%1_loop_filter_simple, 3, %2, 8, dst, stride, flim, cntr | |||||
| %endif | %endif | ||||
| %if mmsize == 8 ; mmx / mmxext | %if mmsize == 8 ; mmx / mmxext | ||||
| .next8px | |||||
| .next8px: | |||||
| %endif | %endif | ||||
| %ifidn %1, v | %ifidn %1, v | ||||
| ; read 4 half/full rows of pixels | ; read 4 half/full rows of pixels | ||||
| @@ -361,7 +361,7 @@ cglobal mix_%1_to_%2_%3_flt, 3,in_channels+2,needed_mmregs+matrix_elements_mm, s | |||||
| mov src0q, [src0q] | mov src0q, [src0q] | ||||
| add src0q, lenq | add src0q, lenq | ||||
| neg lenq | neg lenq | ||||
| .loop | |||||
| .loop: | |||||
| ; for x86-32 with 7-8 channels we do not have enough gp registers for all src | ; for x86-32 with 7-8 channels we do not have enough gp registers for all src | ||||
| ; pointers, so we have to load some of them from the stack each time | ; pointers, so we have to load some of them from the stack each time | ||||
| %define copy_src_from_stack ARCH_X86_32 && in_channels >= 7 && %%i >= 5 | %define copy_src_from_stack ARCH_X86_32 && in_channels >= 7 && %%i >= 5 | ||||
| @@ -30,7 +30,7 @@ SECTION .text | |||||
| cglobal vector_fmul, 4,4,2, dst, src0, src1, len | cglobal vector_fmul, 4,4,2, dst, src0, src1, len | ||||
| lea lenq, [lend*4 - 2*mmsize] | lea lenq, [lend*4 - 2*mmsize] | ||||
| ALIGN 16 | ALIGN 16 | ||||
| .loop | |||||
| .loop: | |||||
| mova m0, [src0q + lenq] | mova m0, [src0q + lenq] | ||||
| mova m1, [src0q + lenq + mmsize] | mova m1, [src0q + lenq + mmsize] | ||||
| mulps m0, m0, [src1q + lenq] | mulps m0, m0, [src1q + lenq] | ||||
| @@ -72,7 +72,7 @@ cglobal vector_fmac_scalar, 4,4,3, dst, src, mul, len | |||||
| %endif | %endif | ||||
| %endif | %endif | ||||
| lea lenq, [lend*4-2*mmsize] | lea lenq, [lend*4-2*mmsize] | ||||
| .loop | |||||
| .loop: | |||||
| mulps m1, m0, [srcq+lenq ] | mulps m1, m0, [srcq+lenq ] | ||||
| mulps m2, m0, [srcq+lenq+mmsize] | mulps m2, m0, [srcq+lenq+mmsize] | ||||
| addps m1, m1, [dstq+lenq ] | addps m1, m1, [dstq+lenq ] | ||||