|
- ;*****************************************************************************
- ;* MMX/SSE2/AVX-optimized 10-bit H.264 qpel code
- ;*****************************************************************************
- ;* Copyright (C) 2011 x264 project
- ;*
- ;* Authors: Daniel Kang <daniel.d.kang@gmail.com>
- ;*
- ;* This file is part of Libav.
- ;*
- ;* Libav is free software; you can redistribute it and/or
- ;* modify it under the terms of the GNU Lesser General Public
- ;* License as published by the Free Software Foundation; either
- ;* version 2.1 of the License, or (at your option) any later version.
- ;*
- ;* Libav is distributed in the hope that it will be useful,
- ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
- ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- ;* Lesser General Public License for more details.
- ;*
- ;* You should have received a copy of the GNU Lesser General Public
- ;* License along with Libav; if not, write to the Free Software
- ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- ;******************************************************************************
-
- %include "libavutil/x86/x86util.asm"
-
- SECTION_RODATA 32
-
- cextern pw_16
- cextern pw_1
- cextern pb_0
-
- pw_pixel_max: times 8 dw ((1 << 10)-1)
-
- pad10: times 8 dw 10*1023
- pad20: times 8 dw 20*1023
- pad30: times 8 dw 30*1023
- depad: times 4 dd 32*20*1023 + 512
- depad2: times 8 dw 20*1023 + 16*1022 + 16
- unpad: times 8 dw 16*1022/32 ; needs to be mod 16
-
- tap1: times 4 dw 1, -5
- tap2: times 4 dw 20, 20
- tap3: times 4 dw -5, 1
- pd_0f: times 4 dd 0xffff
-
- SECTION .text
-
-
- %macro AVG_MOV 2
- pavgw %2, %1
- mova %1, %2
- %endmacro
-
- %macro ADDW 3
- %if mmsize == 8
- paddw %1, %2
- %else
- movu %3, %2
- paddw %1, %3
- %endif
- %endmacro
-
- %macro FILT_H 4
- paddw %1, %4
- psubw %1, %2 ; a-b
- psraw %1, 2 ; (a-b)/4
- psubw %1, %2 ; (a-b)/4-b
- paddw %1, %3 ; (a-b)/4-b+c
- psraw %1, 2 ; ((a-b)/4-b+c)/4
- paddw %1, %3 ; ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
- %endmacro
-
- %macro PRELOAD_V 0
- lea r3, [r2*3]
- sub r1, r3
- movu m0, [r1+r2]
- movu m1, [r1+r2*2]
- add r1, r3
- movu m2, [r1]
- movu m3, [r1+r2]
- movu m4, [r1+r2*2]
- add r1, r3
- %endmacro
-
- %macro FILT_V 8
- movu %6, [r1]
- paddw %1, %6
- mova %7, %2
- paddw %7, %5
- mova %8, %3
- paddw %8, %4
- FILT_H %1, %7, %8, [pw_16]
- psraw %1, 1
- CLIPW %1, [pb_0], [pw_pixel_max]
- %endmacro
-
- %macro MC 1
- %define OP_MOV mova
- INIT_MMX mmxext
- %1 put, 4
- INIT_XMM sse2
- %1 put, 8
-
- %define OP_MOV AVG_MOV
- INIT_MMX mmxext
- %1 avg, 4
- INIT_XMM sse2
- %1 avg, 8
- %endmacro
-
- %macro MCAxA_OP 7
- %if ARCH_X86_32
- cglobal %1_h264_qpel%4_%2_10, %5,%6,%7
- call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX
- mov r0, r0m
- mov r1, r1m
- add r0, %3*2
- add r1, %3*2
- call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX
- mov r0, r0m
- mov r1, r1m
- lea r0, [r0+r2*%3]
- lea r1, [r1+r2*%3]
- call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX
- mov r0, r0m
- mov r1, r1m
- lea r0, [r0+r2*%3+%3*2]
- lea r1, [r1+r2*%3+%3*2]
- call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX
- RET
- %else ; ARCH_X86_64
- cglobal %1_h264_qpel%4_%2_10, %5,%6 + 2,%7
- mov r%6, r0
- %assign p1 %6+1
- mov r %+ p1, r1
- call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX
- lea r0, [r%6+%3*2]
- lea r1, [r %+ p1+%3*2]
- call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX
- lea r0, [r%6+r2*%3]
- lea r1, [r %+ p1+r2*%3]
- call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX
- lea r0, [r%6+r2*%3+%3*2]
- lea r1, [r %+ p1+r2*%3+%3*2]
- %if UNIX64 == 0 ; fall through to function
- call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX
- RET
- %endif
- %endif
- %endmacro
-
- ;cpu, put/avg, mc, 4/8, ...
- %macro cglobal_mc 6
- %assign i %3*2
- %if ARCH_X86_32 || cpuflag(sse2)
- MCAxA_OP %1, %2, %3, i, %4,%5,%6
- %endif
-
- cglobal %1_h264_qpel%3_%2_10, %4,%5,%6
- %if UNIX64 == 0 ; no prologue or epilogue for UNIX64
- call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX
- RET
- %endif
-
- stub_%1_h264_qpel%3_%2_10 %+ SUFFIX:
- %endmacro
-
- ;-----------------------------------------------------------------------------
- ; void h264_qpel_mc00(uint8_t *dst, uint8_t *src, int stride)
- ;-----------------------------------------------------------------------------
- %macro COPY4 0
- movu m0, [r1 ]
- OP_MOV [r0 ], m0
- movu m0, [r1+r2 ]
- OP_MOV [r0+r2 ], m0
- movu m0, [r1+r2*2]
- OP_MOV [r0+r2*2], m0
- movu m0, [r1+r3 ]
- OP_MOV [r0+r3 ], m0
- %endmacro
-
- %macro MC00 1
- INIT_MMX mmxext
- cglobal_mc %1, mc00, 4, 3,4,0
- lea r3, [r2*3]
- COPY4
- ret
-
- INIT_XMM sse2
- cglobal %1_h264_qpel8_mc00_10, 3,4
- lea r3, [r2*3]
- COPY4
- lea r0, [r0+r2*4]
- lea r1, [r1+r2*4]
- COPY4
- RET
-
- cglobal %1_h264_qpel16_mc00_10, 3,4
- mov r3d, 8
- .loop:
- movu m0, [r1 ]
- movu m1, [r1 +16]
- OP_MOV [r0 ], m0
- OP_MOV [r0 +16], m1
- movu m0, [r1+r2 ]
- movu m1, [r1+r2+16]
- OP_MOV [r0+r2 ], m0
- OP_MOV [r0+r2+16], m1
- lea r0, [r0+r2*2]
- lea r1, [r1+r2*2]
- dec r3d
- jg .loop
- REP_RET
- %endmacro
-
- %define OP_MOV mova
- MC00 put
-
- %define OP_MOV AVG_MOV
- MC00 avg
-
- ;-----------------------------------------------------------------------------
- ; void h264_qpel_mc20(uint8_t *dst, uint8_t *src, int stride)
- ;-----------------------------------------------------------------------------
- %macro MC_CACHE 1
- %define OP_MOV mova
- INIT_MMX mmxext
- %1 put, 4
- INIT_XMM sse2, cache64
- %1 put, 8
- INIT_XMM ssse3, cache64
- %1 put, 8
- INIT_XMM sse2
- %1 put, 8
-
- %define OP_MOV AVG_MOV
- INIT_MMX mmxext
- %1 avg, 4
- INIT_XMM sse2, cache64
- %1 avg, 8
- INIT_XMM ssse3, cache64
- %1 avg, 8
- INIT_XMM sse2
- %1 avg, 8
- %endmacro
-
- %macro MC20 2
- cglobal_mc %1, mc20, %2, 3,4,9
- mov r3d, %2
- mova m1, [pw_pixel_max]
- %if num_mmregs > 8
- mova m8, [pw_16]
- %define p16 m8
- %else
- %define p16 [pw_16]
- %endif
- .nextrow:
- %if %0 == 4
- movu m2, [r1-4]
- movu m3, [r1-2]
- movu m4, [r1+0]
- ADDW m2, [r1+6], m5
- ADDW m3, [r1+4], m5
- ADDW m4, [r1+2], m5
- %else ; movu is slow on these processors
- %if mmsize==16
- movu m2, [r1-4]
- movu m0, [r1+6]
- mova m6, m0
- psrldq m0, 6
-
- paddw m6, m2
- PALIGNR m3, m0, m2, 2, m5
- PALIGNR m7, m0, m2, 8, m5
- paddw m3, m7
- PALIGNR m4, m0, m2, 4, m5
- PALIGNR m7, m0, m2, 6, m5
- paddw m4, m7
- SWAP 2, 6
- %else
- movu m2, [r1-4]
- movu m6, [r1+4]
- PALIGNR m3, m6, m2, 2, m5
- paddw m3, m6
- PALIGNR m4, m6, m2, 4, m5
- PALIGNR m7, m6, m2, 6, m5
- paddw m4, m7
- paddw m2, [r1+6]
- %endif
- %endif
-
- FILT_H m2, m3, m4, p16
- psraw m2, 1
- pxor m0, m0
- CLIPW m2, m0, m1
- OP_MOV [r0], m2
- add r0, r2
- add r1, r2
- dec r3d
- jg .nextrow
- rep ret
- %endmacro
-
- MC_CACHE MC20
-
- ;-----------------------------------------------------------------------------
- ; void h264_qpel_mc30(uint8_t *dst, uint8_t *src, int stride)
- ;-----------------------------------------------------------------------------
- %macro MC30 2
- cglobal_mc %1, mc30, %2, 3,5,9
- lea r4, [r1+2]
- jmp stub_%1_h264_qpel%2_mc10_10 %+ SUFFIX %+ .body
- %endmacro
-
- MC_CACHE MC30
-
- ;-----------------------------------------------------------------------------
- ; void h264_qpel_mc10(uint8_t *dst, uint8_t *src, int stride)
- ;-----------------------------------------------------------------------------
- %macro MC10 2
- cglobal_mc %1, mc10, %2, 3,5,9
- mov r4, r1
- .body:
- mov r3d, %2
- mova m1, [pw_pixel_max]
- %if num_mmregs > 8
- mova m8, [pw_16]
- %define p16 m8
- %else
- %define p16 [pw_16]
- %endif
- .nextrow:
- %if %0 == 4
- movu m2, [r1-4]
- movu m3, [r1-2]
- movu m4, [r1+0]
- ADDW m2, [r1+6], m5
- ADDW m3, [r1+4], m5
- ADDW m4, [r1+2], m5
- %else ; movu is slow on these processors
- %if mmsize==16
- movu m2, [r1-4]
- movu m0, [r1+6]
- mova m6, m0
- psrldq m0, 6
-
- paddw m6, m2
- PALIGNR m3, m0, m2, 2, m5
- PALIGNR m7, m0, m2, 8, m5
- paddw m3, m7
- PALIGNR m4, m0, m2, 4, m5
- PALIGNR m7, m0, m2, 6, m5
- paddw m4, m7
- SWAP 2, 6
- %else
- movu m2, [r1-4]
- movu m6, [r1+4]
- PALIGNR m3, m6, m2, 2, m5
- paddw m3, m6
- PALIGNR m4, m6, m2, 4, m5
- PALIGNR m7, m6, m2, 6, m5
- paddw m4, m7
- paddw m2, [r1+6]
- %endif
- %endif
-
- FILT_H m2, m3, m4, p16
- psraw m2, 1
- pxor m0, m0
- CLIPW m2, m0, m1
- movu m3, [r4]
- pavgw m2, m3
- OP_MOV [r0], m2
- add r0, r2
- add r1, r2
- add r4, r2
- dec r3d
- jg .nextrow
- rep ret
- %endmacro
-
- MC_CACHE MC10
-
- ;-----------------------------------------------------------------------------
- ; void h264_qpel_mc02(uint8_t *dst, uint8_t *src, int stride)
- ;-----------------------------------------------------------------------------
- %macro V_FILT 10
- v_filt%9_%10_10
- add r4, r2
- .no_addr4:
- FILT_V m0, m1, m2, m3, m4, m5, m6, m7
- add r1, r2
- add r0, r2
- ret
- %endmacro
-
- INIT_MMX mmxext
- RESET_MM_PERMUTATION
- %assign i 0
- %rep 4
- V_FILT m0, m1, m2, m3, m4, m5, m6, m7, 4, i
- SWAP 0,1,2,3,4,5
- %assign i i+1
- %endrep
-
- INIT_XMM sse2
- RESET_MM_PERMUTATION
- %assign i 0
- %rep 6
- V_FILT m0, m1, m2, m3, m4, m5, m6, m7, 8, i
- SWAP 0,1,2,3,4,5
- %assign i i+1
- %endrep
-
- %macro MC02 2
- cglobal_mc %1, mc02, %2, 3,4,8
- PRELOAD_V
-
- sub r0, r2
- %assign j 0
- %rep %2
- %assign i (j % 6)
- call v_filt%2_ %+ i %+ _10.no_addr4
- OP_MOV [r0], m0
- SWAP 0,1,2,3,4,5
- %assign j j+1
- %endrep
- ret
- %endmacro
-
- MC MC02
-
- ;-----------------------------------------------------------------------------
- ; void h264_qpel_mc01(uint8_t *dst, uint8_t *src, int stride)
- ;-----------------------------------------------------------------------------
- %macro MC01 2
- cglobal_mc %1, mc01, %2, 3,5,8
- mov r4, r1
- .body:
- PRELOAD_V
-
- sub r4, r2
- sub r0, r2
- %assign j 0
- %rep %2
- %assign i (j % 6)
- call v_filt%2_ %+ i %+ _10
- movu m7, [r4]
- pavgw m0, m7
- OP_MOV [r0], m0
- SWAP 0,1,2,3,4,5
- %assign j j+1
- %endrep
- ret
- %endmacro
-
- MC MC01
-
- ;-----------------------------------------------------------------------------
- ; void h264_qpel_mc03(uint8_t *dst, uint8_t *src, int stride)
- ;-----------------------------------------------------------------------------
- %macro MC03 2
- cglobal_mc %1, mc03, %2, 3,5,8
- lea r4, [r1+r2]
- jmp stub_%1_h264_qpel%2_mc01_10 %+ SUFFIX %+ .body
- %endmacro
-
- MC MC03
-
- ;-----------------------------------------------------------------------------
- ; void h264_qpel_mc11(uint8_t *dst, uint8_t *src, int stride)
- ;-----------------------------------------------------------------------------
- %macro H_FILT_AVG 2-3
- h_filt%1_%2_10:
- ;FILT_H with fewer registers and averaged with the FILT_V result
- ;m6,m7 are tmp registers, m0 is the FILT_V result, the rest are to be used next in the next iteration
- ;unfortunately I need three registers, so m5 will have to be re-read from memory
- movu m5, [r4-4]
- ADDW m5, [r4+6], m7
- movu m6, [r4-2]
- ADDW m6, [r4+4], m7
- paddw m5, [pw_16]
- psubw m5, m6 ; a-b
- psraw m5, 2 ; (a-b)/4
- psubw m5, m6 ; (a-b)/4-b
- movu m6, [r4+0]
- ADDW m6, [r4+2], m7
- paddw m5, m6 ; (a-b)/4-b+c
- psraw m5, 2 ; ((a-b)/4-b+c)/4
- paddw m5, m6 ; ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
- psraw m5, 1
- CLIPW m5, [pb_0], [pw_pixel_max]
- ;avg FILT_V, FILT_H
- pavgw m0, m5
- %if %0!=4
- movu m5, [r1+r5]
- %endif
- ret
- %endmacro
-
- INIT_MMX mmxext
- RESET_MM_PERMUTATION
- %assign i 0
- %rep 3
- H_FILT_AVG 4, i
- SWAP 0,1,2,3,4,5
- %assign i i+1
- %endrep
- H_FILT_AVG 4, i, 0
-
- INIT_XMM sse2
- RESET_MM_PERMUTATION
- %assign i 0
- %rep 6
- %if i==1
- H_FILT_AVG 8, i, 0
- %else
- H_FILT_AVG 8, i
- %endif
- SWAP 0,1,2,3,4,5
- %assign i i+1
- %endrep
-
- %macro MC11 2
- ; this REALLY needs x86_64
- cglobal_mc %1, mc11, %2, 3,6,8
- mov r4, r1
- .body:
- PRELOAD_V
-
- sub r0, r2
- sub r4, r2
- mov r5, r2
- neg r5
- %assign j 0
- %rep %2
- %assign i (j % 6)
- call v_filt%2_ %+ i %+ _10
- call h_filt%2_ %+ i %+ _10
- %if %2==8 && i==1
- movu m5, [r1+r5]
- %endif
- OP_MOV [r0], m0
- SWAP 0,1,2,3,4,5
- %assign j j+1
- %endrep
- ret
- %endmacro
-
- MC MC11
-
- ;-----------------------------------------------------------------------------
- ; void h264_qpel_mc31(uint8_t *dst, uint8_t *src, int stride)
- ;-----------------------------------------------------------------------------
- %macro MC31 2
- cglobal_mc %1, mc31, %2, 3,6,8
- mov r4, r1
- add r1, 2
- jmp stub_%1_h264_qpel%2_mc11_10 %+ SUFFIX %+ .body
- %endmacro
-
- MC MC31
-
- ;-----------------------------------------------------------------------------
- ; void h264_qpel_mc13(uint8_t *dst, uint8_t *src, int stride)
- ;-----------------------------------------------------------------------------
- %macro MC13 2
- cglobal_mc %1, mc13, %2, 3,7,12
- lea r4, [r1+r2]
- jmp stub_%1_h264_qpel%2_mc11_10 %+ SUFFIX %+ .body
- %endmacro
-
- MC MC13
-
- ;-----------------------------------------------------------------------------
- ; void h264_qpel_mc33(uint8_t *dst, uint8_t *src, int stride)
- ;-----------------------------------------------------------------------------
- %macro MC33 2
- cglobal_mc %1, mc33, %2, 3,6,8
- lea r4, [r1+r2]
- add r1, 2
- jmp stub_%1_h264_qpel%2_mc11_10 %+ SUFFIX %+ .body
- %endmacro
-
- MC MC33
-
- ;-----------------------------------------------------------------------------
- ; void h264_qpel_mc22(uint8_t *dst, uint8_t *src, int stride)
- ;-----------------------------------------------------------------------------
- %macro FILT_H2 3
- psubw %1, %2 ; a-b
- psubw %2, %3 ; b-c
- psllw %2, 2
- psubw %1, %2 ; a-5*b+4*c
- psllw %3, 4
- paddw %1, %3 ; a-5*b+20*c
- %endmacro
-
- %macro FILT_VNRD 8
- movu %6, [r1]
- paddw %1, %6
- mova %7, %2
- paddw %7, %5
- mova %8, %3
- paddw %8, %4
- FILT_H2 %1, %7, %8
- %endmacro
-
- %macro HV 1
- %if mmsize==16
- %define PAD 12
- %define COUNT 2
- %else
- %define PAD 4
- %define COUNT 3
- %endif
- put_hv%1_10:
- neg r2 ; This actually saves instructions
- lea r1, [r1+r2*2-mmsize+PAD]
- lea r4, [rsp+PAD+gprsize]
- mov r3d, COUNT
- .v_loop:
- movu m0, [r1]
- sub r1, r2
- movu m1, [r1]
- sub r1, r2
- movu m2, [r1]
- sub r1, r2
- movu m3, [r1]
- sub r1, r2
- movu m4, [r1]
- sub r1, r2
- %assign i 0
- %rep %1-1
- FILT_VNRD m0, m1, m2, m3, m4, m5, m6, m7
- psubw m0, [pad20]
- movu [r4+i*mmsize*3], m0
- sub r1, r2
- SWAP 0,1,2,3,4,5
- %assign i i+1
- %endrep
- FILT_VNRD m0, m1, m2, m3, m4, m5, m6, m7
- psubw m0, [pad20]
- movu [r4+i*mmsize*3], m0
- add r4, mmsize
- lea r1, [r1+r2*8+mmsize]
- %if %1==8
- lea r1, [r1+r2*4]
- %endif
- dec r3d
- jg .v_loop
- neg r2
- ret
- %endmacro
-
- INIT_MMX mmxext
- HV 4
- INIT_XMM sse2
- HV 8
-
- %macro H_LOOP 1
- %if num_mmregs > 8
- %define s1 m8
- %define s2 m9
- %define s3 m10
- %define d1 m11
- %else
- %define s1 [tap1]
- %define s2 [tap2]
- %define s3 [tap3]
- %define d1 [depad]
- %endif
- h%1_loop_op:
- movu m1, [r1+mmsize-4]
- movu m2, [r1+mmsize-2]
- mova m3, [r1+mmsize+0]
- movu m4, [r1+mmsize+2]
- movu m5, [r1+mmsize+4]
- movu m6, [r1+mmsize+6]
- %if num_mmregs > 8
- pmaddwd m1, s1
- pmaddwd m2, s1
- pmaddwd m3, s2
- pmaddwd m4, s2
- pmaddwd m5, s3
- pmaddwd m6, s3
- paddd m1, d1
- paddd m2, d1
- %else
- mova m0, s1
- pmaddwd m1, m0
- pmaddwd m2, m0
- mova m0, s2
- pmaddwd m3, m0
- pmaddwd m4, m0
- mova m0, s3
- pmaddwd m5, m0
- pmaddwd m6, m0
- mova m0, d1
- paddd m1, m0
- paddd m2, m0
- %endif
- paddd m3, m5
- paddd m4, m6
- paddd m1, m3
- paddd m2, m4
- psrad m1, 10
- psrad m2, 10
- pslld m2, 16
- pand m1, [pd_0f]
- por m1, m2
- %if num_mmregs <= 8
- pxor m0, m0
- %endif
- CLIPW m1, m0, m7
- add r1, mmsize*3
- ret
- %endmacro
-
- INIT_MMX mmxext
- H_LOOP 4
- INIT_XMM sse2
- H_LOOP 8
-
- %macro MC22 2
- cglobal_mc %1, mc22, %2, 3,7,12
- %define PAD mmsize*8*4*2 ; SIZE*16*4*sizeof(pixel)
- mov r6, rsp ; backup stack pointer
- and rsp, ~(mmsize-1) ; align stack
- sub rsp, PAD
-
- call put_hv%2_10
-
- mov r3d, %2
- mova m7, [pw_pixel_max]
- %if num_mmregs > 8
- pxor m0, m0
- mova m8, [tap1]
- mova m9, [tap2]
- mova m10, [tap3]
- mova m11, [depad]
- %endif
- mov r1, rsp
- .h_loop:
- call h%2_loop_op
-
- OP_MOV [r0], m1
- add r0, r2
- dec r3d
- jg .h_loop
-
- mov rsp, r6 ; restore stack pointer
- ret
- %endmacro
-
- MC MC22
-
- ;-----------------------------------------------------------------------------
- ; void h264_qpel_mc12(uint8_t *dst, uint8_t *src, int stride)
- ;-----------------------------------------------------------------------------
- %macro MC12 2
- cglobal_mc %1, mc12, %2, 3,7,12
- %define PAD mmsize*8*4*2 ; SIZE*16*4*sizeof(pixel)
- mov r6, rsp ; backup stack pointer
- and rsp, ~(mmsize-1) ; align stack
- sub rsp, PAD
-
- call put_hv%2_10
-
- xor r4d, r4d
- .body:
- mov r3d, %2
- pxor m0, m0
- mova m7, [pw_pixel_max]
- %if num_mmregs > 8
- mova m8, [tap1]
- mova m9, [tap2]
- mova m10, [tap3]
- mova m11, [depad]
- %endif
- mov r1, rsp
- .h_loop:
- call h%2_loop_op
-
- movu m3, [r1+r4-2*mmsize] ; movu needed for mc32, etc
- paddw m3, [depad2]
- psrlw m3, 5
- psubw m3, [unpad]
- CLIPW m3, m0, m7
- pavgw m1, m3
-
- OP_MOV [r0], m1
- add r0, r2
- dec r3d
- jg .h_loop
-
- mov rsp, r6 ; restore stack pointer
- ret
- %endmacro
-
- MC MC12
-
- ;-----------------------------------------------------------------------------
- ; void h264_qpel_mc32(uint8_t *dst, uint8_t *src, int stride)
- ;-----------------------------------------------------------------------------
- %macro MC32 2
- cglobal_mc %1, mc32, %2, 3,7,12
- %define PAD mmsize*8*3*2 ; SIZE*16*4*sizeof(pixel)
- mov r6, rsp ; backup stack pointer
- and rsp, ~(mmsize-1) ; align stack
- sub rsp, PAD
-
- call put_hv%2_10
-
- mov r4d, 2 ; sizeof(pixel)
- jmp stub_%1_h264_qpel%2_mc12_10 %+ SUFFIX %+ .body
- %endmacro
-
- MC MC32
-
- ;-----------------------------------------------------------------------------
- ; void h264_qpel_mc21(uint8_t *dst, uint8_t *src, int stride)
- ;-----------------------------------------------------------------------------
- %macro H_NRD 1
- put_h%1_10:
- add rsp, gprsize
- mov r3d, %1
- xor r4d, r4d
- mova m6, [pad20]
- .nextrow:
- movu m2, [r5-4]
- movu m3, [r5-2]
- movu m4, [r5+0]
- ADDW m2, [r5+6], m5
- ADDW m3, [r5+4], m5
- ADDW m4, [r5+2], m5
-
- FILT_H2 m2, m3, m4
- psubw m2, m6
- mova [rsp+r4], m2
- add r4d, mmsize*3
- add r5, r2
- dec r3d
- jg .nextrow
- sub rsp, gprsize
- ret
- %endmacro
-
- INIT_MMX mmxext
- H_NRD 4
- INIT_XMM sse2
- H_NRD 8
-
- %macro MC21 2
- cglobal_mc %1, mc21, %2, 3,7,12
- mov r5, r1
- .body:
- %define PAD mmsize*8*3*2 ; SIZE*16*4*sizeof(pixel)
- mov r6, rsp ; backup stack pointer
- and rsp, ~(mmsize-1) ; align stack
-
- sub rsp, PAD
- call put_h%2_10
-
- sub rsp, PAD
- call put_hv%2_10
-
- mov r4d, PAD-mmsize ; H buffer
- jmp stub_%1_h264_qpel%2_mc12_10 %+ SUFFIX %+ .body
- %endmacro
-
- MC MC21
-
- ;-----------------------------------------------------------------------------
- ; void h264_qpel_mc23(uint8_t *dst, uint8_t *src, int stride)
- ;-----------------------------------------------------------------------------
- %macro MC23 2
- cglobal_mc %1, mc23, %2, 3,7,12
- lea r5, [r1+r2]
- jmp stub_%1_h264_qpel%2_mc21_10 %+ SUFFIX %+ .body
- %endmacro
-
- MC MC23
|