x86/vc1dsp: Port vc1_*_hor_16b_shift2 to NASM format

Reviewed-by: Christophe Gisquet <christophe.gisquet@gmail.com>
9 years ago · bcc223523e
--- a/libavcodec/x86/vc1dsp.asm
+++ b/libavcodec/x86/vc1dsp.asm
@@ -25,6 +25,7 @@
 cextern pw_4
 cextern pw_5
 cextern pw_9
 cextern pw_128

 section .text

@@ -319,6 +320,44 @@ cglobal vc1_h_loop_filter8, 3,5,8
    RET

 %if HAVE_MMX_INLINE

 ; XXX some of these macros are not used right now, but they will in the future
 ;     when more functions are ported.

 %macro OP_PUT 2 ; dst, src
 %endmacro

 %macro OP_AVG 2 ; dst, src
    pavgb           %1, %2
 %endmacro

 %macro NORMALIZE_MMX 1 ; shift
    paddw           m3, m7 ; +bias-r
    paddw           m4, m7 ; +bias-r
    psraw           m3, %1
    psraw           m4, %1
 %endmacro

 %macro TRANSFER_DO_PACK 2 ; op, dst
    packuswb        m3, m4
    %1              m3, [%2]
    mova          [%2], m3
 %endmacro

 %macro TRANSFER_DONT_PACK 2 ; op, dst
    %1              m3, [%2]
    %1              m3, [%2 + mmsize]
    mova          [%2], m3
    mova [mmsize + %2], m4
 %endmacro

 ; see MSPEL_FILTER13_CORE for use as UNPACK macro
 %macro DO_UNPACK 1 ; reg
    punpcklbw       %1, m0
 %endmacro
 %macro DONT_UNPACK 1 ; reg
 %endmacro

 ; Compute the rounder 32-r or 8-r and unpacks it to m7
 %macro LOAD_ROUNDER_MMX 1 ; round
    movd      m7, %1
@@ -394,6 +433,57 @@ cglobal vc1_put_ver_16b_shift2, 4,7,0, dst, src, stride
    dec                 i
        jnz         .loop
    REP_RET
 %undef rnd
 %undef shift
 %undef stride_neg2
 %undef stride_9minus4
 %undef i

 ; void ff_vc1_*_hor_16b_shift2_mmx(uint8_t *dst, x86_reg stride,
 ;                                  const int16_t *src, int rnd);
 ; Data is already unpacked, so some operations can directly be made from
 ; memory.
 %macro HOR_16B_SHIFT2 2 ; op, opname
 cglobal vc1_%2_hor_16b_shift2, 4, 5, 0, dst, stride, src, rnd, h
    mov                hq, 8
    sub              srcq, 2
    sub              rndd, (-1+9+9-1) * 1024 ; add -1024 bias
    LOAD_ROUNDER_MMX rndq
    mova               m5, [pw_9]
    mova               m6, [pw_128]
    pxor               m0, m0

 .loop:
    mova               m1, [srcq + 2 * 0]
    mova               m2, [srcq + 2 * 0 + mmsize]
    mova               m3, [srcq + 2 * 1]
    mova               m4, [srcq + 2 * 1 + mmsize]
    paddw              m3, [srcq + 2 * 2]
    paddw              m4, [srcq + 2 * 2 + mmsize]
    paddw              m1, [srcq + 2 * 3]
    paddw              m2, [srcq + 2 * 3 + mmsize]
    pmullw             m3, m5
    pmullw             m4, m5
    psubw              m3, m1
    psubw              m4, m2
    NORMALIZE_MMX      7
    ; remove bias
    paddw              m3, m6
    paddw              m4, m6
    TRANSFER_DO_PACK   %1, dstq
    add              srcq, 24
    add              dstq, strideq
    dec                hq
        jnz         .loop

    RET
 %endmacro

 INIT_MMX mmx
 HOR_16B_SHIFT2 OP_PUT, put

 INIT_MMX mmxext
 HOR_16B_SHIFT2 OP_AVG, avg
 %endif ; HAVE_MMX_INLINE

 %macro INV_TRANS_INIT 0
--- a/libavcodec/x86/vc1dsp_mmx.c
+++ b/libavcodec/x86/vc1dsp_mmx.c
@@ -38,6 +38,10 @@
 void ff_vc1_put_ver_16b_shift2_mmx(int16_t *dst,
                                   const uint8_t *src, x86_reg stride,
                                   int rnd, int64_t shift);
 void ff_vc1_put_hor_16b_shift2_mmx(uint8_t *dst, x86_reg stride,
                                   const int16_t *src, int rnd);
 void ff_vc1_avg_hor_16b_shift2_mmxext(uint8_t *dst, x86_reg stride,
                                      const int16_t *src, int rnd);

 #define OP_PUT(S,D)
 #define OP_AVG(S,D) "pavgb " #S ", " #D " \n\t"
@@ -70,55 +74,6 @@ void ff_vc1_put_ver_16b_shift2_mmx(int16_t *dst,
     "punpcklwd %%mm7, %%mm7           \n\t"    \
     "punpckldq %%mm7, %%mm7           \n\t"

 /**
 * Data is already unpacked, so some operations can directly be made from
 * memory.
 */
 #define VC1_HOR_16b_SHIFT2(OP, OPNAME)\
 static void OPNAME ## vc1_hor_16b_shift2_mmx(uint8_t *dst, x86_reg stride,\
                                             const int16_t *src, int rnd)\
 {\
    int h = 8;\
 \
    src -= 1;\
    rnd -= (-1+9+9-1)*1024; /* Add -1024 bias */\
    __asm__ volatile(\
        LOAD_ROUNDER_MMX("%4")\
        "movq      "MANGLE(ff_pw_128)", %%mm6\n\t"\
        "movq      "MANGLE(ff_pw_9)", %%mm5 \n\t"\
        "1:                                \n\t"\
        "movq      2*0+0(%1), %%mm1        \n\t"\
        "movq      2*0+8(%1), %%mm2        \n\t"\
        "movq      2*1+0(%1), %%mm3        \n\t"\
        "movq      2*1+8(%1), %%mm4        \n\t"\
        "paddw     2*3+0(%1), %%mm1        \n\t"\
        "paddw     2*3+8(%1), %%mm2        \n\t"\
        "paddw     2*2+0(%1), %%mm3        \n\t"\
        "paddw     2*2+8(%1), %%mm4        \n\t"\
        "pmullw    %%mm5, %%mm3            \n\t"\
        "pmullw    %%mm5, %%mm4            \n\t"\
        "psubw     %%mm1, %%mm3            \n\t"\
        "psubw     %%mm2, %%mm4            \n\t"\
        NORMALIZE_MMX("$7")\
        /* Remove bias */\
        "paddw     %%mm6, %%mm3            \n\t"\
        "paddw     %%mm6, %%mm4            \n\t"\
        TRANSFER_DO_PACK(OP)\
        "add       $24, %1                 \n\t"\
        "add       %3, %2                  \n\t"\
        "decl      %0                      \n\t"\
        "jnz 1b                            \n\t"\
        : "+r"(h), "+r" (src),  "+r" (dst)\
        : "r"(stride), "m"(rnd)\
          NAMED_CONSTRAINTS_ADD(ff_pw_128,ff_pw_9)\
        : "memory"\
    );\
 }

 VC1_HOR_16b_SHIFT2(OP_PUT, put_)
 VC1_HOR_16b_SHIFT2(OP_AVG, avg_)


 /**
 * Purely vertical or horizontal 1/2 shift interpolation.
 * Sacrify mm6 for *9 factor.
@@ -380,14 +335,14 @@ typedef void (*vc1_mspel_mc_filter_8bits)(uint8_t *dst, const uint8_t *src, x86_
 * @param  hmode   Vertical filter.
 * @param  rnd     Rounding bias.
 */
 #define VC1_MSPEL_MC(OP)\
 #define VC1_MSPEL_MC(OP, INSTR)\
 static void OP ## vc1_mspel_mc(uint8_t *dst, const uint8_t *src, int stride,\
                               int hmode, int vmode, int rnd)\
 {\
    static const vc1_mspel_mc_filter_ver_16bits vc1_put_shift_ver_16bits[] =\
         { NULL, vc1_put_ver_16b_shift1_mmx, ff_vc1_put_ver_16b_shift2_mmx, vc1_put_ver_16b_shift3_mmx };\
    static const vc1_mspel_mc_filter_hor_16bits vc1_put_shift_hor_16bits[] =\
         { NULL, OP ## vc1_hor_16b_shift1_mmx, OP ## vc1_hor_16b_shift2_mmx, OP ## vc1_hor_16b_shift3_mmx };\
         { NULL, OP ## vc1_hor_16b_shift1_mmx, ff_vc1_ ## OP ## hor_16b_shift2_ ## INSTR, OP ## vc1_hor_16b_shift3_mmx };\
    static const vc1_mspel_mc_filter_8bits vc1_put_shift_8bits[] =\
         { NULL, OP ## vc1_shift1_mmx, OP ## vc1_shift2_mmx, OP ## vc1_shift3_mmx };\
 \
@@ -428,8 +383,8 @@ static void OP ## vc1_mspel_mc_16(uint8_t *dst, const uint8_t *src, \
    OP ## vc1_mspel_mc(dst + 8, src + 8, stride, hmode, vmode, rnd); \
 }

 VC1_MSPEL_MC(put_)
 VC1_MSPEL_MC(avg_)
 VC1_MSPEL_MC(put_, mmx)
 VC1_MSPEL_MC(avg_, mmxext)

 /** Macro to ease bicubic filter interpolation functions declarations */
 #define DECLARE_FUNCTION(a, b)                                          \