3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw

Originally committed as revision 9053 to svn://svn.ffmpeg.org/ffmpeg/trunk
18 years ago · 038bfcf9d6
--- a/libavcodec/i386/dsputil_mmx.c
+++ b/libavcodec/i386/dsputil_mmx.c
@@ -2752,91 +2752,69 @@ static void gmc_mmx(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int o
 }

 #ifdef CONFIG_ENCODERS
 static int try_8x8basis_mmx(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
    long i=0;

    assert(FFABS(scale) < 256);
    scale<<= 16 + 1 - BASIS_SHIFT + RECON_SHIFT;
 #define PHADDD(a, t)\
    "movq "#a", "#t"                  \n\t"\
    "psrlq $32, "#a"                  \n\t"\
    "paddd "#t", "#a"                 \n\t"
 /*
   pmulhw: dst[0-15]=(src[0-15]*dst[0-15])[16-31]
   pmulhrw: dst[0-15]=(src[0-15]*dst[0-15] + 0x8000)[16-31]
   pmulhrsw: dst[0-15]=(src[0-15]*dst[0-15] + 0x4000)[15-30]
 */
 #define PMULHRW(x, y, s, o)\
    "pmulhw " #s ", "#x "            \n\t"\
    "pmulhw " #s ", "#y "            \n\t"\
    "paddw " #o ", "#x "             \n\t"\
    "paddw " #o ", "#y "             \n\t"\
    "psraw $1, "#x "                 \n\t"\
    "psraw $1, "#y "                 \n\t"
 #define DEF(x) x ## _mmx
 #define SET_RND MOVQ_WONE
 #define SCALE_OFFSET 1

 #include "dsputil_mmx_qns.h"

    asm volatile(
        "pcmpeqw %%mm6, %%mm6           \n\t" // -1w
        "psrlw $15, %%mm6               \n\t" //  1w
        "pxor %%mm7, %%mm7              \n\t"
        "movd  %4, %%mm5                \n\t"
        "punpcklwd %%mm5, %%mm5         \n\t"
        "punpcklwd %%mm5, %%mm5         \n\t"
        "1:                             \n\t"
        "movq  (%1, %0), %%mm0          \n\t"
        "movq  8(%1, %0), %%mm1         \n\t"
        "pmulhw %%mm5, %%mm0            \n\t"
        "pmulhw %%mm5, %%mm1            \n\t"
        "paddw %%mm6, %%mm0             \n\t"
        "paddw %%mm6, %%mm1             \n\t"
        "psraw $1, %%mm0                \n\t"
        "psraw $1, %%mm1                \n\t"
        "paddw (%2, %0), %%mm0          \n\t"
        "paddw 8(%2, %0), %%mm1         \n\t"
        "psraw $6, %%mm0                \n\t"
        "psraw $6, %%mm1                \n\t"
        "pmullw (%3, %0), %%mm0         \n\t"
        "pmullw 8(%3, %0), %%mm1        \n\t"
        "pmaddwd %%mm0, %%mm0           \n\t"
        "pmaddwd %%mm1, %%mm1           \n\t"
        "paddd %%mm1, %%mm0             \n\t"
        "psrld $4, %%mm0                \n\t"
        "paddd %%mm0, %%mm7             \n\t"
        "add $16, %0                    \n\t"
        "cmp $128, %0                   \n\t" //FIXME optimize & bench
        " jb 1b                         \n\t"
        "movq %%mm7, %%mm6              \n\t"
        "psrlq $32, %%mm7               \n\t"
        "paddd %%mm6, %%mm7             \n\t"
        "psrld $2, %%mm7                \n\t"
        "movd %%mm7, %0                 \n\t"
 #undef DEF
 #undef SET_RND
 #undef SCALE_OFFSET
 #undef PMULHRW

        : "+r" (i)
        : "r"(basis), "r"(rem), "r"(weight), "g"(scale)
    );
    return i;
 }
 #define DEF(x) x ## _3dnow
 #define SET_RND(x)
 #define SCALE_OFFSET 0
 #define PMULHRW(x, y, s, o)\
    "pmulhrw " #s ", "#x "           \n\t"\
    "pmulhrw " #s ", "#y "           \n\t"

 static void add_8x8basis_mmx(int16_t rem[64], int16_t basis[64], int scale){
    long i=0;
 #include "dsputil_mmx_qns.h"

 #undef DEF
 #undef SET_RND
 #undef SCALE_OFFSET
 #undef PMULHRW

 #ifdef HAVE_SSSE3
 #undef PHADDD
 #define DEF(x) x ## _ssse3
 #define SET_RND(x)
 #define SCALE_OFFSET -1
 #define PHADDD(a, t)\
    "pshufw $0x0E, "#a", "#t"         \n\t"\
    "paddd "#t", "#a"                 \n\t" /* faster than phaddd on core2 */
 #define PMULHRW(x, y, s, o)\
    "pmulhrsw " #s ", "#x "          \n\t"\
    "pmulhrsw " #s ", "#y "          \n\t"

 #include "dsputil_mmx_qns.h"

 #undef DEF
 #undef SET_RND
 #undef SCALE_OFFSET
 #undef PMULHRW
 #undef PHADDD
 #endif //HAVE_SSSE3

    if(FFABS(scale) < 256){
        scale<<= 16 + 1 - BASIS_SHIFT + RECON_SHIFT;
        asm volatile(
                "pcmpeqw %%mm6, %%mm6   \n\t" // -1w
                "psrlw $15, %%mm6       \n\t" //  1w
                "movd  %3, %%mm5        \n\t"
                "punpcklwd %%mm5, %%mm5 \n\t"
                "punpcklwd %%mm5, %%mm5 \n\t"
                "1:                     \n\t"
                "movq  (%1, %0), %%mm0  \n\t"
                "movq  8(%1, %0), %%mm1 \n\t"
                "pmulhw %%mm5, %%mm0    \n\t"
                "pmulhw %%mm5, %%mm1    \n\t"
                "paddw %%mm6, %%mm0     \n\t"
                "paddw %%mm6, %%mm1     \n\t"
                "psraw $1, %%mm0        \n\t"
                "psraw $1, %%mm1        \n\t"
                "paddw (%2, %0), %%mm0  \n\t"
                "paddw 8(%2, %0), %%mm1 \n\t"
                "movq %%mm0, (%2, %0)   \n\t"
                "movq %%mm1, 8(%2, %0)  \n\t"
                "add $16, %0            \n\t"
                "cmp $128, %0           \n\t" //FIXME optimize & bench
                " jb 1b                 \n\t"

                : "+r" (i)
                : "r"(basis), "r"(rem), "g"(scale)
        );
    }else{
        for(i=0; i<8*8; i++){
            rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
        }
    }
 }
 #endif /* CONFIG_ENCODERS */

 #define PREFETCH(name, op) \
@@ -3625,6 +3603,10 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)

 #ifdef HAVE_SSSE3
        if(mm_flags & MM_SSSE3){
            if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
                c->try_8x8basis= try_8x8basis_ssse3;
            }
            c->add_8x8basis= add_8x8basis_ssse3;
            c->sum_abs_dctelem= sum_abs_dctelem_ssse3;
            c->hadamard8_diff[0]= hadamard8_diff16_ssse3;
            c->hadamard8_diff[1]= hadamard8_diff_ssse3;
@@ -3646,6 +3628,12 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
 #endif

        if(mm_flags & MM_3DNOW){
 #ifdef CONFIG_ENCODERS
            if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
                c->try_8x8basis= try_8x8basis_3dnow;
            }
            c->add_8x8basis= add_8x8basis_3dnow;
 #endif //CONFIG_ENCODERS
            c->vorbis_inverse_coupling = vorbis_inverse_coupling_3dnow;
            c->vector_fmul = vector_fmul_3dnow;
            if(!(avctx->flags & CODEC_FLAG_BITEXACT))
--- a/libavcodec/i386/dsputil_mmx_qns.h
+++ b/libavcodec/i386/dsputil_mmx_qns.h
@@ -0,0 +1,102 @@
 /*
 * DSP utils : QNS functions are compiled 3 times for mmx/3dnow/ssse3
 * Copyright (c) 2004 Michael Niedermayer
 *
 * This file is part of FFmpeg.
 *
 * FFmpeg is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * FFmpeg is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with FFmpeg; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 *
 * MMX optimization by Michael Niedermayer <michaelni@gmx.at>
 * 3DNow! and SSSE3 optimization by Zuxy Meng <zuxy.meng@gmail.com>
 */

 #define MAX_ABS (512 >> (SCALE_OFFSET>0 ? SCALE_OFFSET : 0))

 static int DEF(try_8x8basis)(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale)
 {
    long i=0;

    assert(FFABS(scale) < MAX_ABS);
    scale<<= 16 + SCALE_OFFSET - BASIS_SHIFT + RECON_SHIFT;

    SET_RND(mm6);
    asm volatile(
        "pxor %%mm7, %%mm7              \n\t"
        "movd  %4, %%mm5                \n\t"
        "punpcklwd %%mm5, %%mm5         \n\t"
        "punpcklwd %%mm5, %%mm5         \n\t"
        ASMALIGN(4)
        "1:                             \n\t"
        "movq  (%1, %0), %%mm0          \n\t"
        "movq  8(%1, %0), %%mm1         \n\t"
        PMULHRW(%%mm0, %%mm1, %%mm5, %%mm6)
        "paddw (%2, %0), %%mm0          \n\t"
        "paddw 8(%2, %0), %%mm1         \n\t"
        "psraw $6, %%mm0                \n\t"
        "psraw $6, %%mm1                \n\t"
        "pmullw (%3, %0), %%mm0         \n\t"
        "pmullw 8(%3, %0), %%mm1        \n\t"
        "pmaddwd %%mm0, %%mm0           \n\t"
        "pmaddwd %%mm1, %%mm1           \n\t"
        "paddd %%mm1, %%mm0             \n\t"
        "psrld $4, %%mm0                \n\t"
        "paddd %%mm0, %%mm7             \n\t"
        "add $16, %0                    \n\t"
        "cmp $128, %0                   \n\t" //FIXME optimize & bench
        " jb 1b                         \n\t"
        PHADDD(%%mm7, %%mm6)
        "psrld $2, %%mm7                \n\t"
        "movd %%mm7, %0                 \n\t"

        : "+r" (i)
        : "r"(basis), "r"(rem), "r"(weight), "g"(scale)
    );
    return i;
 }

 static void DEF(add_8x8basis)(int16_t rem[64], int16_t basis[64], int scale)
 {
    long i=0;

    if(FFABS(scale) < MAX_ABS){
        scale<<= 16 + SCALE_OFFSET - BASIS_SHIFT + RECON_SHIFT;
        SET_RND(mm6);
        asm volatile(
                "movd  %3, %%mm5        \n\t"
                "punpcklwd %%mm5, %%mm5 \n\t"
                "punpcklwd %%mm5, %%mm5 \n\t"
                ASMALIGN(4)
                "1:                     \n\t"
                "movq  (%1, %0), %%mm0  \n\t"
                "movq  8(%1, %0), %%mm1 \n\t"
                PMULHRW(%%mm0, %%mm1, %%mm5, %%mm6)
                "paddw (%2, %0), %%mm0  \n\t"
                "paddw 8(%2, %0), %%mm1 \n\t"
                "movq %%mm0, (%2, %0)   \n\t"
                "movq %%mm1, 8(%2, %0)  \n\t"
                "add $16, %0            \n\t"
                "cmp $128, %0           \n\t" // FIXME optimize & bench
                " jb 1b                 \n\t"

                : "+r" (i)
                : "r"(basis), "r"(rem), "g"(scale)
        );
    }else{
        for(i=0; i<8*8; i++){
            rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
        }
    }
 }