H.264: split luma dc idct out and implement MMX/SSE2 versions

About 2.5x the speed. NOTE: the way that the asm code handles large qmuls is a bit suboptimal. If x264-style dequant was used (separate shift and qmul values), it might be possible to get some extra speed. Originally committed as revision 26336 to svn://svn.ffmpeg.org/ffmpeg/trunk
15 years ago · 19fb234e4a
--- a/libavcodec/dsputil.h
+++ b/libavcodec/dsputil.h
@@ -64,6 +64,10 @@ void ff_h264_idct_add16intra_c(uint8_t *dst, const int *blockoffset, DCTELEM *bl
 void ff_h264_idct8_add4_c(uint8_t *dst, const int *blockoffset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]);
 void ff_h264_idct_add8_c(uint8_t **dest, const int *blockoffset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]);

 void ff_h264_luma_dc_dequant_idct_c(DCTELEM *output, DCTELEM *input, int qmul);
 void ff_svq3_luma_dc_dequant_idct_c(DCTELEM *output, DCTELEM *input, int qp);
 void ff_svq3_add_idct_c(uint8_t *dst, DCTELEM *block, int stride, int qp, int dc);

 void ff_vector_fmul_window_c(float *dst, const float *src0, const float *src1,
                             const float *win, float add_bias, int len);
 void ff_float_to_int16_c(int16_t *dst, const float *src, long len);
--- a/libavcodec/h264.c
+++ b/libavcodec/h264.c
@@ -246,46 +246,6 @@ int ff_h264_decode_rbsp_trailing(H264Context *h, const uint8_t *src){
    return 0;
 }

 /**
 * IDCT transforms the 16 dc values and dequantizes them.
 * @param qp quantization parameter
 */
 static void h264_luma_dc_dequant_idct_c(DCTELEM *block, int qp, int qmul){
 #define stride 16
    int i;
    int temp[16]; //FIXME check if this is a good idea
    static const int x_offset[4]={0, 1*stride, 4* stride,  5*stride};
    static const int y_offset[4]={0, 2*stride, 8* stride, 10*stride};

 //memset(block, 64, 2*256);
 //return;
    for(i=0; i<4; i++){
        const int offset= y_offset[i];
        const int z0= block[offset+stride*0] + block[offset+stride*4];
        const int z1= block[offset+stride*0] - block[offset+stride*4];
        const int z2= block[offset+stride*1] - block[offset+stride*5];
        const int z3= block[offset+stride*1] + block[offset+stride*5];

        temp[4*i+0]= z0+z3;
        temp[4*i+1]= z1+z2;
        temp[4*i+2]= z1-z2;
        temp[4*i+3]= z0-z3;
    }

    for(i=0; i<4; i++){
        const int offset= x_offset[i];
        const int z0= temp[4*0+i] + temp[4*2+i];
        const int z1= temp[4*0+i] - temp[4*2+i];
        const int z2= temp[4*1+i] - temp[4*3+i];
        const int z3= temp[4*1+i] + temp[4*3+i];

        block[stride*0 +offset]= ((((z0 + z3)*qmul + 128 ) >> 8)); //FIXME think about merging this into decode_residual
        block[stride*2 +offset]= ((((z1 + z2)*qmul + 128 ) >> 8));
        block[stride*8 +offset]= ((((z1 - z2)*qmul + 128 ) >> 8));
        block[stride*10+offset]= ((((z0 - z3)*qmul + 128 ) >> 8));
    }
 }

 #if 0
 /**
 * DCT transforms the 16 dc values.
@@ -1245,9 +1205,15 @@ static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple){
                h->hpc.pred16x16[ h->intra16x16_pred_mode ](dest_y , linesize);
                if(is_h264){
                    if(!transform_bypass)
                        h264_luma_dc_dequant_idct_c(h->mb, s->qscale, h->dequant4_coeff[0][s->qscale][0]);
                        h->h264dsp.h264_luma_dc_dequant_idct(h->mb, h->mb_luma_dc, h->dequant4_coeff[0][s->qscale][0]);
                    else{
                        static const uint8_t dc_mapping[16] = { 0*16, 1*16, 4*16, 5*16, 2*16, 3*16, 6*16, 7*16,
                                                                8*16, 9*16,12*16,13*16,10*16,11*16,14*16,15*16};
                        for(i = 0; i < 16; i++)
                            h->mb[dc_mapping[i]] = h->mb_luma_dc[i];
                    }
                }else
                    ff_svq3_luma_dc_dequant_idct_c(h->mb, s->qscale);
                    ff_svq3_luma_dc_dequant_idct_c(h->mb, h->mb_luma_dc, s->qscale);
            }
            if(h->deblocking_filter)
                xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 0, simple);
--- a/libavcodec/h264.h
+++ b/libavcodec/h264.h
@@ -406,6 +406,7 @@ typedef struct H264Context{
    GetBitContext *inter_gb_ptr;

    DECLARE_ALIGNED(16, DCTELEM, mb)[16*24];
    DECLARE_ALIGNED(16, DCTELEM, mb_luma_dc)[16];
    DCTELEM mb_padding[256];        ///< as mb is addressed by scantable[i] and scantable is uint8_t we can either check that i is not too large or ensure that there is some unused stuff after mb

    /**
@@ -600,10 +601,6 @@ typedef struct H264Context{

 extern const uint8_t ff_h264_chroma_qp[52];

 void ff_svq3_luma_dc_dequant_idct_c(DCTELEM *block, int qp);

 void ff_svq3_add_idct_c(uint8_t *dst, DCTELEM *block, int stride, int qp, int dc);

 /**
 * Decode SEI
 */
--- a/libavcodec/h264_cabac.c
+++ b/libavcodec/h264_cabac.c
@@ -1597,17 +1597,15 @@ decode_intra_mb:
    s->current_picture.mb_type[mb_xy]= mb_type;

    if( cbp || IS_INTRA16x16( mb_type ) ) {
        const uint8_t *scan, *scan8x8, *dc_scan;
        const uint8_t *scan, *scan8x8;
        const uint32_t *qmul;

        if(IS_INTERLACED(mb_type)){
            scan8x8= s->qscale ? h->field_scan8x8 : h->field_scan8x8_q0;
            scan= s->qscale ? h->field_scan : h->field_scan_q0;
            dc_scan= luma_dc_field_scan;
        }else{
            scan8x8= s->qscale ? h->zigzag_scan8x8 : h->zigzag_scan8x8_q0;
            scan= s->qscale ? h->zigzag_scan : h->zigzag_scan_q0;
            dc_scan= luma_dc_zigzag_scan;
        }

        // decode_cabac_mb_dqp
@@ -1642,7 +1640,9 @@ decode_intra_mb:
        if( IS_INTRA16x16( mb_type ) ) {
            int i;
            //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 DC\n" );
            decode_cabac_residual_dc( h, h->mb, 0, 0, dc_scan, 16);
            AV_ZERO128(h->mb_luma_dc+0);
            AV_ZERO128(h->mb_luma_dc+8);
            decode_cabac_residual_dc( h, h->mb_luma_dc, 0, 0, scan, 16);

            if( cbp&15 ) {
                qmul = h->dequant4_coeff[0][s->qscale];
--- a/libavcodec/h264_cavlc.c
+++ b/libavcodec/h264_cavlc.c
@@ -911,16 +911,14 @@ decode_intra_mb:
        int i8x8, i4x4, chroma_idx;
        int dquant;
        GetBitContext *gb= IS_INTRA(mb_type) ? h->intra_gb_ptr : h->inter_gb_ptr;
        const uint8_t *scan, *scan8x8, *dc_scan;
        const uint8_t *scan, *scan8x8;

        if(IS_INTERLACED(mb_type)){
            scan8x8= s->qscale ? h->field_scan8x8_cavlc : h->field_scan8x8_cavlc_q0;
            scan= s->qscale ? h->field_scan : h->field_scan_q0;
            dc_scan= luma_dc_field_scan;
        }else{
            scan8x8= s->qscale ? h->zigzag_scan8x8_cavlc : h->zigzag_scan8x8_cavlc_q0;
            scan= s->qscale ? h->zigzag_scan : h->zigzag_scan_q0;
            dc_scan= luma_dc_zigzag_scan;
        }

        dquant= get_se_golomb(&s->gb);
@@ -939,7 +937,9 @@ decode_intra_mb:
        h->chroma_qp[0]= get_chroma_qp(h, 0, s->qscale);
        h->chroma_qp[1]= get_chroma_qp(h, 1, s->qscale);
        if(IS_INTRA16x16(mb_type)){
            if( decode_residual(h, h->intra_gb_ptr, h->mb, LUMA_DC_BLOCK_INDEX, dc_scan, h->dequant4_coeff[0][s->qscale], 16) < 0){
            AV_ZERO128(h->mb_luma_dc+0);
            AV_ZERO128(h->mb_luma_dc+8);
            if( decode_residual(h, h->intra_gb_ptr, h->mb_luma_dc, LUMA_DC_BLOCK_INDEX, scan, h->dequant4_coeff[0][s->qscale], 16) < 0){
                return -1; //FIXME continue if partitioned and other return -1 too
            }

--- a/libavcodec/h264dsp.c
+++ b/libavcodec/h264dsp.c
@@ -282,6 +282,7 @@ void ff_h264dsp_init(H264DSPContext *c)
    c->h264_idct8_add4     = ff_h264_idct8_add4_c;
    c->h264_idct_add8      = ff_h264_idct_add8_c;
    c->h264_idct_add16intra= ff_h264_idct_add16intra_c;
    c->h264_luma_dc_dequant_idct= ff_h264_luma_dc_dequant_idct_c;

    c->weight_h264_pixels_tab[0]= weight_h264_pixels16x16_c;
    c->weight_h264_pixels_tab[1]= weight_h264_pixels16x8_c;
--- a/libavcodec/h264dsp.h
+++ b/libavcodec/h264dsp.h
@@ -65,11 +65,13 @@ typedef struct H264DSPContext{
    void (*h264_idct8_add)(uint8_t *dst/*align 8*/, DCTELEM *block/*align 16*/, int stride);
    void (*h264_idct_dc_add)(uint8_t *dst/*align 4*/, DCTELEM *block/*align 16*/, int stride);
    void (*h264_idct8_dc_add)(uint8_t *dst/*align 8*/, DCTELEM *block/*align 16*/, int stride);

    void (*h264_dct)(DCTELEM block[4][4]);
    void (*h264_idct_add16)(uint8_t *dst/*align 16*/, const int *blockoffset, DCTELEM *block/*align 16*/, int stride, const uint8_t nnzc[6*8]);
    void (*h264_idct8_add4)(uint8_t *dst/*align 16*/, const int *blockoffset, DCTELEM *block/*align 16*/, int stride, const uint8_t nnzc[6*8]);
    void (*h264_idct_add8)(uint8_t **dst/*align 16*/, const int *blockoffset, DCTELEM *block/*align 16*/, int stride, const uint8_t nnzc[6*8]);
    void (*h264_idct_add16intra)(uint8_t *dst/*align 16*/, const int *blockoffset, DCTELEM *block/*align 16*/, int stride, const uint8_t nnzc[6*8]);
    void (*h264_luma_dc_dequant_idct)(DCTELEM *output, DCTELEM *input/*align 16*/, int qmul);
 }H264DSPContext;

 void ff_h264dsp_init(H264DSPContext *c);
--- a/libavcodec/h264idct.c
+++ b/libavcodec/h264idct.c
@@ -216,3 +216,38 @@ void ff_h264_idct_add8_c(uint8_t **dest, const int *block_offset, DCTELEM *block
            ff_h264_idct_dc_add_c(dest[(i&4)>>2] + block_offset[i], block + i*16, stride);
    }
 }
 /**
 * IDCT transforms the 16 dc values and dequantizes them.
 * @param qp quantization parameter
 */
 void ff_h264_luma_dc_dequant_idct_c(DCTELEM *output, DCTELEM *input, int qmul){
 #define stride 16
    int i;
    int temp[16];
    static const uint8_t x_offset[4]={0, 2*stride, 8*stride, 10*stride};

    for(i=0; i<4; i++){
        const int z0= input[4*i+0] + input[4*i+1];
        const int z1= input[4*i+0] - input[4*i+1];
        const int z2= input[4*i+2] - input[4*i+3];
        const int z3= input[4*i+2] + input[4*i+3];

        temp[4*i+0]= z0+z3;
        temp[4*i+1]= z0-z3;
        temp[4*i+2]= z1-z2;
        temp[4*i+3]= z1+z2;
    }

    for(i=0; i<4; i++){
        const int offset= x_offset[i];
        const int z0= temp[4*0+i] + temp[4*2+i];
        const int z1= temp[4*0+i] - temp[4*2+i];
        const int z2= temp[4*1+i] - temp[4*3+i];
        const int z3= temp[4*1+i] + temp[4*3+i];

        output[stride* 0+offset]= ((((z0 + z3)*qmul + 128 ) >> 8));
        output[stride* 1+offset]= ((((z1 + z2)*qmul + 128 ) >> 8));
        output[stride* 4+offset]= ((((z1 - z2)*qmul + 128 ) >> 8));
        output[stride* 5+offset]= ((((z0 - z3)*qmul + 128 ) >> 8));
    }
 }
--- a/libavcodec/svq3.c
+++ b/libavcodec/svq3.c
@@ -126,21 +126,19 @@ static const uint32_t svq3_dequant_coeff[32] = {
 };


 void ff_svq3_luma_dc_dequant_idct_c(DCTELEM *block, int qp)
 void ff_svq3_luma_dc_dequant_idct_c(DCTELEM *output, DCTELEM *input, int qp)
 {
    const int qmul = svq3_dequant_coeff[qp];
 #define stride 16
    int i;
    int temp[16];
    static const int x_offset[4] = {0, 1*stride, 4* stride,  5*stride};
    static const int y_offset[4] = {0, 2*stride, 8* stride, 10*stride};

    for (i = 0; i < 4; i++){
        const int offset = y_offset[i];
        const int z0 = 13*(block[offset+stride*0] +    block[offset+stride*4]);
        const int z1 = 13*(block[offset+stride*0] -    block[offset+stride*4]);
        const int z2 =  7* block[offset+stride*1] - 17*block[offset+stride*5];
        const int z3 = 17* block[offset+stride*1] +  7*block[offset+stride*5];
        const int z0= 13*(input[4*i+0] +    input[4*i+1]);
        const int z1= 13*(input[4*i+0] -    input[4*i+1]);
        const int z2=  7* input[4*i+2] - 17*input[4*i+3];
        const int z3= 17* input[4*i+2] +  7*input[4*i+3];

        temp[4*i+0] = z0+z3;
        temp[4*i+1] = z1+z2;
@@ -155,10 +153,10 @@ void ff_svq3_luma_dc_dequant_idct_c(DCTELEM *block, int qp)
        const int z2 =  7* temp[4*1+i] - 17*temp[4*3+i];
        const int z3 = 17* temp[4*1+i] +  7*temp[4*3+i];

        block[stride*0 +offset] = ((z0 + z3)*qmul + 0x80000) >> 20;
        block[stride*2 +offset] = ((z1 + z2)*qmul + 0x80000) >> 20;
        block[stride*8 +offset] = ((z1 - z2)*qmul + 0x80000) >> 20;
        block[stride*10+offset] = ((z0 - z3)*qmul + 0x80000) >> 20;
        output[stride*0 +offset] = ((z0 + z3)*qmul + 0x80000) >> 20;
        output[stride*2 +offset] = ((z1 + z2)*qmul + 0x80000) >> 20;
        output[stride*8 +offset] = ((z1 - z2)*qmul + 0x80000) >> 20;
        output[stride*10+offset] = ((z0 - z3)*qmul + 0x80000) >> 20;
    }
 }
 #undef stride
--- a/libavcodec/x86/dsputil_mmx.c
+++ b/libavcodec/x86/dsputil_mmx.c
@@ -41,6 +41,7 @@ DECLARE_ALIGNED(8,  const uint64_t, ff_wtwo) = 0x0002000200020002ULL;
 DECLARE_ALIGNED(16, const uint64_t, ff_pdw_80000000)[2] =
 {0x8000000080000000ULL, 0x8000000080000000ULL};

 DECLARE_ALIGNED(8,  const uint64_t, ff_pw_1  ) = 0x0001000100010001ULL;
 DECLARE_ALIGNED(8,  const xmm_reg,  ff_pw_3  ) = {0x0003000300030003ULL, 0x0003000300030003ULL};
 DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_4  ) = {0x0004000400040004ULL, 0x0004000400040004ULL};
 DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_5  ) = {0x0005000500050005ULL, 0x0005000500050005ULL};
--- a/libavcodec/x86/h264_idct.asm
+++ b/libavcodec/x86/h264_idct.asm
@@ -47,6 +47,7 @@ scan8_mem: db 4+1*8, 5+1*8, 4+2*8, 5+2*8
 %endif

 cextern pw_32
 cextern pw_1

 SECTION .text

@@ -854,3 +855,156 @@ cglobal h264_idct_add8_sse2, 5, 7, 8
    add8_sse2_cycle 2, 0x21
    add8_sse2_cycle 3, 0x29
    RET

 ;void ff_h264_luma_dc_dequant_idct_mmx(DCTELEM *output, DCTELEM *input, int qmul)

 %macro WALSH4_1D 5
    SUMSUB_BADC m%4, m%3, m%2, m%1, m%5
    SUMSUB_BADC m%4, m%2, m%3, m%1, m%5
    SWAP %1, %4, %3
 %endmacro

 %macro DEQUANT_MMX 3
    mova        m7, [pw_1]
    mova        m4, %1
    punpcklwd   %1, m7
    punpckhwd   m4, m7
    mova        m5, %2
    punpcklwd   %2, m7
    punpckhwd   m5, m7
    movd        m7, t3d
    punpckldq   m7, m7
    pmaddwd     %1, m7
    pmaddwd     %2, m7
    pmaddwd     m4, m7
    pmaddwd     m5, m7
    psrad       %1, %3
    psrad       %2, %3
    psrad       m4, %3
    psrad       m5, %3
    packssdw    %1, m4
    packssdw    %2, m5
 %endmacro

 %macro STORE_WORDS_MMX 5
    movd  t0d, %1
    psrlq  %1, 32
    movd  t1d, %1
    mov [t2+%2*32], t0w
    mov [t2+%4*32], t1w
    shr   t0d, 16
    shr   t1d, 16
    mov [t2+%3*32], t0w
    mov [t2+%5*32], t1w
 %endmacro

 %macro DEQUANT_STORE_MMX 1
    DEQUANT_MMX m0, m1, %1
    STORE_WORDS_MMX m0,  0,  1,  4,  5
    STORE_WORDS_MMX m1,  2,  3,  6,  7

    DEQUANT_MMX m2, m3, %1
    STORE_WORDS_MMX m2,  8,  9, 12, 13
    STORE_WORDS_MMX m3, 10, 11, 14, 15
 %endmacro

 %macro STORE_WORDS_SSE 9
    movd  t0d, %1
    psrldq  %1, 4
    movd  t1d, %1
    psrldq  %1, 4
    mov [t2+%2*32], t0w
    mov [t2+%4*32], t1w
    shr   t0d, 16
    shr   t1d, 16
    mov [t2+%3*32], t0w
    mov [t2+%5*32], t1w
    movd  t0d, %1
    psrldq  %1, 4
    movd  t1d, %1
    mov [t2+%6*32], t0w
    mov [t2+%8*32], t1w
    shr   t0d, 16
    shr   t1d, 16
    mov [t2+%7*32], t0w
    mov [t2+%9*32], t1w
 %endmacro

 %macro DEQUANT_STORE_SSE2 1
    movd      xmm4, t3d
    movq      xmm5, [pw_1]
    pshufd    xmm4, xmm4, 0
    movq2dq   xmm0, m0
    movq2dq   xmm1, m1
    movq2dq   xmm2, m2
    movq2dq   xmm3, m3
    punpcklwd xmm0, xmm5
    punpcklwd xmm1, xmm5
    punpcklwd xmm2, xmm5
    punpcklwd xmm3, xmm5
    pmaddwd   xmm0, xmm4
    pmaddwd   xmm1, xmm4
    pmaddwd   xmm2, xmm4
    pmaddwd   xmm3, xmm4
    psrad     xmm0, %1
    psrad     xmm1, %1
    psrad     xmm2, %1
    psrad     xmm3, %1
    packssdw  xmm0, xmm1
    packssdw  xmm2, xmm3
    STORE_WORDS_SSE xmm0,  0,  1,  4,  5,  2,  3,  6,  7
    STORE_WORDS_SSE xmm2,  8,  9, 12, 13, 10, 11, 14, 15
 %endmacro

 %macro IDCT_DC_DEQUANT 2
 cglobal h264_luma_dc_dequant_idct_%1, 3,4,%2
    movq        m3, [r1+24]
    movq        m2, [r1+16]
    movq        m1, [r1+ 8]
    movq        m0, [r1+ 0]
    WALSH4_1D    0,1,2,3,4
    TRANSPOSE4x4W 0,1,2,3,4
    WALSH4_1D    0,1,2,3,4

 ; shift, tmp, output, qmul
 %ifdef WIN64
    DECLARE_REG_TMP 0,3,1,2
    ; we can't avoid this, because r0 is the shift register (ecx) on win64
    xchg        r0, t2
 %elifdef ARCH_X86_64
    DECLARE_REG_TMP 3,1,0,2
 %else
    DECLARE_REG_TMP 1,3,0,2
 %endif

    cmp        t3d, 32767
    jg .big_qmul
    add        t3d, 128 << 16
 %ifidn %1,mmx
    DEQUANT_STORE_MMX 8
 %else
    DEQUANT_STORE_SSE2 8
 %endif
    RET
 .big_qmul:
    bsr        t0d, t3d
    add        t3d, 128 << 16
    mov        t1d, 7
    cmp        t0d, t1d
    cmovg      t0d, t1d
    inc        t1d
    shr        t3d, t0b
    sub        t1d, t0d
 %ifidn %1,mmx
    movd        m6, t1d
    DEQUANT_STORE_MMX m6
 %else
    movd      xmm6, t1d
    DEQUANT_STORE_SSE2 xmm6
 %endif
    RET
 %endmacro

 INIT_MMX
 IDCT_DC_DEQUANT mmx, 0
 IDCT_DC_DEQUANT sse2, 7
--- a/libavcodec/x86/h264dsp_mmx.c
+++ b/libavcodec/x86/h264dsp_mmx.c
@@ -59,6 +59,8 @@ void ff_h264_idct_add16intra_sse2(uint8_t *dst, const int *block_offset, DCTELEM
                                  int stride, const uint8_t nnzc[6*8]);
 void ff_h264_idct_add8_sse2      (uint8_t **dest, const int *block_offset, DCTELEM *block,
                                  int stride, const uint8_t nnzc[6*8]);
 void ff_h264_luma_dc_dequant_idct_mmx (DCTELEM *output, DCTELEM *input, int qmul);
 void ff_h264_luma_dc_dequant_idct_sse2(DCTELEM *output, DCTELEM *input, int qmul);

 /***********************************/
 /* deblocking */
@@ -301,6 +303,7 @@ void ff_h264dsp_init_x86(H264DSPContext *c)
        c->h264_idct8_add4     = ff_h264_idct8_add4_mmx;
        c->h264_idct_add8      = ff_h264_idct_add8_mmx;
        c->h264_idct_add16intra= ff_h264_idct_add16intra_mmx;
        c->h264_luma_dc_dequant_idct= ff_h264_luma_dc_dequant_idct_mmx;

        if (mm_flags & AV_CPU_FLAG_MMX2) {
            c->h264_idct_dc_add= ff_h264_idct_dc_add_mmx2;
@@ -341,6 +344,7 @@ void ff_h264dsp_init_x86(H264DSPContext *c)
            if (mm_flags&AV_CPU_FLAG_SSE2) {
                c->h264_idct8_add = ff_h264_idct8_add_sse2;
                c->h264_idct8_add4= ff_h264_idct8_add4_sse2;
                c->h264_luma_dc_dequant_idct= ff_h264_luma_dc_dequant_idct_sse2;

                c->weight_h264_pixels_tab[0]= ff_h264_weight_16x16_sse2;
                c->weight_h264_pixels_tab[1]= ff_h264_weight_16x8_sse2;