ARM: NEON optimised H.264 weighted prediction

Originally committed as revision 16771 to svn://svn.ffmpeg.org/ffmpeg/trunk
17 years ago · bd53b426b7
--- a/libavcodec/arm/dsputil_neon.c
+++ b/libavcodec/arm/dsputil_neon.c
@@ -92,6 +92,23 @@ void ff_h264_v_loop_filter_chroma_neon(uint8_t *pix, int stride, int alpha,
 void ff_h264_h_loop_filter_chroma_neon(uint8_t *pix, int stride, int alpha,
                                       int beta, int8_t *tc0);

 void ff_weight_h264_pixels_16x16_neon(uint8_t *ds, int stride, int log2_den,
                                      int weight, int offset);
 void ff_weight_h264_pixels_16x8_neon(uint8_t *ds, int stride, int log2_den,
                                     int weight, int offset);
 void ff_weight_h264_pixels_8x16_neon(uint8_t *ds, int stride, int log2_den,
                                     int weight, int offset);
 void ff_weight_h264_pixels_8x8_neon(uint8_t *ds, int stride, int log2_den,
                                    int weight, int offset);
 void ff_weight_h264_pixels_8x4_neon(uint8_t *ds, int stride, int log2_den,
                                    int weight, int offset);
 void ff_weight_h264_pixels_4x8_neon(uint8_t *ds, int stride, int log2_den,
                                    int weight, int offset);
 void ff_weight_h264_pixels_4x4_neon(uint8_t *ds, int stride, int log2_den,
                                    int weight, int offset);
 void ff_weight_h264_pixels_4x2_neon(uint8_t *ds, int stride, int log2_den,
                                    int weight, int offset);

 void ff_biweight_h264_pixels_16x16_neon(uint8_t *dst, uint8_t *src, int stride,
                                        int log2_den, int weightd, int weights,
                                        int offset);
@@ -201,6 +218,15 @@ void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx)
    c->h264_v_loop_filter_chroma = ff_h264_v_loop_filter_chroma_neon;
    c->h264_h_loop_filter_chroma = ff_h264_h_loop_filter_chroma_neon;

    c->weight_h264_pixels_tab[0] = ff_weight_h264_pixels_16x16_neon;
    c->weight_h264_pixels_tab[1] = ff_weight_h264_pixels_16x8_neon;
    c->weight_h264_pixels_tab[2] = ff_weight_h264_pixels_8x16_neon;
    c->weight_h264_pixels_tab[3] = ff_weight_h264_pixels_8x8_neon;
    c->weight_h264_pixels_tab[4] = ff_weight_h264_pixels_8x4_neon;
    c->weight_h264_pixels_tab[5] = ff_weight_h264_pixels_4x8_neon;
    c->weight_h264_pixels_tab[6] = ff_weight_h264_pixels_4x4_neon;
    c->weight_h264_pixels_tab[7] = ff_weight_h264_pixels_4x2_neon;

    c->biweight_h264_pixels_tab[0] = ff_biweight_h264_pixels_16x16_neon;
    c->biweight_h264_pixels_tab[1] = ff_biweight_h264_pixels_16x8_neon;
    c->biweight_h264_pixels_tab[2] = ff_biweight_h264_pixels_8x16_neon;
--- a/libavcodec/arm/h264dsp_neon.S
+++ b/libavcodec/arm/h264dsp_neon.S
@@ -1536,3 +1536,135 @@ function ff_biweight_h264_pixels_\w\()x\h\()_neon, export=1
        biweight_entry  4,  2
        biweight_entry  4,  4,  b=0
        biweight_func   4

@ Weighted prediction

        .macro  weight_16 mac
        vdup.8          d0,  r3
        vmov            q2,  q8
        vmov            q3,  q8
 1:      subs            ip,  ip,  #2
        vld1.8          {d20-d21},[r0,:128], r1
        \mac            q2,  d0,  d20
        pld             [r0]
        \mac            q3,  d0,  d21
        vmov            q12, q8
        vld1.8          {d28-d29},[r0,:128], r1
        vmov            q13, q8
        \mac            q12, d0,  d28
        pld             [r0]
        \mac            q13, d0,  d29
        vshl.s16        q2,  q2,  q9
        vshl.s16        q3,  q3,  q9
        vqmovun.s16     d4,  q2
        vqmovun.s16     d5,  q3
        vshl.s16        q12, q12, q9
        vshl.s16        q13, q13, q9
        vqmovun.s16     d24, q12
        vqmovun.s16     d25, q13
        vmov            q3,  q8
        vst1.8          {d4- d5}, [r4,:128], r1
        vmov            q2,  q8
        vst1.8          {d24-d25},[r4,:128], r1
        bne             1b
        pop             {r4, pc}
        .endm

        .macro  weight_8 mac
        vdup.8          d0,  r3
        vmov            q1,  q8
        vmov            q10, q8
 1:      subs            ip,  ip,  #2
        vld1.8          {d4},[r0,:64], r1
        \mac            q1,  d0,  d4
        pld             [r0]
        vld1.8          {d6},[r0,:64], r1
        \mac            q10, d0,  d6
        pld             [r0]
        vshl.s16        q1,  q1,  q9
        vqmovun.s16     d2,  q1
        vshl.s16        q10, q10, q9
        vqmovun.s16     d4,  q10
        vmov            q10, q8
        vst1.8          {d2},[r4,:64], r1
        vmov            q1,  q8
        vst1.8          {d4},[r4,:64], r1
        bne             1b
        pop             {r4, pc}
        .endm

        .macro  weight_4 mac
        vdup.8          d0,  r3
        vmov            q1,  q8
        vmov            q10, q8
 1:      subs            ip,  ip,  #4
        vld1.32         {d4[0]},[r0,:32], r1
        vld1.32         {d4[1]},[r0,:32], r1
        \mac            q1,  d0,  d4
        pld             [r0]
        blt             2f
        vld1.32         {d6[0]},[r0,:32], r1
        vld1.32         {d6[1]},[r0,:32], r1
        \mac            q10, d0,  d6
        pld             [r0]
        vshl.s16        q1,  q1,  q9
        vqmovun.s16     d2,  q1
        vshl.s16        q10, q10, q9
        vqmovun.s16     d4,  q10
        vmov            q10, q8
        vst1.32         {d2[0]},[r4,:32], r1
        vst1.32         {d2[1]},[r4,:32], r1
        vmov            q1,  q8
        vst1.32         {d4[0]},[r4,:32], r1
        vst1.32         {d4[1]},[r4,:32], r1
        bne             1b
        pop             {r4, pc}
 2:      vshl.s16        q1,  q1,  q9
        vqmovun.s16     d2,  q1
        vst1.32         {d2[0]},[r4,:32], r1
        vst1.32         {d2[1]},[r4,:32], r1
        pop             {r4, pc}
        .endm

        .macro  weight_func w
 function weight_h264_pixels_\w\()_neon
        push            {r4, lr}
        ldr             r4,  [sp, #8]
        vdup.16         q9,  r2
        mov             lr,  #1
        lsl             r4,  r4,  r2
        subs            r2,  r2,  #1
        vneg.s16        q9,  q9
        addge           r4,  r4,  lr,  lsl r2
        cmp             r3,  #0
        vdup.16         q8,  r4
        mov             r4,  r0
        blt             10f
        weight_\w       vmlal.u8
 10:     rsb             r3,  r3,  #0
        weight_\w       vmlsl.u8
        .endfunc
        .endm

        .macro  weight_entry w, h, b=1
 function ff_weight_h264_pixels_\w\()x\h\()_neon, export=1
        mov             ip,  #\h
 .if \b
        b               weight_h264_pixels_\w\()_neon
 .endif
        .endfunc
        .endm

        weight_entry    16, 8
        weight_entry    16, 16, b=0
        weight_func     16

        weight_entry    8,  16
        weight_entry    8,  4
        weight_entry    8,  8,  b=0
        weight_func     8

        weight_entry    4,  8
        weight_entry    4,  2
        weight_entry    4,  4,  b=0
        weight_func     4