ARM: add new h264 idct functions

Originally committed as revision 16312 to svn://svn.ffmpeg.org/ffmpeg/trunk
17 years ago · 760badc1df
--- a/libavcodec/arm/dsputil_neon.c
+++ b/libavcodec/arm/dsputil_neon.c
@@ -94,6 +94,15 @@ void ff_h264_h_loop_filter_chroma_neon(uint8_t *pix, int stride, int alpha,

 void ff_h264_idct_add_neon(uint8_t *dst, DCTELEM *block, int stride);
 void ff_h264_idct_dc_add_neon(uint8_t *dst, DCTELEM *block, int stride);
 void ff_h264_idct_add16_neon(uint8_t *dst, const int *block_offset,
                             DCTELEM *block, int stride,
                             const uint8_t nnzc[6*8]);
 void ff_h264_idct_add16intra_neon(uint8_t *dst, const int *block_offset,
                                  DCTELEM *block, int stride,
                                  const uint8_t nnzc[6*8]);
 void ff_h264_idct_add8_neon(uint8_t **dest, const int *block_offset,
                            DCTELEM *block, int stride,
                            const uint8_t nnzc[6*8]);

 void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx)
 {
@@ -166,4 +175,7 @@ void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx)

    c->h264_idct_add = ff_h264_idct_add_neon;
    c->h264_idct_dc_add = ff_h264_idct_dc_add_neon;
    c->h264_idct_add16      = ff_h264_idct_add16_neon;
    c->h264_idct_add16intra = ff_h264_idct_add16intra_neon;
    c->h264_idct_add8       = ff_h264_idct_add8_neon;
 }
--- a/libavcodec/arm/h264idct_neon.S
+++ b/libavcodec/arm/h264idct_neon.S
@@ -20,6 +20,7 @@

 #include "asm.S"

        preserve8
        .fpu neon

        .text
@@ -94,3 +95,95 @@ function ff_h264_idct_dc_add_neon, export=1
        vst1.32         {d1[1]},  [r0,:32], r2
        bx              lr
        .endfunc

 function ff_h264_idct_add16_neon, export=1
        push            {r4-r8,lr}
        mov             r4,  r0
        mov             r5,  r1
        mov             r1,  r2
        mov             r2,  r3
        ldr             r6,  [sp, #24]
        movw            r7,  #:lower16:scan8
        movt            r7,  #:upper16:scan8
        mov             ip,  #16
 1:      ldrb            r8,  [r7], #1
        ldr             r0,  [r5], #4
        ldrb            r8,  [r6, r8]
        subs            r8,  r8,  #1
        blt             2f
        ldrsh           lr,  [r1]
        add             r0,  r0,  r4
        movne           lr,  #0
        cmp             lr,  #0
        adrne           lr,  ff_h264_idct_dc_add_neon
        adreq           lr,  ff_h264_idct_add_neon
        blx             lr
 2:      subs            ip,  ip,  #1
        add             r1,  r1,  #32
        bne             1b
        pop             {r4-r8,pc}
        .endfunc

 function ff_h264_idct_add16intra_neon, export=1
        push            {r4-r8,lr}
        mov             r4,  r0
        mov             r5,  r1
        mov             r1,  r2
        mov             r2,  r3
        ldr             r6,  [sp, #24]
        movw            r7,  #:lower16:scan8
        movt            r7,  #:upper16:scan8
        mov             ip,  #16
 1:      ldrb            r8,  [r7], #1
        ldr             r0,  [r5], #4
        ldrb            r8,  [r6, r8]
        add             r0,  r0,  r4
        cmp             r8,  #0
        ldrsh           r8,  [r1]
        adrne           lr,  ff_h264_idct_add_neon
        adreq           lr,  ff_h264_idct_dc_add_neon
        cmpeq           r8,  #0
        blxne           lr
        subs            ip,  ip,  #1
        add             r1,  r1,  #32
        bne             1b
        pop             {r4-r8,pc}
        .endfunc

 function ff_h264_idct_add8_neon, export=1
        push            {r4-r10,lr}
        ldm             r0,  {r4,r9}
        add             r5,  r1,  #16*4
        add             r1,  r2,  #16*32
        mov             r2,  r3
        ldr             r6,  [sp, #32]
        movw            r7,  #:lower16:scan8+16
        movt            r7,  #:upper16:scan8+16
        mov             ip,  #8
 1:      ldrb            r8,  [r7], #1
        ldr             r0,  [r5], #4
        ldrb            r8,  [r6, r8]
        tst             ip,  #4
        addeq           r0,  r0,  r4
        addne           r0,  r0,  r9
        cmp             r8,  #0
        ldrsh           r8,  [r1]
        adrne           lr,  ff_h264_idct_add_neon
        adreq           lr,  ff_h264_idct_dc_add_neon
        cmpeq           r8,  #0
        blxne           lr
        subs            ip,  ip,  #1
        add             r1,  r1,  #32
        bne             1b
        pop             {r4-r10,pc}
        .endfunc

        .section .rodata
 scan8:  .byte           4+1*8, 5+1*8, 4+2*8, 5+2*8
        .byte           6+1*8, 7+1*8, 6+2*8, 7+2*8
        .byte           4+3*8, 5+3*8, 4+4*8, 5+4*8
        .byte           6+3*8, 7+3*8, 6+4*8, 7+4*8
        .byte           1+1*8, 2+1*8
        .byte           1+2*8, 2+2*8
        .byte           1+4*8, 2+4*8
        .byte           1+5*8, 2+5*8