aarch64: vp9: use alternative returns in the core loop filter function

Since aarch64 has enough free general purpose registers use them to branch to the appropiate storage code. 1-2 cycles faster for the functions using loop_filter 8/16, ... on a cortex-a53. Mixed results (up to 2 cycles faster/slower) on a cortex-a57.
9 years ago · d7595de0b2
--- a/libavcodec/aarch64/vp9lpf_neon.S
+++ b/libavcodec/aarch64/vp9lpf_neon.S
@@ -410,15 +410,19 @@
 .endif
        // If no pixels needed flat8in nor flat8out, jump to a
        // writeout of the inner 4 pixels
        cbz             x5,  7f
        cbnz            x5,  1f
        br              x14
 1:
        mov             x5,  v7.d[0]
 .ifc \sz, .16b
        mov             x6,  v7.d[1]
        orr             x5,  x5,  x6
 .endif
        // If no pixels need flat8out, jump to a writeout of the inner 6 pixels
        cbz             x5,  8f
        cbnz            x5,  1f
        br              x15

 1:
        // flat8out
        // This writes all outputs into v2-v17 (skipping v6 and v16).
        // If this part is skipped, the output is read from v21-v26 (which is the input
@@ -549,35 +553,24 @@ endfunc

 function vp9_loop_filter_8
        loop_filter     8,  .8b,  0,    v16, v17, v18, v19, v28, v29, v30, v31
        mov             x5,  #0
        ret
 6:
        mov             x5,  #6
        ret
        br              x13
 9:
        br              x10
 endfunc

 function vp9_loop_filter_8_16b_mix
        loop_filter     8,  .16b, 88,   v16, v17, v18, v19, v28, v29, v30, v31
        mov             x5,  #0
        ret
 6:
        mov             x5,  #6
        ret
        br              x13
 9:
        br              x10
 endfunc

 function vp9_loop_filter_16
        loop_filter     16, .8b,  0,    v8,  v9,  v10, v11, v12, v13, v14, v15
        mov             x5,  #0
        ret
 7:
        mov             x5,  #7
        ret
 8:
        mov             x5,  #8
        ret
 9:
        ldp             d8,  d9,  [sp], 0x10
@@ -589,13 +582,6 @@ endfunc

 function vp9_loop_filter_16_16b
        loop_filter     16, .16b, 0,    v8,  v9,  v10, v11, v12, v13, v14, v15
        mov             x5,  #0
        ret
 7:
        mov             x5,  #7
        ret
 8:
        mov             x5,  #8
        ret
 9:
        ldp             d8,  d9,  [sp], 0x10
@@ -614,11 +600,14 @@ endfunc
 .endm

 .macro loop_filter_8
        // calculate alternative 'return' targets
        adr             x13, 6f
        bl              vp9_loop_filter_8
        cbnz            x5,  6f
 .endm

 .macro loop_filter_8_16b_mix mix
        // calculate alternative 'return' targets
        adr             x13, 6f
 .if \mix == 48
        mov             x11, #0xffffffff00000000
 .elseif \mix == 84
@@ -627,21 +616,20 @@ endfunc
        mov             x11, #0xffffffffffffffff
 .endif
        bl              vp9_loop_filter_8_16b_mix
        cbnz            x5,  6f
 .endm

 .macro loop_filter_16
        // calculate alternative 'return' targets
        adr             x14, 7f
        adr             x15, 8f
        bl              vp9_loop_filter_16
        cmp             x5,  7
        b.gt            8f
        b.eq            7f
 .endm

 .macro loop_filter_16_16b
        // calculate alternative 'return' targets
        adr             x14, 7f
        adr             x15, 8f
        bl              vp9_loop_filter_16_16b
        cmp             x5,  7
        b.gt            8f
        b.eq            7f
 .endm