aarch64: vp9itxfm: Avoid reloading the idct32 coefficients

The idct32x32 function actually pushed d8-d15 onto the stack even though it didn't clobber them; there are plenty of registers that can be used to allow keeping all the idct coefficients in registers without having to reload different subsets of them at different stages in the transform. After this, we still can skip pushing d12-d15. Before: vp9_inv_dct_dct_32x32_sub32_add_neon: 8128.3 After: vp9_inv_dct_dct_32x32_sub32_add_neon: 8053.3 Signed-off-by: Martin Storsjö <martin@martin.st>
9 years ago · 65aa002d54
--- a/libavcodec/aarch64/vp9itxfm_neon.S
+++ b/libavcodec/aarch64/vp9itxfm_neon.S
@@ -1123,18 +1123,14 @@ endfunc
 .endm

 function idct32_odd
        ld1             {v0.8h,v1.8h}, [x11]

        dmbutterfly     v16, v31, v0.h[0], v0.h[1], v4, v5, v6, v7 // v16 = t16a, v31 = t31a
        dmbutterfly     v24, v23, v0.h[2], v0.h[3], v4, v5, v6, v7 // v24 = t17a, v23 = t30a
        dmbutterfly     v20, v27, v0.h[4], v0.h[5], v4, v5, v6, v7 // v20 = t18a, v27 = t29a
        dmbutterfly     v28, v19, v0.h[6], v0.h[7], v4, v5, v6, v7 // v28 = t19a, v19 = t28a
        dmbutterfly     v18, v29, v1.h[0], v1.h[1], v4, v5, v6, v7 // v18 = t20a, v29 = t27a
        dmbutterfly     v26, v21, v1.h[2], v1.h[3], v4, v5, v6, v7 // v26 = t21a, v21 = t26a
        dmbutterfly     v22, v25, v1.h[4], v1.h[5], v4, v5, v6, v7 // v22 = t22a, v25 = t25a
        dmbutterfly     v30, v17, v1.h[6], v1.h[7], v4, v5, v6, v7 // v30 = t23a, v17 = t24a

        ld1             {v0.8h}, [x10]
        dmbutterfly     v16, v31, v8.h[0], v8.h[1], v4, v5, v6, v7 // v16 = t16a, v31 = t31a
        dmbutterfly     v24, v23, v8.h[2], v8.h[3], v4, v5, v6, v7 // v24 = t17a, v23 = t30a
        dmbutterfly     v20, v27, v8.h[4], v8.h[5], v4, v5, v6, v7 // v20 = t18a, v27 = t29a
        dmbutterfly     v28, v19, v8.h[6], v8.h[7], v4, v5, v6, v7 // v28 = t19a, v19 = t28a
        dmbutterfly     v18, v29, v9.h[0], v9.h[1], v4, v5, v6, v7 // v18 = t20a, v29 = t27a
        dmbutterfly     v26, v21, v9.h[2], v9.h[3], v4, v5, v6, v7 // v26 = t21a, v21 = t26a
        dmbutterfly     v22, v25, v9.h[4], v9.h[5], v4, v5, v6, v7 // v22 = t22a, v25 = t25a
        dmbutterfly     v30, v17, v9.h[6], v9.h[7], v4, v5, v6, v7 // v30 = t23a, v17 = t24a

        butterfly_8h    v4,  v24, v16, v24 // v4  = t16, v24 = t17
        butterfly_8h    v5,  v20, v28, v20 // v5  = t19, v20 = t18
@@ -1153,18 +1149,14 @@ function idct32_odd
 endfunc

 function idct32_odd_half
        ld1             {v0.8h,v1.8h}, [x11]

        dmbutterfly_h1  v16, v31, v0.h[0], v0.h[1], v4, v5, v6, v7 // v16 = t16a, v31 = t31a
        dmbutterfly_h2  v24, v23, v0.h[2], v0.h[3], v4, v5, v6, v7 // v24 = t17a, v23 = t30a
        dmbutterfly_h1  v20, v27, v0.h[4], v0.h[5], v4, v5, v6, v7 // v20 = t18a, v27 = t29a
        dmbutterfly_h2  v28, v19, v0.h[6], v0.h[7], v4, v5, v6, v7 // v28 = t19a, v19 = t28a
        dmbutterfly_h1  v18, v29, v1.h[0], v1.h[1], v4, v5, v6, v7 // v18 = t20a, v29 = t27a
        dmbutterfly_h2  v26, v21, v1.h[2], v1.h[3], v4, v5, v6, v7 // v26 = t21a, v21 = t26a
        dmbutterfly_h1  v22, v25, v1.h[4], v1.h[5], v4, v5, v6, v7 // v22 = t22a, v25 = t25a
        dmbutterfly_h2  v30, v17, v1.h[6], v1.h[7], v4, v5, v6, v7 // v30 = t23a, v17 = t24a

        ld1             {v0.8h}, [x10]
        dmbutterfly_h1  v16, v31, v8.h[0], v8.h[1], v4, v5, v6, v7 // v16 = t16a, v31 = t31a
        dmbutterfly_h2  v24, v23, v8.h[2], v8.h[3], v4, v5, v6, v7 // v24 = t17a, v23 = t30a
        dmbutterfly_h1  v20, v27, v8.h[4], v8.h[5], v4, v5, v6, v7 // v20 = t18a, v27 = t29a
        dmbutterfly_h2  v28, v19, v8.h[6], v8.h[7], v4, v5, v6, v7 // v28 = t19a, v19 = t28a
        dmbutterfly_h1  v18, v29, v9.h[0], v9.h[1], v4, v5, v6, v7 // v18 = t20a, v29 = t27a
        dmbutterfly_h2  v26, v21, v9.h[2], v9.h[3], v4, v5, v6, v7 // v26 = t21a, v21 = t26a
        dmbutterfly_h1  v22, v25, v9.h[4], v9.h[5], v4, v5, v6, v7 // v22 = t22a, v25 = t25a
        dmbutterfly_h2  v30, v17, v9.h[6], v9.h[7], v4, v5, v6, v7 // v30 = t23a, v17 = t24a

        butterfly_8h    v4,  v24, v16, v24 // v4  = t16, v24 = t17
        butterfly_8h    v5,  v20, v28, v20 // v5  = t19, v20 = t18
@@ -1183,18 +1175,14 @@ function idct32_odd_half
 endfunc

 function idct32_odd_quarter
        ld1             {v0.8h,v1.8h}, [x11]

        dsmull_h        v4,  v5,  v16, v0.h[0]
        dsmull_h        v28, v29, v19, v0.h[7]
        dsmull_h        v30, v31, v16, v0.h[1]
        dsmull_h        v22, v23, v17, v1.h[6]
        dsmull_h        v7,  v6,  v17, v1.h[7]
        dsmull_h        v26, v27, v19, v0.h[6]
        dsmull_h        v20, v21, v18, v1.h[0]
        dsmull_h        v24, v25, v18, v1.h[1]

        ld1             {v0.8h}, [x10]
        dsmull_h        v4,  v5,  v16, v8.h[0]
        dsmull_h        v28, v29, v19, v8.h[7]
        dsmull_h        v30, v31, v16, v8.h[1]
        dsmull_h        v22, v23, v17, v9.h[6]
        dsmull_h        v7,  v6,  v17, v9.h[7]
        dsmull_h        v26, v27, v19, v8.h[6]
        dsmull_h        v20, v21, v18, v9.h[0]
        dsmull_h        v24, v25, v18, v9.h[1]

        neg             v28.4s, v28.4s
        neg             v29.4s, v29.4s
@@ -1240,12 +1228,8 @@ endfunc
 // x1 = unused
 // x2 = src
 // x9 = double input stride
 // x10 = idct_coeffs
 // x11 = idct_coeffs + 32
 function idct32_1d_8x32_pass1\suffix\()_neon
        mov             x14, x30
        ld1             {v0.8h,v1.8h}, [x10]

        movi            v2.8h, #0

        // v16 = IN(0), v17 = IN(2) ... v31 = IN(30)
@@ -1278,14 +1262,14 @@ function idct32_1d_8x32_pass1\suffix\()_neon
 .macro store_rev a, b
        // There's no rev128 instruction, but we reverse each 64 bit
        // half, and then flip them using an ext with 8 bytes offset.
        rev64           v1.8h, \b
        rev64           v3.8h, \b
        st1             {\a},  [x0], #16
        rev64           v0.8h, \a
        ext             v1.16b, v1.16b, v1.16b, #8
        rev64           v2.8h, \a
        ext             v3.16b, v3.16b, v3.16b, #8
        st1             {\b},  [x0], #16
        ext             v0.16b, v0.16b, v0.16b, #8
        st1             {v1.8h},  [x0], #16
        st1             {v0.8h},  [x0], #16
        ext             v2.16b, v2.16b, v2.16b, #8
        st1             {v3.8h},  [x0], #16
        st1             {v2.8h},  [x0], #16
 .endm
        store_rev       v16.8h, v24.8h
        store_rev       v17.8h, v25.8h
@@ -1339,20 +1323,20 @@ function idct32_1d_8x32_pass1\suffix\()_neon
        // subtracted from the output.
 .macro store_rev a, b
        ld1             {v4.8h},  [x0]
        rev64           v1.8h, \b
        rev64           v3.8h, \b
        add             v4.8h, v4.8h, \a
        rev64           v0.8h, \a
        rev64           v2.8h, \a
        st1             {v4.8h},  [x0], #16
        ext             v1.16b, v1.16b, v1.16b, #8
        ext             v3.16b, v3.16b, v3.16b, #8
        ld1             {v5.8h},  [x0]
        ext             v0.16b, v0.16b, v0.16b, #8
        ext             v2.16b, v2.16b, v2.16b, #8
        add             v5.8h, v5.8h, \b
        st1             {v5.8h},  [x0], #16
        ld1             {v6.8h},  [x0]
        sub             v6.8h, v6.8h, v1.8h
        sub             v6.8h, v6.8h, v3.8h
        st1             {v6.8h},  [x0], #16
        ld1             {v7.8h},  [x0]
        sub             v7.8h, v7.8h, v0.8h
        sub             v7.8h, v7.8h, v2.8h
        st1             {v7.8h},  [x0], #16
 .endm

@@ -1376,12 +1360,8 @@ endfunc
 // x2 = src (temp buffer)
 // x7 = negative double temp buffer stride
 // x9 = double temp buffer stride
 // x10 = idct_coeffs
 // x11 = idct_coeffs + 32
 function idct32_1d_8x32_pass2\suffix\()_neon
        mov             x14, x30
        ld1             {v0.8h,v1.8h}, [x10]

        // v16 = IN(0), v17 = IN(2) ... v31 = IN(30)
 .ifb \suffix
 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
@@ -1454,15 +1434,15 @@ function idct32_1d_8x32_pass2\suffix\()_neon
        sub             v6.8h, v6.8h, \c
        sub             v7.8h, v7.8h, \d
 .endif
        ld1             {v0.8b}, [x0], x1
        ld1             {v1.8b}, [x0], x1
        ld1             {v10.8b}, [x0], x1
        ld1             {v11.8b}, [x0], x1
        srshr           v4.8h, v4.8h, #6
        ld1             {v2.8b}, [x0], x1
        srshr           v5.8h, v5.8h, #6
        uaddw           v4.8h, v4.8h, v0.8b
        uaddw           v4.8h, v4.8h, v10.8b
        ld1             {v3.8b}, [x0], x1
        srshr           v6.8h, v6.8h, #6
        uaddw           v5.8h, v5.8h, v1.8b
        uaddw           v5.8h, v5.8h, v11.8b
        srshr           v7.8h, v7.8h, #6
        sub             x0,  x0,  x1, lsl #2
        uaddw           v6.8h, v6.8h, v2.8b
@@ -1503,13 +1483,10 @@ function ff_vp9_idct_idct_32x32_add_neon, export=1
        b.eq            idct32x32_dc_add_neon

        movrel          x10, idct_coeffs
        add             x11, x10, #32
        movrel          x12, min_eob_idct_idct_32, 2

        mov             x15, x30

        stp             d14, d15, [sp, #-0x10]!
        stp             d12, d13, [sp, #-0x10]!
        stp             d10, d11, [sp, #-0x10]!
        stp             d8,  d9,  [sp, #-0x10]!

@@ -1523,6 +1500,9 @@ function ff_vp9_idct_idct_32x32_add_neon, export=1
        mov             x9,  #128
        neg             x7,  x9

        ld1             {v0.8h,v1.8h}, [x10], #32
        ld1             {v8.8h,v9.8h}, [x10]

        cmp             w3,  #34
        b.le            idct32x32_quarter_add_neon
        cmp             w3,  #135
@@ -1565,8 +1545,6 @@ function ff_vp9_idct_idct_32x32_add_neon, export=1

        ldp             d8,  d9,  [sp], 0x10
        ldp             d10, d11, [sp], 0x10
        ldp             d12, d13, [sp], 0x10
        ldp             d14, d15, [sp], 0x10

        br              x15
 endfunc
@@ -1592,8 +1570,6 @@ function idct32x32_\size\()_add_neon

        ldp             d8,  d9,  [sp], 0x10
        ldp             d10, d11, [sp], 0x10
        ldp             d12, d13, [sp], 0x10
        ldp             d14, d15, [sp], 0x10

        br              x15
 endfunc