Browse Source

arm: vp9itxfm: Make the larger core transforms standalone functions

This work is sponsored by, and copyright, Google.

This reduces the code size of libavcodec/arm/vp9itxfm_neon.o from
15324 to 12388 bytes.

This gives a small slowdown of a couple tens of cycles, up to around
150 cycles for the full case of the largest transform, but makes
it more feasible to add more optimized versions of these transforms.

Before:                              Cortex A7       A8       A9      A53
vp9_inv_dct_dct_16x16_sub4_add_neon:    2063.4   1516.0   1719.5   1245.1
vp9_inv_dct_dct_16x16_sub16_add_neon:   3279.3   2454.5   2525.2   1982.3
vp9_inv_dct_dct_32x32_sub4_add_neon:   10750.0   7955.4   8525.6   6754.2
vp9_inv_dct_dct_32x32_sub32_add_neon:  18574.0  17108.4  14216.7  12010.2

After:
vp9_inv_dct_dct_16x16_sub4_add_neon:    2060.8   1608.5   1735.7   1262.0
vp9_inv_dct_dct_16x16_sub16_add_neon:   3211.2   2443.5   2546.1   1999.5
vp9_inv_dct_dct_32x32_sub4_add_neon:   10682.0   8043.8   8581.3   6810.1
vp9_inv_dct_dct_32x32_sub32_add_neon:  18522.4  17277.4  14286.7  12087.9

Signed-off-by: Martin Storsjö <martin@martin.st>
tags/n3.4
Martin Storsjö 9 years ago
parent
commit
0331c3f5e8
1 changed files with 26 additions and 17 deletions
  1. +26
    -17
      libavcodec/arm/vp9itxfm_neon.S

+ 26
- 17
libavcodec/arm/vp9itxfm_neon.S View File

@@ -534,7 +534,7 @@ function idct16x16_dc_add_neon
endfunc endfunc
.ltorg .ltorg


.macro idct16
function idct16
mbutterfly0 d16, d24, d16, d24, d4, d6, q2, q3 @ d16 = t0a, d24 = t1a mbutterfly0 d16, d24, d16, d24, d4, d6, q2, q3 @ d16 = t0a, d24 = t1a
mbutterfly d20, d28, d0[1], d0[2], q2, q3 @ d20 = t2a, d28 = t3a mbutterfly d20, d28, d0[1], d0[2], q2, q3 @ d20 = t2a, d28 = t3a
mbutterfly d18, d30, d0[3], d1[0], q2, q3 @ d18 = t4a, d30 = t7a mbutterfly d18, d30, d0[3], d1[0], q2, q3 @ d18 = t4a, d30 = t7a
@@ -580,9 +580,10 @@ endfunc
vmov d4, d21 @ d4 = t10a vmov d4, d21 @ d4 = t10a
butterfly d20, d27, d6, d27 @ d20 = out[4], d27 = out[11] butterfly d20, d27, d6, d27 @ d20 = out[4], d27 = out[11]
butterfly d21, d26, d26, d4 @ d21 = out[5], d26 = out[10] butterfly d21, d26, d26, d4 @ d21 = out[5], d26 = out[10]
.endm
bx lr
endfunc


.macro iadst16
function iadst16
movrel r12, iadst16_coeffs movrel r12, iadst16_coeffs
vld1.16 {q0-q1}, [r12,:128] vld1.16 {q0-q1}, [r12,:128]


@@ -653,7 +654,8 @@ endfunc


vmov d16, d2 vmov d16, d2
vmov d30, d4 vmov d30, d4
.endm
bx lr
endfunc


.macro itxfm16_1d_funcs txfm .macro itxfm16_1d_funcs txfm
@ Read a vertical 4x16 slice out of a 16x16 matrix, do a transform on it, @ Read a vertical 4x16 slice out of a 16x16 matrix, do a transform on it,
@@ -662,6 +664,8 @@ endfunc
@ r1 = slice offset @ r1 = slice offset
@ r2 = src @ r2 = src
function \txfm\()16_1d_4x16_pass1_neon function \txfm\()16_1d_4x16_pass1_neon
push {lr}

mov r12, #32 mov r12, #32
vmov.s16 q2, #0 vmov.s16 q2, #0
.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
@@ -669,7 +673,7 @@ function \txfm\()16_1d_4x16_pass1_neon
vst1.16 {d4}, [r2,:64], r12 vst1.16 {d4}, [r2,:64], r12
.endr .endr


\txfm\()16
bl \txfm\()16


@ Do four 4x4 transposes. Originally, d16-d31 contain the @ Do four 4x4 transposes. Originally, d16-d31 contain the
@ 16 rows. Afterwards, d16-d19, d20-d23, d24-d27, d28-d31 @ 16 rows. Afterwards, d16-d19, d20-d23, d24-d27, d28-d31
@@ -682,7 +686,7 @@ function \txfm\()16_1d_4x16_pass1_neon
.irp i, 16, 20, 24, 28, 17, 21, 25, 29, 18, 22, 26, 30, 19, 23, 27, 31 .irp i, 16, 20, 24, 28, 17, 21, 25, 29, 18, 22, 26, 30, 19, 23, 27, 31
vst1.16 {d\i}, [r0,:64]! vst1.16 {d\i}, [r0,:64]!
.endr .endr
bx lr
pop {pc}
1: 1:
@ Special case: For the last input column (r1 == 12), @ Special case: For the last input column (r1 == 12),
@ which would be stored as the last row in the temp buffer, @ which would be stored as the last row in the temp buffer,
@@ -709,7 +713,7 @@ function \txfm\()16_1d_4x16_pass1_neon
vmov d29, d17 vmov d29, d17
vmov d30, d18 vmov d30, d18
vmov d31, d19 vmov d31, d19
bx lr
pop {pc}
endfunc endfunc


@ Read a vertical 4x16 slice out of a 16x16 matrix, do a transform on it, @ Read a vertical 4x16 slice out of a 16x16 matrix, do a transform on it,
@@ -719,6 +723,7 @@ endfunc
@ r2 = src (temp buffer) @ r2 = src (temp buffer)
@ r3 = slice offset @ r3 = slice offset
function \txfm\()16_1d_4x16_pass2_neon function \txfm\()16_1d_4x16_pass2_neon
push {lr}
mov r12, #32 mov r12, #32
.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27
vld1.16 {d\i}, [r2,:64], r12 vld1.16 {d\i}, [r2,:64], r12
@@ -732,7 +737,7 @@ function \txfm\()16_1d_4x16_pass2_neon


add r3, r0, r1 add r3, r0, r1
lsl r1, r1, #1 lsl r1, r1, #1
\txfm\()16
bl \txfm\()16


.macro load_add_store coef0, coef1, coef2, coef3 .macro load_add_store coef0, coef1, coef2, coef3
vrshr.s16 \coef0, \coef0, #6 vrshr.s16 \coef0, \coef0, #6
@@ -773,7 +778,7 @@ function \txfm\()16_1d_4x16_pass2_neon
load_add_store q12, q13, q14, q15 load_add_store q12, q13, q14, q15
.purgem load_add_store .purgem load_add_store


bx lr
pop {pc}
endfunc endfunc
.endm .endm


@@ -908,7 +913,7 @@ function idct32x32_dc_add_neon
bx lr bx lr
endfunc endfunc


.macro idct32_odd
function idct32_odd
movrel r12, idct_coeffs movrel r12, idct_coeffs
add r12, r12, #32 add r12, r12, #32
vld1.16 {q0-q1}, [r12,:128] vld1.16 {q0-q1}, [r12,:128]
@@ -967,7 +972,8 @@ endfunc
mbutterfly0 d26, d21, d26, d21, d4, d6, q2, q3 @ d26 = t26a, d21 = t21a mbutterfly0 d26, d21, d26, d21, d4, d6, q2, q3 @ d26 = t26a, d21 = t21a
mbutterfly0 d25, d22, d25, d22, d4, d6, q2, q3 @ d25 = t25, d22 = t22 mbutterfly0 d25, d22, d25, d22, d4, d6, q2, q3 @ d25 = t25, d22 = t22
mbutterfly0 d24, d23, d24, d23, d4, d6, q2, q3 @ d24 = t24a, d23 = t23a mbutterfly0 d24, d23, d24, d23, d4, d6, q2, q3 @ d24 = t24a, d23 = t23a
.endm
bx lr
endfunc


@ Do an 32-point IDCT of a 4x32 slice out of a 32x32 matrix. @ Do an 32-point IDCT of a 4x32 slice out of a 32x32 matrix.
@ We don't have register space to do a single pass IDCT of 4x32 though, @ We don't have register space to do a single pass IDCT of 4x32 though,
@@ -979,6 +985,8 @@ endfunc
@ r1 = unused @ r1 = unused
@ r2 = src @ r2 = src
function idct32_1d_4x32_pass1_neon function idct32_1d_4x32_pass1_neon
push {lr}

movrel r12, idct_coeffs movrel r12, idct_coeffs
vld1.16 {q0-q1}, [r12,:128] vld1.16 {q0-q1}, [r12,:128]


@@ -992,7 +1000,7 @@ function idct32_1d_4x32_pass1_neon
vst1.16 {d4}, [r2,:64], r12 vst1.16 {d4}, [r2,:64], r12
.endr .endr


idct16
bl idct16


@ Do four 4x4 transposes. Originally, d16-d31 contain the @ Do four 4x4 transposes. Originally, d16-d31 contain the
@ 16 rows. Afterwards, d16-d19, d20-d23, d24-d27, d28-d31 @ 16 rows. Afterwards, d16-d19, d20-d23, d24-d27, d28-d31
@@ -1028,7 +1036,7 @@ function idct32_1d_4x32_pass1_neon
vst1.16 {d4}, [r2,:64], r12 vst1.16 {d4}, [r2,:64], r12
.endr .endr


idct32_odd
bl idct32_odd


transpose16_q_4x_4x4 q15, q14, q13, q12, q11, q10, q9, q8, d31, d30, d29, d28, d27, d26, d25, d24, d23, d22, d21, d20, d19, d18, d17, d16 transpose16_q_4x_4x4 q15, q14, q13, q12, q11, q10, q9, q8, d31, d30, d29, d28, d27, d26, d25, d24, d23, d22, d21, d20, d19, d18, d17, d16


@@ -1054,7 +1062,7 @@ function idct32_1d_4x32_pass1_neon
store_rev 29, 25, 21, 17 store_rev 29, 25, 21, 17
store_rev 28, 24, 20, 16 store_rev 28, 24, 20, 16
.purgem store_rev .purgem store_rev
bx lr
pop {pc}
endfunc endfunc
.ltorg .ltorg


@@ -1065,6 +1073,7 @@ endfunc
@ r1 = dst stride @ r1 = dst stride
@ r2 = src (temp buffer) @ r2 = src (temp buffer)
function idct32_1d_4x32_pass2_neon function idct32_1d_4x32_pass2_neon
push {lr}
movrel r12, idct_coeffs movrel r12, idct_coeffs
vld1.16 {q0-q1}, [r12,:128] vld1.16 {q0-q1}, [r12,:128]


@@ -1075,7 +1084,7 @@ function idct32_1d_4x32_pass2_neon
.endr .endr
sub r2, r2, r12, lsl #4 sub r2, r2, r12, lsl #4


idct16
bl idct16


.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
vst1.16 {d\i}, [r2,:64], r12 vst1.16 {d\i}, [r2,:64], r12
@@ -1091,7 +1100,7 @@ function idct32_1d_4x32_pass2_neon
sub r2, r2, r12, lsl #4 sub r2, r2, r12, lsl #4
sub r2, r2, #64 sub r2, r2, #64


idct32_odd
bl idct32_odd


mov r12, #128 mov r12, #128
.macro load_acc_store a, b, c, d, neg=0 .macro load_acc_store a, b, c, d, neg=0
@@ -1139,7 +1148,7 @@ function idct32_1d_4x32_pass2_neon
load_acc_store 24, 25, 26, 27, 1 load_acc_store 24, 25, 26, 27, 1
load_acc_store 28, 29, 30, 31, 1 load_acc_store 28, 29, 30, 31, 1
.purgem load_acc_store .purgem load_acc_store
bx lr
pop {pc}
endfunc endfunc


const min_eob_idct_idct_32, align=4 const min_eob_idct_idct_32, align=4


Loading…
Cancel
Save