|
|
|
@@ -534,7 +534,7 @@ function idct16x16_dc_add_neon |
|
|
|
endfunc |
|
|
|
.ltorg |
|
|
|
|
|
|
|
.macro idct16 |
|
|
|
function idct16 |
|
|
|
mbutterfly0 d16, d24, d16, d24, d4, d6, q2, q3 @ d16 = t0a, d24 = t1a |
|
|
|
mbutterfly d20, d28, d0[1], d0[2], q2, q3 @ d20 = t2a, d28 = t3a |
|
|
|
mbutterfly d18, d30, d0[3], d1[0], q2, q3 @ d18 = t4a, d30 = t7a |
|
|
|
@@ -580,9 +580,10 @@ endfunc |
|
|
|
vmov d4, d21 @ d4 = t10a |
|
|
|
butterfly d20, d27, d6, d27 @ d20 = out[4], d27 = out[11] |
|
|
|
butterfly d21, d26, d26, d4 @ d21 = out[5], d26 = out[10] |
|
|
|
.endm |
|
|
|
bx lr |
|
|
|
endfunc |
|
|
|
|
|
|
|
.macro iadst16 |
|
|
|
function iadst16 |
|
|
|
movrel r12, iadst16_coeffs |
|
|
|
vld1.16 {q0-q1}, [r12,:128] |
|
|
|
|
|
|
|
@@ -653,7 +654,8 @@ endfunc |
|
|
|
|
|
|
|
vmov d16, d2 |
|
|
|
vmov d30, d4 |
|
|
|
.endm |
|
|
|
bx lr |
|
|
|
endfunc |
|
|
|
|
|
|
|
.macro itxfm16_1d_funcs txfm |
|
|
|
@ Read a vertical 4x16 slice out of a 16x16 matrix, do a transform on it, |
|
|
|
@@ -662,6 +664,8 @@ endfunc |
|
|
|
@ r1 = slice offset |
|
|
|
@ r2 = src |
|
|
|
function \txfm\()16_1d_4x16_pass1_neon |
|
|
|
push {lr} |
|
|
|
|
|
|
|
mov r12, #32 |
|
|
|
vmov.s16 q2, #0 |
|
|
|
.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 |
|
|
|
@@ -669,7 +673,7 @@ function \txfm\()16_1d_4x16_pass1_neon |
|
|
|
vst1.16 {d4}, [r2,:64], r12 |
|
|
|
.endr |
|
|
|
|
|
|
|
\txfm\()16 |
|
|
|
bl \txfm\()16 |
|
|
|
|
|
|
|
@ Do four 4x4 transposes. Originally, d16-d31 contain the |
|
|
|
@ 16 rows. Afterwards, d16-d19, d20-d23, d24-d27, d28-d31 |
|
|
|
@@ -682,7 +686,7 @@ function \txfm\()16_1d_4x16_pass1_neon |
|
|
|
.irp i, 16, 20, 24, 28, 17, 21, 25, 29, 18, 22, 26, 30, 19, 23, 27, 31 |
|
|
|
vst1.16 {d\i}, [r0,:64]! |
|
|
|
.endr |
|
|
|
bx lr |
|
|
|
pop {pc} |
|
|
|
1: |
|
|
|
@ Special case: For the last input column (r1 == 12), |
|
|
|
@ which would be stored as the last row in the temp buffer, |
|
|
|
@@ -709,7 +713,7 @@ function \txfm\()16_1d_4x16_pass1_neon |
|
|
|
vmov d29, d17 |
|
|
|
vmov d30, d18 |
|
|
|
vmov d31, d19 |
|
|
|
bx lr |
|
|
|
pop {pc} |
|
|
|
endfunc |
|
|
|
|
|
|
|
@ Read a vertical 4x16 slice out of a 16x16 matrix, do a transform on it, |
|
|
|
@@ -719,6 +723,7 @@ endfunc |
|
|
|
@ r2 = src (temp buffer) |
|
|
|
@ r3 = slice offset |
|
|
|
function \txfm\()16_1d_4x16_pass2_neon |
|
|
|
push {lr} |
|
|
|
mov r12, #32 |
|
|
|
.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27 |
|
|
|
vld1.16 {d\i}, [r2,:64], r12 |
|
|
|
@@ -732,7 +737,7 @@ function \txfm\()16_1d_4x16_pass2_neon |
|
|
|
|
|
|
|
add r3, r0, r1 |
|
|
|
lsl r1, r1, #1 |
|
|
|
\txfm\()16 |
|
|
|
bl \txfm\()16 |
|
|
|
|
|
|
|
.macro load_add_store coef0, coef1, coef2, coef3 |
|
|
|
vrshr.s16 \coef0, \coef0, #6 |
|
|
|
@@ -773,7 +778,7 @@ function \txfm\()16_1d_4x16_pass2_neon |
|
|
|
load_add_store q12, q13, q14, q15 |
|
|
|
.purgem load_add_store |
|
|
|
|
|
|
|
bx lr |
|
|
|
pop {pc} |
|
|
|
endfunc |
|
|
|
.endm |
|
|
|
|
|
|
|
@@ -908,7 +913,7 @@ function idct32x32_dc_add_neon |
|
|
|
bx lr |
|
|
|
endfunc |
|
|
|
|
|
|
|
.macro idct32_odd |
|
|
|
function idct32_odd |
|
|
|
movrel r12, idct_coeffs |
|
|
|
add r12, r12, #32 |
|
|
|
vld1.16 {q0-q1}, [r12,:128] |
|
|
|
@@ -967,7 +972,8 @@ endfunc |
|
|
|
mbutterfly0 d26, d21, d26, d21, d4, d6, q2, q3 @ d26 = t26a, d21 = t21a |
|
|
|
mbutterfly0 d25, d22, d25, d22, d4, d6, q2, q3 @ d25 = t25, d22 = t22 |
|
|
|
mbutterfly0 d24, d23, d24, d23, d4, d6, q2, q3 @ d24 = t24a, d23 = t23a |
|
|
|
.endm |
|
|
|
bx lr |
|
|
|
endfunc |
|
|
|
|
|
|
|
@ Do an 32-point IDCT of a 4x32 slice out of a 32x32 matrix. |
|
|
|
@ We don't have register space to do a single pass IDCT of 4x32 though, |
|
|
|
@@ -979,6 +985,8 @@ endfunc |
|
|
|
@ r1 = unused |
|
|
|
@ r2 = src |
|
|
|
function idct32_1d_4x32_pass1_neon |
|
|
|
push {lr} |
|
|
|
|
|
|
|
movrel r12, idct_coeffs |
|
|
|
vld1.16 {q0-q1}, [r12,:128] |
|
|
|
|
|
|
|
@@ -992,7 +1000,7 @@ function idct32_1d_4x32_pass1_neon |
|
|
|
vst1.16 {d4}, [r2,:64], r12 |
|
|
|
.endr |
|
|
|
|
|
|
|
idct16 |
|
|
|
bl idct16 |
|
|
|
|
|
|
|
@ Do four 4x4 transposes. Originally, d16-d31 contain the |
|
|
|
@ 16 rows. Afterwards, d16-d19, d20-d23, d24-d27, d28-d31 |
|
|
|
@@ -1028,7 +1036,7 @@ function idct32_1d_4x32_pass1_neon |
|
|
|
vst1.16 {d4}, [r2,:64], r12 |
|
|
|
.endr |
|
|
|
|
|
|
|
idct32_odd |
|
|
|
bl idct32_odd |
|
|
|
|
|
|
|
transpose16_q_4x_4x4 q15, q14, q13, q12, q11, q10, q9, q8, d31, d30, d29, d28, d27, d26, d25, d24, d23, d22, d21, d20, d19, d18, d17, d16 |
|
|
|
|
|
|
|
@@ -1054,7 +1062,7 @@ function idct32_1d_4x32_pass1_neon |
|
|
|
store_rev 29, 25, 21, 17 |
|
|
|
store_rev 28, 24, 20, 16 |
|
|
|
.purgem store_rev |
|
|
|
bx lr |
|
|
|
pop {pc} |
|
|
|
endfunc |
|
|
|
.ltorg |
|
|
|
|
|
|
|
@@ -1065,6 +1073,7 @@ endfunc |
|
|
|
@ r1 = dst stride |
|
|
|
@ r2 = src (temp buffer) |
|
|
|
function idct32_1d_4x32_pass2_neon |
|
|
|
push {lr} |
|
|
|
movrel r12, idct_coeffs |
|
|
|
vld1.16 {q0-q1}, [r12,:128] |
|
|
|
|
|
|
|
@@ -1075,7 +1084,7 @@ function idct32_1d_4x32_pass2_neon |
|
|
|
.endr |
|
|
|
sub r2, r2, r12, lsl #4 |
|
|
|
|
|
|
|
idct16 |
|
|
|
bl idct16 |
|
|
|
|
|
|
|
.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 |
|
|
|
vst1.16 {d\i}, [r2,:64], r12 |
|
|
|
@@ -1091,7 +1100,7 @@ function idct32_1d_4x32_pass2_neon |
|
|
|
sub r2, r2, r12, lsl #4 |
|
|
|
sub r2, r2, #64 |
|
|
|
|
|
|
|
idct32_odd |
|
|
|
bl idct32_odd |
|
|
|
|
|
|
|
mov r12, #128 |
|
|
|
.macro load_acc_store a, b, c, d, neg=0 |
|
|
|
@@ -1139,7 +1148,7 @@ function idct32_1d_4x32_pass2_neon |
|
|
|
load_acc_store 24, 25, 26, 27, 1 |
|
|
|
load_acc_store 28, 29, 30, 31, 1 |
|
|
|
.purgem load_acc_store |
|
|
|
bx lr |
|
|
|
pop {pc} |
|
|
|
endfunc |
|
|
|
|
|
|
|
const min_eob_idct_idct_32, align=4 |
|
|
|
|