|
|
@@ -1575,7 +1575,6 @@ function ff_vp9_idct_idct_32x32_add_neon, export=1 |
|
|
beq idct32x32_dc_add_neon |
|
|
beq idct32x32_dc_add_neon |
|
|
push {r4-r8,lr} |
|
|
push {r4-r8,lr} |
|
|
vpush {q4-q6} |
|
|
vpush {q4-q6} |
|
|
movrel r8, min_eob_idct_idct_32 + 2 |
|
|
|
|
|
|
|
|
|
|
|
@ Align the stack, allocate a temp buffer |
|
|
@ Align the stack, allocate a temp buffer |
|
|
T mov r7, sp |
|
|
T mov r7, sp |
|
|
@@ -1597,6 +1596,8 @@ A and r7, sp, #15 |
|
|
cmp r3, #135 |
|
|
cmp r3, #135 |
|
|
ble idct32x32_half_add_neon |
|
|
ble idct32x32_half_add_neon |
|
|
|
|
|
|
|
|
|
|
|
movrel r8, min_eob_idct_idct_32 + 2 |
|
|
|
|
|
|
|
|
.irp i, 0, 4, 8, 12, 16, 20, 24, 28 |
|
|
.irp i, 0, 4, 8, 12, 16, 20, 24, 28 |
|
|
add r0, sp, #(\i*64) |
|
|
add r0, sp, #(\i*64) |
|
|
.if \i > 0 |
|
|
.if \i > 0 |
|
|
@@ -1634,72 +1635,54 @@ A and r7, sp, #15 |
|
|
pop {r4-r8,pc} |
|
|
pop {r4-r8,pc} |
|
|
endfunc |
|
|
endfunc |
|
|
|
|
|
|
|
|
function idct32x32_quarter_add_neon |
|
|
|
|
|
|
|
|
.macro idct32_partial size |
|
|
|
|
|
function idct32x32_\size\()_add_neon |
|
|
.irp i, 0, 4 |
|
|
.irp i, 0, 4 |
|
|
add r0, sp, #(\i*64) |
|
|
add r0, sp, #(\i*64) |
|
|
|
|
|
.ifc \size,quarter |
|
|
.if \i == 4 |
|
|
.if \i == 4 |
|
|
cmp r3, #9 |
|
|
cmp r3, #9 |
|
|
ble 1f |
|
|
ble 1f |
|
|
|
|
|
.endif |
|
|
.endif |
|
|
.endif |
|
|
add r2, r6, #(\i*2) |
|
|
add r2, r6, #(\i*2) |
|
|
bl idct32_1d_4x32_pass1_quarter_neon |
|
|
|
|
|
.endr |
|
|
|
|
|
b 3f |
|
|
|
|
|
|
|
|
|
|
|
1: |
|
|
|
|
|
@ Write zeros to the temp buffer for pass 2 |
|
|
|
|
|
vmov.i16 q14, #0 |
|
|
|
|
|
vmov.i16 q15, #0 |
|
|
|
|
|
.rept 8 |
|
|
|
|
|
vst1.16 {q14-q15}, [r0,:128]! |
|
|
|
|
|
.endr |
|
|
|
|
|
3: |
|
|
|
|
|
.irp i, 0, 4, 8, 12, 16, 20, 24, 28 |
|
|
|
|
|
add r0, r4, #(\i) |
|
|
|
|
|
mov r1, r5 |
|
|
|
|
|
add r2, sp, #(\i*2) |
|
|
|
|
|
bl idct32_1d_4x32_pass2_quarter_neon |
|
|
|
|
|
|
|
|
bl idct32_1d_4x32_pass1_\size\()_neon |
|
|
.endr |
|
|
.endr |
|
|
|
|
|
|
|
|
add sp, sp, r7 |
|
|
|
|
|
vpop {q4-q6} |
|
|
|
|
|
pop {r4-r8,pc} |
|
|
|
|
|
endfunc |
|
|
|
|
|
|
|
|
|
|
|
function idct32x32_half_add_neon |
|
|
|
|
|
.irp i, 0, 4, 8, 12 |
|
|
|
|
|
|
|
|
.ifc \size,half |
|
|
|
|
|
.irp i, 8, 12 |
|
|
add r0, sp, #(\i*64) |
|
|
add r0, sp, #(\i*64) |
|
|
.if \i > 0 |
|
|
|
|
|
ldrh_post r1, r8, #2 |
|
|
|
|
|
cmp r3, r1 |
|
|
|
|
|
it le |
|
|
|
|
|
movle r1, #(16 - \i)/2 |
|
|
|
|
|
|
|
|
.if \i == 12 |
|
|
|
|
|
cmp r3, #70 |
|
|
ble 1f |
|
|
ble 1f |
|
|
.endif |
|
|
.endif |
|
|
add r2, r6, #(\i*2) |
|
|
add r2, r6, #(\i*2) |
|
|
bl idct32_1d_4x32_pass1_half_neon |
|
|
|
|
|
|
|
|
bl idct32_1d_4x32_pass1_\size\()_neon |
|
|
.endr |
|
|
.endr |
|
|
|
|
|
.endif |
|
|
b 3f |
|
|
b 3f |
|
|
|
|
|
|
|
|
1: |
|
|
1: |
|
|
@ Write zeros to the temp buffer for pass 2 |
|
|
@ Write zeros to the temp buffer for pass 2 |
|
|
vmov.i16 q14, #0 |
|
|
vmov.i16 q14, #0 |
|
|
vmov.i16 q15, #0 |
|
|
vmov.i16 q15, #0 |
|
|
2: |
|
|
|
|
|
subs r1, r1, #1 |
|
|
|
|
|
.rept 4 |
|
|
|
|
|
|
|
|
.rept 8 |
|
|
vst1.16 {q14-q15}, [r0,:128]! |
|
|
vst1.16 {q14-q15}, [r0,:128]! |
|
|
.endr |
|
|
.endr |
|
|
bne 2b |
|
|
|
|
|
|
|
|
|
|
|
3: |
|
|
3: |
|
|
.irp i, 0, 4, 8, 12, 16, 20, 24, 28 |
|
|
.irp i, 0, 4, 8, 12, 16, 20, 24, 28 |
|
|
add r0, r4, #(\i) |
|
|
add r0, r4, #(\i) |
|
|
mov r1, r5 |
|
|
mov r1, r5 |
|
|
add r2, sp, #(\i*2) |
|
|
add r2, sp, #(\i*2) |
|
|
bl idct32_1d_4x32_pass2_half_neon |
|
|
|
|
|
|
|
|
bl idct32_1d_4x32_pass2_\size\()_neon |
|
|
.endr |
|
|
.endr |
|
|
|
|
|
|
|
|
add sp, sp, r7 |
|
|
add sp, sp, r7 |
|
|
vpop {q4-q6} |
|
|
vpop {q4-q6} |
|
|
pop {r4-r8,pc} |
|
|
pop {r4-r8,pc} |
|
|
endfunc |
|
|
endfunc |
|
|
|
|
|
.endm |
|
|
|
|
|
|
|
|
|
|
|
idct32_partial quarter |
|
|
|
|
|
idct32_partial half |