|
|
@@ -218,3 +218,276 @@ endfunc |
|
|
|
.unreq POST1 |
|
|
|
.unreq POST2 |
|
|
|
.unreq POST3 |
|
|
|
|
|
|
|
|
|
|
|
IN .req a1 |
|
|
|
SBACT .req a2 |
|
|
|
OLDFPSCR .req a3 |
|
|
|
IMDCT .req a4 |
|
|
|
WINDOW .req v1 |
|
|
|
OUT .req v2 |
|
|
|
BUF .req v3 |
|
|
|
SCALEINT .req v4 @ only used in softfp case |
|
|
|
COUNT .req v5 |
|
|
|
|
|
|
|
SCALE .req s0 |
|
|
|
|
|
|
|
/* Stack layout differs in softfp and hardfp cases: |
|
|
|
* |
|
|
|
* hardfp |
|
|
|
* fp -> 6 arg words saved by caller |
|
|
|
* a3,a4,v1-v3,v5,fp,lr on entry (a3 just to pad to 8 bytes) |
|
|
|
* s16-s23 on entry |
|
|
|
* align 16 |
|
|
|
* buf -> 8*32*4 bytes buffer |
|
|
|
* s0 on entry |
|
|
|
* sp -> 3 arg words for callee |
|
|
|
* |
|
|
|
* softfp |
|
|
|
* fp -> 7 arg words saved by caller |
|
|
|
* a4,v1-v5,fp,lr on entry |
|
|
|
* s16-s23 on entry |
|
|
|
* align 16 |
|
|
|
* buf -> 8*32*4 bytes buffer |
|
|
|
* sp -> 4 arg words for callee |
|
|
|
*/ |
|
|
|
|
|
|
|
/* void ff_dca_qmf_32_subbands_vfp(float samples_in[32][8], int sb_act, |
|
|
|
* SynthFilterContext *synth, FFTContext *imdct, |
|
|
|
* float (*synth_buf_ptr)[512], |
|
|
|
* int *synth_buf_offset, float (*synth_buf2)[32], |
|
|
|
* const float (*window)[512], float *samples_out, |
|
|
|
* float (*raXin)[32], float scale); |
|
|
|
*/ |
|
|
|
function ff_dca_qmf_32_subbands_vfp, export=1 |
|
|
|
VFP push {a3-a4,v1-v3,v5,fp,lr} |
|
|
|
NOVFP push {a4,v1-v5,fp,lr} |
|
|
|
add fp, sp, #8*4 |
|
|
|
vpush {s16-s23} |
|
|
|
@ The buffer pointed at by raXin isn't big enough for us to do a |
|
|
|
@ complete matrix transposition as we want to, so allocate an |
|
|
|
@ alternative buffer from the stack. Align to 4 words for speed. |
|
|
|
sub BUF, sp, #8*32*4 |
|
|
|
bic BUF, BUF, #15 |
|
|
|
mov sp, BUF |
|
|
|
ldr lr, =0x03330000 @ RunFast mode, short vectors of length 4, stride 2 |
|
|
|
fmrx OLDFPSCR, FPSCR |
|
|
|
fmxr FPSCR, lr |
|
|
|
@ COUNT is used to count down 2 things at once: |
|
|
|
@ bits 0-4 are the number of word pairs remaining in the output row |
|
|
|
@ bits 5-31 are the number of words to copy (with possible negation) |
|
|
|
@ from the source matrix before we start zeroing the remainder |
|
|
|
mov COUNT, #(-4 << 5) + 16 |
|
|
|
adds COUNT, COUNT, SBACT, lsl #5 |
|
|
|
bmi 2f |
|
|
|
1: |
|
|
|
vldr s8, [IN, #(0*8+0)*4] |
|
|
|
vldr s10, [IN, #(0*8+1)*4] |
|
|
|
vldr s12, [IN, #(0*8+2)*4] |
|
|
|
vldr s14, [IN, #(0*8+3)*4] |
|
|
|
vldr s16, [IN, #(0*8+4)*4] |
|
|
|
vldr s18, [IN, #(0*8+5)*4] |
|
|
|
vldr s20, [IN, #(0*8+6)*4] |
|
|
|
vldr s22, [IN, #(0*8+7)*4] |
|
|
|
vneg.f s8, s8 |
|
|
|
vldr s9, [IN, #(1*8+0)*4] |
|
|
|
vldr s11, [IN, #(1*8+1)*4] |
|
|
|
vldr s13, [IN, #(1*8+2)*4] |
|
|
|
vldr s15, [IN, #(1*8+3)*4] |
|
|
|
vneg.f s16, s16 |
|
|
|
vldr s17, [IN, #(1*8+4)*4] |
|
|
|
vldr s19, [IN, #(1*8+5)*4] |
|
|
|
vldr s21, [IN, #(1*8+6)*4] |
|
|
|
vldr s23, [IN, #(1*8+7)*4] |
|
|
|
vstr d4, [BUF, #(0*32+0)*4] |
|
|
|
vstr d5, [BUF, #(1*32+0)*4] |
|
|
|
vstr d6, [BUF, #(2*32+0)*4] |
|
|
|
vstr d7, [BUF, #(3*32+0)*4] |
|
|
|
vstr d8, [BUF, #(4*32+0)*4] |
|
|
|
vstr d9, [BUF, #(5*32+0)*4] |
|
|
|
vstr d10, [BUF, #(6*32+0)*4] |
|
|
|
vstr d11, [BUF, #(7*32+0)*4] |
|
|
|
vldr s9, [IN, #(3*8+0)*4] |
|
|
|
vldr s11, [IN, #(3*8+1)*4] |
|
|
|
vldr s13, [IN, #(3*8+2)*4] |
|
|
|
vldr s15, [IN, #(3*8+3)*4] |
|
|
|
vldr s17, [IN, #(3*8+4)*4] |
|
|
|
vldr s19, [IN, #(3*8+5)*4] |
|
|
|
vldr s21, [IN, #(3*8+6)*4] |
|
|
|
vldr s23, [IN, #(3*8+7)*4] |
|
|
|
vneg.f s9, s9 |
|
|
|
vldr s8, [IN, #(2*8+0)*4] |
|
|
|
vldr s10, [IN, #(2*8+1)*4] |
|
|
|
vldr s12, [IN, #(2*8+2)*4] |
|
|
|
vldr s14, [IN, #(2*8+3)*4] |
|
|
|
vneg.f s17, s17 |
|
|
|
vldr s16, [IN, #(2*8+4)*4] |
|
|
|
vldr s18, [IN, #(2*8+5)*4] |
|
|
|
vldr s20, [IN, #(2*8+6)*4] |
|
|
|
vldr s22, [IN, #(2*8+7)*4] |
|
|
|
vstr d4, [BUF, #(0*32+2)*4] |
|
|
|
vstr d5, [BUF, #(1*32+2)*4] |
|
|
|
vstr d6, [BUF, #(2*32+2)*4] |
|
|
|
vstr d7, [BUF, #(3*32+2)*4] |
|
|
|
vstr d8, [BUF, #(4*32+2)*4] |
|
|
|
vstr d9, [BUF, #(5*32+2)*4] |
|
|
|
vstr d10, [BUF, #(6*32+2)*4] |
|
|
|
vstr d11, [BUF, #(7*32+2)*4] |
|
|
|
add IN, IN, #4*8*4 |
|
|
|
add BUF, BUF, #4*4 |
|
|
|
subs COUNT, COUNT, #(4 << 5) + 2 |
|
|
|
bpl 1b |
|
|
|
2: @ Now deal with trailing < 4 samples |
|
|
|
adds COUNT, COUNT, #3 << 5 |
|
|
|
bmi 4f @ sb_act was a multiple of 4 |
|
|
|
bics lr, COUNT, #0x1F |
|
|
|
bne 3f |
|
|
|
@ sb_act was n*4+1 |
|
|
|
vldr s8, [IN, #(0*8+0)*4] |
|
|
|
vldr s10, [IN, #(0*8+1)*4] |
|
|
|
vldr s12, [IN, #(0*8+2)*4] |
|
|
|
vldr s14, [IN, #(0*8+3)*4] |
|
|
|
vldr s16, [IN, #(0*8+4)*4] |
|
|
|
vldr s18, [IN, #(0*8+5)*4] |
|
|
|
vldr s20, [IN, #(0*8+6)*4] |
|
|
|
vldr s22, [IN, #(0*8+7)*4] |
|
|
|
vneg.f s8, s8 |
|
|
|
vldr s9, zero |
|
|
|
vldr s11, zero |
|
|
|
vldr s13, zero |
|
|
|
vldr s15, zero |
|
|
|
vneg.f s16, s16 |
|
|
|
vldr s17, zero |
|
|
|
vldr s19, zero |
|
|
|
vldr s21, zero |
|
|
|
vldr s23, zero |
|
|
|
vstr d4, [BUF, #(0*32+0)*4] |
|
|
|
vstr d5, [BUF, #(1*32+0)*4] |
|
|
|
vstr d6, [BUF, #(2*32+0)*4] |
|
|
|
vstr d7, [BUF, #(3*32+0)*4] |
|
|
|
vstr d8, [BUF, #(4*32+0)*4] |
|
|
|
vstr d9, [BUF, #(5*32+0)*4] |
|
|
|
vstr d10, [BUF, #(6*32+0)*4] |
|
|
|
vstr d11, [BUF, #(7*32+0)*4] |
|
|
|
add BUF, BUF, #2*4 |
|
|
|
sub COUNT, COUNT, #1 |
|
|
|
b 4f |
|
|
|
3: @ sb_act was n*4+2 or n*4+3, so do the first 2 |
|
|
|
vldr s8, [IN, #(0*8+0)*4] |
|
|
|
vldr s10, [IN, #(0*8+1)*4] |
|
|
|
vldr s12, [IN, #(0*8+2)*4] |
|
|
|
vldr s14, [IN, #(0*8+3)*4] |
|
|
|
vldr s16, [IN, #(0*8+4)*4] |
|
|
|
vldr s18, [IN, #(0*8+5)*4] |
|
|
|
vldr s20, [IN, #(0*8+6)*4] |
|
|
|
vldr s22, [IN, #(0*8+7)*4] |
|
|
|
vneg.f s8, s8 |
|
|
|
vldr s9, [IN, #(1*8+0)*4] |
|
|
|
vldr s11, [IN, #(1*8+1)*4] |
|
|
|
vldr s13, [IN, #(1*8+2)*4] |
|
|
|
vldr s15, [IN, #(1*8+3)*4] |
|
|
|
vneg.f s16, s16 |
|
|
|
vldr s17, [IN, #(1*8+4)*4] |
|
|
|
vldr s19, [IN, #(1*8+5)*4] |
|
|
|
vldr s21, [IN, #(1*8+6)*4] |
|
|
|
vldr s23, [IN, #(1*8+7)*4] |
|
|
|
vstr d4, [BUF, #(0*32+0)*4] |
|
|
|
vstr d5, [BUF, #(1*32+0)*4] |
|
|
|
vstr d6, [BUF, #(2*32+0)*4] |
|
|
|
vstr d7, [BUF, #(3*32+0)*4] |
|
|
|
vstr d8, [BUF, #(4*32+0)*4] |
|
|
|
vstr d9, [BUF, #(5*32+0)*4] |
|
|
|
vstr d10, [BUF, #(6*32+0)*4] |
|
|
|
vstr d11, [BUF, #(7*32+0)*4] |
|
|
|
add BUF, BUF, #2*4 |
|
|
|
sub COUNT, COUNT, #(2 << 5) + 1 |
|
|
|
bics lr, COUNT, #0x1F |
|
|
|
bne 4f |
|
|
|
@ sb_act was n*4+3 |
|
|
|
vldr s8, [IN, #(2*8+0)*4] |
|
|
|
vldr s10, [IN, #(2*8+1)*4] |
|
|
|
vldr s12, [IN, #(2*8+2)*4] |
|
|
|
vldr s14, [IN, #(2*8+3)*4] |
|
|
|
vldr s16, [IN, #(2*8+4)*4] |
|
|
|
vldr s18, [IN, #(2*8+5)*4] |
|
|
|
vldr s20, [IN, #(2*8+6)*4] |
|
|
|
vldr s22, [IN, #(2*8+7)*4] |
|
|
|
vldr s9, zero |
|
|
|
vldr s11, zero |
|
|
|
vldr s13, zero |
|
|
|
vldr s15, zero |
|
|
|
vldr s17, zero |
|
|
|
vldr s19, zero |
|
|
|
vldr s21, zero |
|
|
|
vldr s23, zero |
|
|
|
vstr d4, [BUF, #(0*32+0)*4] |
|
|
|
vstr d5, [BUF, #(1*32+0)*4] |
|
|
|
vstr d6, [BUF, #(2*32+0)*4] |
|
|
|
vstr d7, [BUF, #(3*32+0)*4] |
|
|
|
vstr d8, [BUF, #(4*32+0)*4] |
|
|
|
vstr d9, [BUF, #(5*32+0)*4] |
|
|
|
vstr d10, [BUF, #(6*32+0)*4] |
|
|
|
vstr d11, [BUF, #(7*32+0)*4] |
|
|
|
add BUF, BUF, #2*4 |
|
|
|
sub COUNT, COUNT, #1 |
|
|
|
4: @ Now fill the remainder with 0 |
|
|
|
vldr s8, zero |
|
|
|
vldr s9, zero |
|
|
|
ands COUNT, COUNT, #0x1F |
|
|
|
beq 6f |
|
|
|
5: vstr d4, [BUF, #(0*32+0)*4] |
|
|
|
vstr d4, [BUF, #(1*32+0)*4] |
|
|
|
vstr d4, [BUF, #(2*32+0)*4] |
|
|
|
vstr d4, [BUF, #(3*32+0)*4] |
|
|
|
vstr d4, [BUF, #(4*32+0)*4] |
|
|
|
vstr d4, [BUF, #(5*32+0)*4] |
|
|
|
vstr d4, [BUF, #(6*32+0)*4] |
|
|
|
vstr d4, [BUF, #(7*32+0)*4] |
|
|
|
add BUF, BUF, #2*4 |
|
|
|
subs COUNT, COUNT, #1 |
|
|
|
bne 5b |
|
|
|
6: |
|
|
|
fmxr FPSCR, OLDFPSCR |
|
|
|
ldr WINDOW, [fp, #3*4] |
|
|
|
ldr OUT, [fp, #4*4] |
|
|
|
sub BUF, BUF, #32*4 |
|
|
|
NOVFP ldr SCALEINT, [fp, #6*4] |
|
|
|
mov COUNT, #8 |
|
|
|
VFP vpush {SCALE} |
|
|
|
VFP sub sp, sp, #3*4 |
|
|
|
NOVFP sub sp, sp, #4*4 |
|
|
|
7: |
|
|
|
VFP ldr a1, [fp, #-7*4] @ imdct |
|
|
|
NOVFP ldr a1, [fp, #-8*4] |
|
|
|
ldmia fp, {a2-a4} |
|
|
|
VFP stmia sp, {WINDOW, OUT, BUF} |
|
|
|
NOVFP stmia sp, {WINDOW, OUT, BUF, SCALEINT} |
|
|
|
VFP vldr SCALE, [sp, #3*4] |
|
|
|
bl ff_synth_filter_float_vfp |
|
|
|
add OUT, OUT, #32*4 |
|
|
|
add BUF, BUF, #32*4 |
|
|
|
subs COUNT, COUNT, #1 |
|
|
|
bne 7b |
|
|
|
|
|
|
|
A sub sp, fp, #(8+8)*4 |
|
|
|
T sub fp, fp, #(8+8)*4 |
|
|
|
T mov sp, fp |
|
|
|
vpop {s16-s23} |
|
|
|
VFP pop {a3-a4,v1-v3,v5,fp,pc} |
|
|
|
NOVFP pop {a4,v1-v5,fp,pc} |
|
|
|
endfunc |
|
|
|
|
|
|
|
.unreq IN |
|
|
|
.unreq SBACT |
|
|
|
.unreq OLDFPSCR |
|
|
|
.unreq IMDCT |
|
|
|
.unreq WINDOW |
|
|
|
.unreq OUT |
|
|
|
.unreq BUF |
|
|
|
.unreq SCALEINT |
|
|
|
.unreq COUNT |
|
|
|
|
|
|
|
.unreq SCALE |
|
|
|
|
|
|
|
.align 2 |
|
|
|
zero: .word 0 |