|
|
@@ -588,6 +588,9 @@ endfunc |
|
|
|
.macro store i, dst, inc |
|
|
|
st1 {v\i\().8h}, [\dst], \inc |
|
|
|
.endm |
|
|
|
.macro movi_v i, size, imm |
|
|
|
movi v\i\()\size, \imm |
|
|
|
.endm |
|
|
|
.macro load_clear i, src, inc |
|
|
|
ld1 {v\i\().8h}, [\src] |
|
|
|
st1 {v2.8h}, [\src], \inc |
|
|
@@ -596,9 +599,8 @@ endfunc |
|
|
|
// Read a vertical 8x16 slice out of a 16x16 matrix, do a transform on it, |
|
|
|
// transpose into a horizontal 16x8 slice and store. |
|
|
|
// x0 = dst (temp buffer) |
|
|
|
// x1 = unused |
|
|
|
// x1 = slice offset |
|
|
|
// x2 = src |
|
|
|
// x3 = slice offset |
|
|
|
// x9 = input stride |
|
|
|
.macro itxfm16_1d_funcs txfm |
|
|
|
function \txfm\()16_1d_8x16_pass1_neon |
|
|
@@ -616,14 +618,14 @@ function \txfm\()16_1d_8x16_pass1_neon |
|
|
|
transpose_8x8H v24, v25, v26, v27, v28, v29, v30, v31, v2, v3 |
|
|
|
|
|
|
|
// Store the transposed 8x8 blocks horizontally. |
|
|
|
cmp x3, #8 |
|
|
|
cmp x1, #8 |
|
|
|
b.eq 1f |
|
|
|
.irp i, 16, 24, 17, 25, 18, 26, 19, 27, 20, 28, 21, 29, 22, 30, 23, 31 |
|
|
|
store \i, x0, #16 |
|
|
|
.endr |
|
|
|
ret |
|
|
|
1: |
|
|
|
// Special case: For the last input column (x3 == 8), |
|
|
|
// Special case: For the last input column (x1 == 8), |
|
|
|
// which would be stored as the last row in the temp buffer, |
|
|
|
// don't store the first 8x8 block, but keep it in registers |
|
|
|
// for the first slice of the second pass (where it is the |
|
|
@@ -751,13 +753,36 @@ function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_neon, export=1 |
|
|
|
|
|
|
|
.irp i, 0, 8 |
|
|
|
add x0, sp, #(\i*32) |
|
|
|
.ifc \txfm1\()_\txfm2,idct_idct |
|
|
|
.if \i == 8 |
|
|
|
cmp w3, #38 |
|
|
|
b.le 1f |
|
|
|
.endif |
|
|
|
.endif |
|
|
|
mov x1, #\i |
|
|
|
add x2, x6, #(\i*2) |
|
|
|
mov x3, #\i |
|
|
|
bl \txfm1\()16_1d_8x16_pass1_neon |
|
|
|
.endr |
|
|
|
.ifc \txfm1\()_\txfm2,iadst_idct |
|
|
|
ld1 {v0.8h,v1.8h}, [x10] |
|
|
|
.endif |
|
|
|
|
|
|
|
.ifc \txfm1\()_\txfm2,idct_idct |
|
|
|
b 3f |
|
|
|
1: |
|
|
|
// Set v24-v31 to zero, for the in-register passthrough of |
|
|
|
// coefficients to pass 2. Since we only do two slices, this can |
|
|
|
// only ever happen for the second slice. So we only need to store |
|
|
|
// zeros to the temp buffer for the second half of the buffer. |
|
|
|
// Move x0 to the second half, and use x9 == 32 as increment. |
|
|
|
add x0, x0, #16 |
|
|
|
.irp i, 24, 25, 26, 27, 28, 29, 30, 31 |
|
|
|
movi_v \i, .16b, #0 |
|
|
|
st1 {v24.8h}, [x0], x9 |
|
|
|
.endr |
|
|
|
3: |
|
|
|
.endif |
|
|
|
|
|
|
|
.irp i, 0, 8 |
|
|
|
add x0, x4, #(\i) |
|
|
|
mov x1, x5 |
|
|
@@ -1073,12 +1098,17 @@ function idct32_1d_8x32_pass2_neon |
|
|
|
ret |
|
|
|
endfunc |
|
|
|
|
|
|
|
const min_eob_idct_idct_32, align=4 |
|
|
|
.short 0, 34, 135, 336 |
|
|
|
endconst |
|
|
|
|
|
|
|
function ff_vp9_idct_idct_32x32_add_neon, export=1 |
|
|
|
cmp w3, #1 |
|
|
|
b.eq idct32x32_dc_add_neon |
|
|
|
|
|
|
|
movrel x10, idct_coeffs |
|
|
|
add x11, x10, #32 |
|
|
|
movrel x12, min_eob_idct_idct_32, 2 |
|
|
|
|
|
|
|
mov x15, x30 |
|
|
|
|
|
|
@@ -1099,9 +1129,30 @@ function ff_vp9_idct_idct_32x32_add_neon, export=1 |
|
|
|
|
|
|
|
.irp i, 0, 8, 16, 24 |
|
|
|
add x0, sp, #(\i*64) |
|
|
|
.if \i > 0 |
|
|
|
ldrh w1, [x12], #2 |
|
|
|
cmp w3, w1 |
|
|
|
mov x1, #(32 - \i)/4 |
|
|
|
b.le 1f |
|
|
|
.endif |
|
|
|
add x2, x6, #(\i*2) |
|
|
|
bl idct32_1d_8x32_pass1_neon |
|
|
|
.endr |
|
|
|
b 3f |
|
|
|
|
|
|
|
1: |
|
|
|
// Write zeros to the temp buffer for pass 2 |
|
|
|
movi v16.8h, #0 |
|
|
|
movi v17.8h, #0 |
|
|
|
movi v18.8h, #0 |
|
|
|
movi v19.8h, #0 |
|
|
|
2: |
|
|
|
subs x1, x1, #1 |
|
|
|
.rept 4 |
|
|
|
st1 {v16.8h-v19.8h}, [x0], #64 |
|
|
|
.endr |
|
|
|
b.ne 2b |
|
|
|
3: |
|
|
|
.irp i, 0, 8, 16, 24 |
|
|
|
add x0, x4, #(\i) |
|
|
|
mov x1, x5 |
|
|
|