|  | @@ -588,6 +588,9 @@ endfunc | 
														
													
														
															
																|  |  | .macro store i, dst, inc |  |  | .macro store i, dst, inc | 
														
													
														
															
																|  |  | st1             {v\i\().8h},  [\dst], \inc |  |  | st1             {v\i\().8h},  [\dst], \inc | 
														
													
														
															
																|  |  | .endm |  |  | .endm | 
														
													
														
															
																|  |  |  |  |  | .macro movi_v i, size, imm | 
														
													
														
															
																|  |  |  |  |  | movi            v\i\()\size,  \imm | 
														
													
														
															
																|  |  |  |  |  | .endm | 
														
													
														
															
																|  |  | .macro load_clear i, src, inc |  |  | .macro load_clear i, src, inc | 
														
													
														
															
																|  |  | ld1             {v\i\().8h}, [\src] |  |  | ld1             {v\i\().8h}, [\src] | 
														
													
														
															
																|  |  | st1             {v2.8h},  [\src], \inc |  |  | st1             {v2.8h},  [\src], \inc | 
														
													
												
													
														
															
																|  | @@ -596,9 +599,8 @@ endfunc | 
														
													
														
															
																|  |  | // Read a vertical 8x16 slice out of a 16x16 matrix, do a transform on it, |  |  | // Read a vertical 8x16 slice out of a 16x16 matrix, do a transform on it, | 
														
													
														
															
																|  |  | // transpose into a horizontal 16x8 slice and store. |  |  | // transpose into a horizontal 16x8 slice and store. | 
														
													
														
															
																|  |  | // x0 = dst (temp buffer) |  |  | // x0 = dst (temp buffer) | 
														
													
														
															
																|  |  | // x1 = unused |  |  |  | 
														
													
														
															
																|  |  |  |  |  | // x1 = slice offset | 
														
													
														
															
																|  |  | // x2 = src |  |  | // x2 = src | 
														
													
														
															
																|  |  | // x3 = slice offset |  |  |  | 
														
													
														
															
																|  |  | // x9 = input stride |  |  | // x9 = input stride | 
														
													
														
															
																|  |  | .macro itxfm16_1d_funcs txfm |  |  | .macro itxfm16_1d_funcs txfm | 
														
													
														
															
																|  |  | function \txfm\()16_1d_8x16_pass1_neon |  |  | function \txfm\()16_1d_8x16_pass1_neon | 
														
													
												
													
														
															
																|  | @@ -616,14 +618,14 @@ function \txfm\()16_1d_8x16_pass1_neon | 
														
													
														
															
																|  |  | transpose_8x8H  v24, v25, v26, v27, v28, v29, v30, v31, v2, v3 |  |  | transpose_8x8H  v24, v25, v26, v27, v28, v29, v30, v31, v2, v3 | 
														
													
														
															
																|  |  | 
 |  |  | 
 | 
														
													
														
															
																|  |  | // Store the transposed 8x8 blocks horizontally. |  |  | // Store the transposed 8x8 blocks horizontally. | 
														
													
														
															
																|  |  | cmp             x3,  #8 |  |  |  | 
														
													
														
															
																|  |  |  |  |  | cmp             x1,  #8 | 
														
													
														
															
																|  |  | b.eq            1f |  |  | b.eq            1f | 
														
													
														
															
																|  |  | .irp i, 16, 24, 17, 25, 18, 26, 19, 27, 20, 28, 21, 29, 22, 30, 23, 31 |  |  | .irp i, 16, 24, 17, 25, 18, 26, 19, 27, 20, 28, 21, 29, 22, 30, 23, 31 | 
														
													
														
															
																|  |  | store           \i,  x0,  #16 |  |  | store           \i,  x0,  #16 | 
														
													
														
															
																|  |  | .endr |  |  | .endr | 
														
													
														
															
																|  |  | ret |  |  | ret | 
														
													
														
															
																|  |  | 1: |  |  | 1: | 
														
													
														
															
																|  |  | // Special case: For the last input column (x3 == 8), |  |  |  | 
														
													
														
															
																|  |  |  |  |  | // Special case: For the last input column (x1 == 8), | 
														
													
														
															
																|  |  | // which would be stored as the last row in the temp buffer, |  |  | // which would be stored as the last row in the temp buffer, | 
														
													
														
															
																|  |  | // don't store the first 8x8 block, but keep it in registers |  |  | // don't store the first 8x8 block, but keep it in registers | 
														
													
														
															
																|  |  | // for the first slice of the second pass (where it is the |  |  | // for the first slice of the second pass (where it is the | 
														
													
												
													
														
															
																|  | @@ -751,13 +753,36 @@ function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_neon, export=1 | 
														
													
														
															
																|  |  | 
 |  |  | 
 | 
														
													
														
															
																|  |  | .irp i, 0, 8 |  |  | .irp i, 0, 8 | 
														
													
														
															
																|  |  | add             x0,  sp,  #(\i*32) |  |  | add             x0,  sp,  #(\i*32) | 
														
													
														
															
																|  |  |  |  |  | .ifc \txfm1\()_\txfm2,idct_idct | 
														
													
														
															
																|  |  |  |  |  | .if \i == 8 | 
														
													
														
															
																|  |  |  |  |  | cmp             w3,  #38 | 
														
													
														
															
																|  |  |  |  |  | b.le            1f | 
														
													
														
															
																|  |  |  |  |  | .endif | 
														
													
														
															
																|  |  |  |  |  | .endif | 
														
													
														
															
																|  |  |  |  |  | mov             x1,  #\i | 
														
													
														
															
																|  |  | add             x2,  x6,  #(\i*2) |  |  | add             x2,  x6,  #(\i*2) | 
														
													
														
															
																|  |  | mov             x3,  #\i |  |  |  | 
														
													
														
															
																|  |  | bl              \txfm1\()16_1d_8x16_pass1_neon |  |  | bl              \txfm1\()16_1d_8x16_pass1_neon | 
														
													
														
															
																|  |  | .endr |  |  | .endr | 
														
													
														
															
																|  |  | .ifc \txfm1\()_\txfm2,iadst_idct |  |  | .ifc \txfm1\()_\txfm2,iadst_idct | 
														
													
														
															
																|  |  | ld1             {v0.8h,v1.8h}, [x10] |  |  | ld1             {v0.8h,v1.8h}, [x10] | 
														
													
														
															
																|  |  | .endif |  |  | .endif | 
														
													
														
															
																|  |  |  |  |  | 
 | 
														
													
														
															
																|  |  |  |  |  | .ifc \txfm1\()_\txfm2,idct_idct | 
														
													
														
															
																|  |  |  |  |  | b               3f | 
														
													
														
															
																|  |  |  |  |  | 1: | 
														
													
														
															
																|  |  |  |  |  | // Set v24-v31 to zero, for the in-register passthrough of | 
														
													
														
															
																|  |  |  |  |  | // coefficients to pass 2. Since we only do two slices, this can | 
														
													
														
															
																|  |  |  |  |  | // only ever happen for the second slice. So we only need to store | 
														
													
														
															
																|  |  |  |  |  | // zeros to the temp buffer for the second half of the buffer. | 
														
													
														
															
																|  |  |  |  |  | // Move x0 to the second half, and use x9 == 32 as increment. | 
														
													
														
															
																|  |  |  |  |  | add             x0,  x0,  #16 | 
														
													
														
															
																|  |  |  |  |  | .irp i, 24, 25, 26, 27, 28, 29, 30, 31 | 
														
													
														
															
																|  |  |  |  |  | movi_v          \i,  .16b, #0 | 
														
													
														
															
																|  |  |  |  |  | st1             {v24.8h},  [x0], x9 | 
														
													
														
															
																|  |  |  |  |  | .endr | 
														
													
														
															
																|  |  |  |  |  | 3: | 
														
													
														
															
																|  |  |  |  |  | .endif | 
														
													
														
															
																|  |  |  |  |  | 
 | 
														
													
														
															
																|  |  | .irp i, 0, 8 |  |  | .irp i, 0, 8 | 
														
													
														
															
																|  |  | add             x0,  x4,  #(\i) |  |  | add             x0,  x4,  #(\i) | 
														
													
														
															
																|  |  | mov             x1,  x5 |  |  | mov             x1,  x5 | 
														
													
												
													
														
															
																|  | @@ -1073,12 +1098,17 @@ function idct32_1d_8x32_pass2_neon | 
														
													
														
															
																|  |  | ret |  |  | ret | 
														
													
														
															
																|  |  | endfunc |  |  | endfunc | 
														
													
														
															
																|  |  | 
 |  |  | 
 | 
														
													
														
															
																|  |  |  |  |  | const min_eob_idct_idct_32, align=4 | 
														
													
														
															
																|  |  |  |  |  | .short  0, 34, 135, 336 | 
														
													
														
															
																|  |  |  |  |  | endconst | 
														
													
														
															
																|  |  |  |  |  | 
 | 
														
													
														
															
																|  |  | function ff_vp9_idct_idct_32x32_add_neon, export=1 |  |  | function ff_vp9_idct_idct_32x32_add_neon, export=1 | 
														
													
														
															
																|  |  | cmp             w3,  #1 |  |  | cmp             w3,  #1 | 
														
													
														
															
																|  |  | b.eq            idct32x32_dc_add_neon |  |  | b.eq            idct32x32_dc_add_neon | 
														
													
														
															
																|  |  | 
 |  |  | 
 | 
														
													
														
															
																|  |  | movrel          x10, idct_coeffs |  |  | movrel          x10, idct_coeffs | 
														
													
														
															
																|  |  | add             x11, x10, #32 |  |  | add             x11, x10, #32 | 
														
													
														
															
																|  |  |  |  |  | movrel          x12, min_eob_idct_idct_32 + 2 | 
														
													
														
															
																|  |  | 
 |  |  | 
 | 
														
													
														
															
																|  |  | mov             x15, x30 |  |  | mov             x15, x30 | 
														
													
														
															
																|  |  | 
 |  |  | 
 | 
														
													
												
													
														
															
																|  | @@ -1099,9 +1129,30 @@ function ff_vp9_idct_idct_32x32_add_neon, export=1 | 
														
													
														
															
																|  |  | 
 |  |  | 
 | 
														
													
														
															
																|  |  | .irp i, 0, 8, 16, 24 |  |  | .irp i, 0, 8, 16, 24 | 
														
													
														
															
																|  |  | add             x0,  sp,  #(\i*64) |  |  | add             x0,  sp,  #(\i*64) | 
														
													
														
															
																|  |  |  |  |  | .if \i > 0 | 
														
													
														
															
																|  |  |  |  |  | ldrh            w1,  [x12], #2 | 
														
													
														
															
																|  |  |  |  |  | cmp             w3,  w1 | 
														
													
														
															
																|  |  |  |  |  | mov             x1,  #(32 - \i)/4 | 
														
													
														
															
																|  |  |  |  |  | b.le            1f | 
														
													
														
															
																|  |  |  |  |  | .endif | 
														
													
														
															
																|  |  | add             x2,  x6,  #(\i*2) |  |  | add             x2,  x6,  #(\i*2) | 
														
													
														
															
																|  |  | bl              idct32_1d_8x32_pass1_neon |  |  | bl              idct32_1d_8x32_pass1_neon | 
														
													
														
															
																|  |  | .endr |  |  | .endr | 
														
													
														
															
																|  |  |  |  |  | b               3f | 
														
													
														
															
																|  |  |  |  |  | 
 | 
														
													
														
															
																|  |  |  |  |  | 1: | 
														
													
														
															
																|  |  |  |  |  | // Write zeros to the temp buffer for pass 2 | 
														
													
														
															
																|  |  |  |  |  | movi            v16.8h,  #0 | 
														
													
														
															
																|  |  |  |  |  | movi            v17.8h,  #0 | 
														
													
														
															
																|  |  |  |  |  | movi            v18.8h,  #0 | 
														
													
														
															
																|  |  |  |  |  | movi            v19.8h,  #0 | 
														
													
														
															
																|  |  |  |  |  | 2: | 
														
													
														
															
																|  |  |  |  |  | subs            x1,  x1,  #1 | 
														
													
														
															
																|  |  |  |  |  | .rept 4 | 
														
													
														
															
																|  |  |  |  |  | st1             {v16.8h-v19.8h},  [x0], #64 | 
														
													
														
															
																|  |  |  |  |  | .endr | 
														
													
														
															
																|  |  |  |  |  | b.ne            2b | 
														
													
														
															
																|  |  |  |  |  | 3: | 
														
													
														
															
																|  |  | .irp i, 0, 8, 16, 24 |  |  | .irp i, 0, 8, 16, 24 | 
														
													
														
															
																|  |  | add             x0,  x4,  #(\i) |  |  | add             x0,  x4,  #(\i) | 
														
													
														
															
																|  |  | mov             x1,  x5 |  |  | mov             x1,  x5 | 
														
													
												
													
														
															
																|  | 
 |