Signed-off-by: Mans Rullgard <mans@mansr.com>tags/n0.9
| @@ -967,6 +967,7 @@ CONFIG_LIST=" | |||
| static | |||
| swscale | |||
| swscale_alpha | |||
| thumb | |||
| vaapi | |||
| vdpau | |||
| version3 | |||
| @@ -2607,7 +2608,7 @@ if enabled alpha; then | |||
| elif enabled arm; then | |||
| check_cflags -marm | |||
| enabled thumb && check_cflags -mthumb || check_cflags -marm | |||
| nogas=die | |||
| if check_cpp_condition stddef.h "defined __ARM_PCS_VFP"; then | |||
| @@ -114,12 +114,15 @@ static inline float *VMUL4S(float *dst, const float *v, unsigned idx, | |||
| "vmov d1, %2, %3 \n\t" | |||
| "lsls %6, %6, #1 \n\t" | |||
| "and %0, %5, #1<<31 \n\t" | |||
| "it cs \n\t" | |||
| "lslcs %5, %5, #1 \n\t" | |||
| "lsls %6, %6, #1 \n\t" | |||
| "and %1, %5, #1<<31 \n\t" | |||
| "it cs \n\t" | |||
| "lslcs %5, %5, #1 \n\t" | |||
| "lsls %6, %6, #1 \n\t" | |||
| "and %2, %5, #1<<31 \n\t" | |||
| "it cs \n\t" | |||
| "lslcs %5, %5, #1 \n\t" | |||
| "vmov d4, %0, %1 \n\t" | |||
| "and %3, %5, #1<<31 \n\t" | |||
| @@ -27,6 +27,7 @@ function ff_ac3_update_bap_counts_arm, export=1 | |||
| lsl r3, lr, #1 | |||
| ldrh r12, [r0, r3] | |||
| subs r2, r2, #1 | |||
| it gt | |||
| ldrbgt lr, [r1], #1 | |||
| add r12, r12, #1 | |||
| strh r12, [r0, r3] | |||
| @@ -42,9 +42,11 @@ function ff_ac3_bit_alloc_calc_bap_armv6, export=1 | |||
| mov r11, r10 | |||
| ldrb r10, [r4], #1 @ band_start_tab[band++] | |||
| subs r9, r9, r5 @ - floor | |||
| it lt | |||
| movlt r9, #0 | |||
| cmp r10, r3 @ - end | |||
| and r9, r9, r8 @ & 0x1fe0 | |||
| ite gt | |||
| subgt r8, r3, r11 | |||
| suble r8, r10, r11 | |||
| add r9, r9, r5 @ + floor => m | |||
| @@ -41,6 +41,7 @@ endfunc | |||
| function ff_ac3_exponent_min_neon, export=1 | |||
| cmp r1, #0 | |||
| it eq | |||
| bxeq lr | |||
| push {lr} | |||
| mov r12, #256 | |||
| @@ -24,9 +24,18 @@ | |||
| # define ELF | |||
| #else | |||
| # define ELF @ | |||
| #endif | |||
| #if CONFIG_THUMB | |||
| # define A @ | |||
| # define T | |||
| #else | |||
| # define A | |||
| # define T @ | |||
| #endif | |||
| .syntax unified | |||
| T .thumb | |||
| .macro require8 val=1 | |||
| ELF .eabi_attribute 24, \val | |||
| @@ -82,6 +91,90 @@ ELF .size \name, . - \name | |||
| #endif | |||
| .endm | |||
| .macro ldr_pre rt, rn, rm:vararg | |||
| A ldr \rt, [\rn, \rm]! | |||
| T add \rn, \rn, \rm | |||
| T ldr \rt, [\rn] | |||
| .endm | |||
| .macro ldr_post rt, rn, rm:vararg | |||
| A ldr \rt, [\rn], \rm | |||
| T ldr \rt, [\rn] | |||
| T add \rn, \rn, \rm | |||
| .endm | |||
| .macro ldrd_reg rt, rt2, rn, rm | |||
| A ldrd \rt, \rt2, [\rn, \rm] | |||
| T add \rt, \rn, \rm | |||
| T ldrd \rt, \rt2, [\rt] | |||
| .endm | |||
| .macro ldrd_post rt, rt2, rn, rm | |||
| A ldrd \rt, \rt2, [\rn], \rm | |||
| T ldrd \rt, \rt2, [\rn] | |||
| T add \rn, \rn, \rm | |||
| .endm | |||
| .macro ldrh_pre rt, rn, rm | |||
| A ldrh \rt, [\rn, \rm]! | |||
| T add \rn, \rn, \rm | |||
| T ldrh \rt, [\rn] | |||
| .endm | |||
| .macro ldrh_dpre rt, rn, rm | |||
| A ldrh \rt, [\rn, -\rm]! | |||
| T sub \rn, \rn, \rm | |||
| T ldrh \rt, [\rn] | |||
| .endm | |||
| .macro ldrh_post rt, rn, rm | |||
| A ldrh \rt, [\rn], \rm | |||
| T ldrh \rt, [\rn] | |||
| T add \rn, \rn, \rm | |||
| .endm | |||
| .macro str_post rt, rn, rm:vararg | |||
| A str \rt, [\rn], \rm | |||
| T str \rt, [\rn] | |||
| T add \rn, \rn, \rm | |||
| .endm | |||
| .macro strb_post rt, rn, rm:vararg | |||
| A strb \rt, [\rn], \rm | |||
| T strb \rt, [\rn] | |||
| T add \rn, \rn, \rm | |||
| .endm | |||
| .macro strd_post rt, rt2, rn, rm | |||
| A strd \rt, \rt2, [\rn], \rm | |||
| T strd \rt, \rt2, [\rn] | |||
| T add \rn, \rn, \rm | |||
| .endm | |||
| .macro strh_pre rt, rn, rm | |||
| A strh \rt, [\rn, \rm]! | |||
| T add \rn, \rn, \rm | |||
| T strh \rt, [\rn] | |||
| .endm | |||
| .macro strh_dpre rt, rn, rm | |||
| A strh \rt, [\rn, -\rm]! | |||
| T sub \rn, \rn, \rm | |||
| T strh \rt, [\rn] | |||
| .endm | |||
| .macro strh_post rt, rn, rm | |||
| A strh \rt, [\rn], \rm | |||
| T strh \rt, [\rn] | |||
| T add \rn, \rn, \rm | |||
| .endm | |||
| .macro strh_dpost rt, rn, rm | |||
| A strh \rt, [\rn], -\rm | |||
| T strh \rt, [\rn] | |||
| T sub \rn, \rn, \rm | |||
| .endm | |||
| #if HAVE_VFP_ARGS | |||
| .eabi_attribute 28, 1 | |||
| # define VFP | |||
| @@ -27,6 +27,7 @@ function ff_dca_lfe_fir_neon, export=1 | |||
| add r5, r2, #256*4-16 @ cf1 | |||
| sub r1, r1, #12 | |||
| cmp r3, #32 | |||
| ite eq | |||
| moveq r6, #256/32 | |||
| movne r6, #256/64 | |||
| NOVFP vldr s0, [sp, #16] @ scale | |||
| @@ -554,10 +554,12 @@ endfunc | |||
| and r9, r5, r14 | |||
| and r10, r6, r14 | |||
| and r11, r7, r14 | |||
| it eq | |||
| andeq r14, r14, r14, \rnd #1 | |||
| add r8, r8, r10 | |||
| add r9, r9, r11 | |||
| ldr r12, =0xfcfcfcfc >> 2 | |||
| itt eq | |||
| addeq r8, r8, r14 | |||
| addeq r9, r9, r14 | |||
| and r4, r12, r4, lsr #2 | |||
| @@ -638,8 +640,10 @@ function ff_add_pixels_clamped_arm, export=1 | |||
| mvn r5, r5 | |||
| mvn r7, r7 | |||
| tst r6, #0x100 | |||
| it ne | |||
| movne r6, r5, lsr #24 | |||
| tst r8, #0x100 | |||
| it ne | |||
| movne r8, r7, lsr #24 | |||
| mov r9, r6 | |||
| ldrsh r5, [r0, #4] /* moved form [A] */ | |||
| @@ -654,8 +658,10 @@ function ff_add_pixels_clamped_arm, export=1 | |||
| mvn r5, r5 | |||
| mvn r7, r7 | |||
| tst r6, #0x100 | |||
| it ne | |||
| movne r6, r5, lsr #24 | |||
| tst r8, #0x100 | |||
| it ne | |||
| movne r8, r7, lsr #24 | |||
| orr r9, r9, r6, lsl #16 | |||
| ldr r4, [r1, #4] /* moved form [B] */ | |||
| @@ -676,8 +682,10 @@ function ff_add_pixels_clamped_arm, export=1 | |||
| mvn r5, r5 | |||
| mvn r7, r7 | |||
| tst r6, #0x100 | |||
| it ne | |||
| movne r6, r5, lsr #24 | |||
| tst r8, #0x100 | |||
| it ne | |||
| movne r8, r7, lsr #24 | |||
| mov r9, r6 | |||
| ldrsh r5, [r0, #12] /* moved from [D] */ | |||
| @@ -692,8 +700,10 @@ function ff_add_pixels_clamped_arm, export=1 | |||
| mvn r5, r5 | |||
| mvn r7, r7 | |||
| tst r6, #0x100 | |||
| it ne | |||
| movne r6, r5, lsr #24 | |||
| tst r8, #0x100 | |||
| it ne | |||
| movne r8, r7, lsr #24 | |||
| orr r9, r9, r6, lsl #16 | |||
| add r0, r0, #16 /* moved from [E] */ | |||
| @@ -47,16 +47,16 @@ function ff_put_pixels16_armv6, export=1 | |||
| ldr r5, [r1, #4] | |||
| ldr r6, [r1, #8] | |||
| ldr r7, [r1, #12] | |||
| ldr r4, [r1], r2 | |||
| ldr_post r4, r1, r2 | |||
| strd r6, r7, [r0, #8] | |||
| ldr r9, [r1, #4] | |||
| strd r4, r5, [r0], r2 | |||
| strd_post r4, r5, r0, r2 | |||
| ldr r10, [r1, #8] | |||
| ldr r11, [r1, #12] | |||
| ldr r8, [r1], r2 | |||
| ldr_post r8, r1, r2 | |||
| strd r10, r11, [r0, #8] | |||
| subs r3, r3, #2 | |||
| strd r8, r9, [r0], r2 | |||
| strd_post r8, r9, r0, r2 | |||
| bne 1b | |||
| pop {r4-r11} | |||
| @@ -67,12 +67,12 @@ function ff_put_pixels8_armv6, export=1 | |||
| push {r4-r7} | |||
| 1: | |||
| ldr r5, [r1, #4] | |||
| ldr r4, [r1], r2 | |||
| ldr_post r4, r1, r2 | |||
| ldr r7, [r1, #4] | |||
| strd r4, r5, [r0], r2 | |||
| ldr r6, [r1], r2 | |||
| strd_post r4, r5, r0, r2 | |||
| ldr_post r6, r1, r2 | |||
| subs r3, r3, #2 | |||
| strd r6, r7, [r0], r2 | |||
| strd_post r6, r7, r0, r2 | |||
| bne 1b | |||
| pop {r4-r7} | |||
| @@ -90,7 +90,7 @@ function ff_put_pixels8_x2_armv6, export=1 | |||
| ldr r5, [r1, #4] | |||
| ldr r7, [r1, #5] | |||
| lsr r6, r4, #8 | |||
| ldr r8, [r1, r2]! | |||
| ldr_pre r8, r1, r2 | |||
| orr r6, r6, r5, lsl #24 | |||
| ldr r9, [r1, #4] | |||
| ldr r11, [r1, #5] | |||
| @@ -112,9 +112,9 @@ function ff_put_pixels8_x2_armv6, export=1 | |||
| uhadd8 r9, r9, r11 | |||
| and r6, r6, r12 | |||
| uadd8 r8, r8, r14 | |||
| strd r4, r5, [r0], r2 | |||
| strd_post r4, r5, r0, r2 | |||
| uadd8 r9, r9, r6 | |||
| strd r8, r9, [r0], r2 | |||
| strd_post r8, r9, r0, r2 | |||
| bne 1b | |||
| pop {r4-r11, pc} | |||
| @@ -127,7 +127,7 @@ function ff_put_pixels8_y2_armv6, export=1 | |||
| orr r12, r12, r12, lsl #16 | |||
| ldr r4, [r1] | |||
| ldr r5, [r1, #4] | |||
| ldr r6, [r1, r2]! | |||
| ldr_pre r6, r1, r2 | |||
| ldr r7, [r1, #4] | |||
| 1: | |||
| subs r3, r3, #2 | |||
| @@ -136,7 +136,7 @@ function ff_put_pixels8_y2_armv6, export=1 | |||
| uhadd8 r9, r5, r7 | |||
| eor r11, r5, r7 | |||
| and r10, r10, r12 | |||
| ldr r4, [r1, r2]! | |||
| ldr_pre r4, r1, r2 | |||
| uadd8 r8, r8, r10 | |||
| and r11, r11, r12 | |||
| uadd8 r9, r9, r11 | |||
| @@ -148,11 +148,11 @@ function ff_put_pixels8_y2_armv6, export=1 | |||
| eor r7, r5, r7 | |||
| uadd8 r10, r10, r6 | |||
| and r7, r7, r12 | |||
| ldr r6, [r1, r2]! | |||
| ldr_pre r6, r1, r2 | |||
| uadd8 r11, r11, r7 | |||
| strd r8, r9, [r0], r2 | |||
| strd_post r8, r9, r0, r2 | |||
| ldr r7, [r1, #4] | |||
| strd r10, r11, [r0], r2 | |||
| strd_post r10, r11, r0, r2 | |||
| bne 1b | |||
| pop {r4-r11} | |||
| @@ -166,7 +166,7 @@ function ff_put_pixels8_x2_no_rnd_armv6, export=1 | |||
| ldr r4, [r1] | |||
| ldr r5, [r1, #4] | |||
| ldr r7, [r1, #5] | |||
| ldr r8, [r1, r2]! | |||
| ldr_pre r8, r1, r2 | |||
| ldr r9, [r1, #4] | |||
| ldr r14, [r1, #5] | |||
| add r1, r1, r2 | |||
| @@ -191,16 +191,16 @@ function ff_put_pixels8_y2_no_rnd_armv6, export=1 | |||
| push {r4-r9, lr} | |||
| ldr r4, [r1] | |||
| ldr r5, [r1, #4] | |||
| ldr r6, [r1, r2]! | |||
| ldr_pre r6, r1, r2 | |||
| ldr r7, [r1, #4] | |||
| 1: | |||
| subs r3, r3, #2 | |||
| uhadd8 r8, r4, r6 | |||
| ldr r4, [r1, r2]! | |||
| ldr_pre r4, r1, r2 | |||
| uhadd8 r9, r5, r7 | |||
| ldr r5, [r1, #4] | |||
| uhadd8 r12, r4, r6 | |||
| ldr r6, [r1, r2]! | |||
| ldr_pre r6, r1, r2 | |||
| uhadd8 r14, r5, r7 | |||
| ldr r7, [r1, #4] | |||
| stm r0, {r8,r9} | |||
| @@ -220,44 +220,44 @@ function ff_avg_pixels8_armv6, export=1 | |||
| orr lr, lr, lr, lsl #16 | |||
| ldrd r4, r5, [r0] | |||
| ldr r10, [r1, #4] | |||
| ldr r9, [r1], r2 | |||
| ldr_post r9, r1, r2 | |||
| subs r3, r3, #2 | |||
| 1: | |||
| pld [r1, r2] | |||
| eor r8, r4, r9 | |||
| uhadd8 r4, r4, r9 | |||
| eor r12, r5, r10 | |||
| ldrd r6, r7, [r0, r2] | |||
| ldrd_reg r6, r7, r0, r2 | |||
| uhadd8 r5, r5, r10 | |||
| and r8, r8, lr | |||
| ldr r10, [r1, #4] | |||
| and r12, r12, lr | |||
| uadd8 r4, r4, r8 | |||
| ldr r9, [r1], r2 | |||
| ldr_post r9, r1, r2 | |||
| eor r8, r6, r9 | |||
| uadd8 r5, r5, r12 | |||
| pld [r1, r2, lsl #1] | |||
| eor r12, r7, r10 | |||
| uhadd8 r6, r6, r9 | |||
| strd r4, r5, [r0], r2 | |||
| strd_post r4, r5, r0, r2 | |||
| uhadd8 r7, r7, r10 | |||
| beq 2f | |||
| and r8, r8, lr | |||
| ldrd r4, r5, [r0, r2] | |||
| ldrd_reg r4, r5, r0, r2 | |||
| uadd8 r6, r6, r8 | |||
| ldr r10, [r1, #4] | |||
| and r12, r12, lr | |||
| subs r3, r3, #2 | |||
| uadd8 r7, r7, r12 | |||
| ldr r9, [r1], r2 | |||
| strd r6, r7, [r0], r2 | |||
| ldr_post r9, r1, r2 | |||
| strd_post r6, r7, r0, r2 | |||
| b 1b | |||
| 2: | |||
| and r8, r8, lr | |||
| and r12, r12, lr | |||
| uadd8 r6, r6, r8 | |||
| uadd8 r7, r7, r12 | |||
| strd r6, r7, [r0], r2 | |||
| strd_post r6, r7, r0, r2 | |||
| pop {r4-r10, pc} | |||
| endfunc | |||
| @@ -284,7 +284,7 @@ function ff_add_pixels_clamped_armv6, export=1 | |||
| orr r6, r8, r5, lsl #8 | |||
| orr r7, r4, lr, lsl #8 | |||
| subs r3, r3, #1 | |||
| strd r6, r7, [r1], r2 | |||
| strd_post r6, r7, r1, r2 | |||
| bgt 1b | |||
| pop {r4-r8,pc} | |||
| endfunc | |||
| @@ -294,7 +294,7 @@ function ff_get_pixels_armv6, export=1 | |||
| push {r4-r8, lr} | |||
| mov lr, #8 | |||
| 1: | |||
| ldrd r4, r5, [r1], r2 | |||
| ldrd_post r4, r5, r1, r2 | |||
| subs lr, lr, #1 | |||
| uxtb16 r6, r4 | |||
| uxtb16 r4, r4, ror #8 | |||
| @@ -317,8 +317,8 @@ function ff_diff_pixels_armv6, export=1 | |||
| push {r4-r9, lr} | |||
| mov lr, #8 | |||
| 1: | |||
| ldrd r4, r5, [r1], r3 | |||
| ldrd r6, r7, [r2], r3 | |||
| ldrd_post r4, r5, r1, r3 | |||
| ldrd_post r6, r7, r2, r3 | |||
| uxtb16 r8, r4 | |||
| uxtb16 r4, r4, ror #8 | |||
| uxtb16 r9, r6 | |||
| @@ -492,19 +492,19 @@ function ff_pix_abs8_armv6, export=1 | |||
| push {r4-r9, lr} | |||
| mov r0, #0 | |||
| mov lr, #0 | |||
| ldrd r4, r5, [r1], r3 | |||
| ldrd_post r4, r5, r1, r3 | |||
| 1: | |||
| subs r12, r12, #2 | |||
| ldr r7, [r2, #4] | |||
| ldr r6, [r2], r3 | |||
| ldrd r8, r9, [r1], r3 | |||
| ldr_post r6, r2, r3 | |||
| ldrd_post r8, r9, r1, r3 | |||
| usada8 r0, r4, r6, r0 | |||
| pld [r2, r3] | |||
| usada8 lr, r5, r7, lr | |||
| ldr r7, [r2, #4] | |||
| ldr r6, [r2], r3 | |||
| ldr_post r6, r2, r3 | |||
| beq 2f | |||
| ldrd r4, r5, [r1], r3 | |||
| ldrd_post r4, r5, r1, r3 | |||
| usada8 r0, r8, r6, r0 | |||
| pld [r2, r3] | |||
| usada8 lr, r9, r7, lr | |||
| @@ -613,7 +613,7 @@ function ff_pix_sum_armv6, export=1 | |||
| ldr r7, [r0, #12] | |||
| usada8 r2, r6, lr, r2 | |||
| beq 2f | |||
| ldr r4, [r0, r1]! | |||
| ldr_pre r4, r0, r1 | |||
| usada8 r3, r7, lr, r3 | |||
| bgt 1b | |||
| 2: | |||
| @@ -531,6 +531,7 @@ function ff_vorbis_inverse_coupling_neon, export=1 | |||
| 2: vst1.32 {d2-d3}, [r3, :128]! | |||
| vst1.32 {d0-d1}, [r12,:128]! | |||
| it lt | |||
| bxlt lr | |||
| 3: vld1.32 {d2-d3}, [r1,:128] | |||
| @@ -575,6 +576,7 @@ NOVFP vdup.32 q8, r2 | |||
| 2: vst1.32 {q2},[r0,:128]! | |||
| vst1.32 {q3},[r0,:128]! | |||
| ands len, len, #15 | |||
| it eq | |||
| bxeq lr | |||
| 3: vld1.32 {q0},[r1,:128]! | |||
| vmul.f32 q0, q0, q8 | |||
| @@ -638,6 +640,7 @@ NOVFP ldr r3, [sp] | |||
| 2: vst1.32 {q8},[r0,:128]! | |||
| vst1.32 {q9},[r0,:128]! | |||
| ands r3, r3, #7 | |||
| it eq | |||
| popeq {pc} | |||
| 3: vld1.32 {q0},[r1,:128]! | |||
| ldr r12, [r2], #4 | |||
| @@ -55,18 +55,23 @@ function ff_vector_fmul_vfp, export=1 | |||
| 1: | |||
| subs r3, r3, #16 | |||
| vmul.f32 s12, s4, s12 | |||
| itttt ge | |||
| vldmiage r1!, {s16-s19} | |||
| vldmiage r2!, {s24-s27} | |||
| vldmiage r1!, {s20-s23} | |||
| vldmiage r2!, {s28-s31} | |||
| it ge | |||
| vmulge.f32 s24, s16, s24 | |||
| vstmia r0!, {s8-s11} | |||
| vstmia r0!, {s12-s15} | |||
| it ge | |||
| vmulge.f32 s28, s20, s28 | |||
| itttt gt | |||
| vldmiagt r1!, {s0-s3} | |||
| vldmiagt r2!, {s8-s11} | |||
| vldmiagt r1!, {s4-s7} | |||
| vldmiagt r2!, {s12-s15} | |||
| ittt ge | |||
| vmulge.f32 s8, s0, s8 | |||
| vstmiage r0!, {s24-s27} | |||
| vstmiage r0!, {s28-s31} | |||
| @@ -97,33 +102,49 @@ function ff_vector_fmul_reverse_vfp, export=1 | |||
| vmul.f32 s11, s0, s11 | |||
| 1: | |||
| subs r3, r3, #16 | |||
| it ge | |||
| vldmdbge r2!, {s16-s19} | |||
| vmul.f32 s12, s7, s12 | |||
| it ge | |||
| vldmiage r1!, {s24-s27} | |||
| vmul.f32 s13, s6, s13 | |||
| it ge | |||
| vldmdbge r2!, {s20-s23} | |||
| vmul.f32 s14, s5, s14 | |||
| it ge | |||
| vldmiage r1!, {s28-s31} | |||
| vmul.f32 s15, s4, s15 | |||
| it ge | |||
| vmulge.f32 s24, s19, s24 | |||
| it gt | |||
| vldmdbgt r2!, {s0-s3} | |||
| it ge | |||
| vmulge.f32 s25, s18, s25 | |||
| vstmia r0!, {s8-s13} | |||
| it ge | |||
| vmulge.f32 s26, s17, s26 | |||
| it gt | |||
| vldmiagt r1!, {s8-s11} | |||
| itt ge | |||
| vmulge.f32 s27, s16, s27 | |||
| vmulge.f32 s28, s23, s28 | |||
| it gt | |||
| vldmdbgt r2!, {s4-s7} | |||
| it ge | |||
| vmulge.f32 s29, s22, s29 | |||
| vstmia r0!, {s14-s15} | |||
| ittt ge | |||
| vmulge.f32 s30, s21, s30 | |||
| vmulge.f32 s31, s20, s31 | |||
| vmulge.f32 s8, s3, s8 | |||
| it gt | |||
| vldmiagt r1!, {s12-s15} | |||
| itttt ge | |||
| vmulge.f32 s9, s2, s9 | |||
| vmulge.f32 s10, s1, s10 | |||
| vstmiage r0!, {s24-s27} | |||
| vmulge.f32 s11, s0, s11 | |||
| it ge | |||
| vstmiage r0!, {s28-s31} | |||
| bgt 1b | |||
| @@ -71,6 +71,7 @@ endfunc | |||
| function ff_float_to_int16_interleave_neon, export=1 | |||
| cmp r3, #2 | |||
| itt lt | |||
| ldrlt r1, [r1] | |||
| blt ff_float_to_int16_neon | |||
| bne 4f | |||
| @@ -196,6 +197,7 @@ function ff_float_to_int16_interleave_neon, export=1 | |||
| vst1.64 {d3}, [r8], ip | |||
| vst1.64 {d7}, [r8], ip | |||
| subs r3, r3, #4 | |||
| it eq | |||
| popeq {r4-r8,pc} | |||
| cmp r3, #4 | |||
| add r0, r0, #8 | |||
| @@ -305,6 +307,7 @@ function ff_float_to_int16_interleave_neon, export=1 | |||
| vst1.32 {d23[1]}, [r8], ip | |||
| 8: subs r3, r3, #2 | |||
| add r0, r0, #4 | |||
| it eq | |||
| popeq {r4-r8,pc} | |||
| @ 1 channel | |||
| @@ -354,6 +357,7 @@ function ff_float_to_int16_interleave_neon, export=1 | |||
| vst1.16 {d2[3]}, [r5,:16], ip | |||
| vst1.16 {d3[1]}, [r5,:16], ip | |||
| vst1.16 {d3[3]}, [r5,:16], ip | |||
| it eq | |||
| popeq {r4-r8,pc} | |||
| vld1.64 {d0-d1}, [r4,:128]! | |||
| vcvt.s32.f32 q0, q0, #16 | |||
| @@ -46,6 +46,7 @@ function ff_float_to_int16_vfp, export=1 | |||
| vmov r5, r6, s2, s3 | |||
| vmov r7, r8, s4, s5 | |||
| vmov ip, lr, s6, s7 | |||
| it gt | |||
| vldmiagt r1!, {s16-s23} | |||
| ssat r4, #16, r4 | |||
| ssat r3, #16, r3 | |||
| @@ -53,10 +54,12 @@ function ff_float_to_int16_vfp, export=1 | |||
| ssat r5, #16, r5 | |||
| pkhbt r3, r3, r4, lsl #16 | |||
| pkhbt r4, r5, r6, lsl #16 | |||
| itttt gt | |||
| vcvtgt.s32.f32 s0, s16 | |||
| vcvtgt.s32.f32 s1, s17 | |||
| vcvtgt.s32.f32 s2, s18 | |||
| vcvtgt.s32.f32 s3, s19 | |||
| itttt gt | |||
| vcvtgt.s32.f32 s4, s20 | |||
| vcvtgt.s32.f32 s5, s21 | |||
| vcvtgt.s32.f32 s6, s22 | |||
| @@ -71,7 +71,9 @@ function ff_\type\()_h264_chroma_mc8_neon, export=1 | |||
| pld [r1] | |||
| pld [r1, r2] | |||
| muls r7, r4, r5 | |||
| A muls r7, r4, r5 | |||
| T mul r7, r4, r5 | |||
| T cmp r7, #0 | |||
| rsb r6, r7, r5, lsl #3 | |||
| rsb ip, r7, r4, lsl #3 | |||
| sub r4, r7, r4, lsl #3 | |||
| @@ -197,7 +199,9 @@ function ff_\type\()_h264_chroma_mc4_neon, export=1 | |||
| pld [r1] | |||
| pld [r1, r2] | |||
| muls r7, r4, r5 | |||
| A muls r7, r4, r5 | |||
| T mul r7, r4, r5 | |||
| T cmp r7, #0 | |||
| rsb r6, r7, r5, lsl #3 | |||
| rsb ip, r7, r4, lsl #3 | |||
| sub r4, r7, r4, lsl #3 | |||
| @@ -368,10 +372,10 @@ function ff_\type\()_h264_chroma_mc2_neon, export=1 | |||
| pop {r4-r6, pc} | |||
| 2: | |||
| .ifc \type,put | |||
| ldrh r5, [r1], r2 | |||
| strh r5, [r0], r2 | |||
| ldrh r6, [r1], r2 | |||
| strh r6, [r0], r2 | |||
| ldrh_post r5, r1, r2 | |||
| strh_post r5, r0, r2 | |||
| ldrh_post r6, r1, r2 | |||
| strh_post r6, r0, r2 | |||
| .else | |||
| vld1.16 {d16[0]}, [r1], r2 | |||
| vld1.16 {d16[1]}, [r1], r2 | |||
| @@ -404,28 +408,17 @@ endfunc | |||
| ldr ip, [sp] | |||
| tst r2, r2 | |||
| ldr ip, [ip] | |||
| it ne | |||
| tstne r3, r3 | |||
| vmov.32 d24[0], ip | |||
| and ip, ip, ip, lsl #16 | |||
| it eq | |||
| bxeq lr | |||
| ands ip, ip, ip, lsl #8 | |||
| it lt | |||
| bxlt lr | |||
| .endm | |||
| .macro align_push_regs | |||
| and ip, sp, #15 | |||
| add ip, ip, #32 | |||
| sub sp, sp, ip | |||
| vst1.64 {d12-d15}, [sp,:128] | |||
| sub sp, sp, #32 | |||
| vst1.64 {d8-d11}, [sp,:128] | |||
| .endm | |||
| .macro align_pop_regs | |||
| vld1.64 {d8-d11}, [sp,:128]! | |||
| vld1.64 {d12-d15}, [sp,:128], ip | |||
| .endm | |||
| .macro h264_loop_filter_luma | |||
| vdup.8 q11, r2 @ alpha | |||
| vmovl.u8 q12, d24 | |||
| @@ -506,7 +499,7 @@ function ff_h264_v_loop_filter_luma_neon, export=1 | |||
| vld1.64 {d18,d19}, [r0,:128], r1 | |||
| vld1.64 {d16,d17}, [r0,:128], r1 | |||
| align_push_regs | |||
| vpush {d8-d15} | |||
| h264_loop_filter_luma | |||
| @@ -516,7 +509,7 @@ function ff_h264_v_loop_filter_luma_neon, export=1 | |||
| vst1.64 {d0, d1}, [r0,:128], r1 | |||
| vst1.64 {d10,d11}, [r0,:128] | |||
| align_pop_regs | |||
| vpop {d8-d15} | |||
| bx lr | |||
| endfunc | |||
| @@ -543,7 +536,7 @@ function ff_h264_h_loop_filter_luma_neon, export=1 | |||
| transpose_8x8 q3, q10, q9, q8, q0, q1, q2, q13 | |||
| align_push_regs | |||
| vpush {d8-d15} | |||
| h264_loop_filter_luma | |||
| @@ -568,7 +561,7 @@ function ff_h264_h_loop_filter_luma_neon, export=1 | |||
| vst1.32 {d1[1]}, [r0], r1 | |||
| vst1.32 {d11[1]}, [r0], r1 | |||
| align_pop_regs | |||
| vpop {d8-d15} | |||
| bx lr | |||
| endfunc | |||
| @@ -1116,6 +1109,7 @@ function \type\()_h264_qpel8_hv_lowpass_neon | |||
| vrhadd.u8 d11, d11, d7 | |||
| sub r0, r0, r2, lsl #3 | |||
| .endif | |||
| vst1.64 {d12}, [r0,:64], r2 | |||
| vst1.64 {d13}, [r0,:64], r2 | |||
| vst1.64 {d14}, [r0,:64], r2 | |||
| @@ -1263,7 +1257,9 @@ function ff_\type\()_h264_qpel8_mc11_neon, export=1 | |||
| \type\()_h264_qpel8_mc11: | |||
| lowpass_const r3 | |||
| mov r11, sp | |||
| bic sp, sp, #15 | |||
| A bic sp, sp, #15 | |||
| T bic r0, r11, #15 | |||
| T mov sp, r0 | |||
| sub sp, sp, #64 | |||
| mov r0, sp | |||
| sub r1, r1, #2 | |||
| @@ -1271,14 +1267,14 @@ function ff_\type\()_h264_qpel8_mc11_neon, export=1 | |||
| mov ip, #8 | |||
| vpush {d8-d15} | |||
| bl put_h264_qpel8_h_lowpass_neon | |||
| ldrd r0, [r11] | |||
| ldrd r0, [r11], #8 | |||
| mov r3, r2 | |||
| add ip, sp, #64 | |||
| sub r1, r1, r2, lsl #1 | |||
| mov r2, #8 | |||
| bl \type\()_h264_qpel8_v_lowpass_l2_neon | |||
| vpop {d8-d15} | |||
| add sp, r11, #8 | |||
| mov sp, r11 | |||
| pop {r11, pc} | |||
| endfunc | |||
| @@ -1287,7 +1283,9 @@ function ff_\type\()_h264_qpel8_mc21_neon, export=1 | |||
| \type\()_h264_qpel8_mc21: | |||
| lowpass_const r3 | |||
| mov r11, sp | |||
| bic sp, sp, #15 | |||
| A bic sp, sp, #15 | |||
| T bic r0, r11, #15 | |||
| T mov sp, r0 | |||
| sub sp, sp, #(8*8+16*12) | |||
| sub r1, r1, #2 | |||
| mov r3, #8 | |||
| @@ -1296,14 +1294,14 @@ function ff_\type\()_h264_qpel8_mc21_neon, export=1 | |||
| vpush {d8-d15} | |||
| bl put_h264_qpel8_h_lowpass_neon | |||
| mov r4, r0 | |||
| ldrd r0, [r11] | |||
| ldrd r0, [r11], #8 | |||
| sub r1, r1, r2, lsl #1 | |||
| sub r1, r1, #2 | |||
| mov r3, r2 | |||
| sub r2, r4, #64 | |||
| bl \type\()_h264_qpel8_hv_lowpass_l2_neon | |||
| vpop {d8-d15} | |||
| add sp, r11, #8 | |||
| mov sp, r11 | |||
| pop {r4, r10, r11, pc} | |||
| endfunc | |||
| @@ -1330,7 +1328,9 @@ function ff_\type\()_h264_qpel8_mc12_neon, export=1 | |||
| \type\()_h264_qpel8_mc12: | |||
| lowpass_const r3 | |||
| mov r11, sp | |||
| bic sp, sp, #15 | |||
| A bic sp, sp, #15 | |||
| T bic r0, r11, #15 | |||
| T mov sp, r0 | |||
| sub sp, sp, #(8*8+16*12) | |||
| sub r1, r1, r2, lsl #1 | |||
| mov r3, r2 | |||
| @@ -1339,20 +1339,22 @@ function ff_\type\()_h264_qpel8_mc12_neon, export=1 | |||
| vpush {d8-d15} | |||
| bl put_h264_qpel8_v_lowpass_neon | |||
| mov r4, r0 | |||
| ldrd r0, [r11] | |||
| ldrd r0, [r11], #8 | |||
| sub r1, r1, r3, lsl #1 | |||
| sub r1, r1, #2 | |||
| sub r2, r4, #64 | |||
| bl \type\()_h264_qpel8_hv_lowpass_l2_neon | |||
| vpop {d8-d15} | |||
| add sp, r11, #8 | |||
| mov sp, r11 | |||
| pop {r4, r10, r11, pc} | |||
| endfunc | |||
| function ff_\type\()_h264_qpel8_mc22_neon, export=1 | |||
| push {r4, r10, r11, lr} | |||
| mov r11, sp | |||
| bic sp, sp, #15 | |||
| A bic sp, sp, #15 | |||
| T bic r4, r11, #15 | |||
| T mov sp, r4 | |||
| sub r1, r1, r2, lsl #1 | |||
| sub r1, r1, #2 | |||
| mov r3, r2 | |||
| @@ -1441,21 +1443,23 @@ function ff_\type\()_h264_qpel16_mc11_neon, export=1 | |||
| \type\()_h264_qpel16_mc11: | |||
| lowpass_const r3 | |||
| mov r11, sp | |||
| bic sp, sp, #15 | |||
| A bic sp, sp, #15 | |||
| T bic r0, r11, #15 | |||
| T mov sp, r0 | |||
| sub sp, sp, #256 | |||
| mov r0, sp | |||
| sub r1, r1, #2 | |||
| mov r3, #16 | |||
| vpush {d8-d15} | |||
| bl put_h264_qpel16_h_lowpass_neon | |||
| ldrd r0, [r11] | |||
| ldrd r0, [r11], #8 | |||
| mov r3, r2 | |||
| add ip, sp, #64 | |||
| sub r1, r1, r2, lsl #1 | |||
| mov r2, #16 | |||
| bl \type\()_h264_qpel16_v_lowpass_l2_neon | |||
| vpop {d8-d15} | |||
| add sp, r11, #8 | |||
| mov sp, r11 | |||
| pop {r4, r11, pc} | |||
| endfunc | |||
| @@ -1464,20 +1468,22 @@ function ff_\type\()_h264_qpel16_mc21_neon, export=1 | |||
| \type\()_h264_qpel16_mc21: | |||
| lowpass_const r3 | |||
| mov r11, sp | |||
| bic sp, sp, #15 | |||
| A bic sp, sp, #15 | |||
| T bic r0, r11, #15 | |||
| T mov sp, r0 | |||
| sub sp, sp, #(16*16+16*12) | |||
| sub r1, r1, #2 | |||
| mov r0, sp | |||
| vpush {d8-d15} | |||
| bl put_h264_qpel16_h_lowpass_neon_packed | |||
| mov r4, r0 | |||
| ldrd r0, [r11] | |||
| ldrd r0, [r11], #8 | |||
| sub r1, r1, r2, lsl #1 | |||
| sub r1, r1, #2 | |||
| mov r3, r2 | |||
| bl \type\()_h264_qpel16_hv_lowpass_l2_neon | |||
| vpop {d8-d15} | |||
| add sp, r11, #8 | |||
| mov sp, r11 | |||
| pop {r4-r5, r9-r11, pc} | |||
| endfunc | |||
| @@ -1504,7 +1510,9 @@ function ff_\type\()_h264_qpel16_mc12_neon, export=1 | |||
| \type\()_h264_qpel16_mc12: | |||
| lowpass_const r3 | |||
| mov r11, sp | |||
| bic sp, sp, #15 | |||
| A bic sp, sp, #15 | |||
| T bic r0, r11, #15 | |||
| T mov sp, r0 | |||
| sub sp, sp, #(16*16+16*12) | |||
| sub r1, r1, r2, lsl #1 | |||
| mov r0, sp | |||
| @@ -1512,13 +1520,13 @@ function ff_\type\()_h264_qpel16_mc12_neon, export=1 | |||
| vpush {d8-d15} | |||
| bl put_h264_qpel16_v_lowpass_neon_packed | |||
| mov r4, r0 | |||
| ldrd r0, [r11] | |||
| ldrd r0, [r11], #8 | |||
| sub r1, r1, r3, lsl #1 | |||
| sub r1, r1, #2 | |||
| mov r2, r3 | |||
| bl \type\()_h264_qpel16_hv_lowpass_l2_neon | |||
| vpop {d8-d15} | |||
| add sp, r11, #8 | |||
| mov sp, r11 | |||
| pop {r4-r5, r9-r11, pc} | |||
| endfunc | |||
| @@ -1526,7 +1534,9 @@ function ff_\type\()_h264_qpel16_mc22_neon, export=1 | |||
| push {r4, r9-r11, lr} | |||
| lowpass_const r3 | |||
| mov r11, sp | |||
| bic sp, sp, #15 | |||
| A bic sp, sp, #15 | |||
| T bic r4, r11, #15 | |||
| T mov sp, r4 | |||
| sub r1, r1, r2, lsl #1 | |||
| sub r1, r1, #2 | |||
| mov r3, r2 | |||
| @@ -106,10 +106,12 @@ function ff_h264_idct_add16_neon, export=1 | |||
| blt 2f | |||
| ldrsh lr, [r1] | |||
| add r0, r0, r4 | |||
| it ne | |||
| movne lr, #0 | |||
| cmp lr, #0 | |||
| adrne lr, ff_h264_idct_dc_add_neon | |||
| adreq lr, ff_h264_idct_add_neon | |||
| ite ne | |||
| adrne lr, ff_h264_idct_dc_add_neon + CONFIG_THUMB | |||
| adreq lr, ff_h264_idct_add_neon + CONFIG_THUMB | |||
| blx lr | |||
| 2: subs ip, ip, #1 | |||
| add r1, r1, #32 | |||
| @@ -132,8 +134,9 @@ function ff_h264_idct_add16intra_neon, export=1 | |||
| add r0, r0, r4 | |||
| cmp r8, #0 | |||
| ldrsh r8, [r1] | |||
| adrne lr, ff_h264_idct_add_neon | |||
| adreq lr, ff_h264_idct_dc_add_neon | |||
| iteet ne | |||
| adrne lr, ff_h264_idct_add_neon + CONFIG_THUMB | |||
| adreq lr, ff_h264_idct_dc_add_neon + CONFIG_THUMB | |||
| cmpeq r8, #0 | |||
| blxne lr | |||
| subs ip, ip, #1 | |||
| @@ -159,12 +162,14 @@ function ff_h264_idct_add8_neon, export=1 | |||
| add r1, r3, r12, lsl #5 | |||
| cmp r8, #0 | |||
| ldrsh r8, [r1] | |||
| adrne lr, ff_h264_idct_add_neon | |||
| adreq lr, ff_h264_idct_dc_add_neon | |||
| iteet ne | |||
| adrne lr, ff_h264_idct_add_neon + CONFIG_THUMB | |||
| adreq lr, ff_h264_idct_dc_add_neon + CONFIG_THUMB | |||
| cmpeq r8, #0 | |||
| blxne lr | |||
| add r12, r12, #1 | |||
| cmp r12, #4 | |||
| itt eq | |||
| moveq r12, #16 | |||
| moveq r4, r9 | |||
| cmp r12, #20 | |||
| @@ -365,10 +370,12 @@ function ff_h264_idct8_add4_neon, export=1 | |||
| blt 2f | |||
| ldrsh lr, [r1] | |||
| add r0, r0, r4 | |||
| it ne | |||
| movne lr, #0 | |||
| cmp lr, #0 | |||
| adrne lr, ff_h264_idct8_dc_add_neon | |||
| adreq lr, ff_h264_idct8_add_neon | |||
| ite ne | |||
| adrne lr, ff_h264_idct8_dc_add_neon + CONFIG_THUMB | |||
| adreq lr, ff_h264_idct8_add_neon + CONFIG_THUMB | |||
| blx lr | |||
| 2: subs r12, r12, #4 | |||
| add r1, r1, #128 | |||
| @@ -64,11 +64,14 @@ static inline av_const int mid_pred(int a, int b, int c) | |||
| __asm__ ( | |||
| "mov %0, %2 \n\t" | |||
| "cmp %1, %2 \n\t" | |||
| "itt gt \n\t" | |||
| "movgt %0, %1 \n\t" | |||
| "movgt %1, %2 \n\t" | |||
| "cmp %1, %3 \n\t" | |||
| "it le \n\t" | |||
| "movle %1, %3 \n\t" | |||
| "cmp %0, %1 \n\t" | |||
| "it gt \n\t" | |||
| "movgt %0, %1 \n\t" | |||
| : "=&r"(m), "+r"(a) | |||
| : "r"(b), "r"(c) | |||
| @@ -191,7 +191,9 @@ function ff_mdct_calc_neon, export=1 | |||
| vadd.f32 d17, d17, d3 @ in2u+in1d -I | |||
| 1: | |||
| vmul.f32 d7, d0, d21 @ I*s | |||
| ldr r10, [r3, lr, lsr #1] | |||
| A ldr r10, [r3, lr, lsr #1] | |||
| T lsr r10, lr, #1 | |||
| T ldr r10, [r3, r10] | |||
| vmul.f32 d6, d1, d20 @ -R*c | |||
| ldr r6, [r3, #4]! | |||
| vmul.f32 d4, d1, d21 @ -R*s | |||
| @@ -75,7 +75,7 @@ function ff_mpadsp_apply_window_fixed_armv6, export=1 | |||
| sum8 r8, r9, r1, r0, r10, r11, r12, lr | |||
| sum8 r8, r9, r1, r2, r10, r11, r12, lr, rsb, 32 | |||
| round r10, r8, r9 | |||
| strh r10, [r3], r4 | |||
| strh_post r10, r3, r4 | |||
| mov lr, #15 | |||
| 1: | |||
| @@ -127,10 +127,10 @@ function ff_mpadsp_apply_window_fixed_armv6, export=1 | |||
| round r10, r8, r9 | |||
| adds r8, r8, r4 | |||
| adc r9, r9, r7 | |||
| strh r10, [r3], r12 | |||
| strh_post r10, r3, r12 | |||
| round r11, r8, r9 | |||
| subs lr, lr, #1 | |||
| strh r11, [r5], -r12 | |||
| strh_dpost r11, r5, r12 | |||
| bgt 1b | |||
| sum8 r8, r9, r1, r0, r10, r11, r12, lr, rsb, 33 | |||
| @@ -38,15 +38,21 @@ | |||
| .macro dequant_t dst, src, mul, add, tmp | |||
| rsbs \tmp, ip, \src, asr #16 | |||
| it gt | |||
| addgt \tmp, \add, #0 | |||
| it lt | |||
| rsblt \tmp, \add, #0 | |||
| it ne | |||
| smlatbne \dst, \src, \mul, \tmp | |||
| .endm | |||
| .macro dequant_b dst, src, mul, add, tmp | |||
| rsbs \tmp, ip, \src, lsl #16 | |||
| it gt | |||
| addgt \tmp, \add, #0 | |||
| it lt | |||
| rsblt \tmp, \add, #0 | |||
| it ne | |||
| smlabbne \dst, \src, \mul, \tmp | |||
| .endm | |||
| @@ -80,21 +86,27 @@ function ff_dct_unquantize_h263_armv5te, export=1 | |||
| strh lr, [r0], #2 | |||
| subs r3, r3, #8 | |||
| it gt | |||
| ldrdgt r4, [r0, #0] /* load data early to avoid load/use pipeline stall */ | |||
| bgt 1b | |||
| adds r3, r3, #2 | |||
| it le | |||
| pople {r4-r9,pc} | |||
| 2: | |||
| ldrsh r9, [r0, #0] | |||
| ldrsh lr, [r0, #2] | |||
| mov r8, r2 | |||
| cmp r9, #0 | |||
| it lt | |||
| rsblt r8, r2, #0 | |||
| it ne | |||
| smlabbne r9, r9, r1, r8 | |||
| mov r8, r2 | |||
| cmp lr, #0 | |||
| it lt | |||
| rsblt r8, r2, #0 | |||
| it ne | |||
| smlabbne lr, lr, r1, r8 | |||
| strh r9, [r0], #2 | |||
| strh lr, [r0], #2 | |||
| @@ -57,6 +57,7 @@ function ff_dct_unquantize_h263_neon, export=1 | |||
| subs r3, r3, #16 | |||
| vst1.16 {q0}, [r1,:128]! | |||
| vst1.16 {q8}, [r1,:128]! | |||
| it le | |||
| bxle lr | |||
| cmp r3, #8 | |||
| bgt 1b | |||
| @@ -78,6 +79,7 @@ function ff_dct_unquantize_h263_intra_neon, export=1 | |||
| ldr r6, [r0, #AC_PRED] | |||
| add lr, r0, #INTER_SCANTAB_RASTER_END | |||
| cmp r6, #0 | |||
| it ne | |||
| movne r12, #63 | |||
| bne 1f | |||
| ldr r12, [r12, r2, lsl #2] | |||
| @@ -86,9 +88,11 @@ function ff_dct_unquantize_h263_intra_neon, export=1 | |||
| ldrsh r4, [r1] | |||
| cmp r5, #0 | |||
| mov r5, r1 | |||
| it ne | |||
| movne r2, #0 | |||
| bne 2f | |||
| cmp r2, #4 | |||
| it ge | |||
| addge r0, r0, #4 | |||
| sub r2, r3, #1 | |||
| ldr r6, [r0, #Y_DC_SCALE] | |||
| @@ -137,6 +137,7 @@ function ff_rdft_calc_neon, export=1 | |||
| vst1.32 {d22}, [r5,:64] | |||
| cmp r6, #0 | |||
| it eq | |||
| popeq {r4-r8,pc} | |||
| vmul.f32 d22, d22, d18 | |||
| @@ -121,11 +121,13 @@ __b_evaluation: | |||
| ldr r11, [r12, #offW7] @ R11=W7 | |||
| mul r5, r10, r7 @ R5=W5*ROWr16[1]=b2 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle) | |||
| mul r7, r11, r7 @ R7=W7*ROWr16[1]=b3 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle) | |||
| teq r2, #0 @ if null avoid muls | |||
| mlane r0, r9, r2, r0 @ R0+=W3*ROWr16[3]=b0 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle) | |||
| teq r2, #0 @ if null avoid muls | |||
| itttt ne | |||
| mlane r0, r9, r2, r0 @ R0+=W3*ROWr16[3]=b0 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle) | |||
| rsbne r2, r2, #0 @ R2=-ROWr16[3] | |||
| mlane r1, r11, r2, r1 @ R1-=W7*ROWr16[3]=b1 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle) | |||
| mlane r5, r8, r2, r5 @ R5-=W1*ROWr16[3]=b2 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle) | |||
| it ne | |||
| mlane r7, r10, r2, r7 @ R7-=W5*ROWr16[3]=b3 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle) | |||
| @@ at this point, R0=b0, R1=b1, R2 (free), R3=ROWr32[2], R4=ROWr32[3], | |||
| @@ -148,19 +150,23 @@ __b_evaluation: | |||
| @@ MAC16(b3, -W1, row[7]); | |||
| @@ MAC16(b1, -W5, row[7]); | |||
| mov r3, r3, asr #16 @ R3=ROWr16[5] | |||
| teq r3, #0 @ if null avoid muls | |||
| teq r3, #0 @ if null avoid muls | |||
| it ne | |||
| mlane r0, r10, r3, r0 @ R0+=W5*ROWr16[5]=b0 | |||
| mov r4, r4, asr #16 @ R4=ROWr16[7] | |||
| itttt ne | |||
| mlane r5, r11, r3, r5 @ R5+=W7*ROWr16[5]=b2 | |||
| mlane r7, r9, r3, r7 @ R7+=W3*ROWr16[5]=b3 | |||
| rsbne r3, r3, #0 @ R3=-ROWr16[5] | |||
| mlane r1, r8, r3, r1 @ R7-=W1*ROWr16[5]=b1 | |||
| @@ R3 is free now | |||
| teq r4, #0 @ if null avoid muls | |||
| teq r4, #0 @ if null avoid muls | |||
| itttt ne | |||
| mlane r0, r11, r4, r0 @ R0+=W7*ROWr16[7]=b0 | |||
| mlane r5, r9, r4, r5 @ R5+=W3*ROWr16[7]=b2 | |||
| rsbne r4, r4, #0 @ R4=-ROWr16[7] | |||
| mlane r7, r8, r4, r7 @ R7-=W1*ROWr16[7]=b3 | |||
| it ne | |||
| mlane r1, r10, r4, r1 @ R1-=W5*ROWr16[7]=b1 | |||
| @@ R4 is free now | |||
| __end_b_evaluation: | |||
| @@ -204,16 +210,19 @@ __a_evaluation: | |||
| @@ a2 -= W4*row[4] | |||
| @@ a3 += W4*row[4] | |||
| ldrsh r11, [r14, #8] @ R11=ROWr16[4] | |||
| teq r11, #0 @ if null avoid muls | |||
| teq r11, #0 @ if null avoid muls | |||
| it ne | |||
| mulne r11, r9, r11 @ R11=W4*ROWr16[4] | |||
| @@ R9 is free now | |||
| ldrsh r9, [r14, #12] @ R9=ROWr16[6] | |||
| itttt ne | |||
| addne r6, r6, r11 @ R6+=W4*ROWr16[4] (a0) | |||
| subne r2, r2, r11 @ R2-=W4*ROWr16[4] (a1) | |||
| subne r3, r3, r11 @ R3-=W4*ROWr16[4] (a2) | |||
| addne r4, r4, r11 @ R4+=W4*ROWr16[4] (a3) | |||
| @@ W6 alone is no more useful, save W2*ROWr16[6] in it instead | |||
| teq r9, #0 @ if null avoid muls | |||
| teq r9, #0 @ if null avoid muls | |||
| itttt ne | |||
| mulne r11, r10, r9 @ R11=W6*ROWr16[6] | |||
| addne r6, r6, r11 @ R6+=W6*ROWr16[6] (a0) | |||
| mulne r10, r8, r9 @ R10=W2*ROWr16[6] | |||
| @@ -222,6 +231,7 @@ __a_evaluation: | |||
| @@ a1 -= W2*row[6]; | |||
| @@ a2 += W2*row[6]; | |||
| subne r4, r4, r11 @ R4-=W6*ROWr16[6] (a3) | |||
| itt ne | |||
| subne r2, r2, r10 @ R2-=W2*ROWr16[6] (a1) | |||
| addne r3, r3, r10 @ R3+=W2*ROWr16[6] (a2) | |||
| @@ -323,10 +333,12 @@ __b_evaluation2: | |||
| ldrsh r2, [r14, #48] | |||
| mul r7, r11, r7 @ R7=W7*ROWr16[1]=b3 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle) | |||
| teq r2, #0 @ if 0, then avoid muls | |||
| itttt ne | |||
| mlane r0, r9, r2, r0 @ R0+=W3*ROWr16[3]=b0 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle) | |||
| rsbne r2, r2, #0 @ R2=-ROWr16[3] | |||
| mlane r1, r11, r2, r1 @ R1-=W7*ROWr16[3]=b1 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle) | |||
| mlane r5, r8, r2, r5 @ R5-=W1*ROWr16[3]=b2 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle) | |||
| it ne | |||
| mlane r7, r10, r2, r7 @ R7-=W5*ROWr16[3]=b3 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle) | |||
| @@ at this point, R0=b0, R1=b1, R2 (free), R3 (free), R4 (free), | |||
| @@ -342,18 +354,22 @@ __b_evaluation2: | |||
| @@ MAC16(b1, -W5, col[7x8]); | |||
| ldrsh r3, [r14, #80] @ R3=COLr16[5x8] | |||
| teq r3, #0 @ if 0 then avoid muls | |||
| itttt ne | |||
| mlane r0, r10, r3, r0 @ R0+=W5*ROWr16[5x8]=b0 | |||
| mlane r5, r11, r3, r5 @ R5+=W7*ROWr16[5x8]=b2 | |||
| mlane r7, r9, r3, r7 @ R7+=W3*ROWr16[5x8]=b3 | |||
| rsbne r3, r3, #0 @ R3=-ROWr16[5x8] | |||
| ldrsh r4, [r14, #112] @ R4=COLr16[7x8] | |||
| it ne | |||
| mlane r1, r8, r3, r1 @ R7-=W1*ROWr16[5x8]=b1 | |||
| @@ R3 is free now | |||
| teq r4, #0 @ if 0 then avoid muls | |||
| itttt ne | |||
| mlane r0, r11, r4, r0 @ R0+=W7*ROWr16[7x8]=b0 | |||
| mlane r5, r9, r4, r5 @ R5+=W3*ROWr16[7x8]=b2 | |||
| rsbne r4, r4, #0 @ R4=-ROWr16[7x8] | |||
| mlane r7, r8, r4, r7 @ R7-=W1*ROWr16[7x8]=b3 | |||
| it ne | |||
| mlane r1, r10, r4, r1 @ R1-=W5*ROWr16[7x8]=b1 | |||
| @@ R4 is free now | |||
| __end_b_evaluation2: | |||
| @@ -390,15 +406,18 @@ __a_evaluation2: | |||
| @@ a3 += W4*row[4] | |||
| ldrsh r11, [r14, #64] @ R11=ROWr16[4] | |||
| teq r11, #0 @ if null avoid muls | |||
| itttt ne | |||
| mulne r11, r9, r11 @ R11=W4*ROWr16[4] | |||
| @@ R9 is free now | |||
| addne r6, r6, r11 @ R6+=W4*ROWr16[4] (a0) | |||
| subne r2, r2, r11 @ R2-=W4*ROWr16[4] (a1) | |||
| subne r3, r3, r11 @ R3-=W4*ROWr16[4] (a2) | |||
| ldrsh r9, [r14, #96] @ R9=ROWr16[6] | |||
| it ne | |||
| addne r4, r4, r11 @ R4+=W4*ROWr16[4] (a3) | |||
| @@ W6 alone is no more useful, save W2*ROWr16[6] in it instead | |||
| teq r9, #0 @ if null avoid muls | |||
| itttt ne | |||
| mulne r11, r10, r9 @ R11=W6*ROWr16[6] | |||
| addne r6, r6, r11 @ R6+=W6*ROWr16[6] (a0) | |||
| mulne r10, r8, r9 @ R10=W2*ROWr16[6] | |||
| @@ -407,6 +426,7 @@ __a_evaluation2: | |||
| @@ a1 -= W2*row[6]; | |||
| @@ a2 += W2*row[6]; | |||
| subne r4, r4, r11 @ R4-=W6*ROWr16[6] (a3) | |||
| itt ne | |||
| subne r2, r2, r10 @ R2-=W2*ROWr16[6] (a1) | |||
| addne r3, r3, r10 @ R3+=W2*ROWr16[6] (a2) | |||
| __end_a_evaluation2: | |||
| @@ -49,6 +49,7 @@ function idct_row_armv5te | |||
| ldrd v1, [a1, #8] | |||
| ldrd a3, [a1] /* a3 = row[1:0], a4 = row[3:2] */ | |||
| orrs v1, v1, v2 | |||
| itt eq | |||
| cmpeq v1, a4 | |||
| cmpeq v1, a3, lsr #16 | |||
| beq row_dc_only | |||
| @@ -269,6 +270,7 @@ function idct_col_armv5te | |||
| ldmfd sp!, {a3, a4} | |||
| adds a2, a3, v1 | |||
| mov a2, a2, lsr #20 | |||
| it mi | |||
| orrmi a2, a2, #0xf000 | |||
| add ip, a4, v2 | |||
| mov ip, ip, asr #20 | |||
| @@ -276,6 +278,7 @@ function idct_col_armv5te | |||
| str a2, [a1] | |||
| subs a3, a3, v1 | |||
| mov a2, a3, lsr #20 | |||
| it mi | |||
| orrmi a2, a2, #0xf000 | |||
| sub a4, a4, v2 | |||
| mov a4, a4, asr #20 | |||
| @@ -285,6 +288,7 @@ function idct_col_armv5te | |||
| subs a2, a3, v3 | |||
| mov a2, a2, lsr #20 | |||
| it mi | |||
| orrmi a2, a2, #0xf000 | |||
| sub ip, a4, v4 | |||
| mov ip, ip, asr #20 | |||
| @@ -292,6 +296,7 @@ function idct_col_armv5te | |||
| str a2, [a1, #(16*1)] | |||
| adds a3, a3, v3 | |||
| mov a2, a3, lsr #20 | |||
| it mi | |||
| orrmi a2, a2, #0xf000 | |||
| add a4, a4, v4 | |||
| mov a4, a4, asr #20 | |||
| @@ -301,6 +306,7 @@ function idct_col_armv5te | |||
| adds a2, a3, v5 | |||
| mov a2, a2, lsr #20 | |||
| it mi | |||
| orrmi a2, a2, #0xf000 | |||
| add ip, a4, v6 | |||
| mov ip, ip, asr #20 | |||
| @@ -308,6 +314,7 @@ function idct_col_armv5te | |||
| str a2, [a1, #(16*2)] | |||
| subs a3, a3, v5 | |||
| mov a2, a3, lsr #20 | |||
| it mi | |||
| orrmi a2, a2, #0xf000 | |||
| sub a4, a4, v6 | |||
| mov a4, a4, asr #20 | |||
| @@ -317,6 +324,7 @@ function idct_col_armv5te | |||
| adds a2, a3, v7 | |||
| mov a2, a2, lsr #20 | |||
| it mi | |||
| orrmi a2, a2, #0xf000 | |||
| add ip, a4, fp | |||
| mov ip, ip, asr #20 | |||
| @@ -324,6 +332,7 @@ function idct_col_armv5te | |||
| str a2, [a1, #(16*3)] | |||
| subs a3, a3, v7 | |||
| mov a2, a3, lsr #20 | |||
| it mi | |||
| orrmi a2, a2, #0xf000 | |||
| sub a4, a4, fp | |||
| mov a4, a4, asr #20 | |||
| @@ -335,15 +344,19 @@ endfunc | |||
| .macro clip dst, src:vararg | |||
| movs \dst, \src | |||
| it mi | |||
| movmi \dst, #0 | |||
| cmp \dst, #255 | |||
| it gt | |||
| movgt \dst, #255 | |||
| .endm | |||
| .macro aclip dst, src:vararg | |||
| adds \dst, \src | |||
| it mi | |||
| movmi \dst, #0 | |||
| cmp \dst, #255 | |||
| it gt | |||
| movgt \dst, #255 | |||
| .endm | |||
| @@ -370,35 +383,35 @@ function idct_col_put_armv5te | |||
| orr a2, a3, a4, lsl #8 | |||
| rsb v2, lr, lr, lsl #3 | |||
| ldmfd sp!, {a3, a4} | |||
| strh a2, [v2, v1]! | |||
| strh_pre a2, v2, v1 | |||
| sub a2, a3, v3 | |||
| clip a2, a2, asr #20 | |||
| sub ip, a4, v4 | |||
| clip ip, ip, asr #20 | |||
| orr a2, a2, ip, lsl #8 | |||
| strh a2, [v1, lr]! | |||
| strh_pre a2, v1, lr | |||
| add a3, a3, v3 | |||
| clip a2, a3, asr #20 | |||
| add a4, a4, v4 | |||
| clip a4, a4, asr #20 | |||
| orr a2, a2, a4, lsl #8 | |||
| ldmfd sp!, {a3, a4} | |||
| strh a2, [v2, -lr]! | |||
| strh_dpre a2, v2, lr | |||
| add a2, a3, v5 | |||
| clip a2, a2, asr #20 | |||
| add ip, a4, v6 | |||
| clip ip, ip, asr #20 | |||
| orr a2, a2, ip, lsl #8 | |||
| strh a2, [v1, lr]! | |||
| strh_pre a2, v1, lr | |||
| sub a3, a3, v5 | |||
| clip a2, a3, asr #20 | |||
| sub a4, a4, v6 | |||
| clip a4, a4, asr #20 | |||
| orr a2, a2, a4, lsl #8 | |||
| ldmfd sp!, {a3, a4} | |||
| strh a2, [v2, -lr]! | |||
| strh_dpre a2, v2, lr | |||
| add a2, a3, v7 | |||
| clip a2, a2, asr #20 | |||
| @@ -411,7 +424,7 @@ function idct_col_put_armv5te | |||
| sub a4, a4, fp | |||
| clip a4, a4, asr #20 | |||
| orr a2, a2, a4, lsl #8 | |||
| strh a2, [v2, -lr] | |||
| strh_dpre a2, v2, lr | |||
| ldr pc, [sp], #4 | |||
| endfunc | |||
| @@ -436,7 +449,7 @@ function idct_col_add_armv5te | |||
| ldr v1, [sp, #32] | |||
| sub a4, a4, v2 | |||
| rsb v2, v1, v1, lsl #3 | |||
| ldrh ip, [v2, lr]! | |||
| ldrh_pre ip, v2, lr | |||
| strh a2, [lr] | |||
| and a2, ip, #255 | |||
| aclip a3, a2, a3, asr #20 | |||
| @@ -448,7 +461,7 @@ function idct_col_add_armv5te | |||
| strh a2, [v2] | |||
| ldmfd sp!, {a3, a4} | |||
| ldrh ip, [lr, v1]! | |||
| ldrh_pre ip, lr, v1 | |||
| sub a2, a3, v3 | |||
| add a3, a3, v3 | |||
| and v3, ip, #255 | |||
| @@ -458,7 +471,7 @@ function idct_col_add_armv5te | |||
| aclip v3, v3, ip, lsr #8 | |||
| orr a2, a2, v3, lsl #8 | |||
| add a4, a4, v4 | |||
| ldrh ip, [v2, -v1]! | |||
| ldrh_dpre ip, v2, v1 | |||
| strh a2, [lr] | |||
| and a2, ip, #255 | |||
| aclip a3, a2, a3, asr #20 | |||
| @@ -468,7 +481,7 @@ function idct_col_add_armv5te | |||
| strh a2, [v2] | |||
| ldmfd sp!, {a3, a4} | |||
| ldrh ip, [lr, v1]! | |||
| ldrh_pre ip, lr, v1 | |||
| add a2, a3, v5 | |||
| sub a3, a3, v5 | |||
| and v3, ip, #255 | |||
| @@ -478,7 +491,7 @@ function idct_col_add_armv5te | |||
| aclip v3, v3, ip, lsr #8 | |||
| orr a2, a2, v3, lsl #8 | |||
| sub a4, a4, v6 | |||
| ldrh ip, [v2, -v1]! | |||
| ldrh_dpre ip, v2, v1 | |||
| strh a2, [lr] | |||
| and a2, ip, #255 | |||
| aclip a3, a2, a3, asr #20 | |||
| @@ -488,7 +501,7 @@ function idct_col_add_armv5te | |||
| strh a2, [v2] | |||
| ldmfd sp!, {a3, a4} | |||
| ldrh ip, [lr, v1]! | |||
| ldrh_pre ip, lr, v1 | |||
| add a2, a3, v7 | |||
| sub a3, a3, v7 | |||
| and v3, ip, #255 | |||
| @@ -498,7 +511,7 @@ function idct_col_add_armv5te | |||
| aclip v3, v3, ip, lsr #8 | |||
| orr a2, a2, v3, lsl #8 | |||
| sub a4, a4, fp | |||
| ldrh ip, [v2, -v1]! | |||
| ldrh_dpre ip, v2, v1 | |||
| strh a2, [lr] | |||
| and a2, ip, #255 | |||
| aclip a3, a2, a3, asr #20 | |||
| @@ -200,6 +200,7 @@ function idct_row_armv6 | |||
| ldr r3, [r0, #8] /* r3 = row[3,1] */ | |||
| ldr r2, [r0] /* r2 = row[2,0] */ | |||
| orrs lr, lr, ip | |||
| itt eq | |||
| cmpeq lr, r3 | |||
| cmpeq lr, r2, lsr #16 | |||
| beq 1f | |||
| @@ -282,14 +283,14 @@ function idct_col_put_armv6 | |||
| pop {r1, r2} | |||
| idct_finish_shift_sat COL_SHIFT | |||
| strb r4, [r1], r2 | |||
| strb r5, [r1], r2 | |||
| strb r6, [r1], r2 | |||
| strb r7, [r1], r2 | |||
| strb r11,[r1], r2 | |||
| strb r10,[r1], r2 | |||
| strb r9, [r1], r2 | |||
| strb r8, [r1], r2 | |||
| strb_post r4, r1, r2 | |||
| strb_post r5, r1, r2 | |||
| strb_post r6, r1, r2 | |||
| strb_post r7, r1, r2 | |||
| strb_post r11,r1, r2 | |||
| strb_post r10,r1, r2 | |||
| strb_post r9, r1, r2 | |||
| strb_post r8, r1, r2 | |||
| sub r1, r1, r2, lsl #3 | |||
| @@ -318,16 +319,16 @@ function idct_col_add_armv6 | |||
| add ip, r3, ip, asr #COL_SHIFT | |||
| usat ip, #8, ip | |||
| add r4, r7, r4, asr #COL_SHIFT | |||
| strb ip, [r1], r2 | |||
| strb_post ip, r1, r2 | |||
| ldrb ip, [r1, r2] | |||
| usat r4, #8, r4 | |||
| ldrb r11,[r1, r2, lsl #2] | |||
| add r5, ip, r5, asr #COL_SHIFT | |||
| usat r5, #8, r5 | |||
| strb r4, [r1], r2 | |||
| strb_post r4, r1, r2 | |||
| ldrb r3, [r1, r2] | |||
| ldrb ip, [r1, r2, lsl #2] | |||
| strb r5, [r1], r2 | |||
| strb_post r5, r1, r2 | |||
| ldrb r7, [r1, r2] | |||
| ldrb r4, [r1, r2, lsl #2] | |||
| add r6, r3, r6, asr #COL_SHIFT | |||
| @@ -340,11 +341,11 @@ function idct_col_add_armv6 | |||
| usat r8, #8, r8 | |||
| add lr, r4, lr, asr #COL_SHIFT | |||
| usat lr, #8, lr | |||
| strb r6, [r1], r2 | |||
| strb r10,[r1], r2 | |||
| strb r9, [r1], r2 | |||
| strb r8, [r1], r2 | |||
| strb lr, [r1], r2 | |||
| strb_post r6, r1, r2 | |||
| strb_post r10,r1, r2 | |||
| strb_post r9, r1, r2 | |||
| strb_post r8, r1, r2 | |||
| strb_post lr, r1, r2 | |||
| sub r1, r1, r2, lsl #3 | |||
| @@ -71,7 +71,7 @@ function idct_row4_pld_neon | |||
| add r3, r0, r1, lsl #2 | |||
| pld [r0, r1] | |||
| pld [r0, r1, lsl #1] | |||
| pld [r3, -r1] | |||
| A pld [r3, -r1] | |||
| pld [r3] | |||
| pld [r3, r1] | |||
| add r3, r3, r1, lsl #1 | |||
| @@ -164,6 +164,7 @@ function idct_col4_neon | |||
| orrs r4, r4, r5 | |||
| idct_col4_top | |||
| it eq | |||
| addeq r2, r2, #16 | |||
| beq 1f | |||
| @@ -176,6 +177,7 @@ function idct_col4_neon | |||
| 1: orrs r6, r6, r7 | |||
| ldrd r4, [r2, #16] | |||
| it eq | |||
| addeq r2, r2, #16 | |||
| beq 2f | |||
| @@ -187,6 +189,7 @@ function idct_col4_neon | |||
| 2: orrs r4, r4, r5 | |||
| ldrd r4, [r2, #16] | |||
| it eq | |||
| addeq r2, r2, #16 | |||
| beq 3f | |||
| @@ -199,6 +202,7 @@ function idct_col4_neon | |||
| vadd.i32 q13, q13, q8 | |||
| 3: orrs r4, r4, r5 | |||
| it eq | |||
| addeq r2, r2, #16 | |||
| beq 4f | |||
| @@ -100,9 +100,11 @@ NOVFP vldr s0, [sp, #12*4] @ scale | |||
| vst1.32 {q9}, [r2,:128] | |||
| subs r1, r1, #1 | |||
| it eq | |||
| popeq {r4-r11,pc} | |||
| cmp r4, #0 | |||
| itt eq | |||
| subeq r8, r8, #512*4 | |||
| subeq r9, r9, #512*4 | |||
| sub r5, r5, #512*4 | |||
| @@ -21,6 +21,14 @@ | |||
| #ifndef AVCODEC_ARM_VP56_ARITH_H | |||
| #define AVCODEC_ARM_VP56_ARITH_H | |||
| #if CONFIG_THUMB | |||
| # define A(x) | |||
| # define T(x) x | |||
| #else | |||
| # define A(x) x | |||
| # define T(x) | |||
| #endif | |||
| #if HAVE_ARMV6 && HAVE_INLINE_ASM | |||
| #define vp56_rac_get_prob vp56_rac_get_prob_armv6 | |||
| @@ -32,15 +40,21 @@ static inline int vp56_rac_get_prob_armv6(VP56RangeCoder *c, int pr) | |||
| unsigned bit; | |||
| __asm__ ("adds %3, %3, %0 \n" | |||
| "itt cs \n" | |||
| "cmpcs %7, %4 \n" | |||
| "ldrcsh %2, [%4], #2 \n" | |||
| A("ldrcsh %2, [%4], #2 \n") | |||
| T("ldrhcs %2, [%4], #2 \n") | |||
| "rsb %0, %6, #256 \n" | |||
| "smlabb %0, %5, %6, %0 \n" | |||
| T("itttt cs \n") | |||
| "rev16cs %2, %2 \n" | |||
| "orrcs %1, %1, %2, lsl %3 \n" | |||
| T("lslcs %2, %2, %3 \n") | |||
| T("orrcs %1, %1, %2 \n") | |||
| A("orrcs %1, %1, %2, lsl %3 \n") | |||
| "subcs %3, %3, #16 \n" | |||
| "lsr %0, %0, #8 \n" | |||
| "cmp %1, %0, lsl #16 \n" | |||
| "ittte ge \n" | |||
| "subge %1, %1, %0, lsl #16 \n" | |||
| "subge %0, %5, %0 \n" | |||
| "movge %2, #1 \n" | |||
| @@ -64,12 +78,17 @@ static inline int vp56_rac_get_prob_branchy_armv6(VP56RangeCoder *c, int pr) | |||
| unsigned tmp; | |||
| __asm__ ("adds %3, %3, %0 \n" | |||
| "itt cs \n" | |||
| "cmpcs %7, %4 \n" | |||
| "ldrcsh %2, [%4], #2 \n" | |||
| A("ldrcsh %2, [%4], #2 \n") | |||
| T("ldrhcs %2, [%4], #2 \n") | |||
| "rsb %0, %6, #256 \n" | |||
| "smlabb %0, %5, %6, %0 \n" | |||
| T("itttt cs \n") | |||
| "rev16cs %2, %2 \n" | |||
| "orrcs %1, %1, %2, lsl %3 \n" | |||
| T("lslcs %2, %2, %3 \n") | |||
| T("orrcs %1, %1, %2 \n") | |||
| A("orrcs %1, %1, %2, lsl %3 \n") | |||
| "subcs %3, %3, #16 \n" | |||
| "lsr %0, %0, #8 \n" | |||
| "lsl %2, %0, #16 \n" | |||
| @@ -25,13 +25,18 @@ | |||
| lsl \cw, \cw, \t0 | |||
| lsl \t0, \h, \t0 | |||
| rsb \h, \pr, #256 | |||
| it cs | |||
| ldrhcs \t1, [\buf], #2 | |||
| smlabb \h, \t0, \pr, \h | |||
| T itttt cs | |||
| rev16cs \t1, \t1 | |||
| orrcs \cw, \cw, \t1, lsl \bs | |||
| A orrcs \cw, \cw, \t1, lsl \bs | |||
| T lslcs \t1, \t1, \bs | |||
| T orrcs \cw, \cw, \t1 | |||
| subcs \bs, \bs, #16 | |||
| lsr \h, \h, #8 | |||
| cmp \cw, \h, lsl #16 | |||
| itt ge | |||
| subge \cw, \cw, \h, lsl #16 | |||
| subge \h, \t0, \h | |||
| .endm | |||
| @@ -40,14 +45,20 @@ | |||
| adds \bs, \bs, \t0 | |||
| lsl \cw, \cw, \t0 | |||
| lsl \t0, \h, \t0 | |||
| it cs | |||
| ldrhcs \t1, [\buf], #2 | |||
| mov \h, #128 | |||
| it cs | |||
| rev16cs \t1, \t1 | |||
| add \h, \h, \t0, lsl #7 | |||
| orrcs \cw, \cw, \t1, lsl \bs | |||
| A orrcs \cw, \cw, \t1, lsl \bs | |||
| T ittt cs | |||
| T lslcs \t1, \t1, \bs | |||
| T orrcs \cw, \cw, \t1 | |||
| subcs \bs, \bs, #16 | |||
| lsr \h, \h, #8 | |||
| cmp \cw, \h, lsl #16 | |||
| itt ge | |||
| subge \cw, \cw, \h, lsl #16 | |||
| subge \h, \t0, \h | |||
| .endm | |||
| @@ -59,6 +70,7 @@ function ff_decode_block_coeffs_armv6, export=1 | |||
| cmp r3, #0 | |||
| ldr r11, [r5] | |||
| ldm r0, {r5-r7} @ high, bits, buf | |||
| it ne | |||
| pkhtbne r11, r11, r11, asr #16 | |||
| ldr r8, [r0, #16] @ code_word | |||
| 0: | |||
| @@ -80,19 +92,26 @@ function ff_decode_block_coeffs_armv6, export=1 | |||
| adds r6, r6, r9 | |||
| add r4, r4, #11 | |||
| lsl r8, r8, r9 | |||
| it cs | |||
| ldrhcs r10, [r7], #2 | |||
| lsl r9, r5, r9 | |||
| mov r5, #128 | |||
| it cs | |||
| rev16cs r10, r10 | |||
| add r5, r5, r9, lsl #7 | |||
| orrcs r8, r8, r10, lsl r6 | |||
| T ittt cs | |||
| T lslcs r10, r10, r6 | |||
| T orrcs r8, r8, r10 | |||
| A orrcs r8, r8, r10, lsl r6 | |||
| subcs r6, r6, #16 | |||
| lsr r5, r5, #8 | |||
| cmp r8, r5, lsl #16 | |||
| movrel r10, zigzag_scan-1 | |||
| itt ge | |||
| subge r8, r8, r5, lsl #16 | |||
| subge r5, r9, r5 | |||
| ldrb r10, [r10, r3] | |||
| it ge | |||
| rsbge r12, r12, #0 | |||
| cmp r3, #16 | |||
| strh r12, [r1, r10] | |||
| @@ -108,6 +127,7 @@ function ff_decode_block_coeffs_armv6, export=1 | |||
| ldr r0, [sp] | |||
| ldr r9, [r0, #12] | |||
| cmp r7, r9 | |||
| it hi | |||
| movhi r7, r9 | |||
| stm r0, {r5-r7} @ high, bits, buf | |||
| str r8, [r0, #16] @ code_word | |||
| @@ -131,11 +151,13 @@ function ff_decode_block_coeffs_armv6, export=1 | |||
| mov r12, #2 | |||
| ldrb r0, [r4, #4] | |||
| rac_get_prob r5, r6, r7, r8, r0, r9, r10 | |||
| it ge | |||
| addge r12, #1 | |||
| ldrb r9, [lr, r5] | |||
| blt 4f | |||
| ldrb r0, [r4, #5] | |||
| rac_get_prob r5, r6, r7, r8, r0, r9, r10 | |||
| it ge | |||
| addge r12, #1 | |||
| ldrb r9, [lr, r5] | |||
| b 4f | |||
| @@ -153,6 +175,7 @@ function ff_decode_block_coeffs_armv6, export=1 | |||
| mov r12, #5 | |||
| mov r0, #159 | |||
| rac_get_prob r5, r6, r7, r8, r0, r9, r10 | |||
| it ge | |||
| addge r12, r12, #1 | |||
| ldrb r9, [lr, r5] | |||
| b 4f | |||
| @@ -160,23 +183,28 @@ function ff_decode_block_coeffs_armv6, export=1 | |||
| mov r12, #7 | |||
| mov r0, #165 | |||
| rac_get_prob r5, r6, r7, r8, r0, r9, r10 | |||
| it ge | |||
| addge r12, r12, #2 | |||
| ldrb r9, [lr, r5] | |||
| mov r0, #145 | |||
| rac_get_prob r5, r6, r7, r8, r0, r9, r10 | |||
| it ge | |||
| addge r12, r12, #1 | |||
| ldrb r9, [lr, r5] | |||
| b 4f | |||
| 3: | |||
| ldrb r0, [r4, #8] | |||
| rac_get_prob r5, r6, r7, r8, r0, r9, r10 | |||
| it ge | |||
| addge r4, r4, #1 | |||
| ldrb r9, [lr, r5] | |||
| ite ge | |||
| movge r12, #2 | |||
| movlt r12, #0 | |||
| ldrb r0, [r4, #9] | |||
| rac_get_prob r5, r6, r7, r8, r0, r9, r10 | |||
| mov r9, #8 | |||
| it ge | |||
| addge r12, r12, #1 | |||
| movrel r4, X(ff_vp8_dct_cat_prob) | |||
| lsl r9, r9, r12 | |||
| @@ -189,6 +217,7 @@ function ff_decode_block_coeffs_armv6, export=1 | |||
| lsl r1, r1, #1 | |||
| rac_get_prob r5, r6, r7, r8, r0, r9, r10 | |||
| ldrb r0, [r4], #1 | |||
| it ge | |||
| addge r1, r1, #1 | |||
| cmp r0, #0 | |||
| bne 1b | |||
| @@ -200,6 +229,7 @@ function ff_decode_block_coeffs_armv6, export=1 | |||
| add r4, r2, r4 | |||
| add r4, r4, #22 | |||
| rac_get_128 r5, r6, r7, r8, r9, r10 | |||
| it ge | |||
| rsbge r12, r12, #0 | |||
| smulbb r12, r12, r11 | |||
| movrel r9, zigzag_scan-1 | |||
| @@ -746,14 +746,14 @@ function ff_put_vp8_pixels4_neon, export=1 | |||
| push {r4-r6,lr} | |||
| 1: | |||
| subs r12, r12, #4 | |||
| ldr r4, [r2], r3 | |||
| ldr r5, [r2], r3 | |||
| ldr r6, [r2], r3 | |||
| ldr lr, [r2], r3 | |||
| str r4, [r0], r1 | |||
| str r5, [r0], r1 | |||
| str r6, [r0], r1 | |||
| str lr, [r0], r1 | |||
| ldr_post r4, r2, r3 | |||
| ldr_post r5, r2, r3 | |||
| ldr_post r6, r2, r3 | |||
| ldr_post lr, r2, r3 | |||
| str_post r4, r0, r1 | |||
| str_post r5, r0, r1 | |||
| str_post r6, r0, r1 | |||
| str_post lr, r0, r1 | |||
| bgt 1b | |||
| pop {r4-r6,pc} | |||
| endfunc | |||
| @@ -36,6 +36,7 @@ static av_always_inline av_const int FASTDIV(int a, int b) | |||
| int r; | |||
| __asm__ ("cmp %2, #2 \n\t" | |||
| "ldr %0, [%3, %2, lsl #2] \n\t" | |||
| "ite le \n\t" | |||
| "lsrle %0, %1, #1 \n\t" | |||
| "smmulgt %0, %0, %1 \n\t" | |||
| : "=&r"(r) : "r"(a), "r"(b), "r"(ff_inverse) : "cc"); | |||
| @@ -101,6 +102,7 @@ static av_always_inline av_const int32_t av_clipl_int32_arm(int64_t a) | |||
| { | |||
| int x, y; | |||
| __asm__ ("adds %1, %R2, %Q2, lsr #31 \n\t" | |||
| "itet ne \n\t" | |||
| "mvnne %1, #1<<31 \n\t" | |||
| "moveq %0, %Q2 \n\t" | |||
| "eorne %0, %1, %R2, asr #31 \n\t" | |||