Signed-off-by: Mans Rullgard <mans@mansr.com>tags/n0.9
| @@ -967,6 +967,7 @@ CONFIG_LIST=" | |||||
| static | static | ||||
| swscale | swscale | ||||
| swscale_alpha | swscale_alpha | ||||
| thumb | |||||
| vaapi | vaapi | ||||
| vdpau | vdpau | ||||
| version3 | version3 | ||||
| @@ -2607,7 +2608,7 @@ if enabled alpha; then | |||||
| elif enabled arm; then | elif enabled arm; then | ||||
| check_cflags -marm | |||||
| enabled thumb && check_cflags -mthumb || check_cflags -marm | |||||
| nogas=die | nogas=die | ||||
| if check_cpp_condition stddef.h "defined __ARM_PCS_VFP"; then | if check_cpp_condition stddef.h "defined __ARM_PCS_VFP"; then | ||||
| @@ -114,12 +114,15 @@ static inline float *VMUL4S(float *dst, const float *v, unsigned idx, | |||||
| "vmov d1, %2, %3 \n\t" | "vmov d1, %2, %3 \n\t" | ||||
| "lsls %6, %6, #1 \n\t" | "lsls %6, %6, #1 \n\t" | ||||
| "and %0, %5, #1<<31 \n\t" | "and %0, %5, #1<<31 \n\t" | ||||
| "it cs \n\t" | |||||
| "lslcs %5, %5, #1 \n\t" | "lslcs %5, %5, #1 \n\t" | ||||
| "lsls %6, %6, #1 \n\t" | "lsls %6, %6, #1 \n\t" | ||||
| "and %1, %5, #1<<31 \n\t" | "and %1, %5, #1<<31 \n\t" | ||||
| "it cs \n\t" | |||||
| "lslcs %5, %5, #1 \n\t" | "lslcs %5, %5, #1 \n\t" | ||||
| "lsls %6, %6, #1 \n\t" | "lsls %6, %6, #1 \n\t" | ||||
| "and %2, %5, #1<<31 \n\t" | "and %2, %5, #1<<31 \n\t" | ||||
| "it cs \n\t" | |||||
| "lslcs %5, %5, #1 \n\t" | "lslcs %5, %5, #1 \n\t" | ||||
| "vmov d4, %0, %1 \n\t" | "vmov d4, %0, %1 \n\t" | ||||
| "and %3, %5, #1<<31 \n\t" | "and %3, %5, #1<<31 \n\t" | ||||
| @@ -27,6 +27,7 @@ function ff_ac3_update_bap_counts_arm, export=1 | |||||
| lsl r3, lr, #1 | lsl r3, lr, #1 | ||||
| ldrh r12, [r0, r3] | ldrh r12, [r0, r3] | ||||
| subs r2, r2, #1 | subs r2, r2, #1 | ||||
| it gt | |||||
| ldrbgt lr, [r1], #1 | ldrbgt lr, [r1], #1 | ||||
| add r12, r12, #1 | add r12, r12, #1 | ||||
| strh r12, [r0, r3] | strh r12, [r0, r3] | ||||
| @@ -42,9 +42,11 @@ function ff_ac3_bit_alloc_calc_bap_armv6, export=1 | |||||
| mov r11, r10 | mov r11, r10 | ||||
| ldrb r10, [r4], #1 @ band_start_tab[band++] | ldrb r10, [r4], #1 @ band_start_tab[band++] | ||||
| subs r9, r9, r5 @ - floor | subs r9, r9, r5 @ - floor | ||||
| it lt | |||||
| movlt r9, #0 | movlt r9, #0 | ||||
| cmp r10, r3 @ - end | cmp r10, r3 @ - end | ||||
| and r9, r9, r8 @ & 0x1fe0 | and r9, r9, r8 @ & 0x1fe0 | ||||
| ite gt | |||||
| subgt r8, r3, r11 | subgt r8, r3, r11 | ||||
| suble r8, r10, r11 | suble r8, r10, r11 | ||||
| add r9, r9, r5 @ + floor => m | add r9, r9, r5 @ + floor => m | ||||
| @@ -41,6 +41,7 @@ endfunc | |||||
| function ff_ac3_exponent_min_neon, export=1 | function ff_ac3_exponent_min_neon, export=1 | ||||
| cmp r1, #0 | cmp r1, #0 | ||||
| it eq | |||||
| bxeq lr | bxeq lr | ||||
| push {lr} | push {lr} | ||||
| mov r12, #256 | mov r12, #256 | ||||
| @@ -24,9 +24,18 @@ | |||||
| # define ELF | # define ELF | ||||
| #else | #else | ||||
| # define ELF @ | # define ELF @ | ||||
| #endif | |||||
| #if CONFIG_THUMB | |||||
| # define A @ | |||||
| # define T | |||||
| #else | |||||
| # define A | |||||
| # define T @ | |||||
| #endif | #endif | ||||
| .syntax unified | .syntax unified | ||||
| T .thumb | |||||
| .macro require8 val=1 | .macro require8 val=1 | ||||
| ELF .eabi_attribute 24, \val | ELF .eabi_attribute 24, \val | ||||
| @@ -82,6 +91,90 @@ ELF .size \name, . - \name | |||||
| #endif | #endif | ||||
| .endm | .endm | ||||
| .macro ldr_pre rt, rn, rm:vararg | |||||
| A ldr \rt, [\rn, \rm]! | |||||
| T add \rn, \rn, \rm | |||||
| T ldr \rt, [\rn] | |||||
| .endm | |||||
| .macro ldr_post rt, rn, rm:vararg | |||||
| A ldr \rt, [\rn], \rm | |||||
| T ldr \rt, [\rn] | |||||
| T add \rn, \rn, \rm | |||||
| .endm | |||||
| .macro ldrd_reg rt, rt2, rn, rm | |||||
| A ldrd \rt, \rt2, [\rn, \rm] | |||||
| T add \rt, \rn, \rm | |||||
| T ldrd \rt, \rt2, [\rt] | |||||
| .endm | |||||
| .macro ldrd_post rt, rt2, rn, rm | |||||
| A ldrd \rt, \rt2, [\rn], \rm | |||||
| T ldrd \rt, \rt2, [\rn] | |||||
| T add \rn, \rn, \rm | |||||
| .endm | |||||
| .macro ldrh_pre rt, rn, rm | |||||
| A ldrh \rt, [\rn, \rm]! | |||||
| T add \rn, \rn, \rm | |||||
| T ldrh \rt, [\rn] | |||||
| .endm | |||||
| .macro ldrh_dpre rt, rn, rm | |||||
| A ldrh \rt, [\rn, -\rm]! | |||||
| T sub \rn, \rn, \rm | |||||
| T ldrh \rt, [\rn] | |||||
| .endm | |||||
| .macro ldrh_post rt, rn, rm | |||||
| A ldrh \rt, [\rn], \rm | |||||
| T ldrh \rt, [\rn] | |||||
| T add \rn, \rn, \rm | |||||
| .endm | |||||
| .macro str_post rt, rn, rm:vararg | |||||
| A str \rt, [\rn], \rm | |||||
| T str \rt, [\rn] | |||||
| T add \rn, \rn, \rm | |||||
| .endm | |||||
| .macro strb_post rt, rn, rm:vararg | |||||
| A strb \rt, [\rn], \rm | |||||
| T strb \rt, [\rn] | |||||
| T add \rn, \rn, \rm | |||||
| .endm | |||||
| .macro strd_post rt, rt2, rn, rm | |||||
| A strd \rt, \rt2, [\rn], \rm | |||||
| T strd \rt, \rt2, [\rn] | |||||
| T add \rn, \rn, \rm | |||||
| .endm | |||||
| .macro strh_pre rt, rn, rm | |||||
| A strh \rt, [\rn, \rm]! | |||||
| T add \rn, \rn, \rm | |||||
| T strh \rt, [\rn] | |||||
| .endm | |||||
| .macro strh_dpre rt, rn, rm | |||||
| A strh \rt, [\rn, -\rm]! | |||||
| T sub \rn, \rn, \rm | |||||
| T strh \rt, [\rn] | |||||
| .endm | |||||
| .macro strh_post rt, rn, rm | |||||
| A strh \rt, [\rn], \rm | |||||
| T strh \rt, [\rn] | |||||
| T add \rn, \rn, \rm | |||||
| .endm | |||||
| .macro strh_dpost rt, rn, rm | |||||
| A strh \rt, [\rn], -\rm | |||||
| T strh \rt, [\rn] | |||||
| T sub \rn, \rn, \rm | |||||
| .endm | |||||
| #if HAVE_VFP_ARGS | #if HAVE_VFP_ARGS | ||||
| .eabi_attribute 28, 1 | .eabi_attribute 28, 1 | ||||
| # define VFP | # define VFP | ||||
| @@ -27,6 +27,7 @@ function ff_dca_lfe_fir_neon, export=1 | |||||
| add r5, r2, #256*4-16 @ cf1 | add r5, r2, #256*4-16 @ cf1 | ||||
| sub r1, r1, #12 | sub r1, r1, #12 | ||||
| cmp r3, #32 | cmp r3, #32 | ||||
| ite eq | |||||
| moveq r6, #256/32 | moveq r6, #256/32 | ||||
| movne r6, #256/64 | movne r6, #256/64 | ||||
| NOVFP vldr s0, [sp, #16] @ scale | NOVFP vldr s0, [sp, #16] @ scale | ||||
| @@ -554,10 +554,12 @@ endfunc | |||||
| and r9, r5, r14 | and r9, r5, r14 | ||||
| and r10, r6, r14 | and r10, r6, r14 | ||||
| and r11, r7, r14 | and r11, r7, r14 | ||||
| it eq | |||||
| andeq r14, r14, r14, \rnd #1 | andeq r14, r14, r14, \rnd #1 | ||||
| add r8, r8, r10 | add r8, r8, r10 | ||||
| add r9, r9, r11 | add r9, r9, r11 | ||||
| ldr r12, =0xfcfcfcfc >> 2 | ldr r12, =0xfcfcfcfc >> 2 | ||||
| itt eq | |||||
| addeq r8, r8, r14 | addeq r8, r8, r14 | ||||
| addeq r9, r9, r14 | addeq r9, r9, r14 | ||||
| and r4, r12, r4, lsr #2 | and r4, r12, r4, lsr #2 | ||||
| @@ -638,8 +640,10 @@ function ff_add_pixels_clamped_arm, export=1 | |||||
| mvn r5, r5 | mvn r5, r5 | ||||
| mvn r7, r7 | mvn r7, r7 | ||||
| tst r6, #0x100 | tst r6, #0x100 | ||||
| it ne | |||||
| movne r6, r5, lsr #24 | movne r6, r5, lsr #24 | ||||
| tst r8, #0x100 | tst r8, #0x100 | ||||
| it ne | |||||
| movne r8, r7, lsr #24 | movne r8, r7, lsr #24 | ||||
| mov r9, r6 | mov r9, r6 | ||||
| ldrsh r5, [r0, #4] /* moved form [A] */ | ldrsh r5, [r0, #4] /* moved form [A] */ | ||||
| @@ -654,8 +658,10 @@ function ff_add_pixels_clamped_arm, export=1 | |||||
| mvn r5, r5 | mvn r5, r5 | ||||
| mvn r7, r7 | mvn r7, r7 | ||||
| tst r6, #0x100 | tst r6, #0x100 | ||||
| it ne | |||||
| movne r6, r5, lsr #24 | movne r6, r5, lsr #24 | ||||
| tst r8, #0x100 | tst r8, #0x100 | ||||
| it ne | |||||
| movne r8, r7, lsr #24 | movne r8, r7, lsr #24 | ||||
| orr r9, r9, r6, lsl #16 | orr r9, r9, r6, lsl #16 | ||||
| ldr r4, [r1, #4] /* moved form [B] */ | ldr r4, [r1, #4] /* moved form [B] */ | ||||
| @@ -676,8 +682,10 @@ function ff_add_pixels_clamped_arm, export=1 | |||||
| mvn r5, r5 | mvn r5, r5 | ||||
| mvn r7, r7 | mvn r7, r7 | ||||
| tst r6, #0x100 | tst r6, #0x100 | ||||
| it ne | |||||
| movne r6, r5, lsr #24 | movne r6, r5, lsr #24 | ||||
| tst r8, #0x100 | tst r8, #0x100 | ||||
| it ne | |||||
| movne r8, r7, lsr #24 | movne r8, r7, lsr #24 | ||||
| mov r9, r6 | mov r9, r6 | ||||
| ldrsh r5, [r0, #12] /* moved from [D] */ | ldrsh r5, [r0, #12] /* moved from [D] */ | ||||
| @@ -692,8 +700,10 @@ function ff_add_pixels_clamped_arm, export=1 | |||||
| mvn r5, r5 | mvn r5, r5 | ||||
| mvn r7, r7 | mvn r7, r7 | ||||
| tst r6, #0x100 | tst r6, #0x100 | ||||
| it ne | |||||
| movne r6, r5, lsr #24 | movne r6, r5, lsr #24 | ||||
| tst r8, #0x100 | tst r8, #0x100 | ||||
| it ne | |||||
| movne r8, r7, lsr #24 | movne r8, r7, lsr #24 | ||||
| orr r9, r9, r6, lsl #16 | orr r9, r9, r6, lsl #16 | ||||
| add r0, r0, #16 /* moved from [E] */ | add r0, r0, #16 /* moved from [E] */ | ||||
| @@ -47,16 +47,16 @@ function ff_put_pixels16_armv6, export=1 | |||||
| ldr r5, [r1, #4] | ldr r5, [r1, #4] | ||||
| ldr r6, [r1, #8] | ldr r6, [r1, #8] | ||||
| ldr r7, [r1, #12] | ldr r7, [r1, #12] | ||||
| ldr r4, [r1], r2 | |||||
| ldr_post r4, r1, r2 | |||||
| strd r6, r7, [r0, #8] | strd r6, r7, [r0, #8] | ||||
| ldr r9, [r1, #4] | ldr r9, [r1, #4] | ||||
| strd r4, r5, [r0], r2 | |||||
| strd_post r4, r5, r0, r2 | |||||
| ldr r10, [r1, #8] | ldr r10, [r1, #8] | ||||
| ldr r11, [r1, #12] | ldr r11, [r1, #12] | ||||
| ldr r8, [r1], r2 | |||||
| ldr_post r8, r1, r2 | |||||
| strd r10, r11, [r0, #8] | strd r10, r11, [r0, #8] | ||||
| subs r3, r3, #2 | subs r3, r3, #2 | ||||
| strd r8, r9, [r0], r2 | |||||
| strd_post r8, r9, r0, r2 | |||||
| bne 1b | bne 1b | ||||
| pop {r4-r11} | pop {r4-r11} | ||||
| @@ -67,12 +67,12 @@ function ff_put_pixels8_armv6, export=1 | |||||
| push {r4-r7} | push {r4-r7} | ||||
| 1: | 1: | ||||
| ldr r5, [r1, #4] | ldr r5, [r1, #4] | ||||
| ldr r4, [r1], r2 | |||||
| ldr_post r4, r1, r2 | |||||
| ldr r7, [r1, #4] | ldr r7, [r1, #4] | ||||
| strd r4, r5, [r0], r2 | |||||
| ldr r6, [r1], r2 | |||||
| strd_post r4, r5, r0, r2 | |||||
| ldr_post r6, r1, r2 | |||||
| subs r3, r3, #2 | subs r3, r3, #2 | ||||
| strd r6, r7, [r0], r2 | |||||
| strd_post r6, r7, r0, r2 | |||||
| bne 1b | bne 1b | ||||
| pop {r4-r7} | pop {r4-r7} | ||||
| @@ -90,7 +90,7 @@ function ff_put_pixels8_x2_armv6, export=1 | |||||
| ldr r5, [r1, #4] | ldr r5, [r1, #4] | ||||
| ldr r7, [r1, #5] | ldr r7, [r1, #5] | ||||
| lsr r6, r4, #8 | lsr r6, r4, #8 | ||||
| ldr r8, [r1, r2]! | |||||
| ldr_pre r8, r1, r2 | |||||
| orr r6, r6, r5, lsl #24 | orr r6, r6, r5, lsl #24 | ||||
| ldr r9, [r1, #4] | ldr r9, [r1, #4] | ||||
| ldr r11, [r1, #5] | ldr r11, [r1, #5] | ||||
| @@ -112,9 +112,9 @@ function ff_put_pixels8_x2_armv6, export=1 | |||||
| uhadd8 r9, r9, r11 | uhadd8 r9, r9, r11 | ||||
| and r6, r6, r12 | and r6, r6, r12 | ||||
| uadd8 r8, r8, r14 | uadd8 r8, r8, r14 | ||||
| strd r4, r5, [r0], r2 | |||||
| strd_post r4, r5, r0, r2 | |||||
| uadd8 r9, r9, r6 | uadd8 r9, r9, r6 | ||||
| strd r8, r9, [r0], r2 | |||||
| strd_post r8, r9, r0, r2 | |||||
| bne 1b | bne 1b | ||||
| pop {r4-r11, pc} | pop {r4-r11, pc} | ||||
| @@ -127,7 +127,7 @@ function ff_put_pixels8_y2_armv6, export=1 | |||||
| orr r12, r12, r12, lsl #16 | orr r12, r12, r12, lsl #16 | ||||
| ldr r4, [r1] | ldr r4, [r1] | ||||
| ldr r5, [r1, #4] | ldr r5, [r1, #4] | ||||
| ldr r6, [r1, r2]! | |||||
| ldr_pre r6, r1, r2 | |||||
| ldr r7, [r1, #4] | ldr r7, [r1, #4] | ||||
| 1: | 1: | ||||
| subs r3, r3, #2 | subs r3, r3, #2 | ||||
| @@ -136,7 +136,7 @@ function ff_put_pixels8_y2_armv6, export=1 | |||||
| uhadd8 r9, r5, r7 | uhadd8 r9, r5, r7 | ||||
| eor r11, r5, r7 | eor r11, r5, r7 | ||||
| and r10, r10, r12 | and r10, r10, r12 | ||||
| ldr r4, [r1, r2]! | |||||
| ldr_pre r4, r1, r2 | |||||
| uadd8 r8, r8, r10 | uadd8 r8, r8, r10 | ||||
| and r11, r11, r12 | and r11, r11, r12 | ||||
| uadd8 r9, r9, r11 | uadd8 r9, r9, r11 | ||||
| @@ -148,11 +148,11 @@ function ff_put_pixels8_y2_armv6, export=1 | |||||
| eor r7, r5, r7 | eor r7, r5, r7 | ||||
| uadd8 r10, r10, r6 | uadd8 r10, r10, r6 | ||||
| and r7, r7, r12 | and r7, r7, r12 | ||||
| ldr r6, [r1, r2]! | |||||
| ldr_pre r6, r1, r2 | |||||
| uadd8 r11, r11, r7 | uadd8 r11, r11, r7 | ||||
| strd r8, r9, [r0], r2 | |||||
| strd_post r8, r9, r0, r2 | |||||
| ldr r7, [r1, #4] | ldr r7, [r1, #4] | ||||
| strd r10, r11, [r0], r2 | |||||
| strd_post r10, r11, r0, r2 | |||||
| bne 1b | bne 1b | ||||
| pop {r4-r11} | pop {r4-r11} | ||||
| @@ -166,7 +166,7 @@ function ff_put_pixels8_x2_no_rnd_armv6, export=1 | |||||
| ldr r4, [r1] | ldr r4, [r1] | ||||
| ldr r5, [r1, #4] | ldr r5, [r1, #4] | ||||
| ldr r7, [r1, #5] | ldr r7, [r1, #5] | ||||
| ldr r8, [r1, r2]! | |||||
| ldr_pre r8, r1, r2 | |||||
| ldr r9, [r1, #4] | ldr r9, [r1, #4] | ||||
| ldr r14, [r1, #5] | ldr r14, [r1, #5] | ||||
| add r1, r1, r2 | add r1, r1, r2 | ||||
| @@ -191,16 +191,16 @@ function ff_put_pixels8_y2_no_rnd_armv6, export=1 | |||||
| push {r4-r9, lr} | push {r4-r9, lr} | ||||
| ldr r4, [r1] | ldr r4, [r1] | ||||
| ldr r5, [r1, #4] | ldr r5, [r1, #4] | ||||
| ldr r6, [r1, r2]! | |||||
| ldr_pre r6, r1, r2 | |||||
| ldr r7, [r1, #4] | ldr r7, [r1, #4] | ||||
| 1: | 1: | ||||
| subs r3, r3, #2 | subs r3, r3, #2 | ||||
| uhadd8 r8, r4, r6 | uhadd8 r8, r4, r6 | ||||
| ldr r4, [r1, r2]! | |||||
| ldr_pre r4, r1, r2 | |||||
| uhadd8 r9, r5, r7 | uhadd8 r9, r5, r7 | ||||
| ldr r5, [r1, #4] | ldr r5, [r1, #4] | ||||
| uhadd8 r12, r4, r6 | uhadd8 r12, r4, r6 | ||||
| ldr r6, [r1, r2]! | |||||
| ldr_pre r6, r1, r2 | |||||
| uhadd8 r14, r5, r7 | uhadd8 r14, r5, r7 | ||||
| ldr r7, [r1, #4] | ldr r7, [r1, #4] | ||||
| stm r0, {r8,r9} | stm r0, {r8,r9} | ||||
| @@ -220,44 +220,44 @@ function ff_avg_pixels8_armv6, export=1 | |||||
| orr lr, lr, lr, lsl #16 | orr lr, lr, lr, lsl #16 | ||||
| ldrd r4, r5, [r0] | ldrd r4, r5, [r0] | ||||
| ldr r10, [r1, #4] | ldr r10, [r1, #4] | ||||
| ldr r9, [r1], r2 | |||||
| ldr_post r9, r1, r2 | |||||
| subs r3, r3, #2 | subs r3, r3, #2 | ||||
| 1: | 1: | ||||
| pld [r1, r2] | pld [r1, r2] | ||||
| eor r8, r4, r9 | eor r8, r4, r9 | ||||
| uhadd8 r4, r4, r9 | uhadd8 r4, r4, r9 | ||||
| eor r12, r5, r10 | eor r12, r5, r10 | ||||
| ldrd r6, r7, [r0, r2] | |||||
| ldrd_reg r6, r7, r0, r2 | |||||
| uhadd8 r5, r5, r10 | uhadd8 r5, r5, r10 | ||||
| and r8, r8, lr | and r8, r8, lr | ||||
| ldr r10, [r1, #4] | ldr r10, [r1, #4] | ||||
| and r12, r12, lr | and r12, r12, lr | ||||
| uadd8 r4, r4, r8 | uadd8 r4, r4, r8 | ||||
| ldr r9, [r1], r2 | |||||
| ldr_post r9, r1, r2 | |||||
| eor r8, r6, r9 | eor r8, r6, r9 | ||||
| uadd8 r5, r5, r12 | uadd8 r5, r5, r12 | ||||
| pld [r1, r2, lsl #1] | pld [r1, r2, lsl #1] | ||||
| eor r12, r7, r10 | eor r12, r7, r10 | ||||
| uhadd8 r6, r6, r9 | uhadd8 r6, r6, r9 | ||||
| strd r4, r5, [r0], r2 | |||||
| strd_post r4, r5, r0, r2 | |||||
| uhadd8 r7, r7, r10 | uhadd8 r7, r7, r10 | ||||
| beq 2f | beq 2f | ||||
| and r8, r8, lr | and r8, r8, lr | ||||
| ldrd r4, r5, [r0, r2] | |||||
| ldrd_reg r4, r5, r0, r2 | |||||
| uadd8 r6, r6, r8 | uadd8 r6, r6, r8 | ||||
| ldr r10, [r1, #4] | ldr r10, [r1, #4] | ||||
| and r12, r12, lr | and r12, r12, lr | ||||
| subs r3, r3, #2 | subs r3, r3, #2 | ||||
| uadd8 r7, r7, r12 | uadd8 r7, r7, r12 | ||||
| ldr r9, [r1], r2 | |||||
| strd r6, r7, [r0], r2 | |||||
| ldr_post r9, r1, r2 | |||||
| strd_post r6, r7, r0, r2 | |||||
| b 1b | b 1b | ||||
| 2: | 2: | ||||
| and r8, r8, lr | and r8, r8, lr | ||||
| and r12, r12, lr | and r12, r12, lr | ||||
| uadd8 r6, r6, r8 | uadd8 r6, r6, r8 | ||||
| uadd8 r7, r7, r12 | uadd8 r7, r7, r12 | ||||
| strd r6, r7, [r0], r2 | |||||
| strd_post r6, r7, r0, r2 | |||||
| pop {r4-r10, pc} | pop {r4-r10, pc} | ||||
| endfunc | endfunc | ||||
| @@ -284,7 +284,7 @@ function ff_add_pixels_clamped_armv6, export=1 | |||||
| orr r6, r8, r5, lsl #8 | orr r6, r8, r5, lsl #8 | ||||
| orr r7, r4, lr, lsl #8 | orr r7, r4, lr, lsl #8 | ||||
| subs r3, r3, #1 | subs r3, r3, #1 | ||||
| strd r6, r7, [r1], r2 | |||||
| strd_post r6, r7, r1, r2 | |||||
| bgt 1b | bgt 1b | ||||
| pop {r4-r8,pc} | pop {r4-r8,pc} | ||||
| endfunc | endfunc | ||||
| @@ -294,7 +294,7 @@ function ff_get_pixels_armv6, export=1 | |||||
| push {r4-r8, lr} | push {r4-r8, lr} | ||||
| mov lr, #8 | mov lr, #8 | ||||
| 1: | 1: | ||||
| ldrd r4, r5, [r1], r2 | |||||
| ldrd_post r4, r5, r1, r2 | |||||
| subs lr, lr, #1 | subs lr, lr, #1 | ||||
| uxtb16 r6, r4 | uxtb16 r6, r4 | ||||
| uxtb16 r4, r4, ror #8 | uxtb16 r4, r4, ror #8 | ||||
| @@ -317,8 +317,8 @@ function ff_diff_pixels_armv6, export=1 | |||||
| push {r4-r9, lr} | push {r4-r9, lr} | ||||
| mov lr, #8 | mov lr, #8 | ||||
| 1: | 1: | ||||
| ldrd r4, r5, [r1], r3 | |||||
| ldrd r6, r7, [r2], r3 | |||||
| ldrd_post r4, r5, r1, r3 | |||||
| ldrd_post r6, r7, r2, r3 | |||||
| uxtb16 r8, r4 | uxtb16 r8, r4 | ||||
| uxtb16 r4, r4, ror #8 | uxtb16 r4, r4, ror #8 | ||||
| uxtb16 r9, r6 | uxtb16 r9, r6 | ||||
| @@ -492,19 +492,19 @@ function ff_pix_abs8_armv6, export=1 | |||||
| push {r4-r9, lr} | push {r4-r9, lr} | ||||
| mov r0, #0 | mov r0, #0 | ||||
| mov lr, #0 | mov lr, #0 | ||||
| ldrd r4, r5, [r1], r3 | |||||
| ldrd_post r4, r5, r1, r3 | |||||
| 1: | 1: | ||||
| subs r12, r12, #2 | subs r12, r12, #2 | ||||
| ldr r7, [r2, #4] | ldr r7, [r2, #4] | ||||
| ldr r6, [r2], r3 | |||||
| ldrd r8, r9, [r1], r3 | |||||
| ldr_post r6, r2, r3 | |||||
| ldrd_post r8, r9, r1, r3 | |||||
| usada8 r0, r4, r6, r0 | usada8 r0, r4, r6, r0 | ||||
| pld [r2, r3] | pld [r2, r3] | ||||
| usada8 lr, r5, r7, lr | usada8 lr, r5, r7, lr | ||||
| ldr r7, [r2, #4] | ldr r7, [r2, #4] | ||||
| ldr r6, [r2], r3 | |||||
| ldr_post r6, r2, r3 | |||||
| beq 2f | beq 2f | ||||
| ldrd r4, r5, [r1], r3 | |||||
| ldrd_post r4, r5, r1, r3 | |||||
| usada8 r0, r8, r6, r0 | usada8 r0, r8, r6, r0 | ||||
| pld [r2, r3] | pld [r2, r3] | ||||
| usada8 lr, r9, r7, lr | usada8 lr, r9, r7, lr | ||||
| @@ -613,7 +613,7 @@ function ff_pix_sum_armv6, export=1 | |||||
| ldr r7, [r0, #12] | ldr r7, [r0, #12] | ||||
| usada8 r2, r6, lr, r2 | usada8 r2, r6, lr, r2 | ||||
| beq 2f | beq 2f | ||||
| ldr r4, [r0, r1]! | |||||
| ldr_pre r4, r0, r1 | |||||
| usada8 r3, r7, lr, r3 | usada8 r3, r7, lr, r3 | ||||
| bgt 1b | bgt 1b | ||||
| 2: | 2: | ||||
| @@ -531,6 +531,7 @@ function ff_vorbis_inverse_coupling_neon, export=1 | |||||
| 2: vst1.32 {d2-d3}, [r3, :128]! | 2: vst1.32 {d2-d3}, [r3, :128]! | ||||
| vst1.32 {d0-d1}, [r12,:128]! | vst1.32 {d0-d1}, [r12,:128]! | ||||
| it lt | |||||
| bxlt lr | bxlt lr | ||||
| 3: vld1.32 {d2-d3}, [r1,:128] | 3: vld1.32 {d2-d3}, [r1,:128] | ||||
| @@ -575,6 +576,7 @@ NOVFP vdup.32 q8, r2 | |||||
| 2: vst1.32 {q2},[r0,:128]! | 2: vst1.32 {q2},[r0,:128]! | ||||
| vst1.32 {q3},[r0,:128]! | vst1.32 {q3},[r0,:128]! | ||||
| ands len, len, #15 | ands len, len, #15 | ||||
| it eq | |||||
| bxeq lr | bxeq lr | ||||
| 3: vld1.32 {q0},[r1,:128]! | 3: vld1.32 {q0},[r1,:128]! | ||||
| vmul.f32 q0, q0, q8 | vmul.f32 q0, q0, q8 | ||||
| @@ -638,6 +640,7 @@ NOVFP ldr r3, [sp] | |||||
| 2: vst1.32 {q8},[r0,:128]! | 2: vst1.32 {q8},[r0,:128]! | ||||
| vst1.32 {q9},[r0,:128]! | vst1.32 {q9},[r0,:128]! | ||||
| ands r3, r3, #7 | ands r3, r3, #7 | ||||
| it eq | |||||
| popeq {pc} | popeq {pc} | ||||
| 3: vld1.32 {q0},[r1,:128]! | 3: vld1.32 {q0},[r1,:128]! | ||||
| ldr r12, [r2], #4 | ldr r12, [r2], #4 | ||||
| @@ -55,18 +55,23 @@ function ff_vector_fmul_vfp, export=1 | |||||
| 1: | 1: | ||||
| subs r3, r3, #16 | subs r3, r3, #16 | ||||
| vmul.f32 s12, s4, s12 | vmul.f32 s12, s4, s12 | ||||
| itttt ge | |||||
| vldmiage r1!, {s16-s19} | vldmiage r1!, {s16-s19} | ||||
| vldmiage r2!, {s24-s27} | vldmiage r2!, {s24-s27} | ||||
| vldmiage r1!, {s20-s23} | vldmiage r1!, {s20-s23} | ||||
| vldmiage r2!, {s28-s31} | vldmiage r2!, {s28-s31} | ||||
| it ge | |||||
| vmulge.f32 s24, s16, s24 | vmulge.f32 s24, s16, s24 | ||||
| vstmia r0!, {s8-s11} | vstmia r0!, {s8-s11} | ||||
| vstmia r0!, {s12-s15} | vstmia r0!, {s12-s15} | ||||
| it ge | |||||
| vmulge.f32 s28, s20, s28 | vmulge.f32 s28, s20, s28 | ||||
| itttt gt | |||||
| vldmiagt r1!, {s0-s3} | vldmiagt r1!, {s0-s3} | ||||
| vldmiagt r2!, {s8-s11} | vldmiagt r2!, {s8-s11} | ||||
| vldmiagt r1!, {s4-s7} | vldmiagt r1!, {s4-s7} | ||||
| vldmiagt r2!, {s12-s15} | vldmiagt r2!, {s12-s15} | ||||
| ittt ge | |||||
| vmulge.f32 s8, s0, s8 | vmulge.f32 s8, s0, s8 | ||||
| vstmiage r0!, {s24-s27} | vstmiage r0!, {s24-s27} | ||||
| vstmiage r0!, {s28-s31} | vstmiage r0!, {s28-s31} | ||||
| @@ -97,33 +102,49 @@ function ff_vector_fmul_reverse_vfp, export=1 | |||||
| vmul.f32 s11, s0, s11 | vmul.f32 s11, s0, s11 | ||||
| 1: | 1: | ||||
| subs r3, r3, #16 | subs r3, r3, #16 | ||||
| it ge | |||||
| vldmdbge r2!, {s16-s19} | vldmdbge r2!, {s16-s19} | ||||
| vmul.f32 s12, s7, s12 | vmul.f32 s12, s7, s12 | ||||
| it ge | |||||
| vldmiage r1!, {s24-s27} | vldmiage r1!, {s24-s27} | ||||
| vmul.f32 s13, s6, s13 | vmul.f32 s13, s6, s13 | ||||
| it ge | |||||
| vldmdbge r2!, {s20-s23} | vldmdbge r2!, {s20-s23} | ||||
| vmul.f32 s14, s5, s14 | vmul.f32 s14, s5, s14 | ||||
| it ge | |||||
| vldmiage r1!, {s28-s31} | vldmiage r1!, {s28-s31} | ||||
| vmul.f32 s15, s4, s15 | vmul.f32 s15, s4, s15 | ||||
| it ge | |||||
| vmulge.f32 s24, s19, s24 | vmulge.f32 s24, s19, s24 | ||||
| it gt | |||||
| vldmdbgt r2!, {s0-s3} | vldmdbgt r2!, {s0-s3} | ||||
| it ge | |||||
| vmulge.f32 s25, s18, s25 | vmulge.f32 s25, s18, s25 | ||||
| vstmia r0!, {s8-s13} | vstmia r0!, {s8-s13} | ||||
| it ge | |||||
| vmulge.f32 s26, s17, s26 | vmulge.f32 s26, s17, s26 | ||||
| it gt | |||||
| vldmiagt r1!, {s8-s11} | vldmiagt r1!, {s8-s11} | ||||
| itt ge | |||||
| vmulge.f32 s27, s16, s27 | vmulge.f32 s27, s16, s27 | ||||
| vmulge.f32 s28, s23, s28 | vmulge.f32 s28, s23, s28 | ||||
| it gt | |||||
| vldmdbgt r2!, {s4-s7} | vldmdbgt r2!, {s4-s7} | ||||
| it ge | |||||
| vmulge.f32 s29, s22, s29 | vmulge.f32 s29, s22, s29 | ||||
| vstmia r0!, {s14-s15} | vstmia r0!, {s14-s15} | ||||
| ittt ge | |||||
| vmulge.f32 s30, s21, s30 | vmulge.f32 s30, s21, s30 | ||||
| vmulge.f32 s31, s20, s31 | vmulge.f32 s31, s20, s31 | ||||
| vmulge.f32 s8, s3, s8 | vmulge.f32 s8, s3, s8 | ||||
| it gt | |||||
| vldmiagt r1!, {s12-s15} | vldmiagt r1!, {s12-s15} | ||||
| itttt ge | |||||
| vmulge.f32 s9, s2, s9 | vmulge.f32 s9, s2, s9 | ||||
| vmulge.f32 s10, s1, s10 | vmulge.f32 s10, s1, s10 | ||||
| vstmiage r0!, {s24-s27} | vstmiage r0!, {s24-s27} | ||||
| vmulge.f32 s11, s0, s11 | vmulge.f32 s11, s0, s11 | ||||
| it ge | |||||
| vstmiage r0!, {s28-s31} | vstmiage r0!, {s28-s31} | ||||
| bgt 1b | bgt 1b | ||||
| @@ -71,6 +71,7 @@ endfunc | |||||
| function ff_float_to_int16_interleave_neon, export=1 | function ff_float_to_int16_interleave_neon, export=1 | ||||
| cmp r3, #2 | cmp r3, #2 | ||||
| itt lt | |||||
| ldrlt r1, [r1] | ldrlt r1, [r1] | ||||
| blt ff_float_to_int16_neon | blt ff_float_to_int16_neon | ||||
| bne 4f | bne 4f | ||||
| @@ -196,6 +197,7 @@ function ff_float_to_int16_interleave_neon, export=1 | |||||
| vst1.64 {d3}, [r8], ip | vst1.64 {d3}, [r8], ip | ||||
| vst1.64 {d7}, [r8], ip | vst1.64 {d7}, [r8], ip | ||||
| subs r3, r3, #4 | subs r3, r3, #4 | ||||
| it eq | |||||
| popeq {r4-r8,pc} | popeq {r4-r8,pc} | ||||
| cmp r3, #4 | cmp r3, #4 | ||||
| add r0, r0, #8 | add r0, r0, #8 | ||||
| @@ -305,6 +307,7 @@ function ff_float_to_int16_interleave_neon, export=1 | |||||
| vst1.32 {d23[1]}, [r8], ip | vst1.32 {d23[1]}, [r8], ip | ||||
| 8: subs r3, r3, #2 | 8: subs r3, r3, #2 | ||||
| add r0, r0, #4 | add r0, r0, #4 | ||||
| it eq | |||||
| popeq {r4-r8,pc} | popeq {r4-r8,pc} | ||||
| @ 1 channel | @ 1 channel | ||||
| @@ -354,6 +357,7 @@ function ff_float_to_int16_interleave_neon, export=1 | |||||
| vst1.16 {d2[3]}, [r5,:16], ip | vst1.16 {d2[3]}, [r5,:16], ip | ||||
| vst1.16 {d3[1]}, [r5,:16], ip | vst1.16 {d3[1]}, [r5,:16], ip | ||||
| vst1.16 {d3[3]}, [r5,:16], ip | vst1.16 {d3[3]}, [r5,:16], ip | ||||
| it eq | |||||
| popeq {r4-r8,pc} | popeq {r4-r8,pc} | ||||
| vld1.64 {d0-d1}, [r4,:128]! | vld1.64 {d0-d1}, [r4,:128]! | ||||
| vcvt.s32.f32 q0, q0, #16 | vcvt.s32.f32 q0, q0, #16 | ||||
| @@ -46,6 +46,7 @@ function ff_float_to_int16_vfp, export=1 | |||||
| vmov r5, r6, s2, s3 | vmov r5, r6, s2, s3 | ||||
| vmov r7, r8, s4, s5 | vmov r7, r8, s4, s5 | ||||
| vmov ip, lr, s6, s7 | vmov ip, lr, s6, s7 | ||||
| it gt | |||||
| vldmiagt r1!, {s16-s23} | vldmiagt r1!, {s16-s23} | ||||
| ssat r4, #16, r4 | ssat r4, #16, r4 | ||||
| ssat r3, #16, r3 | ssat r3, #16, r3 | ||||
| @@ -53,10 +54,12 @@ function ff_float_to_int16_vfp, export=1 | |||||
| ssat r5, #16, r5 | ssat r5, #16, r5 | ||||
| pkhbt r3, r3, r4, lsl #16 | pkhbt r3, r3, r4, lsl #16 | ||||
| pkhbt r4, r5, r6, lsl #16 | pkhbt r4, r5, r6, lsl #16 | ||||
| itttt gt | |||||
| vcvtgt.s32.f32 s0, s16 | vcvtgt.s32.f32 s0, s16 | ||||
| vcvtgt.s32.f32 s1, s17 | vcvtgt.s32.f32 s1, s17 | ||||
| vcvtgt.s32.f32 s2, s18 | vcvtgt.s32.f32 s2, s18 | ||||
| vcvtgt.s32.f32 s3, s19 | vcvtgt.s32.f32 s3, s19 | ||||
| itttt gt | |||||
| vcvtgt.s32.f32 s4, s20 | vcvtgt.s32.f32 s4, s20 | ||||
| vcvtgt.s32.f32 s5, s21 | vcvtgt.s32.f32 s5, s21 | ||||
| vcvtgt.s32.f32 s6, s22 | vcvtgt.s32.f32 s6, s22 | ||||
| @@ -71,7 +71,9 @@ function ff_\type\()_h264_chroma_mc8_neon, export=1 | |||||
| pld [r1] | pld [r1] | ||||
| pld [r1, r2] | pld [r1, r2] | ||||
| muls r7, r4, r5 | |||||
| A muls r7, r4, r5 | |||||
| T mul r7, r4, r5 | |||||
| T cmp r7, #0 | |||||
| rsb r6, r7, r5, lsl #3 | rsb r6, r7, r5, lsl #3 | ||||
| rsb ip, r7, r4, lsl #3 | rsb ip, r7, r4, lsl #3 | ||||
| sub r4, r7, r4, lsl #3 | sub r4, r7, r4, lsl #3 | ||||
| @@ -197,7 +199,9 @@ function ff_\type\()_h264_chroma_mc4_neon, export=1 | |||||
| pld [r1] | pld [r1] | ||||
| pld [r1, r2] | pld [r1, r2] | ||||
| muls r7, r4, r5 | |||||
| A muls r7, r4, r5 | |||||
| T mul r7, r4, r5 | |||||
| T cmp r7, #0 | |||||
| rsb r6, r7, r5, lsl #3 | rsb r6, r7, r5, lsl #3 | ||||
| rsb ip, r7, r4, lsl #3 | rsb ip, r7, r4, lsl #3 | ||||
| sub r4, r7, r4, lsl #3 | sub r4, r7, r4, lsl #3 | ||||
| @@ -368,10 +372,10 @@ function ff_\type\()_h264_chroma_mc2_neon, export=1 | |||||
| pop {r4-r6, pc} | pop {r4-r6, pc} | ||||
| 2: | 2: | ||||
| .ifc \type,put | .ifc \type,put | ||||
| ldrh r5, [r1], r2 | |||||
| strh r5, [r0], r2 | |||||
| ldrh r6, [r1], r2 | |||||
| strh r6, [r0], r2 | |||||
| ldrh_post r5, r1, r2 | |||||
| strh_post r5, r0, r2 | |||||
| ldrh_post r6, r1, r2 | |||||
| strh_post r6, r0, r2 | |||||
| .else | .else | ||||
| vld1.16 {d16[0]}, [r1], r2 | vld1.16 {d16[0]}, [r1], r2 | ||||
| vld1.16 {d16[1]}, [r1], r2 | vld1.16 {d16[1]}, [r1], r2 | ||||
| @@ -404,28 +408,17 @@ endfunc | |||||
| ldr ip, [sp] | ldr ip, [sp] | ||||
| tst r2, r2 | tst r2, r2 | ||||
| ldr ip, [ip] | ldr ip, [ip] | ||||
| it ne | |||||
| tstne r3, r3 | tstne r3, r3 | ||||
| vmov.32 d24[0], ip | vmov.32 d24[0], ip | ||||
| and ip, ip, ip, lsl #16 | and ip, ip, ip, lsl #16 | ||||
| it eq | |||||
| bxeq lr | bxeq lr | ||||
| ands ip, ip, ip, lsl #8 | ands ip, ip, ip, lsl #8 | ||||
| it lt | |||||
| bxlt lr | bxlt lr | ||||
| .endm | .endm | ||||
| .macro align_push_regs | |||||
| and ip, sp, #15 | |||||
| add ip, ip, #32 | |||||
| sub sp, sp, ip | |||||
| vst1.64 {d12-d15}, [sp,:128] | |||||
| sub sp, sp, #32 | |||||
| vst1.64 {d8-d11}, [sp,:128] | |||||
| .endm | |||||
| .macro align_pop_regs | |||||
| vld1.64 {d8-d11}, [sp,:128]! | |||||
| vld1.64 {d12-d15}, [sp,:128], ip | |||||
| .endm | |||||
| .macro h264_loop_filter_luma | .macro h264_loop_filter_luma | ||||
| vdup.8 q11, r2 @ alpha | vdup.8 q11, r2 @ alpha | ||||
| vmovl.u8 q12, d24 | vmovl.u8 q12, d24 | ||||
| @@ -506,7 +499,7 @@ function ff_h264_v_loop_filter_luma_neon, export=1 | |||||
| vld1.64 {d18,d19}, [r0,:128], r1 | vld1.64 {d18,d19}, [r0,:128], r1 | ||||
| vld1.64 {d16,d17}, [r0,:128], r1 | vld1.64 {d16,d17}, [r0,:128], r1 | ||||
| align_push_regs | |||||
| vpush {d8-d15} | |||||
| h264_loop_filter_luma | h264_loop_filter_luma | ||||
| @@ -516,7 +509,7 @@ function ff_h264_v_loop_filter_luma_neon, export=1 | |||||
| vst1.64 {d0, d1}, [r0,:128], r1 | vst1.64 {d0, d1}, [r0,:128], r1 | ||||
| vst1.64 {d10,d11}, [r0,:128] | vst1.64 {d10,d11}, [r0,:128] | ||||
| align_pop_regs | |||||
| vpop {d8-d15} | |||||
| bx lr | bx lr | ||||
| endfunc | endfunc | ||||
| @@ -543,7 +536,7 @@ function ff_h264_h_loop_filter_luma_neon, export=1 | |||||
| transpose_8x8 q3, q10, q9, q8, q0, q1, q2, q13 | transpose_8x8 q3, q10, q9, q8, q0, q1, q2, q13 | ||||
| align_push_regs | |||||
| vpush {d8-d15} | |||||
| h264_loop_filter_luma | h264_loop_filter_luma | ||||
| @@ -568,7 +561,7 @@ function ff_h264_h_loop_filter_luma_neon, export=1 | |||||
| vst1.32 {d1[1]}, [r0], r1 | vst1.32 {d1[1]}, [r0], r1 | ||||
| vst1.32 {d11[1]}, [r0], r1 | vst1.32 {d11[1]}, [r0], r1 | ||||
| align_pop_regs | |||||
| vpop {d8-d15} | |||||
| bx lr | bx lr | ||||
| endfunc | endfunc | ||||
| @@ -1116,6 +1109,7 @@ function \type\()_h264_qpel8_hv_lowpass_neon | |||||
| vrhadd.u8 d11, d11, d7 | vrhadd.u8 d11, d11, d7 | ||||
| sub r0, r0, r2, lsl #3 | sub r0, r0, r2, lsl #3 | ||||
| .endif | .endif | ||||
| vst1.64 {d12}, [r0,:64], r2 | vst1.64 {d12}, [r0,:64], r2 | ||||
| vst1.64 {d13}, [r0,:64], r2 | vst1.64 {d13}, [r0,:64], r2 | ||||
| vst1.64 {d14}, [r0,:64], r2 | vst1.64 {d14}, [r0,:64], r2 | ||||
| @@ -1263,7 +1257,9 @@ function ff_\type\()_h264_qpel8_mc11_neon, export=1 | |||||
| \type\()_h264_qpel8_mc11: | \type\()_h264_qpel8_mc11: | ||||
| lowpass_const r3 | lowpass_const r3 | ||||
| mov r11, sp | mov r11, sp | ||||
| bic sp, sp, #15 | |||||
| A bic sp, sp, #15 | |||||
| T bic r0, r11, #15 | |||||
| T mov sp, r0 | |||||
| sub sp, sp, #64 | sub sp, sp, #64 | ||||
| mov r0, sp | mov r0, sp | ||||
| sub r1, r1, #2 | sub r1, r1, #2 | ||||
| @@ -1271,14 +1267,14 @@ function ff_\type\()_h264_qpel8_mc11_neon, export=1 | |||||
| mov ip, #8 | mov ip, #8 | ||||
| vpush {d8-d15} | vpush {d8-d15} | ||||
| bl put_h264_qpel8_h_lowpass_neon | bl put_h264_qpel8_h_lowpass_neon | ||||
| ldrd r0, [r11] | |||||
| ldrd r0, [r11], #8 | |||||
| mov r3, r2 | mov r3, r2 | ||||
| add ip, sp, #64 | add ip, sp, #64 | ||||
| sub r1, r1, r2, lsl #1 | sub r1, r1, r2, lsl #1 | ||||
| mov r2, #8 | mov r2, #8 | ||||
| bl \type\()_h264_qpel8_v_lowpass_l2_neon | bl \type\()_h264_qpel8_v_lowpass_l2_neon | ||||
| vpop {d8-d15} | vpop {d8-d15} | ||||
| add sp, r11, #8 | |||||
| mov sp, r11 | |||||
| pop {r11, pc} | pop {r11, pc} | ||||
| endfunc | endfunc | ||||
| @@ -1287,7 +1283,9 @@ function ff_\type\()_h264_qpel8_mc21_neon, export=1 | |||||
| \type\()_h264_qpel8_mc21: | \type\()_h264_qpel8_mc21: | ||||
| lowpass_const r3 | lowpass_const r3 | ||||
| mov r11, sp | mov r11, sp | ||||
| bic sp, sp, #15 | |||||
| A bic sp, sp, #15 | |||||
| T bic r0, r11, #15 | |||||
| T mov sp, r0 | |||||
| sub sp, sp, #(8*8+16*12) | sub sp, sp, #(8*8+16*12) | ||||
| sub r1, r1, #2 | sub r1, r1, #2 | ||||
| mov r3, #8 | mov r3, #8 | ||||
| @@ -1296,14 +1294,14 @@ function ff_\type\()_h264_qpel8_mc21_neon, export=1 | |||||
| vpush {d8-d15} | vpush {d8-d15} | ||||
| bl put_h264_qpel8_h_lowpass_neon | bl put_h264_qpel8_h_lowpass_neon | ||||
| mov r4, r0 | mov r4, r0 | ||||
| ldrd r0, [r11] | |||||
| ldrd r0, [r11], #8 | |||||
| sub r1, r1, r2, lsl #1 | sub r1, r1, r2, lsl #1 | ||||
| sub r1, r1, #2 | sub r1, r1, #2 | ||||
| mov r3, r2 | mov r3, r2 | ||||
| sub r2, r4, #64 | sub r2, r4, #64 | ||||
| bl \type\()_h264_qpel8_hv_lowpass_l2_neon | bl \type\()_h264_qpel8_hv_lowpass_l2_neon | ||||
| vpop {d8-d15} | vpop {d8-d15} | ||||
| add sp, r11, #8 | |||||
| mov sp, r11 | |||||
| pop {r4, r10, r11, pc} | pop {r4, r10, r11, pc} | ||||
| endfunc | endfunc | ||||
| @@ -1330,7 +1328,9 @@ function ff_\type\()_h264_qpel8_mc12_neon, export=1 | |||||
| \type\()_h264_qpel8_mc12: | \type\()_h264_qpel8_mc12: | ||||
| lowpass_const r3 | lowpass_const r3 | ||||
| mov r11, sp | mov r11, sp | ||||
| bic sp, sp, #15 | |||||
| A bic sp, sp, #15 | |||||
| T bic r0, r11, #15 | |||||
| T mov sp, r0 | |||||
| sub sp, sp, #(8*8+16*12) | sub sp, sp, #(8*8+16*12) | ||||
| sub r1, r1, r2, lsl #1 | sub r1, r1, r2, lsl #1 | ||||
| mov r3, r2 | mov r3, r2 | ||||
| @@ -1339,20 +1339,22 @@ function ff_\type\()_h264_qpel8_mc12_neon, export=1 | |||||
| vpush {d8-d15} | vpush {d8-d15} | ||||
| bl put_h264_qpel8_v_lowpass_neon | bl put_h264_qpel8_v_lowpass_neon | ||||
| mov r4, r0 | mov r4, r0 | ||||
| ldrd r0, [r11] | |||||
| ldrd r0, [r11], #8 | |||||
| sub r1, r1, r3, lsl #1 | sub r1, r1, r3, lsl #1 | ||||
| sub r1, r1, #2 | sub r1, r1, #2 | ||||
| sub r2, r4, #64 | sub r2, r4, #64 | ||||
| bl \type\()_h264_qpel8_hv_lowpass_l2_neon | bl \type\()_h264_qpel8_hv_lowpass_l2_neon | ||||
| vpop {d8-d15} | vpop {d8-d15} | ||||
| add sp, r11, #8 | |||||
| mov sp, r11 | |||||
| pop {r4, r10, r11, pc} | pop {r4, r10, r11, pc} | ||||
| endfunc | endfunc | ||||
| function ff_\type\()_h264_qpel8_mc22_neon, export=1 | function ff_\type\()_h264_qpel8_mc22_neon, export=1 | ||||
| push {r4, r10, r11, lr} | push {r4, r10, r11, lr} | ||||
| mov r11, sp | mov r11, sp | ||||
| bic sp, sp, #15 | |||||
| A bic sp, sp, #15 | |||||
| T bic r4, r11, #15 | |||||
| T mov sp, r4 | |||||
| sub r1, r1, r2, lsl #1 | sub r1, r1, r2, lsl #1 | ||||
| sub r1, r1, #2 | sub r1, r1, #2 | ||||
| mov r3, r2 | mov r3, r2 | ||||
| @@ -1441,21 +1443,23 @@ function ff_\type\()_h264_qpel16_mc11_neon, export=1 | |||||
| \type\()_h264_qpel16_mc11: | \type\()_h264_qpel16_mc11: | ||||
| lowpass_const r3 | lowpass_const r3 | ||||
| mov r11, sp | mov r11, sp | ||||
| bic sp, sp, #15 | |||||
| A bic sp, sp, #15 | |||||
| T bic r0, r11, #15 | |||||
| T mov sp, r0 | |||||
| sub sp, sp, #256 | sub sp, sp, #256 | ||||
| mov r0, sp | mov r0, sp | ||||
| sub r1, r1, #2 | sub r1, r1, #2 | ||||
| mov r3, #16 | mov r3, #16 | ||||
| vpush {d8-d15} | vpush {d8-d15} | ||||
| bl put_h264_qpel16_h_lowpass_neon | bl put_h264_qpel16_h_lowpass_neon | ||||
| ldrd r0, [r11] | |||||
| ldrd r0, [r11], #8 | |||||
| mov r3, r2 | mov r3, r2 | ||||
| add ip, sp, #64 | add ip, sp, #64 | ||||
| sub r1, r1, r2, lsl #1 | sub r1, r1, r2, lsl #1 | ||||
| mov r2, #16 | mov r2, #16 | ||||
| bl \type\()_h264_qpel16_v_lowpass_l2_neon | bl \type\()_h264_qpel16_v_lowpass_l2_neon | ||||
| vpop {d8-d15} | vpop {d8-d15} | ||||
| add sp, r11, #8 | |||||
| mov sp, r11 | |||||
| pop {r4, r11, pc} | pop {r4, r11, pc} | ||||
| endfunc | endfunc | ||||
| @@ -1464,20 +1468,22 @@ function ff_\type\()_h264_qpel16_mc21_neon, export=1 | |||||
| \type\()_h264_qpel16_mc21: | \type\()_h264_qpel16_mc21: | ||||
| lowpass_const r3 | lowpass_const r3 | ||||
| mov r11, sp | mov r11, sp | ||||
| bic sp, sp, #15 | |||||
| A bic sp, sp, #15 | |||||
| T bic r0, r11, #15 | |||||
| T mov sp, r0 | |||||
| sub sp, sp, #(16*16+16*12) | sub sp, sp, #(16*16+16*12) | ||||
| sub r1, r1, #2 | sub r1, r1, #2 | ||||
| mov r0, sp | mov r0, sp | ||||
| vpush {d8-d15} | vpush {d8-d15} | ||||
| bl put_h264_qpel16_h_lowpass_neon_packed | bl put_h264_qpel16_h_lowpass_neon_packed | ||||
| mov r4, r0 | mov r4, r0 | ||||
| ldrd r0, [r11] | |||||
| ldrd r0, [r11], #8 | |||||
| sub r1, r1, r2, lsl #1 | sub r1, r1, r2, lsl #1 | ||||
| sub r1, r1, #2 | sub r1, r1, #2 | ||||
| mov r3, r2 | mov r3, r2 | ||||
| bl \type\()_h264_qpel16_hv_lowpass_l2_neon | bl \type\()_h264_qpel16_hv_lowpass_l2_neon | ||||
| vpop {d8-d15} | vpop {d8-d15} | ||||
| add sp, r11, #8 | |||||
| mov sp, r11 | |||||
| pop {r4-r5, r9-r11, pc} | pop {r4-r5, r9-r11, pc} | ||||
| endfunc | endfunc | ||||
| @@ -1504,7 +1510,9 @@ function ff_\type\()_h264_qpel16_mc12_neon, export=1 | |||||
| \type\()_h264_qpel16_mc12: | \type\()_h264_qpel16_mc12: | ||||
| lowpass_const r3 | lowpass_const r3 | ||||
| mov r11, sp | mov r11, sp | ||||
| bic sp, sp, #15 | |||||
| A bic sp, sp, #15 | |||||
| T bic r0, r11, #15 | |||||
| T mov sp, r0 | |||||
| sub sp, sp, #(16*16+16*12) | sub sp, sp, #(16*16+16*12) | ||||
| sub r1, r1, r2, lsl #1 | sub r1, r1, r2, lsl #1 | ||||
| mov r0, sp | mov r0, sp | ||||
| @@ -1512,13 +1520,13 @@ function ff_\type\()_h264_qpel16_mc12_neon, export=1 | |||||
| vpush {d8-d15} | vpush {d8-d15} | ||||
| bl put_h264_qpel16_v_lowpass_neon_packed | bl put_h264_qpel16_v_lowpass_neon_packed | ||||
| mov r4, r0 | mov r4, r0 | ||||
| ldrd r0, [r11] | |||||
| ldrd r0, [r11], #8 | |||||
| sub r1, r1, r3, lsl #1 | sub r1, r1, r3, lsl #1 | ||||
| sub r1, r1, #2 | sub r1, r1, #2 | ||||
| mov r2, r3 | mov r2, r3 | ||||
| bl \type\()_h264_qpel16_hv_lowpass_l2_neon | bl \type\()_h264_qpel16_hv_lowpass_l2_neon | ||||
| vpop {d8-d15} | vpop {d8-d15} | ||||
| add sp, r11, #8 | |||||
| mov sp, r11 | |||||
| pop {r4-r5, r9-r11, pc} | pop {r4-r5, r9-r11, pc} | ||||
| endfunc | endfunc | ||||
| @@ -1526,7 +1534,9 @@ function ff_\type\()_h264_qpel16_mc22_neon, export=1 | |||||
| push {r4, r9-r11, lr} | push {r4, r9-r11, lr} | ||||
| lowpass_const r3 | lowpass_const r3 | ||||
| mov r11, sp | mov r11, sp | ||||
| bic sp, sp, #15 | |||||
| A bic sp, sp, #15 | |||||
| T bic r4, r11, #15 | |||||
| T mov sp, r4 | |||||
| sub r1, r1, r2, lsl #1 | sub r1, r1, r2, lsl #1 | ||||
| sub r1, r1, #2 | sub r1, r1, #2 | ||||
| mov r3, r2 | mov r3, r2 | ||||
| @@ -106,10 +106,12 @@ function ff_h264_idct_add16_neon, export=1 | |||||
| blt 2f | blt 2f | ||||
| ldrsh lr, [r1] | ldrsh lr, [r1] | ||||
| add r0, r0, r4 | add r0, r0, r4 | ||||
| it ne | |||||
| movne lr, #0 | movne lr, #0 | ||||
| cmp lr, #0 | cmp lr, #0 | ||||
| adrne lr, ff_h264_idct_dc_add_neon | |||||
| adreq lr, ff_h264_idct_add_neon | |||||
| ite ne | |||||
| adrne lr, ff_h264_idct_dc_add_neon + CONFIG_THUMB | |||||
| adreq lr, ff_h264_idct_add_neon + CONFIG_THUMB | |||||
| blx lr | blx lr | ||||
| 2: subs ip, ip, #1 | 2: subs ip, ip, #1 | ||||
| add r1, r1, #32 | add r1, r1, #32 | ||||
| @@ -132,8 +134,9 @@ function ff_h264_idct_add16intra_neon, export=1 | |||||
| add r0, r0, r4 | add r0, r0, r4 | ||||
| cmp r8, #0 | cmp r8, #0 | ||||
| ldrsh r8, [r1] | ldrsh r8, [r1] | ||||
| adrne lr, ff_h264_idct_add_neon | |||||
| adreq lr, ff_h264_idct_dc_add_neon | |||||
| iteet ne | |||||
| adrne lr, ff_h264_idct_add_neon + CONFIG_THUMB | |||||
| adreq lr, ff_h264_idct_dc_add_neon + CONFIG_THUMB | |||||
| cmpeq r8, #0 | cmpeq r8, #0 | ||||
| blxne lr | blxne lr | ||||
| subs ip, ip, #1 | subs ip, ip, #1 | ||||
| @@ -159,12 +162,14 @@ function ff_h264_idct_add8_neon, export=1 | |||||
| add r1, r3, r12, lsl #5 | add r1, r3, r12, lsl #5 | ||||
| cmp r8, #0 | cmp r8, #0 | ||||
| ldrsh r8, [r1] | ldrsh r8, [r1] | ||||
| adrne lr, ff_h264_idct_add_neon | |||||
| adreq lr, ff_h264_idct_dc_add_neon | |||||
| iteet ne | |||||
| adrne lr, ff_h264_idct_add_neon + CONFIG_THUMB | |||||
| adreq lr, ff_h264_idct_dc_add_neon + CONFIG_THUMB | |||||
| cmpeq r8, #0 | cmpeq r8, #0 | ||||
| blxne lr | blxne lr | ||||
| add r12, r12, #1 | add r12, r12, #1 | ||||
| cmp r12, #4 | cmp r12, #4 | ||||
| itt eq | |||||
| moveq r12, #16 | moveq r12, #16 | ||||
| moveq r4, r9 | moveq r4, r9 | ||||
| cmp r12, #20 | cmp r12, #20 | ||||
| @@ -365,10 +370,12 @@ function ff_h264_idct8_add4_neon, export=1 | |||||
| blt 2f | blt 2f | ||||
| ldrsh lr, [r1] | ldrsh lr, [r1] | ||||
| add r0, r0, r4 | add r0, r0, r4 | ||||
| it ne | |||||
| movne lr, #0 | movne lr, #0 | ||||
| cmp lr, #0 | cmp lr, #0 | ||||
| adrne lr, ff_h264_idct8_dc_add_neon | |||||
| adreq lr, ff_h264_idct8_add_neon | |||||
| ite ne | |||||
| adrne lr, ff_h264_idct8_dc_add_neon + CONFIG_THUMB | |||||
| adreq lr, ff_h264_idct8_add_neon + CONFIG_THUMB | |||||
| blx lr | blx lr | ||||
| 2: subs r12, r12, #4 | 2: subs r12, r12, #4 | ||||
| add r1, r1, #128 | add r1, r1, #128 | ||||
| @@ -64,11 +64,14 @@ static inline av_const int mid_pred(int a, int b, int c) | |||||
| __asm__ ( | __asm__ ( | ||||
| "mov %0, %2 \n\t" | "mov %0, %2 \n\t" | ||||
| "cmp %1, %2 \n\t" | "cmp %1, %2 \n\t" | ||||
| "itt gt \n\t" | |||||
| "movgt %0, %1 \n\t" | "movgt %0, %1 \n\t" | ||||
| "movgt %1, %2 \n\t" | "movgt %1, %2 \n\t" | ||||
| "cmp %1, %3 \n\t" | "cmp %1, %3 \n\t" | ||||
| "it le \n\t" | |||||
| "movle %1, %3 \n\t" | "movle %1, %3 \n\t" | ||||
| "cmp %0, %1 \n\t" | "cmp %0, %1 \n\t" | ||||
| "it gt \n\t" | |||||
| "movgt %0, %1 \n\t" | "movgt %0, %1 \n\t" | ||||
| : "=&r"(m), "+r"(a) | : "=&r"(m), "+r"(a) | ||||
| : "r"(b), "r"(c) | : "r"(b), "r"(c) | ||||
| @@ -191,7 +191,9 @@ function ff_mdct_calc_neon, export=1 | |||||
| vadd.f32 d17, d17, d3 @ in2u+in1d -I | vadd.f32 d17, d17, d3 @ in2u+in1d -I | ||||
| 1: | 1: | ||||
| vmul.f32 d7, d0, d21 @ I*s | vmul.f32 d7, d0, d21 @ I*s | ||||
| ldr r10, [r3, lr, lsr #1] | |||||
| A ldr r10, [r3, lr, lsr #1] | |||||
| T lsr r10, lr, #1 | |||||
| T ldr r10, [r3, r10] | |||||
| vmul.f32 d6, d1, d20 @ -R*c | vmul.f32 d6, d1, d20 @ -R*c | ||||
| ldr r6, [r3, #4]! | ldr r6, [r3, #4]! | ||||
| vmul.f32 d4, d1, d21 @ -R*s | vmul.f32 d4, d1, d21 @ -R*s | ||||
| @@ -75,7 +75,7 @@ function ff_mpadsp_apply_window_fixed_armv6, export=1 | |||||
| sum8 r8, r9, r1, r0, r10, r11, r12, lr | sum8 r8, r9, r1, r0, r10, r11, r12, lr | ||||
| sum8 r8, r9, r1, r2, r10, r11, r12, lr, rsb, 32 | sum8 r8, r9, r1, r2, r10, r11, r12, lr, rsb, 32 | ||||
| round r10, r8, r9 | round r10, r8, r9 | ||||
| strh r10, [r3], r4 | |||||
| strh_post r10, r3, r4 | |||||
| mov lr, #15 | mov lr, #15 | ||||
| 1: | 1: | ||||
| @@ -127,10 +127,10 @@ function ff_mpadsp_apply_window_fixed_armv6, export=1 | |||||
| round r10, r8, r9 | round r10, r8, r9 | ||||
| adds r8, r8, r4 | adds r8, r8, r4 | ||||
| adc r9, r9, r7 | adc r9, r9, r7 | ||||
| strh r10, [r3], r12 | |||||
| strh_post r10, r3, r12 | |||||
| round r11, r8, r9 | round r11, r8, r9 | ||||
| subs lr, lr, #1 | subs lr, lr, #1 | ||||
| strh r11, [r5], -r12 | |||||
| strh_dpost r11, r5, r12 | |||||
| bgt 1b | bgt 1b | ||||
| sum8 r8, r9, r1, r0, r10, r11, r12, lr, rsb, 33 | sum8 r8, r9, r1, r0, r10, r11, r12, lr, rsb, 33 | ||||
| @@ -38,15 +38,21 @@ | |||||
| .macro dequant_t dst, src, mul, add, tmp | .macro dequant_t dst, src, mul, add, tmp | ||||
| rsbs \tmp, ip, \src, asr #16 | rsbs \tmp, ip, \src, asr #16 | ||||
| it gt | |||||
| addgt \tmp, \add, #0 | addgt \tmp, \add, #0 | ||||
| it lt | |||||
| rsblt \tmp, \add, #0 | rsblt \tmp, \add, #0 | ||||
| it ne | |||||
| smlatbne \dst, \src, \mul, \tmp | smlatbne \dst, \src, \mul, \tmp | ||||
| .endm | .endm | ||||
| .macro dequant_b dst, src, mul, add, tmp | .macro dequant_b dst, src, mul, add, tmp | ||||
| rsbs \tmp, ip, \src, lsl #16 | rsbs \tmp, ip, \src, lsl #16 | ||||
| it gt | |||||
| addgt \tmp, \add, #0 | addgt \tmp, \add, #0 | ||||
| it lt | |||||
| rsblt \tmp, \add, #0 | rsblt \tmp, \add, #0 | ||||
| it ne | |||||
| smlabbne \dst, \src, \mul, \tmp | smlabbne \dst, \src, \mul, \tmp | ||||
| .endm | .endm | ||||
| @@ -80,21 +86,27 @@ function ff_dct_unquantize_h263_armv5te, export=1 | |||||
| strh lr, [r0], #2 | strh lr, [r0], #2 | ||||
| subs r3, r3, #8 | subs r3, r3, #8 | ||||
| it gt | |||||
| ldrdgt r4, [r0, #0] /* load data early to avoid load/use pipeline stall */ | ldrdgt r4, [r0, #0] /* load data early to avoid load/use pipeline stall */ | ||||
| bgt 1b | bgt 1b | ||||
| adds r3, r3, #2 | adds r3, r3, #2 | ||||
| it le | |||||
| pople {r4-r9,pc} | pople {r4-r9,pc} | ||||
| 2: | 2: | ||||
| ldrsh r9, [r0, #0] | ldrsh r9, [r0, #0] | ||||
| ldrsh lr, [r0, #2] | ldrsh lr, [r0, #2] | ||||
| mov r8, r2 | mov r8, r2 | ||||
| cmp r9, #0 | cmp r9, #0 | ||||
| it lt | |||||
| rsblt r8, r2, #0 | rsblt r8, r2, #0 | ||||
| it ne | |||||
| smlabbne r9, r9, r1, r8 | smlabbne r9, r9, r1, r8 | ||||
| mov r8, r2 | mov r8, r2 | ||||
| cmp lr, #0 | cmp lr, #0 | ||||
| it lt | |||||
| rsblt r8, r2, #0 | rsblt r8, r2, #0 | ||||
| it ne | |||||
| smlabbne lr, lr, r1, r8 | smlabbne lr, lr, r1, r8 | ||||
| strh r9, [r0], #2 | strh r9, [r0], #2 | ||||
| strh lr, [r0], #2 | strh lr, [r0], #2 | ||||
| @@ -57,6 +57,7 @@ function ff_dct_unquantize_h263_neon, export=1 | |||||
| subs r3, r3, #16 | subs r3, r3, #16 | ||||
| vst1.16 {q0}, [r1,:128]! | vst1.16 {q0}, [r1,:128]! | ||||
| vst1.16 {q8}, [r1,:128]! | vst1.16 {q8}, [r1,:128]! | ||||
| it le | |||||
| bxle lr | bxle lr | ||||
| cmp r3, #8 | cmp r3, #8 | ||||
| bgt 1b | bgt 1b | ||||
| @@ -78,6 +79,7 @@ function ff_dct_unquantize_h263_intra_neon, export=1 | |||||
| ldr r6, [r0, #AC_PRED] | ldr r6, [r0, #AC_PRED] | ||||
| add lr, r0, #INTER_SCANTAB_RASTER_END | add lr, r0, #INTER_SCANTAB_RASTER_END | ||||
| cmp r6, #0 | cmp r6, #0 | ||||
| it ne | |||||
| movne r12, #63 | movne r12, #63 | ||||
| bne 1f | bne 1f | ||||
| ldr r12, [r12, r2, lsl #2] | ldr r12, [r12, r2, lsl #2] | ||||
| @@ -86,9 +88,11 @@ function ff_dct_unquantize_h263_intra_neon, export=1 | |||||
| ldrsh r4, [r1] | ldrsh r4, [r1] | ||||
| cmp r5, #0 | cmp r5, #0 | ||||
| mov r5, r1 | mov r5, r1 | ||||
| it ne | |||||
| movne r2, #0 | movne r2, #0 | ||||
| bne 2f | bne 2f | ||||
| cmp r2, #4 | cmp r2, #4 | ||||
| it ge | |||||
| addge r0, r0, #4 | addge r0, r0, #4 | ||||
| sub r2, r3, #1 | sub r2, r3, #1 | ||||
| ldr r6, [r0, #Y_DC_SCALE] | ldr r6, [r0, #Y_DC_SCALE] | ||||
| @@ -137,6 +137,7 @@ function ff_rdft_calc_neon, export=1 | |||||
| vst1.32 {d22}, [r5,:64] | vst1.32 {d22}, [r5,:64] | ||||
| cmp r6, #0 | cmp r6, #0 | ||||
| it eq | |||||
| popeq {r4-r8,pc} | popeq {r4-r8,pc} | ||||
| vmul.f32 d22, d22, d18 | vmul.f32 d22, d22, d18 | ||||
| @@ -121,11 +121,13 @@ __b_evaluation: | |||||
| ldr r11, [r12, #offW7] @ R11=W7 | ldr r11, [r12, #offW7] @ R11=W7 | ||||
| mul r5, r10, r7 @ R5=W5*ROWr16[1]=b2 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle) | mul r5, r10, r7 @ R5=W5*ROWr16[1]=b2 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle) | ||||
| mul r7, r11, r7 @ R7=W7*ROWr16[1]=b3 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle) | mul r7, r11, r7 @ R7=W7*ROWr16[1]=b3 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle) | ||||
| teq r2, #0 @ if null avoid muls | |||||
| mlane r0, r9, r2, r0 @ R0+=W3*ROWr16[3]=b0 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle) | |||||
| teq r2, #0 @ if null avoid muls | |||||
| itttt ne | |||||
| mlane r0, r9, r2, r0 @ R0+=W3*ROWr16[3]=b0 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle) | |||||
| rsbne r2, r2, #0 @ R2=-ROWr16[3] | rsbne r2, r2, #0 @ R2=-ROWr16[3] | ||||
| mlane r1, r11, r2, r1 @ R1-=W7*ROWr16[3]=b1 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle) | mlane r1, r11, r2, r1 @ R1-=W7*ROWr16[3]=b1 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle) | ||||
| mlane r5, r8, r2, r5 @ R5-=W1*ROWr16[3]=b2 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle) | mlane r5, r8, r2, r5 @ R5-=W1*ROWr16[3]=b2 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle) | ||||
| it ne | |||||
| mlane r7, r10, r2, r7 @ R7-=W5*ROWr16[3]=b3 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle) | mlane r7, r10, r2, r7 @ R7-=W5*ROWr16[3]=b3 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle) | ||||
| @@ at this point, R0=b0, R1=b1, R2 (free), R3=ROWr32[2], R4=ROWr32[3], | @@ at this point, R0=b0, R1=b1, R2 (free), R3=ROWr32[2], R4=ROWr32[3], | ||||
| @@ -148,19 +150,23 @@ __b_evaluation: | |||||
| @@ MAC16(b3, -W1, row[7]); | @@ MAC16(b3, -W1, row[7]); | ||||
| @@ MAC16(b1, -W5, row[7]); | @@ MAC16(b1, -W5, row[7]); | ||||
| mov r3, r3, asr #16 @ R3=ROWr16[5] | mov r3, r3, asr #16 @ R3=ROWr16[5] | ||||
| teq r3, #0 @ if null avoid muls | |||||
| teq r3, #0 @ if null avoid muls | |||||
| it ne | |||||
| mlane r0, r10, r3, r0 @ R0+=W5*ROWr16[5]=b0 | mlane r0, r10, r3, r0 @ R0+=W5*ROWr16[5]=b0 | ||||
| mov r4, r4, asr #16 @ R4=ROWr16[7] | mov r4, r4, asr #16 @ R4=ROWr16[7] | ||||
| itttt ne | |||||
| mlane r5, r11, r3, r5 @ R5+=W7*ROWr16[5]=b2 | mlane r5, r11, r3, r5 @ R5+=W7*ROWr16[5]=b2 | ||||
| mlane r7, r9, r3, r7 @ R7+=W3*ROWr16[5]=b3 | mlane r7, r9, r3, r7 @ R7+=W3*ROWr16[5]=b3 | ||||
| rsbne r3, r3, #0 @ R3=-ROWr16[5] | rsbne r3, r3, #0 @ R3=-ROWr16[5] | ||||
| mlane r1, r8, r3, r1 @ R7-=W1*ROWr16[5]=b1 | mlane r1, r8, r3, r1 @ R7-=W1*ROWr16[5]=b1 | ||||
| @@ R3 is free now | @@ R3 is free now | ||||
| teq r4, #0 @ if null avoid muls | |||||
| teq r4, #0 @ if null avoid muls | |||||
| itttt ne | |||||
| mlane r0, r11, r4, r0 @ R0+=W7*ROWr16[7]=b0 | mlane r0, r11, r4, r0 @ R0+=W7*ROWr16[7]=b0 | ||||
| mlane r5, r9, r4, r5 @ R5+=W3*ROWr16[7]=b2 | mlane r5, r9, r4, r5 @ R5+=W3*ROWr16[7]=b2 | ||||
| rsbne r4, r4, #0 @ R4=-ROWr16[7] | rsbne r4, r4, #0 @ R4=-ROWr16[7] | ||||
| mlane r7, r8, r4, r7 @ R7-=W1*ROWr16[7]=b3 | mlane r7, r8, r4, r7 @ R7-=W1*ROWr16[7]=b3 | ||||
| it ne | |||||
| mlane r1, r10, r4, r1 @ R1-=W5*ROWr16[7]=b1 | mlane r1, r10, r4, r1 @ R1-=W5*ROWr16[7]=b1 | ||||
| @@ R4 is free now | @@ R4 is free now | ||||
| __end_b_evaluation: | __end_b_evaluation: | ||||
| @@ -204,16 +210,19 @@ __a_evaluation: | |||||
| @@ a2 -= W4*row[4] | @@ a2 -= W4*row[4] | ||||
| @@ a3 += W4*row[4] | @@ a3 += W4*row[4] | ||||
| ldrsh r11, [r14, #8] @ R11=ROWr16[4] | ldrsh r11, [r14, #8] @ R11=ROWr16[4] | ||||
| teq r11, #0 @ if null avoid muls | |||||
| teq r11, #0 @ if null avoid muls | |||||
| it ne | |||||
| mulne r11, r9, r11 @ R11=W4*ROWr16[4] | mulne r11, r9, r11 @ R11=W4*ROWr16[4] | ||||
| @@ R9 is free now | @@ R9 is free now | ||||
| ldrsh r9, [r14, #12] @ R9=ROWr16[6] | ldrsh r9, [r14, #12] @ R9=ROWr16[6] | ||||
| itttt ne | |||||
| addne r6, r6, r11 @ R6+=W4*ROWr16[4] (a0) | addne r6, r6, r11 @ R6+=W4*ROWr16[4] (a0) | ||||
| subne r2, r2, r11 @ R2-=W4*ROWr16[4] (a1) | subne r2, r2, r11 @ R2-=W4*ROWr16[4] (a1) | ||||
| subne r3, r3, r11 @ R3-=W4*ROWr16[4] (a2) | subne r3, r3, r11 @ R3-=W4*ROWr16[4] (a2) | ||||
| addne r4, r4, r11 @ R4+=W4*ROWr16[4] (a3) | addne r4, r4, r11 @ R4+=W4*ROWr16[4] (a3) | ||||
| @@ W6 alone is no more useful, save W2*ROWr16[6] in it instead | @@ W6 alone is no more useful, save W2*ROWr16[6] in it instead | ||||
| teq r9, #0 @ if null avoid muls | |||||
| teq r9, #0 @ if null avoid muls | |||||
| itttt ne | |||||
| mulne r11, r10, r9 @ R11=W6*ROWr16[6] | mulne r11, r10, r9 @ R11=W6*ROWr16[6] | ||||
| addne r6, r6, r11 @ R6+=W6*ROWr16[6] (a0) | addne r6, r6, r11 @ R6+=W6*ROWr16[6] (a0) | ||||
| mulne r10, r8, r9 @ R10=W2*ROWr16[6] | mulne r10, r8, r9 @ R10=W2*ROWr16[6] | ||||
| @@ -222,6 +231,7 @@ __a_evaluation: | |||||
| @@ a1 -= W2*row[6]; | @@ a1 -= W2*row[6]; | ||||
| @@ a2 += W2*row[6]; | @@ a2 += W2*row[6]; | ||||
| subne r4, r4, r11 @ R4-=W6*ROWr16[6] (a3) | subne r4, r4, r11 @ R4-=W6*ROWr16[6] (a3) | ||||
| itt ne | |||||
| subne r2, r2, r10 @ R2-=W2*ROWr16[6] (a1) | subne r2, r2, r10 @ R2-=W2*ROWr16[6] (a1) | ||||
| addne r3, r3, r10 @ R3+=W2*ROWr16[6] (a2) | addne r3, r3, r10 @ R3+=W2*ROWr16[6] (a2) | ||||
| @@ -323,10 +333,12 @@ __b_evaluation2: | |||||
| ldrsh r2, [r14, #48] | ldrsh r2, [r14, #48] | ||||
| mul r7, r11, r7 @ R7=W7*ROWr16[1]=b3 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle) | mul r7, r11, r7 @ R7=W7*ROWr16[1]=b3 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle) | ||||
| teq r2, #0 @ if 0, then avoid muls | teq r2, #0 @ if 0, then avoid muls | ||||
| itttt ne | |||||
| mlane r0, r9, r2, r0 @ R0+=W3*ROWr16[3]=b0 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle) | mlane r0, r9, r2, r0 @ R0+=W3*ROWr16[3]=b0 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle) | ||||
| rsbne r2, r2, #0 @ R2=-ROWr16[3] | rsbne r2, r2, #0 @ R2=-ROWr16[3] | ||||
| mlane r1, r11, r2, r1 @ R1-=W7*ROWr16[3]=b1 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle) | mlane r1, r11, r2, r1 @ R1-=W7*ROWr16[3]=b1 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle) | ||||
| mlane r5, r8, r2, r5 @ R5-=W1*ROWr16[3]=b2 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle) | mlane r5, r8, r2, r5 @ R5-=W1*ROWr16[3]=b2 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle) | ||||
| it ne | |||||
| mlane r7, r10, r2, r7 @ R7-=W5*ROWr16[3]=b3 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle) | mlane r7, r10, r2, r7 @ R7-=W5*ROWr16[3]=b3 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle) | ||||
| @@ at this point, R0=b0, R1=b1, R2 (free), R3 (free), R4 (free), | @@ at this point, R0=b0, R1=b1, R2 (free), R3 (free), R4 (free), | ||||
| @@ -342,18 +354,22 @@ __b_evaluation2: | |||||
| @@ MAC16(b1, -W5, col[7x8]); | @@ MAC16(b1, -W5, col[7x8]); | ||||
| ldrsh r3, [r14, #80] @ R3=COLr16[5x8] | ldrsh r3, [r14, #80] @ R3=COLr16[5x8] | ||||
| teq r3, #0 @ if 0 then avoid muls | teq r3, #0 @ if 0 then avoid muls | ||||
| itttt ne | |||||
| mlane r0, r10, r3, r0 @ R0+=W5*ROWr16[5x8]=b0 | mlane r0, r10, r3, r0 @ R0+=W5*ROWr16[5x8]=b0 | ||||
| mlane r5, r11, r3, r5 @ R5+=W7*ROWr16[5x8]=b2 | mlane r5, r11, r3, r5 @ R5+=W7*ROWr16[5x8]=b2 | ||||
| mlane r7, r9, r3, r7 @ R7+=W3*ROWr16[5x8]=b3 | mlane r7, r9, r3, r7 @ R7+=W3*ROWr16[5x8]=b3 | ||||
| rsbne r3, r3, #0 @ R3=-ROWr16[5x8] | rsbne r3, r3, #0 @ R3=-ROWr16[5x8] | ||||
| ldrsh r4, [r14, #112] @ R4=COLr16[7x8] | ldrsh r4, [r14, #112] @ R4=COLr16[7x8] | ||||
| it ne | |||||
| mlane r1, r8, r3, r1 @ R7-=W1*ROWr16[5x8]=b1 | mlane r1, r8, r3, r1 @ R7-=W1*ROWr16[5x8]=b1 | ||||
| @@ R3 is free now | @@ R3 is free now | ||||
| teq r4, #0 @ if 0 then avoid muls | teq r4, #0 @ if 0 then avoid muls | ||||
| itttt ne | |||||
| mlane r0, r11, r4, r0 @ R0+=W7*ROWr16[7x8]=b0 | mlane r0, r11, r4, r0 @ R0+=W7*ROWr16[7x8]=b0 | ||||
| mlane r5, r9, r4, r5 @ R5+=W3*ROWr16[7x8]=b2 | mlane r5, r9, r4, r5 @ R5+=W3*ROWr16[7x8]=b2 | ||||
| rsbne r4, r4, #0 @ R4=-ROWr16[7x8] | rsbne r4, r4, #0 @ R4=-ROWr16[7x8] | ||||
| mlane r7, r8, r4, r7 @ R7-=W1*ROWr16[7x8]=b3 | mlane r7, r8, r4, r7 @ R7-=W1*ROWr16[7x8]=b3 | ||||
| it ne | |||||
| mlane r1, r10, r4, r1 @ R1-=W5*ROWr16[7x8]=b1 | mlane r1, r10, r4, r1 @ R1-=W5*ROWr16[7x8]=b1 | ||||
| @@ R4 is free now | @@ R4 is free now | ||||
| __end_b_evaluation2: | __end_b_evaluation2: | ||||
| @@ -390,15 +406,18 @@ __a_evaluation2: | |||||
| @@ a3 += W4*row[4] | @@ a3 += W4*row[4] | ||||
| ldrsh r11, [r14, #64] @ R11=ROWr16[4] | ldrsh r11, [r14, #64] @ R11=ROWr16[4] | ||||
| teq r11, #0 @ if null avoid muls | teq r11, #0 @ if null avoid muls | ||||
| itttt ne | |||||
| mulne r11, r9, r11 @ R11=W4*ROWr16[4] | mulne r11, r9, r11 @ R11=W4*ROWr16[4] | ||||
| @@ R9 is free now | @@ R9 is free now | ||||
| addne r6, r6, r11 @ R6+=W4*ROWr16[4] (a0) | addne r6, r6, r11 @ R6+=W4*ROWr16[4] (a0) | ||||
| subne r2, r2, r11 @ R2-=W4*ROWr16[4] (a1) | subne r2, r2, r11 @ R2-=W4*ROWr16[4] (a1) | ||||
| subne r3, r3, r11 @ R3-=W4*ROWr16[4] (a2) | subne r3, r3, r11 @ R3-=W4*ROWr16[4] (a2) | ||||
| ldrsh r9, [r14, #96] @ R9=ROWr16[6] | ldrsh r9, [r14, #96] @ R9=ROWr16[6] | ||||
| it ne | |||||
| addne r4, r4, r11 @ R4+=W4*ROWr16[4] (a3) | addne r4, r4, r11 @ R4+=W4*ROWr16[4] (a3) | ||||
| @@ W6 alone is no more useful, save W2*ROWr16[6] in it instead | @@ W6 alone is no more useful, save W2*ROWr16[6] in it instead | ||||
| teq r9, #0 @ if null avoid muls | teq r9, #0 @ if null avoid muls | ||||
| itttt ne | |||||
| mulne r11, r10, r9 @ R11=W6*ROWr16[6] | mulne r11, r10, r9 @ R11=W6*ROWr16[6] | ||||
| addne r6, r6, r11 @ R6+=W6*ROWr16[6] (a0) | addne r6, r6, r11 @ R6+=W6*ROWr16[6] (a0) | ||||
| mulne r10, r8, r9 @ R10=W2*ROWr16[6] | mulne r10, r8, r9 @ R10=W2*ROWr16[6] | ||||
| @@ -407,6 +426,7 @@ __a_evaluation2: | |||||
| @@ a1 -= W2*row[6]; | @@ a1 -= W2*row[6]; | ||||
| @@ a2 += W2*row[6]; | @@ a2 += W2*row[6]; | ||||
| subne r4, r4, r11 @ R4-=W6*ROWr16[6] (a3) | subne r4, r4, r11 @ R4-=W6*ROWr16[6] (a3) | ||||
| itt ne | |||||
| subne r2, r2, r10 @ R2-=W2*ROWr16[6] (a1) | subne r2, r2, r10 @ R2-=W2*ROWr16[6] (a1) | ||||
| addne r3, r3, r10 @ R3+=W2*ROWr16[6] (a2) | addne r3, r3, r10 @ R3+=W2*ROWr16[6] (a2) | ||||
| __end_a_evaluation2: | __end_a_evaluation2: | ||||
| @@ -49,6 +49,7 @@ function idct_row_armv5te | |||||
| ldrd v1, [a1, #8] | ldrd v1, [a1, #8] | ||||
| ldrd a3, [a1] /* a3 = row[1:0], a4 = row[3:2] */ | ldrd a3, [a1] /* a3 = row[1:0], a4 = row[3:2] */ | ||||
| orrs v1, v1, v2 | orrs v1, v1, v2 | ||||
| itt eq | |||||
| cmpeq v1, a4 | cmpeq v1, a4 | ||||
| cmpeq v1, a3, lsr #16 | cmpeq v1, a3, lsr #16 | ||||
| beq row_dc_only | beq row_dc_only | ||||
| @@ -269,6 +270,7 @@ function idct_col_armv5te | |||||
| ldmfd sp!, {a3, a4} | ldmfd sp!, {a3, a4} | ||||
| adds a2, a3, v1 | adds a2, a3, v1 | ||||
| mov a2, a2, lsr #20 | mov a2, a2, lsr #20 | ||||
| it mi | |||||
| orrmi a2, a2, #0xf000 | orrmi a2, a2, #0xf000 | ||||
| add ip, a4, v2 | add ip, a4, v2 | ||||
| mov ip, ip, asr #20 | mov ip, ip, asr #20 | ||||
| @@ -276,6 +278,7 @@ function idct_col_armv5te | |||||
| str a2, [a1] | str a2, [a1] | ||||
| subs a3, a3, v1 | subs a3, a3, v1 | ||||
| mov a2, a3, lsr #20 | mov a2, a3, lsr #20 | ||||
| it mi | |||||
| orrmi a2, a2, #0xf000 | orrmi a2, a2, #0xf000 | ||||
| sub a4, a4, v2 | sub a4, a4, v2 | ||||
| mov a4, a4, asr #20 | mov a4, a4, asr #20 | ||||
| @@ -285,6 +288,7 @@ function idct_col_armv5te | |||||
| subs a2, a3, v3 | subs a2, a3, v3 | ||||
| mov a2, a2, lsr #20 | mov a2, a2, lsr #20 | ||||
| it mi | |||||
| orrmi a2, a2, #0xf000 | orrmi a2, a2, #0xf000 | ||||
| sub ip, a4, v4 | sub ip, a4, v4 | ||||
| mov ip, ip, asr #20 | mov ip, ip, asr #20 | ||||
| @@ -292,6 +296,7 @@ function idct_col_armv5te | |||||
| str a2, [a1, #(16*1)] | str a2, [a1, #(16*1)] | ||||
| adds a3, a3, v3 | adds a3, a3, v3 | ||||
| mov a2, a3, lsr #20 | mov a2, a3, lsr #20 | ||||
| it mi | |||||
| orrmi a2, a2, #0xf000 | orrmi a2, a2, #0xf000 | ||||
| add a4, a4, v4 | add a4, a4, v4 | ||||
| mov a4, a4, asr #20 | mov a4, a4, asr #20 | ||||
| @@ -301,6 +306,7 @@ function idct_col_armv5te | |||||
| adds a2, a3, v5 | adds a2, a3, v5 | ||||
| mov a2, a2, lsr #20 | mov a2, a2, lsr #20 | ||||
| it mi | |||||
| orrmi a2, a2, #0xf000 | orrmi a2, a2, #0xf000 | ||||
| add ip, a4, v6 | add ip, a4, v6 | ||||
| mov ip, ip, asr #20 | mov ip, ip, asr #20 | ||||
| @@ -308,6 +314,7 @@ function idct_col_armv5te | |||||
| str a2, [a1, #(16*2)] | str a2, [a1, #(16*2)] | ||||
| subs a3, a3, v5 | subs a3, a3, v5 | ||||
| mov a2, a3, lsr #20 | mov a2, a3, lsr #20 | ||||
| it mi | |||||
| orrmi a2, a2, #0xf000 | orrmi a2, a2, #0xf000 | ||||
| sub a4, a4, v6 | sub a4, a4, v6 | ||||
| mov a4, a4, asr #20 | mov a4, a4, asr #20 | ||||
| @@ -317,6 +324,7 @@ function idct_col_armv5te | |||||
| adds a2, a3, v7 | adds a2, a3, v7 | ||||
| mov a2, a2, lsr #20 | mov a2, a2, lsr #20 | ||||
| it mi | |||||
| orrmi a2, a2, #0xf000 | orrmi a2, a2, #0xf000 | ||||
| add ip, a4, fp | add ip, a4, fp | ||||
| mov ip, ip, asr #20 | mov ip, ip, asr #20 | ||||
| @@ -324,6 +332,7 @@ function idct_col_armv5te | |||||
| str a2, [a1, #(16*3)] | str a2, [a1, #(16*3)] | ||||
| subs a3, a3, v7 | subs a3, a3, v7 | ||||
| mov a2, a3, lsr #20 | mov a2, a3, lsr #20 | ||||
| it mi | |||||
| orrmi a2, a2, #0xf000 | orrmi a2, a2, #0xf000 | ||||
| sub a4, a4, fp | sub a4, a4, fp | ||||
| mov a4, a4, asr #20 | mov a4, a4, asr #20 | ||||
| @@ -335,15 +344,19 @@ endfunc | |||||
| .macro clip dst, src:vararg | .macro clip dst, src:vararg | ||||
| movs \dst, \src | movs \dst, \src | ||||
| it mi | |||||
| movmi \dst, #0 | movmi \dst, #0 | ||||
| cmp \dst, #255 | cmp \dst, #255 | ||||
| it gt | |||||
| movgt \dst, #255 | movgt \dst, #255 | ||||
| .endm | .endm | ||||
| .macro aclip dst, src:vararg | .macro aclip dst, src:vararg | ||||
| adds \dst, \src | adds \dst, \src | ||||
| it mi | |||||
| movmi \dst, #0 | movmi \dst, #0 | ||||
| cmp \dst, #255 | cmp \dst, #255 | ||||
| it gt | |||||
| movgt \dst, #255 | movgt \dst, #255 | ||||
| .endm | .endm | ||||
| @@ -370,35 +383,35 @@ function idct_col_put_armv5te | |||||
| orr a2, a3, a4, lsl #8 | orr a2, a3, a4, lsl #8 | ||||
| rsb v2, lr, lr, lsl #3 | rsb v2, lr, lr, lsl #3 | ||||
| ldmfd sp!, {a3, a4} | ldmfd sp!, {a3, a4} | ||||
| strh a2, [v2, v1]! | |||||
| strh_pre a2, v2, v1 | |||||
| sub a2, a3, v3 | sub a2, a3, v3 | ||||
| clip a2, a2, asr #20 | clip a2, a2, asr #20 | ||||
| sub ip, a4, v4 | sub ip, a4, v4 | ||||
| clip ip, ip, asr #20 | clip ip, ip, asr #20 | ||||
| orr a2, a2, ip, lsl #8 | orr a2, a2, ip, lsl #8 | ||||
| strh a2, [v1, lr]! | |||||
| strh_pre a2, v1, lr | |||||
| add a3, a3, v3 | add a3, a3, v3 | ||||
| clip a2, a3, asr #20 | clip a2, a3, asr #20 | ||||
| add a4, a4, v4 | add a4, a4, v4 | ||||
| clip a4, a4, asr #20 | clip a4, a4, asr #20 | ||||
| orr a2, a2, a4, lsl #8 | orr a2, a2, a4, lsl #8 | ||||
| ldmfd sp!, {a3, a4} | ldmfd sp!, {a3, a4} | ||||
| strh a2, [v2, -lr]! | |||||
| strh_dpre a2, v2, lr | |||||
| add a2, a3, v5 | add a2, a3, v5 | ||||
| clip a2, a2, asr #20 | clip a2, a2, asr #20 | ||||
| add ip, a4, v6 | add ip, a4, v6 | ||||
| clip ip, ip, asr #20 | clip ip, ip, asr #20 | ||||
| orr a2, a2, ip, lsl #8 | orr a2, a2, ip, lsl #8 | ||||
| strh a2, [v1, lr]! | |||||
| strh_pre a2, v1, lr | |||||
| sub a3, a3, v5 | sub a3, a3, v5 | ||||
| clip a2, a3, asr #20 | clip a2, a3, asr #20 | ||||
| sub a4, a4, v6 | sub a4, a4, v6 | ||||
| clip a4, a4, asr #20 | clip a4, a4, asr #20 | ||||
| orr a2, a2, a4, lsl #8 | orr a2, a2, a4, lsl #8 | ||||
| ldmfd sp!, {a3, a4} | ldmfd sp!, {a3, a4} | ||||
| strh a2, [v2, -lr]! | |||||
| strh_dpre a2, v2, lr | |||||
| add a2, a3, v7 | add a2, a3, v7 | ||||
| clip a2, a2, asr #20 | clip a2, a2, asr #20 | ||||
| @@ -411,7 +424,7 @@ function idct_col_put_armv5te | |||||
| sub a4, a4, fp | sub a4, a4, fp | ||||
| clip a4, a4, asr #20 | clip a4, a4, asr #20 | ||||
| orr a2, a2, a4, lsl #8 | orr a2, a2, a4, lsl #8 | ||||
| strh a2, [v2, -lr] | |||||
| strh_dpre a2, v2, lr | |||||
| ldr pc, [sp], #4 | ldr pc, [sp], #4 | ||||
| endfunc | endfunc | ||||
| @@ -436,7 +449,7 @@ function idct_col_add_armv5te | |||||
| ldr v1, [sp, #32] | ldr v1, [sp, #32] | ||||
| sub a4, a4, v2 | sub a4, a4, v2 | ||||
| rsb v2, v1, v1, lsl #3 | rsb v2, v1, v1, lsl #3 | ||||
| ldrh ip, [v2, lr]! | |||||
| ldrh_pre ip, v2, lr | |||||
| strh a2, [lr] | strh a2, [lr] | ||||
| and a2, ip, #255 | and a2, ip, #255 | ||||
| aclip a3, a2, a3, asr #20 | aclip a3, a2, a3, asr #20 | ||||
| @@ -448,7 +461,7 @@ function idct_col_add_armv5te | |||||
| strh a2, [v2] | strh a2, [v2] | ||||
| ldmfd sp!, {a3, a4} | ldmfd sp!, {a3, a4} | ||||
| ldrh ip, [lr, v1]! | |||||
| ldrh_pre ip, lr, v1 | |||||
| sub a2, a3, v3 | sub a2, a3, v3 | ||||
| add a3, a3, v3 | add a3, a3, v3 | ||||
| and v3, ip, #255 | and v3, ip, #255 | ||||
| @@ -458,7 +471,7 @@ function idct_col_add_armv5te | |||||
| aclip v3, v3, ip, lsr #8 | aclip v3, v3, ip, lsr #8 | ||||
| orr a2, a2, v3, lsl #8 | orr a2, a2, v3, lsl #8 | ||||
| add a4, a4, v4 | add a4, a4, v4 | ||||
| ldrh ip, [v2, -v1]! | |||||
| ldrh_dpre ip, v2, v1 | |||||
| strh a2, [lr] | strh a2, [lr] | ||||
| and a2, ip, #255 | and a2, ip, #255 | ||||
| aclip a3, a2, a3, asr #20 | aclip a3, a2, a3, asr #20 | ||||
| @@ -468,7 +481,7 @@ function idct_col_add_armv5te | |||||
| strh a2, [v2] | strh a2, [v2] | ||||
| ldmfd sp!, {a3, a4} | ldmfd sp!, {a3, a4} | ||||
| ldrh ip, [lr, v1]! | |||||
| ldrh_pre ip, lr, v1 | |||||
| add a2, a3, v5 | add a2, a3, v5 | ||||
| sub a3, a3, v5 | sub a3, a3, v5 | ||||
| and v3, ip, #255 | and v3, ip, #255 | ||||
| @@ -478,7 +491,7 @@ function idct_col_add_armv5te | |||||
| aclip v3, v3, ip, lsr #8 | aclip v3, v3, ip, lsr #8 | ||||
| orr a2, a2, v3, lsl #8 | orr a2, a2, v3, lsl #8 | ||||
| sub a4, a4, v6 | sub a4, a4, v6 | ||||
| ldrh ip, [v2, -v1]! | |||||
| ldrh_dpre ip, v2, v1 | |||||
| strh a2, [lr] | strh a2, [lr] | ||||
| and a2, ip, #255 | and a2, ip, #255 | ||||
| aclip a3, a2, a3, asr #20 | aclip a3, a2, a3, asr #20 | ||||
| @@ -488,7 +501,7 @@ function idct_col_add_armv5te | |||||
| strh a2, [v2] | strh a2, [v2] | ||||
| ldmfd sp!, {a3, a4} | ldmfd sp!, {a3, a4} | ||||
| ldrh ip, [lr, v1]! | |||||
| ldrh_pre ip, lr, v1 | |||||
| add a2, a3, v7 | add a2, a3, v7 | ||||
| sub a3, a3, v7 | sub a3, a3, v7 | ||||
| and v3, ip, #255 | and v3, ip, #255 | ||||
| @@ -498,7 +511,7 @@ function idct_col_add_armv5te | |||||
| aclip v3, v3, ip, lsr #8 | aclip v3, v3, ip, lsr #8 | ||||
| orr a2, a2, v3, lsl #8 | orr a2, a2, v3, lsl #8 | ||||
| sub a4, a4, fp | sub a4, a4, fp | ||||
| ldrh ip, [v2, -v1]! | |||||
| ldrh_dpre ip, v2, v1 | |||||
| strh a2, [lr] | strh a2, [lr] | ||||
| and a2, ip, #255 | and a2, ip, #255 | ||||
| aclip a3, a2, a3, asr #20 | aclip a3, a2, a3, asr #20 | ||||
| @@ -200,6 +200,7 @@ function idct_row_armv6 | |||||
| ldr r3, [r0, #8] /* r3 = row[3,1] */ | ldr r3, [r0, #8] /* r3 = row[3,1] */ | ||||
| ldr r2, [r0] /* r2 = row[2,0] */ | ldr r2, [r0] /* r2 = row[2,0] */ | ||||
| orrs lr, lr, ip | orrs lr, lr, ip | ||||
| itt eq | |||||
| cmpeq lr, r3 | cmpeq lr, r3 | ||||
| cmpeq lr, r2, lsr #16 | cmpeq lr, r2, lsr #16 | ||||
| beq 1f | beq 1f | ||||
| @@ -282,14 +283,14 @@ function idct_col_put_armv6 | |||||
| pop {r1, r2} | pop {r1, r2} | ||||
| idct_finish_shift_sat COL_SHIFT | idct_finish_shift_sat COL_SHIFT | ||||
| strb r4, [r1], r2 | |||||
| strb r5, [r1], r2 | |||||
| strb r6, [r1], r2 | |||||
| strb r7, [r1], r2 | |||||
| strb r11,[r1], r2 | |||||
| strb r10,[r1], r2 | |||||
| strb r9, [r1], r2 | |||||
| strb r8, [r1], r2 | |||||
| strb_post r4, r1, r2 | |||||
| strb_post r5, r1, r2 | |||||
| strb_post r6, r1, r2 | |||||
| strb_post r7, r1, r2 | |||||
| strb_post r11,r1, r2 | |||||
| strb_post r10,r1, r2 | |||||
| strb_post r9, r1, r2 | |||||
| strb_post r8, r1, r2 | |||||
| sub r1, r1, r2, lsl #3 | sub r1, r1, r2, lsl #3 | ||||
| @@ -318,16 +319,16 @@ function idct_col_add_armv6 | |||||
| add ip, r3, ip, asr #COL_SHIFT | add ip, r3, ip, asr #COL_SHIFT | ||||
| usat ip, #8, ip | usat ip, #8, ip | ||||
| add r4, r7, r4, asr #COL_SHIFT | add r4, r7, r4, asr #COL_SHIFT | ||||
| strb ip, [r1], r2 | |||||
| strb_post ip, r1, r2 | |||||
| ldrb ip, [r1, r2] | ldrb ip, [r1, r2] | ||||
| usat r4, #8, r4 | usat r4, #8, r4 | ||||
| ldrb r11,[r1, r2, lsl #2] | ldrb r11,[r1, r2, lsl #2] | ||||
| add r5, ip, r5, asr #COL_SHIFT | add r5, ip, r5, asr #COL_SHIFT | ||||
| usat r5, #8, r5 | usat r5, #8, r5 | ||||
| strb r4, [r1], r2 | |||||
| strb_post r4, r1, r2 | |||||
| ldrb r3, [r1, r2] | ldrb r3, [r1, r2] | ||||
| ldrb ip, [r1, r2, lsl #2] | ldrb ip, [r1, r2, lsl #2] | ||||
| strb r5, [r1], r2 | |||||
| strb_post r5, r1, r2 | |||||
| ldrb r7, [r1, r2] | ldrb r7, [r1, r2] | ||||
| ldrb r4, [r1, r2, lsl #2] | ldrb r4, [r1, r2, lsl #2] | ||||
| add r6, r3, r6, asr #COL_SHIFT | add r6, r3, r6, asr #COL_SHIFT | ||||
| @@ -340,11 +341,11 @@ function idct_col_add_armv6 | |||||
| usat r8, #8, r8 | usat r8, #8, r8 | ||||
| add lr, r4, lr, asr #COL_SHIFT | add lr, r4, lr, asr #COL_SHIFT | ||||
| usat lr, #8, lr | usat lr, #8, lr | ||||
| strb r6, [r1], r2 | |||||
| strb r10,[r1], r2 | |||||
| strb r9, [r1], r2 | |||||
| strb r8, [r1], r2 | |||||
| strb lr, [r1], r2 | |||||
| strb_post r6, r1, r2 | |||||
| strb_post r10,r1, r2 | |||||
| strb_post r9, r1, r2 | |||||
| strb_post r8, r1, r2 | |||||
| strb_post lr, r1, r2 | |||||
| sub r1, r1, r2, lsl #3 | sub r1, r1, r2, lsl #3 | ||||
| @@ -71,7 +71,7 @@ function idct_row4_pld_neon | |||||
| add r3, r0, r1, lsl #2 | add r3, r0, r1, lsl #2 | ||||
| pld [r0, r1] | pld [r0, r1] | ||||
| pld [r0, r1, lsl #1] | pld [r0, r1, lsl #1] | ||||
| pld [r3, -r1] | |||||
| A pld [r3, -r1] | |||||
| pld [r3] | pld [r3] | ||||
| pld [r3, r1] | pld [r3, r1] | ||||
| add r3, r3, r1, lsl #1 | add r3, r3, r1, lsl #1 | ||||
| @@ -164,6 +164,7 @@ function idct_col4_neon | |||||
| orrs r4, r4, r5 | orrs r4, r4, r5 | ||||
| idct_col4_top | idct_col4_top | ||||
| it eq | |||||
| addeq r2, r2, #16 | addeq r2, r2, #16 | ||||
| beq 1f | beq 1f | ||||
| @@ -176,6 +177,7 @@ function idct_col4_neon | |||||
| 1: orrs r6, r6, r7 | 1: orrs r6, r6, r7 | ||||
| ldrd r4, [r2, #16] | ldrd r4, [r2, #16] | ||||
| it eq | |||||
| addeq r2, r2, #16 | addeq r2, r2, #16 | ||||
| beq 2f | beq 2f | ||||
| @@ -187,6 +189,7 @@ function idct_col4_neon | |||||
| 2: orrs r4, r4, r5 | 2: orrs r4, r4, r5 | ||||
| ldrd r4, [r2, #16] | ldrd r4, [r2, #16] | ||||
| it eq | |||||
| addeq r2, r2, #16 | addeq r2, r2, #16 | ||||
| beq 3f | beq 3f | ||||
| @@ -199,6 +202,7 @@ function idct_col4_neon | |||||
| vadd.i32 q13, q13, q8 | vadd.i32 q13, q13, q8 | ||||
| 3: orrs r4, r4, r5 | 3: orrs r4, r4, r5 | ||||
| it eq | |||||
| addeq r2, r2, #16 | addeq r2, r2, #16 | ||||
| beq 4f | beq 4f | ||||
| @@ -100,9 +100,11 @@ NOVFP vldr s0, [sp, #12*4] @ scale | |||||
| vst1.32 {q9}, [r2,:128] | vst1.32 {q9}, [r2,:128] | ||||
| subs r1, r1, #1 | subs r1, r1, #1 | ||||
| it eq | |||||
| popeq {r4-r11,pc} | popeq {r4-r11,pc} | ||||
| cmp r4, #0 | cmp r4, #0 | ||||
| itt eq | |||||
| subeq r8, r8, #512*4 | subeq r8, r8, #512*4 | ||||
| subeq r9, r9, #512*4 | subeq r9, r9, #512*4 | ||||
| sub r5, r5, #512*4 | sub r5, r5, #512*4 | ||||
| @@ -21,6 +21,14 @@ | |||||
| #ifndef AVCODEC_ARM_VP56_ARITH_H | #ifndef AVCODEC_ARM_VP56_ARITH_H | ||||
| #define AVCODEC_ARM_VP56_ARITH_H | #define AVCODEC_ARM_VP56_ARITH_H | ||||
| #if CONFIG_THUMB | |||||
| # define A(x) | |||||
| # define T(x) x | |||||
| #else | |||||
| # define A(x) x | |||||
| # define T(x) | |||||
| #endif | |||||
| #if HAVE_ARMV6 && HAVE_INLINE_ASM | #if HAVE_ARMV6 && HAVE_INLINE_ASM | ||||
| #define vp56_rac_get_prob vp56_rac_get_prob_armv6 | #define vp56_rac_get_prob vp56_rac_get_prob_armv6 | ||||
| @@ -32,15 +40,21 @@ static inline int vp56_rac_get_prob_armv6(VP56RangeCoder *c, int pr) | |||||
| unsigned bit; | unsigned bit; | ||||
| __asm__ ("adds %3, %3, %0 \n" | __asm__ ("adds %3, %3, %0 \n" | ||||
| "itt cs \n" | |||||
| "cmpcs %7, %4 \n" | "cmpcs %7, %4 \n" | ||||
| "ldrcsh %2, [%4], #2 \n" | |||||
| A("ldrcsh %2, [%4], #2 \n") | |||||
| T("ldrhcs %2, [%4], #2 \n") | |||||
| "rsb %0, %6, #256 \n" | "rsb %0, %6, #256 \n" | ||||
| "smlabb %0, %5, %6, %0 \n" | "smlabb %0, %5, %6, %0 \n" | ||||
| T("itttt cs \n") | |||||
| "rev16cs %2, %2 \n" | "rev16cs %2, %2 \n" | ||||
| "orrcs %1, %1, %2, lsl %3 \n" | |||||
| T("lslcs %2, %2, %3 \n") | |||||
| T("orrcs %1, %1, %2 \n") | |||||
| A("orrcs %1, %1, %2, lsl %3 \n") | |||||
| "subcs %3, %3, #16 \n" | "subcs %3, %3, #16 \n" | ||||
| "lsr %0, %0, #8 \n" | "lsr %0, %0, #8 \n" | ||||
| "cmp %1, %0, lsl #16 \n" | "cmp %1, %0, lsl #16 \n" | ||||
| "ittte ge \n" | |||||
| "subge %1, %1, %0, lsl #16 \n" | "subge %1, %1, %0, lsl #16 \n" | ||||
| "subge %0, %5, %0 \n" | "subge %0, %5, %0 \n" | ||||
| "movge %2, #1 \n" | "movge %2, #1 \n" | ||||
| @@ -64,12 +78,17 @@ static inline int vp56_rac_get_prob_branchy_armv6(VP56RangeCoder *c, int pr) | |||||
| unsigned tmp; | unsigned tmp; | ||||
| __asm__ ("adds %3, %3, %0 \n" | __asm__ ("adds %3, %3, %0 \n" | ||||
| "itt cs \n" | |||||
| "cmpcs %7, %4 \n" | "cmpcs %7, %4 \n" | ||||
| "ldrcsh %2, [%4], #2 \n" | |||||
| A("ldrcsh %2, [%4], #2 \n") | |||||
| T("ldrhcs %2, [%4], #2 \n") | |||||
| "rsb %0, %6, #256 \n" | "rsb %0, %6, #256 \n" | ||||
| "smlabb %0, %5, %6, %0 \n" | "smlabb %0, %5, %6, %0 \n" | ||||
| T("itttt cs \n") | |||||
| "rev16cs %2, %2 \n" | "rev16cs %2, %2 \n" | ||||
| "orrcs %1, %1, %2, lsl %3 \n" | |||||
| T("lslcs %2, %2, %3 \n") | |||||
| T("orrcs %1, %1, %2 \n") | |||||
| A("orrcs %1, %1, %2, lsl %3 \n") | |||||
| "subcs %3, %3, #16 \n" | "subcs %3, %3, #16 \n" | ||||
| "lsr %0, %0, #8 \n" | "lsr %0, %0, #8 \n" | ||||
| "lsl %2, %0, #16 \n" | "lsl %2, %0, #16 \n" | ||||
| @@ -25,13 +25,18 @@ | |||||
| lsl \cw, \cw, \t0 | lsl \cw, \cw, \t0 | ||||
| lsl \t0, \h, \t0 | lsl \t0, \h, \t0 | ||||
| rsb \h, \pr, #256 | rsb \h, \pr, #256 | ||||
| it cs | |||||
| ldrhcs \t1, [\buf], #2 | ldrhcs \t1, [\buf], #2 | ||||
| smlabb \h, \t0, \pr, \h | smlabb \h, \t0, \pr, \h | ||||
| T itttt cs | |||||
| rev16cs \t1, \t1 | rev16cs \t1, \t1 | ||||
| orrcs \cw, \cw, \t1, lsl \bs | |||||
| A orrcs \cw, \cw, \t1, lsl \bs | |||||
| T lslcs \t1, \t1, \bs | |||||
| T orrcs \cw, \cw, \t1 | |||||
| subcs \bs, \bs, #16 | subcs \bs, \bs, #16 | ||||
| lsr \h, \h, #8 | lsr \h, \h, #8 | ||||
| cmp \cw, \h, lsl #16 | cmp \cw, \h, lsl #16 | ||||
| itt ge | |||||
| subge \cw, \cw, \h, lsl #16 | subge \cw, \cw, \h, lsl #16 | ||||
| subge \h, \t0, \h | subge \h, \t0, \h | ||||
| .endm | .endm | ||||
| @@ -40,14 +45,20 @@ | |||||
| adds \bs, \bs, \t0 | adds \bs, \bs, \t0 | ||||
| lsl \cw, \cw, \t0 | lsl \cw, \cw, \t0 | ||||
| lsl \t0, \h, \t0 | lsl \t0, \h, \t0 | ||||
| it cs | |||||
| ldrhcs \t1, [\buf], #2 | ldrhcs \t1, [\buf], #2 | ||||
| mov \h, #128 | mov \h, #128 | ||||
| it cs | |||||
| rev16cs \t1, \t1 | rev16cs \t1, \t1 | ||||
| add \h, \h, \t0, lsl #7 | add \h, \h, \t0, lsl #7 | ||||
| orrcs \cw, \cw, \t1, lsl \bs | |||||
| A orrcs \cw, \cw, \t1, lsl \bs | |||||
| T ittt cs | |||||
| T lslcs \t1, \t1, \bs | |||||
| T orrcs \cw, \cw, \t1 | |||||
| subcs \bs, \bs, #16 | subcs \bs, \bs, #16 | ||||
| lsr \h, \h, #8 | lsr \h, \h, #8 | ||||
| cmp \cw, \h, lsl #16 | cmp \cw, \h, lsl #16 | ||||
| itt ge | |||||
| subge \cw, \cw, \h, lsl #16 | subge \cw, \cw, \h, lsl #16 | ||||
| subge \h, \t0, \h | subge \h, \t0, \h | ||||
| .endm | .endm | ||||
| @@ -59,6 +70,7 @@ function ff_decode_block_coeffs_armv6, export=1 | |||||
| cmp r3, #0 | cmp r3, #0 | ||||
| ldr r11, [r5] | ldr r11, [r5] | ||||
| ldm r0, {r5-r7} @ high, bits, buf | ldm r0, {r5-r7} @ high, bits, buf | ||||
| it ne | |||||
| pkhtbne r11, r11, r11, asr #16 | pkhtbne r11, r11, r11, asr #16 | ||||
| ldr r8, [r0, #16] @ code_word | ldr r8, [r0, #16] @ code_word | ||||
| 0: | 0: | ||||
| @@ -80,19 +92,26 @@ function ff_decode_block_coeffs_armv6, export=1 | |||||
| adds r6, r6, r9 | adds r6, r6, r9 | ||||
| add r4, r4, #11 | add r4, r4, #11 | ||||
| lsl r8, r8, r9 | lsl r8, r8, r9 | ||||
| it cs | |||||
| ldrhcs r10, [r7], #2 | ldrhcs r10, [r7], #2 | ||||
| lsl r9, r5, r9 | lsl r9, r5, r9 | ||||
| mov r5, #128 | mov r5, #128 | ||||
| it cs | |||||
| rev16cs r10, r10 | rev16cs r10, r10 | ||||
| add r5, r5, r9, lsl #7 | add r5, r5, r9, lsl #7 | ||||
| orrcs r8, r8, r10, lsl r6 | |||||
| T ittt cs | |||||
| T lslcs r10, r10, r6 | |||||
| T orrcs r8, r8, r10 | |||||
| A orrcs r8, r8, r10, lsl r6 | |||||
| subcs r6, r6, #16 | subcs r6, r6, #16 | ||||
| lsr r5, r5, #8 | lsr r5, r5, #8 | ||||
| cmp r8, r5, lsl #16 | cmp r8, r5, lsl #16 | ||||
| movrel r10, zigzag_scan-1 | movrel r10, zigzag_scan-1 | ||||
| itt ge | |||||
| subge r8, r8, r5, lsl #16 | subge r8, r8, r5, lsl #16 | ||||
| subge r5, r9, r5 | subge r5, r9, r5 | ||||
| ldrb r10, [r10, r3] | ldrb r10, [r10, r3] | ||||
| it ge | |||||
| rsbge r12, r12, #0 | rsbge r12, r12, #0 | ||||
| cmp r3, #16 | cmp r3, #16 | ||||
| strh r12, [r1, r10] | strh r12, [r1, r10] | ||||
| @@ -108,6 +127,7 @@ function ff_decode_block_coeffs_armv6, export=1 | |||||
| ldr r0, [sp] | ldr r0, [sp] | ||||
| ldr r9, [r0, #12] | ldr r9, [r0, #12] | ||||
| cmp r7, r9 | cmp r7, r9 | ||||
| it hi | |||||
| movhi r7, r9 | movhi r7, r9 | ||||
| stm r0, {r5-r7} @ high, bits, buf | stm r0, {r5-r7} @ high, bits, buf | ||||
| str r8, [r0, #16] @ code_word | str r8, [r0, #16] @ code_word | ||||
| @@ -131,11 +151,13 @@ function ff_decode_block_coeffs_armv6, export=1 | |||||
| mov r12, #2 | mov r12, #2 | ||||
| ldrb r0, [r4, #4] | ldrb r0, [r4, #4] | ||||
| rac_get_prob r5, r6, r7, r8, r0, r9, r10 | rac_get_prob r5, r6, r7, r8, r0, r9, r10 | ||||
| it ge | |||||
| addge r12, #1 | addge r12, #1 | ||||
| ldrb r9, [lr, r5] | ldrb r9, [lr, r5] | ||||
| blt 4f | blt 4f | ||||
| ldrb r0, [r4, #5] | ldrb r0, [r4, #5] | ||||
| rac_get_prob r5, r6, r7, r8, r0, r9, r10 | rac_get_prob r5, r6, r7, r8, r0, r9, r10 | ||||
| it ge | |||||
| addge r12, #1 | addge r12, #1 | ||||
| ldrb r9, [lr, r5] | ldrb r9, [lr, r5] | ||||
| b 4f | b 4f | ||||
| @@ -153,6 +175,7 @@ function ff_decode_block_coeffs_armv6, export=1 | |||||
| mov r12, #5 | mov r12, #5 | ||||
| mov r0, #159 | mov r0, #159 | ||||
| rac_get_prob r5, r6, r7, r8, r0, r9, r10 | rac_get_prob r5, r6, r7, r8, r0, r9, r10 | ||||
| it ge | |||||
| addge r12, r12, #1 | addge r12, r12, #1 | ||||
| ldrb r9, [lr, r5] | ldrb r9, [lr, r5] | ||||
| b 4f | b 4f | ||||
| @@ -160,23 +183,28 @@ function ff_decode_block_coeffs_armv6, export=1 | |||||
| mov r12, #7 | mov r12, #7 | ||||
| mov r0, #165 | mov r0, #165 | ||||
| rac_get_prob r5, r6, r7, r8, r0, r9, r10 | rac_get_prob r5, r6, r7, r8, r0, r9, r10 | ||||
| it ge | |||||
| addge r12, r12, #2 | addge r12, r12, #2 | ||||
| ldrb r9, [lr, r5] | ldrb r9, [lr, r5] | ||||
| mov r0, #145 | mov r0, #145 | ||||
| rac_get_prob r5, r6, r7, r8, r0, r9, r10 | rac_get_prob r5, r6, r7, r8, r0, r9, r10 | ||||
| it ge | |||||
| addge r12, r12, #1 | addge r12, r12, #1 | ||||
| ldrb r9, [lr, r5] | ldrb r9, [lr, r5] | ||||
| b 4f | b 4f | ||||
| 3: | 3: | ||||
| ldrb r0, [r4, #8] | ldrb r0, [r4, #8] | ||||
| rac_get_prob r5, r6, r7, r8, r0, r9, r10 | rac_get_prob r5, r6, r7, r8, r0, r9, r10 | ||||
| it ge | |||||
| addge r4, r4, #1 | addge r4, r4, #1 | ||||
| ldrb r9, [lr, r5] | ldrb r9, [lr, r5] | ||||
| ite ge | |||||
| movge r12, #2 | movge r12, #2 | ||||
| movlt r12, #0 | movlt r12, #0 | ||||
| ldrb r0, [r4, #9] | ldrb r0, [r4, #9] | ||||
| rac_get_prob r5, r6, r7, r8, r0, r9, r10 | rac_get_prob r5, r6, r7, r8, r0, r9, r10 | ||||
| mov r9, #8 | mov r9, #8 | ||||
| it ge | |||||
| addge r12, r12, #1 | addge r12, r12, #1 | ||||
| movrel r4, X(ff_vp8_dct_cat_prob) | movrel r4, X(ff_vp8_dct_cat_prob) | ||||
| lsl r9, r9, r12 | lsl r9, r9, r12 | ||||
| @@ -189,6 +217,7 @@ function ff_decode_block_coeffs_armv6, export=1 | |||||
| lsl r1, r1, #1 | lsl r1, r1, #1 | ||||
| rac_get_prob r5, r6, r7, r8, r0, r9, r10 | rac_get_prob r5, r6, r7, r8, r0, r9, r10 | ||||
| ldrb r0, [r4], #1 | ldrb r0, [r4], #1 | ||||
| it ge | |||||
| addge r1, r1, #1 | addge r1, r1, #1 | ||||
| cmp r0, #0 | cmp r0, #0 | ||||
| bne 1b | bne 1b | ||||
| @@ -200,6 +229,7 @@ function ff_decode_block_coeffs_armv6, export=1 | |||||
| add r4, r2, r4 | add r4, r2, r4 | ||||
| add r4, r4, #22 | add r4, r4, #22 | ||||
| rac_get_128 r5, r6, r7, r8, r9, r10 | rac_get_128 r5, r6, r7, r8, r9, r10 | ||||
| it ge | |||||
| rsbge r12, r12, #0 | rsbge r12, r12, #0 | ||||
| smulbb r12, r12, r11 | smulbb r12, r12, r11 | ||||
| movrel r9, zigzag_scan-1 | movrel r9, zigzag_scan-1 | ||||
| @@ -746,14 +746,14 @@ function ff_put_vp8_pixels4_neon, export=1 | |||||
| push {r4-r6,lr} | push {r4-r6,lr} | ||||
| 1: | 1: | ||||
| subs r12, r12, #4 | subs r12, r12, #4 | ||||
| ldr r4, [r2], r3 | |||||
| ldr r5, [r2], r3 | |||||
| ldr r6, [r2], r3 | |||||
| ldr lr, [r2], r3 | |||||
| str r4, [r0], r1 | |||||
| str r5, [r0], r1 | |||||
| str r6, [r0], r1 | |||||
| str lr, [r0], r1 | |||||
| ldr_post r4, r2, r3 | |||||
| ldr_post r5, r2, r3 | |||||
| ldr_post r6, r2, r3 | |||||
| ldr_post lr, r2, r3 | |||||
| str_post r4, r0, r1 | |||||
| str_post r5, r0, r1 | |||||
| str_post r6, r0, r1 | |||||
| str_post lr, r0, r1 | |||||
| bgt 1b | bgt 1b | ||||
| pop {r4-r6,pc} | pop {r4-r6,pc} | ||||
| endfunc | endfunc | ||||
| @@ -36,6 +36,7 @@ static av_always_inline av_const int FASTDIV(int a, int b) | |||||
| int r; | int r; | ||||
| __asm__ ("cmp %2, #2 \n\t" | __asm__ ("cmp %2, #2 \n\t" | ||||
| "ldr %0, [%3, %2, lsl #2] \n\t" | "ldr %0, [%3, %2, lsl #2] \n\t" | ||||
| "ite le \n\t" | |||||
| "lsrle %0, %1, #1 \n\t" | "lsrle %0, %1, #1 \n\t" | ||||
| "smmulgt %0, %0, %1 \n\t" | "smmulgt %0, %0, %1 \n\t" | ||||
| : "=&r"(r) : "r"(a), "r"(b), "r"(ff_inverse) : "cc"); | : "=&r"(r) : "r"(a), "r"(b), "r"(ff_inverse) : "cc"); | ||||
| @@ -101,6 +102,7 @@ static av_always_inline av_const int32_t av_clipl_int32_arm(int64_t a) | |||||
| { | { | ||||
| int x, y; | int x, y; | ||||
| __asm__ ("adds %1, %R2, %Q2, lsr #31 \n\t" | __asm__ ("adds %1, %R2, %Q2, lsr #31 \n\t" | ||||
| "itet ne \n\t" | |||||
| "mvnne %1, #1<<31 \n\t" | "mvnne %1, #1<<31 \n\t" | ||||
| "moveq %0, %Q2 \n\t" | "moveq %0, %Q2 \n\t" | ||||
| "eorne %0, %1, %R2, asr #31 \n\t" | "eorne %0, %1, %R2, asr #31 \n\t" | ||||