| 
							- /*
 -  * Copyright (c) 2008 Siarhei Siamashka <ssvb@users.sourceforge.net>
 -  *
 -  * This file is part of Libav.
 -  *
 -  * Libav is free software; you can redistribute it and/or
 -  * modify it under the terms of the GNU Lesser General Public
 -  * License as published by the Free Software Foundation; either
 -  * version 2.1 of the License, or (at your option) any later version.
 -  *
 -  * Libav is distributed in the hope that it will be useful,
 -  * but WITHOUT ANY WARRANTY; without even the implied warranty of
 -  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 -  * Lesser General Public License for more details.
 -  *
 -  * You should have received a copy of the GNU Lesser General Public
 -  * License along with Libav; if not, write to the Free Software
 -  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 -  */
 - 
 - #include "config.h"
 - #include "asm.S"
 - 
 - /**
 -  * Assume that len is a positive number and is multiple of 8
 -  */
 - @ void ff_vector_fmul_vfp(float *dst, const float *src0, const float *src1, int len)
 - function ff_vector_fmul_vfp, export=1
 -         vpush           {d8-d15}
 -         fmrx            r12, fpscr
 -         orr             r12, r12, #(3 << 16) /* set vector size to 4 */
 -         fmxr            fpscr, r12
 - 
 -         vldmia          r1!, {s0-s3}
 -         vldmia          r2!, {s8-s11}
 -         vldmia          r1!, {s4-s7}
 -         vldmia          r2!, {s12-s15}
 -         vmul.f32        s8,  s0,  s8
 - 1:
 -         subs            r3,  r3,  #16
 -         vmul.f32        s12, s4,  s12
 -         itttt           ge
 -         vldmiage        r1!, {s16-s19}
 -         vldmiage        r2!, {s24-s27}
 -         vldmiage        r1!, {s20-s23}
 -         vldmiage        r2!, {s28-s31}
 -         it              ge
 -         vmulge.f32      s24, s16, s24
 -         vstmia          r0!, {s8-s11}
 -         vstmia          r0!, {s12-s15}
 -         it              ge
 -         vmulge.f32      s28, s20, s28
 -         itttt           gt
 -         vldmiagt        r1!, {s0-s3}
 -         vldmiagt        r2!, {s8-s11}
 -         vldmiagt        r1!, {s4-s7}
 -         vldmiagt        r2!, {s12-s15}
 -         ittt            ge
 -         vmulge.f32      s8,  s0,  s8
 -         vstmiage        r0!, {s24-s27}
 -         vstmiage        r0!, {s28-s31}
 -         bgt             1b
 - 
 -         bic             r12, r12, #(7 << 16) /* set vector size back to 1 */
 -         fmxr            fpscr, r12
 -         vpop            {d8-d15}
 -         bx              lr
 - endfunc
 - 
 - /**
 -  * ARM VFP implementation of 'vector_fmul_window_c' function
 -  * Assume that len is a positive non-zero number
 -  */
 - @ void ff_vector_fmul_window_vfp(float *dst, const float *src0,
 - @                                const float *src1, const float *win, int len)
 - function ff_vector_fmul_window_vfp, export=1
 - DST0    .req    a1
 - SRC0    .req    a2
 - SRC1    .req    a3
 - WIN0    .req    a4
 - LEN     .req    v1
 - DST1    .req    v2
 - WIN1    .req    v3
 - OLDFPSCR .req   ip
 - 
 -         push    {v1-v3,lr}
 -         ldr     LEN, [sp, #4*4+0]
 -         vpush   {s16-s31}
 -         fmrx    OLDFPSCR, FPSCR
 -         add     DST1, DST0, LEN, lsl #3
 -         add     SRC1, SRC1, LEN, lsl #2
 -         add     WIN1, WIN0, LEN, lsl #3
 - 
 -         tst     LEN, #7
 -         beq     4f                          @ common case: len is a multiple of 8
 - 
 -         ldr     lr, =0x03000000             @ RunFast mode, scalar mode
 -         fmxr    FPSCR, lr
 - 
 -         tst     LEN, #1
 -         beq     1f
 -         vldmdb  WIN1!, {s0}
 -         vldmia  SRC0!, {s8}
 -         vldmia  WIN0!, {s16}
 -         vmul.f  s24, s0, s8
 -         vldmdb  SRC1!, {s20}
 -         vmul.f  s8, s16, s8
 -         vmls.f  s24, s16, s20
 -         vmla.f  s8, s0, s20
 -         vstmia  DST0!, {s24}
 -         vstmdb  DST1!, {s8}
 - 1:
 -         tst     LEN, #2
 -         beq     2f
 -         vldmdb  WIN1!, {s0}
 -         vldmdb  WIN1!, {s1}
 -         vldmia  SRC0!, {s8-s9}
 -         vldmia  WIN0!, {s16-s17}
 -         vmul.f  s24, s0, s8
 -         vmul.f  s25, s1, s9
 -         vldmdb  SRC1!, {s20}
 -         vldmdb  SRC1!, {s21}
 -         vmul.f  s8, s16, s8
 -         vmul.f  s9, s17, s9
 -         vmls.f  s24, s16, s20
 -         vmls.f  s25, s17, s21
 -         vmla.f  s8, s0, s20
 -         vmla.f  s9, s1, s21
 -         vstmia  DST0!, {s24-s25}
 -         vstmdb  DST1!, {s8}
 -         vstmdb  DST1!, {s9}
 - 2:
 -         tst     LEN, #4
 -         beq     3f
 -         vldmdb  WIN1!, {s0}
 -         vldmdb  WIN1!, {s1}
 -         vldmdb  WIN1!, {s2}
 -         vldmdb  WIN1!, {s3}
 -         vldmia  SRC0!, {s8-s11}
 -         vldmia  WIN0!, {s16-s19}
 -         vmul.f  s24, s0, s8
 -         vmul.f  s25, s1, s9
 -         vmul.f  s26, s2, s10
 -         vmul.f  s27, s3, s11
 -         vldmdb  SRC1!, {s20}
 -         vldmdb  SRC1!, {s21}
 -         vldmdb  SRC1!, {s22}
 -         vldmdb  SRC1!, {s23}
 -         vmul.f  s8, s16, s8
 -         vmul.f  s9, s17, s9
 -         vmul.f  s10, s18, s10
 -         vmul.f  s11, s19, s11
 -         vmls.f  s24, s16, s20
 -         vmls.f  s25, s17, s21
 -         vmls.f  s26, s18, s22
 -         vmls.f  s27, s19, s23
 -         vmla.f  s8, s0, s20
 -         vmla.f  s9, s1, s21
 -         vmla.f  s10, s2, s22
 -         vmla.f  s11, s3, s23
 -         vstmia  DST0!, {s24-s27}
 -         vstmdb  DST1!, {s8}
 -         vstmdb  DST1!, {s9}
 -         vstmdb  DST1!, {s10}
 -         vstmdb  DST1!, {s11}
 - 3:
 -         bics    LEN, LEN, #7
 -         beq     7f
 - 4:
 -         ldr     lr, =0x03030000             @ RunFast mode, short vectors of length 4, stride 1
 -         fmxr    FPSCR, lr
 - 
 -         vldmdb  WIN1!, {s0}
 -         vldmdb  WIN1!, {s1}
 -         vldmdb  WIN1!, {s2}
 -         vldmdb  WIN1!, {s3}
 -         vldmia  SRC0!, {s8-s11}
 -         vldmia  WIN0!, {s16-s19}
 -         vmul.f  s24, s0, s8                     @ vector * vector
 -         vldmdb  SRC1!, {s20}
 -         vldmdb  SRC1!, {s21}
 -         vldmdb  SRC1!, {s22}
 -         vldmdb  SRC1!, {s23}
 -         vmul.f  s8, s16, s8                     @ vector * vector
 -         vmls.f  s24, s16, s20                   @ vector * vector
 -             vldmdb  WIN1!, {s4}
 -             vldmdb  WIN1!, {s5}
 -             vldmdb  WIN1!, {s6}
 -             vldmdb  WIN1!, {s7}
 -             vldmia  SRC0!, {s12-s13}
 -         vmla.f  s8, s0, s20                     @ vector * vector
 -             vldmia  SRC0!, {s14-s15}
 -         subs    LEN, LEN, #8
 -         beq     6f
 - 5:          vldmia  WIN0!, {s20-s23}
 -             vmul.f  s28, s4, s12                @ vector * vector
 -         vstmia  DST0!, {s24-s25}
 -             vldmdb  SRC1!, {s16}
 -             vldmdb  SRC1!, {s17}
 -             vldmdb  SRC1!, {s18}
 -             vldmdb  SRC1!, {s19}
 -             vmul.f  s12, s20, s12               @ vector * vector
 -         vstmia  DST0!, {s26-s27}
 -         vstmdb  DST1!, {s8}
 -         vstmdb  DST1!, {s9}
 -         vstmdb  DST1!, {s10}
 -         vstmdb  DST1!, {s11}
 -             vmls.f  s28, s20, s16               @ vector * vector
 -                 vldmdb  WIN1!, {s0}
 -                 vldmdb  WIN1!, {s1}
 -                 vldmdb  WIN1!, {s2}
 -                 vldmdb  WIN1!, {s3}
 -                 vldmia  SRC0!, {s8-s9}
 -             vmla.f  s12, s4, s16                @ vector * vector
 -                 vldmia  SRC0!, {s10-s11}
 -         subs    LEN, LEN, #8
 -                 vldmia  WIN0!, {s16-s19}
 -                 vmul.f  s24, s0, s8             @ vector * vector
 -             vstmia  DST0!, {s28-s29}
 -                 vldmdb  SRC1!, {s20}
 -                 vldmdb  SRC1!, {s21}
 -                 vldmdb  SRC1!, {s22}
 -                 vldmdb  SRC1!, {s23}
 -                 vmul.f  s8, s16, s8             @ vector * vector
 -             vstmia  DST0!, {s30-s31}
 -             vstmdb  DST1!, {s12}
 -             vstmdb  DST1!, {s13}
 -             vstmdb  DST1!, {s14}
 -             vstmdb  DST1!, {s15}
 -                 vmls.f  s24, s16, s20           @ vector * vector
 -                     vldmdb  WIN1!, {s4}
 -                     vldmdb  WIN1!, {s5}
 -                     vldmdb  WIN1!, {s6}
 -                     vldmdb  WIN1!, {s7}
 -                     vldmia  SRC0!, {s12-s13}
 -                 vmla.f  s8, s0, s20             @ vector * vector
 -                     vldmia  SRC0!, {s14-s15}
 -         bne     5b
 - 6:                  vldmia  WIN0!, {s20-s23}
 -                     vmul.f  s28, s4, s12        @ vector * vector
 -                 vstmia  DST0!, {s24-s25}
 -                     vldmdb  SRC1!, {s16}
 -                     vldmdb  SRC1!, {s17}
 -                     vldmdb  SRC1!, {s18}
 -                     vldmdb  SRC1!, {s19}
 -                     vmul.f  s12, s20, s12       @ vector * vector
 -                 vstmia  DST0!, {s26-s27}
 -                 vstmdb  DST1!, {s8}
 -                 vstmdb  DST1!, {s9}
 -                 vstmdb  DST1!, {s10}
 -                 vstmdb  DST1!, {s11}
 -                     vmls.f  s28, s20, s16       @ vector * vector
 -                     vmla.f  s12, s4, s16        @ vector * vector
 -                     vstmia  DST0!, {s28-s31}
 -                     vstmdb  DST1!, {s12}
 -                     vstmdb  DST1!, {s13}
 -                     vstmdb  DST1!, {s14}
 -                     vstmdb  DST1!, {s15}
 - 7:
 -         fmxr    FPSCR, OLDFPSCR
 -         vpop    {s16-s31}
 -         pop     {v1-v3,pc}
 - 
 -         .unreq  DST0
 -         .unreq  SRC0
 -         .unreq  SRC1
 -         .unreq  WIN0
 -         .unreq  LEN
 -         .unreq  OLDFPSCR
 -         .unreq  DST1
 -         .unreq  WIN1
 - endfunc
 - 
 - /**
 -  * ARM VFP optimized implementation of 'vector_fmul_reverse_c' function.
 -  * Assume that len is a positive number and is multiple of 8
 -  */
 - @ void ff_vector_fmul_reverse_vfp(float *dst, const float *src0,
 - @                                 const float *src1, int len)
 - function ff_vector_fmul_reverse_vfp, export=1
 -         vpush           {d8-d15}
 -         add             r2,  r2,  r3, lsl #2
 -         vldmdb          r2!, {s0-s3}
 -         vldmia          r1!, {s8-s11}
 -         vldmdb          r2!, {s4-s7}
 -         vldmia          r1!, {s12-s15}
 -         vmul.f32        s8,  s3,  s8
 -         vmul.f32        s9,  s2,  s9
 -         vmul.f32        s10, s1,  s10
 -         vmul.f32        s11, s0,  s11
 - 1:
 -         subs            r3,  r3,  #16
 -         it              ge
 -         vldmdbge        r2!, {s16-s19}
 -         vmul.f32        s12, s7,  s12
 -         it              ge
 -         vldmiage        r1!, {s24-s27}
 -         vmul.f32        s13, s6,  s13
 -         it              ge
 -         vldmdbge        r2!, {s20-s23}
 -         vmul.f32        s14, s5,  s14
 -         it              ge
 -         vldmiage        r1!, {s28-s31}
 -         vmul.f32        s15, s4,  s15
 -         it              ge
 -         vmulge.f32      s24, s19, s24
 -         it              gt
 -         vldmdbgt        r2!, {s0-s3}
 -         it              ge
 -         vmulge.f32      s25, s18, s25
 -         vstmia          r0!, {s8-s13}
 -         it              ge
 -         vmulge.f32      s26, s17, s26
 -         it              gt
 -         vldmiagt        r1!, {s8-s11}
 -         itt             ge
 -         vmulge.f32      s27, s16, s27
 -         vmulge.f32      s28, s23, s28
 -         it              gt
 -         vldmdbgt        r2!, {s4-s7}
 -         it              ge
 -         vmulge.f32      s29, s22, s29
 -         vstmia          r0!, {s14-s15}
 -         ittt            ge
 -         vmulge.f32      s30, s21, s30
 -         vmulge.f32      s31, s20, s31
 -         vmulge.f32      s8,  s3,  s8
 -         it              gt
 -         vldmiagt        r1!, {s12-s15}
 -         itttt           ge
 -         vmulge.f32      s9,  s2,  s9
 -         vmulge.f32      s10, s1,  s10
 -         vstmiage        r0!, {s24-s27}
 -         vmulge.f32      s11, s0,  s11
 -         it              ge
 -         vstmiage        r0!, {s28-s31}
 -         bgt             1b
 - 
 -         vpop            {d8-d15}
 -         bx              lr
 - endfunc
 - 
 - /**
 -  * ARM VFP implementation of 'butterflies_float_c' function
 -  * Assume that len is a positive non-zero number
 -  */
 - @ void ff_butterflies_float_vfp(float *restrict v1, float *restrict v2, int len)
 - function ff_butterflies_float_vfp, export=1
 - BASE1   .req    a1
 - BASE2   .req    a2
 - LEN     .req    a3
 - OLDFPSCR .req   a4
 - 
 -         vpush   {s16-s31}
 -         fmrx    OLDFPSCR, FPSCR
 - 
 -         tst     LEN, #7
 -         beq     4f                          @ common case: len is a multiple of 8
 - 
 -         ldr     ip, =0x03000000             @ RunFast mode, scalar mode
 -         fmxr    FPSCR, ip
 - 
 -         tst     LEN, #1
 -         beq     1f
 -         vldmia  BASE1!, {s0}
 -         vldmia  BASE2!, {s8}
 -         vadd.f  s16, s0, s8
 -         vsub.f  s24, s0, s8
 -         vstr    s16, [BASE1, #0-4*1]
 -         vstr    s24, [BASE2, #0-4*1]
 - 1:
 -         tst     LEN, #2
 -         beq     2f
 -         vldmia  BASE1!, {s0-s1}
 -         vldmia  BASE2!, {s8-s9}
 -         vadd.f  s16, s0, s8
 -         vadd.f  s17, s1, s9
 -         vsub.f  s24, s0, s8
 -         vsub.f  s25, s1, s9
 -         vstr    d8, [BASE1, #0-8*1]    @ s16,s17
 -         vstr    d12, [BASE2, #0-8*1]   @ s24,s25
 - 2:
 -         tst     LEN, #4
 -         beq     3f
 -         vldmia  BASE1!, {s0-s1}
 -         vldmia  BASE2!, {s8-s9}
 -         vldmia  BASE1!, {s2-s3}
 -         vldmia  BASE2!, {s10-s11}
 -         vadd.f  s16, s0, s8
 -         vadd.f  s17, s1, s9
 -         vsub.f  s24, s0, s8
 -         vsub.f  s25, s1, s9
 -         vadd.f  s18, s2, s10
 -         vadd.f  s19, s3, s11
 -         vsub.f  s26, s2, s10
 -         vsub.f  s27, s3, s11
 -         vstr    d8, [BASE1, #0-16*1]    @ s16,s17
 -         vstr    d12, [BASE2, #0-16*1]   @ s24,s25
 -         vstr    d9, [BASE1, #8-16*1]    @ s18,s19
 -         vstr    d13, [BASE2, #8-16*1]   @ s26,s27
 - 3:
 -         bics    LEN, LEN, #7
 -         beq     7f
 - 4:
 -         ldr     ip, =0x03030000             @ RunFast mode, short vectors of length 4, stride 1
 -         fmxr    FPSCR, ip
 - 
 -         vldmia  BASE1!, {s0-s1}
 -         vldmia  BASE2!, {s8-s9}
 -         vldmia  BASE1!, {s2-s3}
 -         vldmia  BASE2!, {s10-s11}
 -         vadd.f  s16, s0, s8
 -             vldmia  BASE1!, {s4-s5}
 -             vldmia  BASE2!, {s12-s13}
 -             vldmia  BASE1!, {s6-s7}
 -             vldmia  BASE2!, {s14-s15}
 -         vsub.f  s24, s0, s8
 -             vadd.f  s20, s4, s12
 -         subs    LEN, LEN, #8
 -         beq     6f
 - 5:              vldmia  BASE1!, {s0-s3}
 -                 vldmia  BASE2!, {s8-s11}
 -             vsub.f  s28, s4, s12
 -         vstr    d8, [BASE1, #0-16*3]    @ s16,s17
 -         vstr    d9, [BASE1, #8-16*3]    @ s18,s19
 -         vstr    d12, [BASE2, #0-16*3]   @ s24,s25
 -         vstr    d13, [BASE2, #8-16*3]   @ s26,s27
 -                 vadd.f  s16, s0, s8
 -                     vldmia  BASE1!, {s4-s7}
 -                     vldmia  BASE2!, {s12-s15}
 -                 vsub.f  s24, s0, s8
 -             vstr    d10, [BASE1, #0-16*3]   @ s20,s21
 -             vstr    d11, [BASE1, #8-16*3]   @ s22,s23
 -             vstr    d14, [BASE2, #0-16*3]   @ s28,s29
 -             vstr    d15, [BASE2, #8-16*3]   @ s30,s31
 -                     vadd.f  s20, s4, s12
 -         subs    LEN, LEN, #8
 -         bne     5b
 - 6:                   vsub.f  s28, s4, s12
 -                 vstr    d8, [BASE1, #0-16*2]    @ s16,s17
 -                 vstr    d9, [BASE1, #8-16*2]    @ s18,s19
 -                 vstr    d12, [BASE2, #0-16*2]   @ s24,s25
 -                 vstr    d13, [BASE2, #8-16*2]   @ s26,s27
 -                     vstr    d10, [BASE1, #0-16*1]   @ s20,s21
 -                     vstr    d11, [BASE1, #8-16*1]   @ s22,s23
 -                     vstr    d14, [BASE2, #0-16*1]   @ s28,s29
 -                     vstr    d15, [BASE2, #8-16*1]   @ s30,s31
 - 7:
 -         fmxr    FPSCR, OLDFPSCR
 -         vpop    {s16-s31}
 -         bx      lr
 - 
 -         .unreq  BASE1
 -         .unreq  BASE2
 -         .unreq  LEN
 -         .unreq  OLDFPSCR
 - endfunc
 
 
  |