| 
							- ;******************************************************************************
 - ;* MMX/SSE2-optimized functions for the RV40 decoder
 - ;* Copyright (C) 2012 Christophe Gisquet <christophe.gisquet@gmail.com>
 - ;*
 - ;* This file is part of Libav.
 - ;*
 - ;* Libav is free software; you can redistribute it and/or
 - ;* modify it under the terms of the GNU Lesser General Public
 - ;* License as published by the Free Software Foundation; either
 - ;* version 2.1 of the License, or (at your option) any later version.
 - ;*
 - ;* Libav is distributed in the hope that it will be useful,
 - ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 - ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 - ;* Lesser General Public License for more details.
 - ;*
 - ;* You should have received a copy of the GNU Lesser General Public
 - ;* License along with Libav; if not, write to the Free Software
 - ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 - ;******************************************************************************
 - 
 - %include "x86inc.asm"
 - %include "x86util.asm"
 - 
 - SECTION_RODATA
 - 
 - align 16
 - shift_round:   times 8 dw 1 << (16 - 6)
 - cextern pw_16
 - 
 - SECTION .text
 - 
 - ; %1=5bits weights?, %2=dst %3=src1 %4=src3 %5=stride if sse2
 - %macro RV40_WCORE  4-5
 -     movh       m4, [%3 + 0]
 -     movh       m5, [%4 + 0]
 - %if %0 == 4
 - %define OFFSET mmsize / 2
 - %else
 -     ; 8x8 block and sse2, stride was provided
 - %define OFFSET %5
 - %endif
 -     movh       m6, [%3 + OFFSET]
 -     movh       m7, [%4 + OFFSET]
 - 
 - %if %1 == 0
 -     ; 14bits weights
 -     punpcklbw  m4, m0
 -     punpcklbw  m5, m0
 -     punpcklbw  m6, m0
 -     punpcklbw  m7, m0
 - 
 -     psllw      m4, 7
 -     psllw      m5, 7
 -     psllw      m6, 7
 -     psllw      m7, 7
 -     pmulhw     m4, m3
 -     pmulhw     m5, m2
 -     pmulhw     m6, m3
 -     pmulhw     m7, m2
 - 
 -     paddw      m4, m5
 -     paddw      m6, m7
 - %else
 -     ; 5bits weights
 - %if cpuflag(ssse3)
 -     punpcklbw  m4, m5
 -     punpcklbw  m6, m7
 - 
 -     pmaddubsw  m4, m3
 -     pmaddubsw  m6, m3
 - %else
 -     punpcklbw  m4, m0
 -     punpcklbw  m5, m0
 -     punpcklbw  m6, m0
 -     punpcklbw  m7, m0
 - 
 -     pmullw     m4, m3
 -     pmullw     m5, m2
 -     pmullw     m6, m3
 -     pmullw     m7, m2
 -     paddw      m4, m5
 -     paddw      m6, m7
 - %endif
 - 
 - %endif
 - 
 -     ; bias and shift down
 - %if cpuflag(ssse3)
 -     pmulhrsw   m4, m1
 -     pmulhrsw   m6, m1
 - %else
 -     paddw      m4, m1
 -     paddw      m6, m1
 -     psrlw      m4, 5
 -     psrlw      m6, 5
 - %endif
 - 
 -     packuswb   m4, m6
 - %if %0 == 5
 -     ; Only called for 8x8 blocks and sse2
 -     movh       [%2 +  0], m4
 -     movhps     [%2 + %5], m4
 - %else
 -     mova       [%2], m4
 - %endif
 - %endmacro
 - 
 - 
 - %macro MAIN_LOOP   2
 - %if mmsize == 8
 -     RV40_WCORE %2, r0, r1, r2
 - %if %1 == 16
 -     RV40_WCORE %2, r0 + 8, r1 + 8, r2 + 8
 - %endif
 - 
 -     ; Prepare for next loop
 -     add        r0, r5
 -     add        r1, r5
 -     add        r2, r5
 - %else
 - %ifidn %1, 8
 -     RV40_WCORE %2, r0, r1, r2, r5
 -     ; Prepare 2 next lines
 -     lea        r0, [r0 + 2 * r5]
 -     lea        r1, [r1 + 2 * r5]
 -     lea        r2, [r2 + 2 * r5]
 - %else
 -     RV40_WCORE %2, r0, r1, r2
 -     ; Prepare single next line
 -     add        r0, r5
 -     add        r1, r5
 -     add        r2, r5
 - %endif
 - %endif
 - 
 -     dec        r6
 - %endmacro
 - 
 - ; rv40_weight_func_%1(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w1, int w2, int stride)
 - ; %1=size  %2=num of xmm regs
 - %macro RV40_WEIGHT  2
 - cglobal rv40_weight_func_%1, 6, 7, %2
 - %if cpuflag(ssse3)
 -     mova       m1, [shift_round]
 - %else
 -     mova       m1, [pw_16]
 - %endif
 -     pxor       m0, m0
 -     mov        r6, r3
 -     or         r6, r4
 -     ; The weights are FP0.14 notation of fractions depending on pts.
 -     ; For timebases without rounding error (i.e. PAL), the fractions
 -     ; can be simplified, and several operations can be avoided.
 -     ; Therefore, we check here whether they are multiples of 2^9 for
 -     ; those simplifications to occur.
 -     and        r6, 0x1FF
 -     ; Set loop counter and increments
 - %if mmsize == 8
 -     mov        r6, %1
 - %else
 -     mov        r6, (%1 * %1) / mmsize
 - %endif
 - 
 -     ; Use result of test now
 -     jz .loop_512
 -     movd       m2, r3
 -     movd       m3, r4
 -     SPLATW     m2, m2
 -     SPLATW     m3, m3
 - 
 - .loop:
 -     MAIN_LOOP  %1, 0
 -     jnz        .loop
 -     REP_RET
 - 
 -     ; Weights are multiple of 512, which allows some shortcuts
 - .loop_512:
 -     sar        r3, 9
 -     sar        r4, 9
 -     movd       m2, r3
 -     movd       m3, r4
 - %if cpuflag(ssse3)
 -     punpcklbw  m3, m2
 -     SPLATW     m3, m3
 - %else
 -     SPLATW     m2, m2
 -     SPLATW     m3, m3
 - %endif
 - .loop2:
 -     MAIN_LOOP  %1, 1
 -     jnz        .loop2
 -     REP_RET
 - 
 - %endmacro
 - 
 - INIT_MMX mmx
 - RV40_WEIGHT    8, 0
 - RV40_WEIGHT   16, 0
 - 
 - INIT_XMM sse2
 - RV40_WEIGHT    8, 8
 - RV40_WEIGHT   16, 8
 - 
 - INIT_XMM ssse3
 - RV40_WEIGHT    8, 8
 - RV40_WEIGHT   16, 8
 
 
  |