Modify the asm accordingly. GLOBAL is now no longoer necessary for PIC-compliant loads. Originally committed as revision 23739 to svn://svn.ffmpeg.org/ffmpeg/trunktags/n0.8
| @@ -40,7 +40,7 @@ section .text align=16 | |||
| %endmacro | |||
| %macro FLOAT_TO_INT16_INTERLEAVE6 1 | |||
| ; void ff_float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len) | |||
| ; void float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len) | |||
| cglobal float_to_int16_interleave6_%1, 2,7,0, dst, src, src1, src2, src3, src4, src5 | |||
| %ifdef ARCH_X86_64 | |||
| %define lend r10d | |||
| @@ -272,7 +272,7 @@ SCALARPRODUCT_LOOP 0 | |||
| ; void ff_add_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top) | |||
| ; void add_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top) | |||
| cglobal add_hfyu_median_prediction_mmx2, 6,6,0, dst, top, diff, w, left, left_top | |||
| movq mm0, [topq] | |||
| movq mm2, mm0 | |||
| @@ -370,23 +370,23 @@ cglobal add_hfyu_median_prediction_mmx2, 6,6,0, dst, top, diff, w, left, left_to | |||
| RET | |||
| %endmacro | |||
| ; int ff_add_hfyu_left_prediction(uint8_t *dst, const uint8_t *src, int w, int left) | |||
| ; int add_hfyu_left_prediction(uint8_t *dst, const uint8_t *src, int w, int left) | |||
| INIT_MMX | |||
| cglobal add_hfyu_left_prediction_ssse3, 3,3,7, dst, src, w, left | |||
| .skip_prologue: | |||
| mova m5, [pb_7 GLOBAL] | |||
| mova m4, [pb_zzzz3333zzzzbbbb GLOBAL] | |||
| mova m3, [pb_zz11zz55zz99zzdd GLOBAL] | |||
| mova m5, [pb_7] | |||
| mova m4, [pb_zzzz3333zzzzbbbb] | |||
| mova m3, [pb_zz11zz55zz99zzdd] | |||
| movd m0, leftm | |||
| psllq m0, 56 | |||
| ADD_HFYU_LEFT_LOOP 1 | |||
| INIT_XMM | |||
| cglobal add_hfyu_left_prediction_sse4, 3,3,7, dst, src, w, left | |||
| mova m5, [pb_f GLOBAL] | |||
| mova m6, [pb_zzzzzzzz77777777 GLOBAL] | |||
| mova m4, [pb_zzzz3333zzzzbbbb GLOBAL] | |||
| mova m3, [pb_zz11zz55zz99zzdd GLOBAL] | |||
| mova m5, [pb_f] | |||
| mova m6, [pb_zzzzzzzz77777777] | |||
| mova m4, [pb_zzzz3333zzzzbbbb] | |||
| mova m3, [pb_zz11zz55zz99zzdd] | |||
| movd m0, leftm | |||
| pslldq m0, 15 | |||
| test srcq, 15 | |||
| @@ -398,7 +398,7 @@ cglobal add_hfyu_left_prediction_sse4, 3,3,7, dst, src, w, left | |||
| ADD_HFYU_LEFT_LOOP 0 | |||
| ; float ff_scalarproduct_float_sse(const float *v1, const float *v2, int len) | |||
| ; float scalarproduct_float_sse(const float *v1, const float *v2, int len) | |||
| cglobal scalarproduct_float_sse, 3,3,2, v1, v2, offset | |||
| neg offsetq | |||
| shl offsetq, 2 | |||
| @@ -35,7 +35,7 @@ ps_m1p1: dd 1<<31, 0 | |||
| %assign i 16 | |||
| %rep 13 | |||
| cextern ff_cos_ %+ i | |||
| cextern cos_ %+ i | |||
| %assign i i<<1 | |||
| %endrep | |||
| @@ -64,7 +64,7 @@ section .text align=16 | |||
| mova %5, %3 | |||
| pfsub %3, %4 | |||
| pfadd %5, %4 ; {t6,t5} | |||
| pxor %3, [ps_m1p1 GLOBAL] ; {t8,t7} | |||
| pxor %3, [ps_m1p1] ; {t8,t7} | |||
| mova %6, %1 | |||
| pswapd %3, %3 | |||
| pfadd %1, %5 ; {r0,i0} | |||
| @@ -105,8 +105,8 @@ section .text align=16 | |||
| addps %6, %5 ; {t1,t2,t3,t4} | |||
| mova %5, %3 | |||
| shufps %5, %5, 0xb1 ; {i5,r5,i7,r7} | |||
| mulps %3, [ps_root2mppm GLOBAL] ; {-r5,i5,r7,-i7} | |||
| mulps %5, [ps_root2 GLOBAL] | |||
| mulps %3, [ps_root2mppm] ; {-r5,i5,r7,-i7} | |||
| mulps %5, [ps_root2] | |||
| addps %3, %5 ; {t8,t7,ta,t9} | |||
| mova %5, %6 | |||
| shufps %6, %3, 0x36 ; {t3,t2,t9,t8} | |||
| @@ -309,7 +309,7 @@ fft16_sse: | |||
| mova m6, Z(6) | |||
| mova m7, Z(7) | |||
| T4_SSE m6, m7, m0 | |||
| PASS_SMALL 0, [ff_cos_16 GLOBAL], [ff_cos_16+16 GLOBAL] | |||
| PASS_SMALL 0, [cos_16], [cos_16+16] | |||
| ret | |||
| @@ -342,12 +342,12 @@ fft8%1: | |||
| T2_3DN m6, m7, Z(6), Z(7) | |||
| pswapd m0, m5 | |||
| pswapd m2, m7 | |||
| pxor m0, [ps_m1p1 GLOBAL] | |||
| pxor m2, [ps_m1p1 GLOBAL] | |||
| pxor m0, [ps_m1p1] | |||
| pxor m2, [ps_m1p1] | |||
| pfsub m5, m0 | |||
| pfadd m7, m2 | |||
| pfmul m5, [ps_root2 GLOBAL] | |||
| pfmul m7, [ps_root2 GLOBAL] | |||
| pfmul m5, [ps_root2] | |||
| pfmul m7, [ps_root2] | |||
| T4_3DN m1, m3, m5, m7, m0, m2 | |||
| mova Z(5), m5 | |||
| mova Z(7), m7 | |||
| @@ -445,7 +445,7 @@ fft %+ n %+ %3%2: | |||
| add r0, n*2 - (n2&(-2<<%1)) | |||
| call fft %+ n4 %+ %2 | |||
| sub r0, n*6 + (n2&(-2<<%1)) | |||
| lea r1, [ff_cos_ %+ n GLOBAL] | |||
| lea r1, [cos_ %+ n] | |||
| mov r2d, n4/2 | |||
| jmp pass%3%2 | |||
| @@ -461,10 +461,10 @@ section .text | |||
| ; On x86_32, this function does the register saving and restoring for all of fft. | |||
| ; The others pass args in registers and don't spill anything. | |||
| cglobal fft_dispatch%3%2, 2,5,8, z, nbits | |||
| lea r2, [dispatch_tab%3%2 GLOBAL] | |||
| lea r2, [dispatch_tab%3%2] | |||
| mov r2, [r2 + (nbitsq-2)*gprsize] | |||
| %ifdef PIC | |||
| lea r3, [$$ GLOBAL] | |||
| lea r3, [$$] | |||
| add r2, r3 | |||
| %endif | |||
| call r2 | |||
| @@ -234,18 +234,18 @@ SECTION .text | |||
| %macro DEBLOCK_P0_Q0 0 | |||
| mova m5, m1 | |||
| pxor m5, m2 ; p0^q0 | |||
| pand m5, [pb_01 GLOBAL] ; (p0^q0)&1 | |||
| pand m5, [pb_01] ; (p0^q0)&1 | |||
| pcmpeqb m4, m4 | |||
| pxor m3, m4 | |||
| pavgb m3, m0 ; (p1 - q1 + 256)>>1 | |||
| pavgb m3, [pb_03 GLOBAL] ; (((p1 - q1 + 256)>>1)+4)>>1 = 64+2+(p1-q1)>>2 | |||
| pavgb m3, [pb_03] ; (((p1 - q1 + 256)>>1)+4)>>1 = 64+2+(p1-q1)>>2 | |||
| pxor m4, m1 | |||
| pavgb m4, m2 ; (q0 - p0 + 256)>>1 | |||
| pavgb m3, m5 | |||
| paddusb m3, m4 ; d+128+33 | |||
| mova m6, [pb_a1 GLOBAL] | |||
| mova m6, [pb_a1] | |||
| psubusb m6, m3 | |||
| psubusb m3, [pb_a1 GLOBAL] | |||
| psubusb m3, [pb_a1] | |||
| pminub m6, m7 | |||
| pminub m3, m7 | |||
| psubusb m1, m6 | |||
| @@ -263,7 +263,7 @@ SECTION .text | |||
| pavgb %6, m2 | |||
| pavgb %2, %6 ; avg(p2,avg(p0,q0)) | |||
| pxor %6, %3 | |||
| pand %6, [pb_01 GLOBAL] ; (p2^avg(p0,q0))&1 | |||
| pand %6, [pb_01] ; (p2^avg(p0,q0))&1 | |||
| psubusb %2, %6 ; (p2+((p0+q0+1)>>1))>>1 | |||
| mova %6, %1 | |||
| psubusb %6, %5 | |||
| @@ -612,8 +612,8 @@ DEBLOCK_LUMA sse2, v, 16 | |||
| %define mask0 spill(2) | |||
| %define mask1p spill(3) | |||
| %define mask1q spill(4) | |||
| %define mpb_00 [pb_00 GLOBAL] | |||
| %define mpb_01 [pb_01 GLOBAL] | |||
| %define mpb_00 [pb_00] | |||
| %define mpb_01 [pb_01] | |||
| %endif | |||
| ;----------------------------------------------------------------------------- | |||
| @@ -637,7 +637,7 @@ cglobal x264_deblock_%2_luma_intra_%1, 4,6,16 | |||
| mova q1, [r0+r1] | |||
| %ifdef ARCH_X86_64 | |||
| pxor mpb_00, mpb_00 | |||
| mova mpb_01, [pb_01 GLOBAL] | |||
| mova mpb_01, [pb_01] | |||
| LOAD_MASK r2d, r3d, t5 ; m5=beta-1, t5=alpha-1, m7=mask0 | |||
| SWAP 7, 12 ; m12=mask0 | |||
| pavgb t5, mpb_00 | |||
| @@ -656,8 +656,8 @@ cglobal x264_deblock_%2_luma_intra_%1, 4,6,16 | |||
| LOAD_MASK r2d, r3d, t5 ; m5=beta-1, t5=alpha-1, m7=mask0 | |||
| mova m4, t5 | |||
| mova mask0, m7 | |||
| pavgb m4, [pb_00 GLOBAL] | |||
| pavgb m4, [pb_01 GLOBAL] ; alpha/4+1 | |||
| pavgb m4, [pb_00] | |||
| pavgb m4, [pb_01] ; alpha/4+1 | |||
| DIFF_GT2 p0, q0, m4, m6, m7 ; m6 = |p0-q0| > alpha/4+1 | |||
| pand m6, mask0 | |||
| DIFF_GT2 p0, p2, m5, m4, m7 ; m4 = |p2-p0| > beta-1 | |||
| @@ -43,7 +43,7 @@ cglobal x264_add8x4_idct_sse2, 3,3,8 | |||
| movhps m3, [r1+56] | |||
| IDCT4_1D 0,1,2,3,4,5 | |||
| TRANSPOSE2x4x4W 0,1,2,3,4 | |||
| paddw m0, [pw_32 GLOBAL] | |||
| paddw m0, [pw_32] | |||
| IDCT4_1D 0,1,2,3,4,5 | |||
| pxor m7, m7 | |||
| STORE_DIFF m0, m4, m7, [r0] | |||
| @@ -1,25 +1,39 @@ | |||
| ;***************************************************************************** | |||
| ;* x86inc.asm | |||
| ;***************************************************************************** | |||
| ;* Copyright (C) 2005-2008 Loren Merritt <lorenm@u.washington.edu> | |||
| ;* Copyright (C) 2005-2008 x264 project | |||
| ;* | |||
| ;* This file is part of FFmpeg. | |||
| ;* Authors: Loren Merritt <lorenm@u.washington.edu> | |||
| ;* Anton Mitrofanov <BugMaster@narod.ru> | |||
| ;* | |||
| ;* FFmpeg is free software; you can redistribute it and/or | |||
| ;* modify it under the terms of the GNU Lesser General Public | |||
| ;* License as published by the Free Software Foundation; either | |||
| ;* version 2.1 of the License, or (at your option) any later version. | |||
| ;* Permission to use, copy, modify, and/or distribute this software for any | |||
| ;* purpose with or without fee is hereby granted, provided that the above | |||
| ;* copyright notice and this permission notice appear in all copies. | |||
| ;* | |||
| ;* FFmpeg is distributed in the hope that it will be useful, | |||
| ;* but WITHOUT ANY WARRANTY; without even the implied warranty of | |||
| ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |||
| ;* Lesser General Public License for more details. | |||
| ;* | |||
| ;* You should have received a copy of the GNU Lesser General Public | |||
| ;* License along with FFmpeg; if not, write to the Free Software | |||
| ;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |||
| ;* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES | |||
| ;* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF | |||
| ;* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR | |||
| ;* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES | |||
| ;* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN | |||
| ;* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF | |||
| ;* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. | |||
| ;***************************************************************************** | |||
| ; This is a header file for the x264ASM assembly language, which uses | |||
| ; NASM/YASM syntax combined with a large number of macros to provide easy | |||
| ; abstraction between different calling conventions (x86_32, win64, linux64). | |||
| ; It also has various other useful features to simplify writing the kind of | |||
| ; DSP functions that are most often used in x264. | |||
| ; Unlike the rest of x264, this file is available under an ISC license, as it | |||
| ; has significant usefulness outside of x264 and we want it to be available | |||
| ; to the largest audience possible. Of course, if you modify it for your own | |||
| ; purposes to add a new feature, we strongly encourage contributing a patch | |||
| ; as this feature might be useful for others as well. Send patches or ideas | |||
| ; to x264-devel@videolan.org . | |||
| %define program_name ff | |||
| %ifdef ARCH_X86_64 | |||
| %ifidn __OUTPUT_FORMAT__,win32 | |||
| %define WIN64 | |||
| @@ -28,6 +42,12 @@ | |||
| %endif | |||
| %endif | |||
| %ifdef PREFIX | |||
| %define mangle(x) _ %+ x | |||
| %else | |||
| %define mangle(x) x | |||
| %endif | |||
| ; FIXME: All of the 64bit asm functions that take a stride as an argument | |||
| ; via register, assume that the high dword of that register is filled with 0. | |||
| ; This is true in practice (since we never do any 64bit arithmetic on strides, | |||
| @@ -47,28 +67,16 @@ | |||
| %endif | |||
| %endmacro | |||
| ; PIC support macros. | |||
| ; x86_64 can't fit 64bit address literals in most instruction types, | |||
| ; so shared objects (under the assumption that they might be anywhere | |||
| ; in memory) must use an address mode that does fit. | |||
| ; So all accesses to global variables must use this macro, e.g. | |||
| ; mov eax, [foo GLOBAL] | |||
| ; instead of | |||
| ; mov eax, [foo] | |||
| ; | |||
| ; x86_32 doesn't require PIC. | |||
| ; Some distros prefer shared objects to be PIC, but nothing breaks if | |||
| ; the code contains a few textrels, so we'll skip that complexity. | |||
| %ifdef WIN64 | |||
| %define PIC | |||
| %elifndef ARCH_X86_64 | |||
| ; x86_32 doesn't require PIC. | |||
| ; Some distros prefer shared objects to be PIC, but nothing breaks if | |||
| ; the code contains a few textrels, so we'll skip that complexity. | |||
| %undef PIC | |||
| %endif | |||
| %ifdef PIC | |||
| %define GLOBAL wrt rip | |||
| %else | |||
| %define GLOBAL | |||
| default rel | |||
| %endif | |||
| ; Macros to eliminate most code duplication between x86_32 and x86_64: | |||
| @@ -163,7 +171,7 @@ DECLARE_REG_SIZE bp, bpl | |||
| %endrep | |||
| %endmacro | |||
| DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7 | |||
| DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9 | |||
| %ifdef ARCH_X86_64 | |||
| %define gprsize 8 | |||
| @@ -259,15 +267,11 @@ DECLARE_REG 6, rax, eax, ax, al, [rsp + stack_offset + 56] | |||
| %endif | |||
| %endmacro | |||
| %macro PROLOGUE 2-4+ ; #args, #regs, #xmm_regs, arg_names... | |||
| %macro PROLOGUE 2-4+ 0 ; #args, #regs, #xmm_regs, arg_names... | |||
| ASSERT %2 >= %1 | |||
| %assign regs_used %2 | |||
| ASSERT regs_used <= 7 | |||
| %if %0 > 2 | |||
| %assign xmm_regs_used %3 | |||
| %else | |||
| %assign xmm_regs_used 0 | |||
| %endif | |||
| %assign xmm_regs_used %3 | |||
| ASSERT xmm_regs_used <= 16 | |||
| %if regs_used > 4 | |||
| push r4 | |||
| @@ -388,7 +392,7 @@ DECLARE_REG 6, ebp, ebp, bp, null, [esp + stack_offset + 28] | |||
| %endif | |||
| %endmacro | |||
| %macro PROLOGUE 2-4+ ; #args, #regs, arg_names... | |||
| %macro PROLOGUE 2-4+ ; #args, #regs, #xmm_regs, arg_names... | |||
| ASSERT %2 >= %1 | |||
| %assign regs_used %2 | |||
| ASSERT regs_used <= 7 | |||
| @@ -434,10 +438,7 @@ DECLARE_REG 6, ebp, ebp, bp, null, [esp + stack_offset + 28] | |||
| ; Symbol prefix for C linkage | |||
| %macro cglobal 1-2+ | |||
| %xdefine %1 ff_%1 | |||
| %ifdef PREFIX | |||
| %xdefine %1 _ %+ %1 | |||
| %endif | |||
| %xdefine %1 mangle(program_name %+ _ %+ %1) | |||
| %xdefine %1.skip_prologue %1 %+ .skip_prologue | |||
| %ifidn __OUTPUT_FORMAT__,elf | |||
| global %1:function hidden | |||
| @@ -454,21 +455,28 @@ DECLARE_REG 6, ebp, ebp, bp, null, [esp + stack_offset + 28] | |||
| %endmacro | |||
| %macro cextern 1 | |||
| %ifdef PREFIX | |||
| %xdefine %1 _%1 | |||
| %endif | |||
| %xdefine %1 mangle(program_name %+ _ %+ %1) | |||
| extern %1 | |||
| %endmacro | |||
| ;like cextern, but without the prefix | |||
| %macro cextern_naked 1 | |||
| %xdefine %1 mangle(%1) | |||
| extern %1 | |||
| %endmacro | |||
| %macro const 2+ | |||
| %xdefine %1 mangle(program_name %+ _ %+ %1) | |||
| global %1 | |||
| %1: %2 | |||
| %endmacro | |||
| ; This is needed for ELF, otherwise the GNU linker assumes the stack is | |||
| ; executable by default. | |||
| %ifidn __OUTPUT_FORMAT__,elf | |||
| SECTION .note.GNU-stack noalloc noexec nowrite progbits | |||
| %endif | |||
| %assign FENC_STRIDE 16 | |||
| %assign FDEC_STRIDE 32 | |||
| ; merge mmx and sse* | |||
| %macro CAT_XDEFINE 3 | |||
| @@ -575,7 +583,10 @@ INIT_MMX | |||
| %endrep | |||
| %endmacro | |||
| %macro SAVE_MM_PERMUTATION 1 | |||
| ; If SAVE_MM_PERMUTATION is placed at the end of a function and given the | |||
| ; function name, then any later calls to that function will automatically | |||
| ; load the permutation, so values can be returned in mmregs. | |||
| %macro SAVE_MM_PERMUTATION 1 ; name to save as | |||
| %assign %%i 0 | |||
| %rep num_mmregs | |||
| CAT_XDEFINE %1_m, %%i, m %+ %%i | |||
| @@ -583,7 +594,7 @@ INIT_MMX | |||
| %endrep | |||
| %endmacro | |||
| %macro LOAD_MM_PERMUTATION 1 | |||
| %macro LOAD_MM_PERMUTATION 1 ; name to load from | |||
| %assign %%i 0 | |||
| %rep num_mmregs | |||
| CAT_XDEFINE m, %%i, %1_m %+ %%i | |||
| @@ -599,7 +610,7 @@ INIT_MMX | |||
| %endif | |||
| %endmacro | |||
| ;Substitutions that reduce instruction size but are functionally equivalent | |||
| ; Substitutions that reduce instruction size but are functionally equivalent | |||
| %macro add 2 | |||
| %ifnum %2 | |||
| %if %2==128 | |||
| @@ -1,7 +1,10 @@ | |||
| ;***************************************************************************** | |||
| ;* x86util.asm | |||
| ;***************************************************************************** | |||
| ;* Copyright (C) 2008 Loren Merritt <lorenm@u.washington.edu> | |||
| ;* Copyright (C) 2008 x264 project | |||
| ;* | |||
| ;* Authors: Holger Lubitz <holger@lubitz.org> | |||
| ;* Loren Merritt <lorenm@u.washington.edu> | |||
| ;* | |||
| ;* This program is free software; you can redistribute it and/or modify | |||
| ;* it under the terms of the GNU General Public License as published by | |||
| @@ -18,6 +21,9 @@ | |||
| ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. | |||
| ;***************************************************************************** | |||
| %assign FENC_STRIDE 16 | |||
| %assign FDEC_STRIDE 32 | |||
| %macro SBUTTERFLY 4 | |||
| mova m%4, m%2 | |||
| punpckl%1 m%2, m%3 | |||
| @@ -25,6 +31,13 @@ | |||
| SWAP %3, %4 | |||
| %endmacro | |||
| %macro SBUTTERFLY2 4 | |||
| mova m%4, m%2 | |||
| punpckh%1 m%2, m%3 | |||
| punpckl%1 m%4, m%3 | |||
| SWAP %2, %4, %3 | |||
| %endmacro | |||
| %macro TRANSPOSE4x4W 5 | |||
| SBUTTERFLY wd, %1, %2, %5 | |||
| SBUTTERFLY wd, %3, %4, %5 | |||
| @@ -123,14 +136,40 @@ | |||
| pabsw %2, %2 | |||
| %endmacro | |||
| %define ABS1 ABS1_MMX | |||
| %define ABS2 ABS2_MMX | |||
| %macro ABSB_MMX 2 | |||
| pxor %2, %2 | |||
| psubb %2, %1 | |||
| pminub %1, %2 | |||
| %endmacro | |||
| %macro ABSB2_MMX 4 | |||
| pxor %3, %3 | |||
| pxor %4, %4 | |||
| psubb %3, %1 | |||
| psubb %4, %2 | |||
| pminub %1, %3 | |||
| pminub %2, %4 | |||
| %endmacro | |||
| %macro ABSB_SSSE3 2 | |||
| pabsb %1, %1 | |||
| %endmacro | |||
| %macro ABSB2_SSSE3 4 | |||
| pabsb %1, %1 | |||
| pabsb %2, %2 | |||
| %endmacro | |||
| %macro ABS4 6 | |||
| ABS2 %1, %2, %5, %6 | |||
| ABS2 %3, %4, %5, %6 | |||
| %endmacro | |||
| %define ABS1 ABS1_MMX | |||
| %define ABS2 ABS2_MMX | |||
| %define ABSB ABSB_MMX | |||
| %define ABSB2 ABSB2_MMX | |||
| %macro SPLATB_MMX 3 | |||
| movd %1, [%2-3] ;to avoid crossing a cacheline | |||
| punpcklbw %1, %1 | |||
| @@ -226,10 +265,10 @@ | |||
| ; %3/%4: source regs | |||
| ; %5/%6: tmp regs | |||
| %ifidn %1, d | |||
| %define mask [mask_10 GLOBAL] | |||
| %define mask [mask_10] | |||
| %define shift 16 | |||
| %elifidn %1, q | |||
| %define mask [mask_1100 GLOBAL] | |||
| %define mask [mask_1100] | |||
| %define shift 32 | |||
| %endif | |||
| %if %0==6 ; less dependency if we have two tmp | |||
| @@ -383,10 +422,10 @@ | |||
| %macro SUMSUBD2_AB 4 | |||
| mova %4, %1 | |||
| mova %3, %2 | |||
| psraw %2, 1 | |||
| psraw %1, 1 | |||
| paddw %2, %4 | |||
| psubw %1, %3 | |||
| psraw %2, 1 ; %2: %2>>1 | |||
| psraw %1, 1 ; %1: %1>>1 | |||
| paddw %2, %4 ; %2: %2>>1+%1 | |||
| psubw %1, %3 ; %1: %1>>1-%2 | |||
| %endmacro | |||
| %macro DCT4_1D 5 | |||
| @@ -407,16 +446,27 @@ | |||
| %macro IDCT4_1D 5-6 | |||
| %ifnum %5 | |||
| SUMSUBD2_AB m%2, m%4, m%6, m%5 | |||
| ; %2: %2>>1-%4 %4: %2+%4>>1 | |||
| SUMSUB_BA m%3, m%1, m%6 | |||
| ; %3: %1+%3 %1: %1-%3 | |||
| SUMSUB_BADC m%4, m%3, m%2, m%1, m%6 | |||
| ; %4: %1+%3 + (%2+%4>>1) | |||
| ; %3: %1+%3 - (%2+%4>>1) | |||
| ; %2: %1-%3 + (%2>>1-%4) | |||
| ; %1: %1-%3 - (%2>>1-%4) | |||
| %else | |||
| SUMSUBD2_AB m%2, m%4, [%5], [%5+16] | |||
| SUMSUB_BA m%3, m%1 | |||
| SUMSUB_BADC m%4, m%3, m%2, m%1 | |||
| %endif | |||
| SWAP %1, %4, %3 | |||
| ; %1: %1+%3 + (%2+%4>>1) row0 | |||
| ; %2: %1-%3 + (%2>>1-%4) row1 | |||
| ; %3: %1-%3 - (%2>>1-%4) row2 | |||
| ; %4: %1+%3 - (%2+%4>>1) row3 | |||
| %endmacro | |||
| %macro LOAD_DIFF 5 | |||
| %ifidn %3, none | |||
| movh %1, %4 | |||
| @@ -512,4 +562,3 @@ | |||
| packuswb %1, %1 | |||
| movh %4, %1 | |||
| %endmacro | |||