Modify the asm accordingly. GLOBAL is now no longoer necessary for PIC-compliant loads. Originally committed as revision 23739 to svn://svn.ffmpeg.org/ffmpeg/trunktags/n0.8
| @@ -40,7 +40,7 @@ section .text align=16 | |||||
| %endmacro | %endmacro | ||||
| %macro FLOAT_TO_INT16_INTERLEAVE6 1 | %macro FLOAT_TO_INT16_INTERLEAVE6 1 | ||||
| ; void ff_float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len) | |||||
| ; void float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len) | |||||
| cglobal float_to_int16_interleave6_%1, 2,7,0, dst, src, src1, src2, src3, src4, src5 | cglobal float_to_int16_interleave6_%1, 2,7,0, dst, src, src1, src2, src3, src4, src5 | ||||
| %ifdef ARCH_X86_64 | %ifdef ARCH_X86_64 | ||||
| %define lend r10d | %define lend r10d | ||||
| @@ -272,7 +272,7 @@ SCALARPRODUCT_LOOP 0 | |||||
| ; void ff_add_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top) | |||||
| ; void add_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top) | |||||
| cglobal add_hfyu_median_prediction_mmx2, 6,6,0, dst, top, diff, w, left, left_top | cglobal add_hfyu_median_prediction_mmx2, 6,6,0, dst, top, diff, w, left, left_top | ||||
| movq mm0, [topq] | movq mm0, [topq] | ||||
| movq mm2, mm0 | movq mm2, mm0 | ||||
| @@ -370,23 +370,23 @@ cglobal add_hfyu_median_prediction_mmx2, 6,6,0, dst, top, diff, w, left, left_to | |||||
| RET | RET | ||||
| %endmacro | %endmacro | ||||
| ; int ff_add_hfyu_left_prediction(uint8_t *dst, const uint8_t *src, int w, int left) | |||||
| ; int add_hfyu_left_prediction(uint8_t *dst, const uint8_t *src, int w, int left) | |||||
| INIT_MMX | INIT_MMX | ||||
| cglobal add_hfyu_left_prediction_ssse3, 3,3,7, dst, src, w, left | cglobal add_hfyu_left_prediction_ssse3, 3,3,7, dst, src, w, left | ||||
| .skip_prologue: | .skip_prologue: | ||||
| mova m5, [pb_7 GLOBAL] | |||||
| mova m4, [pb_zzzz3333zzzzbbbb GLOBAL] | |||||
| mova m3, [pb_zz11zz55zz99zzdd GLOBAL] | |||||
| mova m5, [pb_7] | |||||
| mova m4, [pb_zzzz3333zzzzbbbb] | |||||
| mova m3, [pb_zz11zz55zz99zzdd] | |||||
| movd m0, leftm | movd m0, leftm | ||||
| psllq m0, 56 | psllq m0, 56 | ||||
| ADD_HFYU_LEFT_LOOP 1 | ADD_HFYU_LEFT_LOOP 1 | ||||
| INIT_XMM | INIT_XMM | ||||
| cglobal add_hfyu_left_prediction_sse4, 3,3,7, dst, src, w, left | cglobal add_hfyu_left_prediction_sse4, 3,3,7, dst, src, w, left | ||||
| mova m5, [pb_f GLOBAL] | |||||
| mova m6, [pb_zzzzzzzz77777777 GLOBAL] | |||||
| mova m4, [pb_zzzz3333zzzzbbbb GLOBAL] | |||||
| mova m3, [pb_zz11zz55zz99zzdd GLOBAL] | |||||
| mova m5, [pb_f] | |||||
| mova m6, [pb_zzzzzzzz77777777] | |||||
| mova m4, [pb_zzzz3333zzzzbbbb] | |||||
| mova m3, [pb_zz11zz55zz99zzdd] | |||||
| movd m0, leftm | movd m0, leftm | ||||
| pslldq m0, 15 | pslldq m0, 15 | ||||
| test srcq, 15 | test srcq, 15 | ||||
| @@ -398,7 +398,7 @@ cglobal add_hfyu_left_prediction_sse4, 3,3,7, dst, src, w, left | |||||
| ADD_HFYU_LEFT_LOOP 0 | ADD_HFYU_LEFT_LOOP 0 | ||||
| ; float ff_scalarproduct_float_sse(const float *v1, const float *v2, int len) | |||||
| ; float scalarproduct_float_sse(const float *v1, const float *v2, int len) | |||||
| cglobal scalarproduct_float_sse, 3,3,2, v1, v2, offset | cglobal scalarproduct_float_sse, 3,3,2, v1, v2, offset | ||||
| neg offsetq | neg offsetq | ||||
| shl offsetq, 2 | shl offsetq, 2 | ||||
| @@ -35,7 +35,7 @@ ps_m1p1: dd 1<<31, 0 | |||||
| %assign i 16 | %assign i 16 | ||||
| %rep 13 | %rep 13 | ||||
| cextern ff_cos_ %+ i | |||||
| cextern cos_ %+ i | |||||
| %assign i i<<1 | %assign i i<<1 | ||||
| %endrep | %endrep | ||||
| @@ -64,7 +64,7 @@ section .text align=16 | |||||
| mova %5, %3 | mova %5, %3 | ||||
| pfsub %3, %4 | pfsub %3, %4 | ||||
| pfadd %5, %4 ; {t6,t5} | pfadd %5, %4 ; {t6,t5} | ||||
| pxor %3, [ps_m1p1 GLOBAL] ; {t8,t7} | |||||
| pxor %3, [ps_m1p1] ; {t8,t7} | |||||
| mova %6, %1 | mova %6, %1 | ||||
| pswapd %3, %3 | pswapd %3, %3 | ||||
| pfadd %1, %5 ; {r0,i0} | pfadd %1, %5 ; {r0,i0} | ||||
| @@ -105,8 +105,8 @@ section .text align=16 | |||||
| addps %6, %5 ; {t1,t2,t3,t4} | addps %6, %5 ; {t1,t2,t3,t4} | ||||
| mova %5, %3 | mova %5, %3 | ||||
| shufps %5, %5, 0xb1 ; {i5,r5,i7,r7} | shufps %5, %5, 0xb1 ; {i5,r5,i7,r7} | ||||
| mulps %3, [ps_root2mppm GLOBAL] ; {-r5,i5,r7,-i7} | |||||
| mulps %5, [ps_root2 GLOBAL] | |||||
| mulps %3, [ps_root2mppm] ; {-r5,i5,r7,-i7} | |||||
| mulps %5, [ps_root2] | |||||
| addps %3, %5 ; {t8,t7,ta,t9} | addps %3, %5 ; {t8,t7,ta,t9} | ||||
| mova %5, %6 | mova %5, %6 | ||||
| shufps %6, %3, 0x36 ; {t3,t2,t9,t8} | shufps %6, %3, 0x36 ; {t3,t2,t9,t8} | ||||
| @@ -309,7 +309,7 @@ fft16_sse: | |||||
| mova m6, Z(6) | mova m6, Z(6) | ||||
| mova m7, Z(7) | mova m7, Z(7) | ||||
| T4_SSE m6, m7, m0 | T4_SSE m6, m7, m0 | ||||
| PASS_SMALL 0, [ff_cos_16 GLOBAL], [ff_cos_16+16 GLOBAL] | |||||
| PASS_SMALL 0, [cos_16], [cos_16+16] | |||||
| ret | ret | ||||
| @@ -342,12 +342,12 @@ fft8%1: | |||||
| T2_3DN m6, m7, Z(6), Z(7) | T2_3DN m6, m7, Z(6), Z(7) | ||||
| pswapd m0, m5 | pswapd m0, m5 | ||||
| pswapd m2, m7 | pswapd m2, m7 | ||||
| pxor m0, [ps_m1p1 GLOBAL] | |||||
| pxor m2, [ps_m1p1 GLOBAL] | |||||
| pxor m0, [ps_m1p1] | |||||
| pxor m2, [ps_m1p1] | |||||
| pfsub m5, m0 | pfsub m5, m0 | ||||
| pfadd m7, m2 | pfadd m7, m2 | ||||
| pfmul m5, [ps_root2 GLOBAL] | |||||
| pfmul m7, [ps_root2 GLOBAL] | |||||
| pfmul m5, [ps_root2] | |||||
| pfmul m7, [ps_root2] | |||||
| T4_3DN m1, m3, m5, m7, m0, m2 | T4_3DN m1, m3, m5, m7, m0, m2 | ||||
| mova Z(5), m5 | mova Z(5), m5 | ||||
| mova Z(7), m7 | mova Z(7), m7 | ||||
| @@ -445,7 +445,7 @@ fft %+ n %+ %3%2: | |||||
| add r0, n*2 - (n2&(-2<<%1)) | add r0, n*2 - (n2&(-2<<%1)) | ||||
| call fft %+ n4 %+ %2 | call fft %+ n4 %+ %2 | ||||
| sub r0, n*6 + (n2&(-2<<%1)) | sub r0, n*6 + (n2&(-2<<%1)) | ||||
| lea r1, [ff_cos_ %+ n GLOBAL] | |||||
| lea r1, [cos_ %+ n] | |||||
| mov r2d, n4/2 | mov r2d, n4/2 | ||||
| jmp pass%3%2 | jmp pass%3%2 | ||||
| @@ -461,10 +461,10 @@ section .text | |||||
| ; On x86_32, this function does the register saving and restoring for all of fft. | ; On x86_32, this function does the register saving and restoring for all of fft. | ||||
| ; The others pass args in registers and don't spill anything. | ; The others pass args in registers and don't spill anything. | ||||
| cglobal fft_dispatch%3%2, 2,5,8, z, nbits | cglobal fft_dispatch%3%2, 2,5,8, z, nbits | ||||
| lea r2, [dispatch_tab%3%2 GLOBAL] | |||||
| lea r2, [dispatch_tab%3%2] | |||||
| mov r2, [r2 + (nbitsq-2)*gprsize] | mov r2, [r2 + (nbitsq-2)*gprsize] | ||||
| %ifdef PIC | %ifdef PIC | ||||
| lea r3, [$$ GLOBAL] | |||||
| lea r3, [$$] | |||||
| add r2, r3 | add r2, r3 | ||||
| %endif | %endif | ||||
| call r2 | call r2 | ||||
| @@ -234,18 +234,18 @@ SECTION .text | |||||
| %macro DEBLOCK_P0_Q0 0 | %macro DEBLOCK_P0_Q0 0 | ||||
| mova m5, m1 | mova m5, m1 | ||||
| pxor m5, m2 ; p0^q0 | pxor m5, m2 ; p0^q0 | ||||
| pand m5, [pb_01 GLOBAL] ; (p0^q0)&1 | |||||
| pand m5, [pb_01] ; (p0^q0)&1 | |||||
| pcmpeqb m4, m4 | pcmpeqb m4, m4 | ||||
| pxor m3, m4 | pxor m3, m4 | ||||
| pavgb m3, m0 ; (p1 - q1 + 256)>>1 | pavgb m3, m0 ; (p1 - q1 + 256)>>1 | ||||
| pavgb m3, [pb_03 GLOBAL] ; (((p1 - q1 + 256)>>1)+4)>>1 = 64+2+(p1-q1)>>2 | |||||
| pavgb m3, [pb_03] ; (((p1 - q1 + 256)>>1)+4)>>1 = 64+2+(p1-q1)>>2 | |||||
| pxor m4, m1 | pxor m4, m1 | ||||
| pavgb m4, m2 ; (q0 - p0 + 256)>>1 | pavgb m4, m2 ; (q0 - p0 + 256)>>1 | ||||
| pavgb m3, m5 | pavgb m3, m5 | ||||
| paddusb m3, m4 ; d+128+33 | paddusb m3, m4 ; d+128+33 | ||||
| mova m6, [pb_a1 GLOBAL] | |||||
| mova m6, [pb_a1] | |||||
| psubusb m6, m3 | psubusb m6, m3 | ||||
| psubusb m3, [pb_a1 GLOBAL] | |||||
| psubusb m3, [pb_a1] | |||||
| pminub m6, m7 | pminub m6, m7 | ||||
| pminub m3, m7 | pminub m3, m7 | ||||
| psubusb m1, m6 | psubusb m1, m6 | ||||
| @@ -263,7 +263,7 @@ SECTION .text | |||||
| pavgb %6, m2 | pavgb %6, m2 | ||||
| pavgb %2, %6 ; avg(p2,avg(p0,q0)) | pavgb %2, %6 ; avg(p2,avg(p0,q0)) | ||||
| pxor %6, %3 | pxor %6, %3 | ||||
| pand %6, [pb_01 GLOBAL] ; (p2^avg(p0,q0))&1 | |||||
| pand %6, [pb_01] ; (p2^avg(p0,q0))&1 | |||||
| psubusb %2, %6 ; (p2+((p0+q0+1)>>1))>>1 | psubusb %2, %6 ; (p2+((p0+q0+1)>>1))>>1 | ||||
| mova %6, %1 | mova %6, %1 | ||||
| psubusb %6, %5 | psubusb %6, %5 | ||||
| @@ -612,8 +612,8 @@ DEBLOCK_LUMA sse2, v, 16 | |||||
| %define mask0 spill(2) | %define mask0 spill(2) | ||||
| %define mask1p spill(3) | %define mask1p spill(3) | ||||
| %define mask1q spill(4) | %define mask1q spill(4) | ||||
| %define mpb_00 [pb_00 GLOBAL] | |||||
| %define mpb_01 [pb_01 GLOBAL] | |||||
| %define mpb_00 [pb_00] | |||||
| %define mpb_01 [pb_01] | |||||
| %endif | %endif | ||||
| ;----------------------------------------------------------------------------- | ;----------------------------------------------------------------------------- | ||||
| @@ -637,7 +637,7 @@ cglobal x264_deblock_%2_luma_intra_%1, 4,6,16 | |||||
| mova q1, [r0+r1] | mova q1, [r0+r1] | ||||
| %ifdef ARCH_X86_64 | %ifdef ARCH_X86_64 | ||||
| pxor mpb_00, mpb_00 | pxor mpb_00, mpb_00 | ||||
| mova mpb_01, [pb_01 GLOBAL] | |||||
| mova mpb_01, [pb_01] | |||||
| LOAD_MASK r2d, r3d, t5 ; m5=beta-1, t5=alpha-1, m7=mask0 | LOAD_MASK r2d, r3d, t5 ; m5=beta-1, t5=alpha-1, m7=mask0 | ||||
| SWAP 7, 12 ; m12=mask0 | SWAP 7, 12 ; m12=mask0 | ||||
| pavgb t5, mpb_00 | pavgb t5, mpb_00 | ||||
| @@ -656,8 +656,8 @@ cglobal x264_deblock_%2_luma_intra_%1, 4,6,16 | |||||
| LOAD_MASK r2d, r3d, t5 ; m5=beta-1, t5=alpha-1, m7=mask0 | LOAD_MASK r2d, r3d, t5 ; m5=beta-1, t5=alpha-1, m7=mask0 | ||||
| mova m4, t5 | mova m4, t5 | ||||
| mova mask0, m7 | mova mask0, m7 | ||||
| pavgb m4, [pb_00 GLOBAL] | |||||
| pavgb m4, [pb_01 GLOBAL] ; alpha/4+1 | |||||
| pavgb m4, [pb_00] | |||||
| pavgb m4, [pb_01] ; alpha/4+1 | |||||
| DIFF_GT2 p0, q0, m4, m6, m7 ; m6 = |p0-q0| > alpha/4+1 | DIFF_GT2 p0, q0, m4, m6, m7 ; m6 = |p0-q0| > alpha/4+1 | ||||
| pand m6, mask0 | pand m6, mask0 | ||||
| DIFF_GT2 p0, p2, m5, m4, m7 ; m4 = |p2-p0| > beta-1 | DIFF_GT2 p0, p2, m5, m4, m7 ; m4 = |p2-p0| > beta-1 | ||||
| @@ -43,7 +43,7 @@ cglobal x264_add8x4_idct_sse2, 3,3,8 | |||||
| movhps m3, [r1+56] | movhps m3, [r1+56] | ||||
| IDCT4_1D 0,1,2,3,4,5 | IDCT4_1D 0,1,2,3,4,5 | ||||
| TRANSPOSE2x4x4W 0,1,2,3,4 | TRANSPOSE2x4x4W 0,1,2,3,4 | ||||
| paddw m0, [pw_32 GLOBAL] | |||||
| paddw m0, [pw_32] | |||||
| IDCT4_1D 0,1,2,3,4,5 | IDCT4_1D 0,1,2,3,4,5 | ||||
| pxor m7, m7 | pxor m7, m7 | ||||
| STORE_DIFF m0, m4, m7, [r0] | STORE_DIFF m0, m4, m7, [r0] | ||||
| @@ -1,25 +1,39 @@ | |||||
| ;***************************************************************************** | ;***************************************************************************** | ||||
| ;* x86inc.asm | ;* x86inc.asm | ||||
| ;***************************************************************************** | ;***************************************************************************** | ||||
| ;* Copyright (C) 2005-2008 Loren Merritt <lorenm@u.washington.edu> | |||||
| ;* Copyright (C) 2005-2008 x264 project | |||||
| ;* | ;* | ||||
| ;* This file is part of FFmpeg. | |||||
| ;* Authors: Loren Merritt <lorenm@u.washington.edu> | |||||
| ;* Anton Mitrofanov <BugMaster@narod.ru> | |||||
| ;* | ;* | ||||
| ;* FFmpeg is free software; you can redistribute it and/or | |||||
| ;* modify it under the terms of the GNU Lesser General Public | |||||
| ;* License as published by the Free Software Foundation; either | |||||
| ;* version 2.1 of the License, or (at your option) any later version. | |||||
| ;* Permission to use, copy, modify, and/or distribute this software for any | |||||
| ;* purpose with or without fee is hereby granted, provided that the above | |||||
| ;* copyright notice and this permission notice appear in all copies. | |||||
| ;* | ;* | ||||
| ;* FFmpeg is distributed in the hope that it will be useful, | |||||
| ;* but WITHOUT ANY WARRANTY; without even the implied warranty of | |||||
| ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |||||
| ;* Lesser General Public License for more details. | |||||
| ;* | |||||
| ;* You should have received a copy of the GNU Lesser General Public | |||||
| ;* License along with FFmpeg; if not, write to the Free Software | |||||
| ;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |||||
| ;* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES | |||||
| ;* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF | |||||
| ;* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR | |||||
| ;* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES | |||||
| ;* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN | |||||
| ;* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF | |||||
| ;* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. | |||||
| ;***************************************************************************** | ;***************************************************************************** | ||||
| ; This is a header file for the x264ASM assembly language, which uses | |||||
| ; NASM/YASM syntax combined with a large number of macros to provide easy | |||||
| ; abstraction between different calling conventions (x86_32, win64, linux64). | |||||
| ; It also has various other useful features to simplify writing the kind of | |||||
| ; DSP functions that are most often used in x264. | |||||
| ; Unlike the rest of x264, this file is available under an ISC license, as it | |||||
| ; has significant usefulness outside of x264 and we want it to be available | |||||
| ; to the largest audience possible. Of course, if you modify it for your own | |||||
| ; purposes to add a new feature, we strongly encourage contributing a patch | |||||
| ; as this feature might be useful for others as well. Send patches or ideas | |||||
| ; to x264-devel@videolan.org . | |||||
| %define program_name ff | |||||
| %ifdef ARCH_X86_64 | %ifdef ARCH_X86_64 | ||||
| %ifidn __OUTPUT_FORMAT__,win32 | %ifidn __OUTPUT_FORMAT__,win32 | ||||
| %define WIN64 | %define WIN64 | ||||
| @@ -28,6 +42,12 @@ | |||||
| %endif | %endif | ||||
| %endif | %endif | ||||
| %ifdef PREFIX | |||||
| %define mangle(x) _ %+ x | |||||
| %else | |||||
| %define mangle(x) x | |||||
| %endif | |||||
| ; FIXME: All of the 64bit asm functions that take a stride as an argument | ; FIXME: All of the 64bit asm functions that take a stride as an argument | ||||
| ; via register, assume that the high dword of that register is filled with 0. | ; via register, assume that the high dword of that register is filled with 0. | ||||
| ; This is true in practice (since we never do any 64bit arithmetic on strides, | ; This is true in practice (since we never do any 64bit arithmetic on strides, | ||||
| @@ -47,28 +67,16 @@ | |||||
| %endif | %endif | ||||
| %endmacro | %endmacro | ||||
| ; PIC support macros. | |||||
| ; x86_64 can't fit 64bit address literals in most instruction types, | |||||
| ; so shared objects (under the assumption that they might be anywhere | |||||
| ; in memory) must use an address mode that does fit. | |||||
| ; So all accesses to global variables must use this macro, e.g. | |||||
| ; mov eax, [foo GLOBAL] | |||||
| ; instead of | |||||
| ; mov eax, [foo] | |||||
| ; | |||||
| ; x86_32 doesn't require PIC. | |||||
| ; Some distros prefer shared objects to be PIC, but nothing breaks if | |||||
| ; the code contains a few textrels, so we'll skip that complexity. | |||||
| %ifdef WIN64 | %ifdef WIN64 | ||||
| %define PIC | %define PIC | ||||
| %elifndef ARCH_X86_64 | %elifndef ARCH_X86_64 | ||||
| ; x86_32 doesn't require PIC. | |||||
| ; Some distros prefer shared objects to be PIC, but nothing breaks if | |||||
| ; the code contains a few textrels, so we'll skip that complexity. | |||||
| %undef PIC | %undef PIC | ||||
| %endif | %endif | ||||
| %ifdef PIC | %ifdef PIC | ||||
| %define GLOBAL wrt rip | |||||
| %else | |||||
| %define GLOBAL | |||||
| default rel | |||||
| %endif | %endif | ||||
| ; Macros to eliminate most code duplication between x86_32 and x86_64: | ; Macros to eliminate most code duplication between x86_32 and x86_64: | ||||
| @@ -163,7 +171,7 @@ DECLARE_REG_SIZE bp, bpl | |||||
| %endrep | %endrep | ||||
| %endmacro | %endmacro | ||||
| DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7 | |||||
| DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9 | |||||
| %ifdef ARCH_X86_64 | %ifdef ARCH_X86_64 | ||||
| %define gprsize 8 | %define gprsize 8 | ||||
| @@ -259,15 +267,11 @@ DECLARE_REG 6, rax, eax, ax, al, [rsp + stack_offset + 56] | |||||
| %endif | %endif | ||||
| %endmacro | %endmacro | ||||
| %macro PROLOGUE 2-4+ ; #args, #regs, #xmm_regs, arg_names... | |||||
| %macro PROLOGUE 2-4+ 0 ; #args, #regs, #xmm_regs, arg_names... | |||||
| ASSERT %2 >= %1 | ASSERT %2 >= %1 | ||||
| %assign regs_used %2 | %assign regs_used %2 | ||||
| ASSERT regs_used <= 7 | ASSERT regs_used <= 7 | ||||
| %if %0 > 2 | |||||
| %assign xmm_regs_used %3 | |||||
| %else | |||||
| %assign xmm_regs_used 0 | |||||
| %endif | |||||
| %assign xmm_regs_used %3 | |||||
| ASSERT xmm_regs_used <= 16 | ASSERT xmm_regs_used <= 16 | ||||
| %if regs_used > 4 | %if regs_used > 4 | ||||
| push r4 | push r4 | ||||
| @@ -388,7 +392,7 @@ DECLARE_REG 6, ebp, ebp, bp, null, [esp + stack_offset + 28] | |||||
| %endif | %endif | ||||
| %endmacro | %endmacro | ||||
| %macro PROLOGUE 2-4+ ; #args, #regs, arg_names... | |||||
| %macro PROLOGUE 2-4+ ; #args, #regs, #xmm_regs, arg_names... | |||||
| ASSERT %2 >= %1 | ASSERT %2 >= %1 | ||||
| %assign regs_used %2 | %assign regs_used %2 | ||||
| ASSERT regs_used <= 7 | ASSERT regs_used <= 7 | ||||
| @@ -434,10 +438,7 @@ DECLARE_REG 6, ebp, ebp, bp, null, [esp + stack_offset + 28] | |||||
| ; Symbol prefix for C linkage | ; Symbol prefix for C linkage | ||||
| %macro cglobal 1-2+ | %macro cglobal 1-2+ | ||||
| %xdefine %1 ff_%1 | |||||
| %ifdef PREFIX | |||||
| %xdefine %1 _ %+ %1 | |||||
| %endif | |||||
| %xdefine %1 mangle(program_name %+ _ %+ %1) | |||||
| %xdefine %1.skip_prologue %1 %+ .skip_prologue | %xdefine %1.skip_prologue %1 %+ .skip_prologue | ||||
| %ifidn __OUTPUT_FORMAT__,elf | %ifidn __OUTPUT_FORMAT__,elf | ||||
| global %1:function hidden | global %1:function hidden | ||||
| @@ -454,21 +455,28 @@ DECLARE_REG 6, ebp, ebp, bp, null, [esp + stack_offset + 28] | |||||
| %endmacro | %endmacro | ||||
| %macro cextern 1 | %macro cextern 1 | ||||
| %ifdef PREFIX | |||||
| %xdefine %1 _%1 | |||||
| %endif | |||||
| %xdefine %1 mangle(program_name %+ _ %+ %1) | |||||
| extern %1 | |||||
| %endmacro | |||||
| ;like cextern, but without the prefix | |||||
| %macro cextern_naked 1 | |||||
| %xdefine %1 mangle(%1) | |||||
| extern %1 | extern %1 | ||||
| %endmacro | %endmacro | ||||
| %macro const 2+ | |||||
| %xdefine %1 mangle(program_name %+ _ %+ %1) | |||||
| global %1 | |||||
| %1: %2 | |||||
| %endmacro | |||||
| ; This is needed for ELF, otherwise the GNU linker assumes the stack is | ; This is needed for ELF, otherwise the GNU linker assumes the stack is | ||||
| ; executable by default. | ; executable by default. | ||||
| %ifidn __OUTPUT_FORMAT__,elf | %ifidn __OUTPUT_FORMAT__,elf | ||||
| SECTION .note.GNU-stack noalloc noexec nowrite progbits | SECTION .note.GNU-stack noalloc noexec nowrite progbits | ||||
| %endif | %endif | ||||
| %assign FENC_STRIDE 16 | |||||
| %assign FDEC_STRIDE 32 | |||||
| ; merge mmx and sse* | ; merge mmx and sse* | ||||
| %macro CAT_XDEFINE 3 | %macro CAT_XDEFINE 3 | ||||
| @@ -575,7 +583,10 @@ INIT_MMX | |||||
| %endrep | %endrep | ||||
| %endmacro | %endmacro | ||||
| %macro SAVE_MM_PERMUTATION 1 | |||||
| ; If SAVE_MM_PERMUTATION is placed at the end of a function and given the | |||||
| ; function name, then any later calls to that function will automatically | |||||
| ; load the permutation, so values can be returned in mmregs. | |||||
| %macro SAVE_MM_PERMUTATION 1 ; name to save as | |||||
| %assign %%i 0 | %assign %%i 0 | ||||
| %rep num_mmregs | %rep num_mmregs | ||||
| CAT_XDEFINE %1_m, %%i, m %+ %%i | CAT_XDEFINE %1_m, %%i, m %+ %%i | ||||
| @@ -583,7 +594,7 @@ INIT_MMX | |||||
| %endrep | %endrep | ||||
| %endmacro | %endmacro | ||||
| %macro LOAD_MM_PERMUTATION 1 | |||||
| %macro LOAD_MM_PERMUTATION 1 ; name to load from | |||||
| %assign %%i 0 | %assign %%i 0 | ||||
| %rep num_mmregs | %rep num_mmregs | ||||
| CAT_XDEFINE m, %%i, %1_m %+ %%i | CAT_XDEFINE m, %%i, %1_m %+ %%i | ||||
| @@ -599,7 +610,7 @@ INIT_MMX | |||||
| %endif | %endif | ||||
| %endmacro | %endmacro | ||||
| ;Substitutions that reduce instruction size but are functionally equivalent | |||||
| ; Substitutions that reduce instruction size but are functionally equivalent | |||||
| %macro add 2 | %macro add 2 | ||||
| %ifnum %2 | %ifnum %2 | ||||
| %if %2==128 | %if %2==128 | ||||
| @@ -1,7 +1,10 @@ | |||||
| ;***************************************************************************** | ;***************************************************************************** | ||||
| ;* x86util.asm | ;* x86util.asm | ||||
| ;***************************************************************************** | ;***************************************************************************** | ||||
| ;* Copyright (C) 2008 Loren Merritt <lorenm@u.washington.edu> | |||||
| ;* Copyright (C) 2008 x264 project | |||||
| ;* | |||||
| ;* Authors: Holger Lubitz <holger@lubitz.org> | |||||
| ;* Loren Merritt <lorenm@u.washington.edu> | |||||
| ;* | ;* | ||||
| ;* This program is free software; you can redistribute it and/or modify | ;* This program is free software; you can redistribute it and/or modify | ||||
| ;* it under the terms of the GNU General Public License as published by | ;* it under the terms of the GNU General Public License as published by | ||||
| @@ -18,6 +21,9 @@ | |||||
| ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. | ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. | ||||
| ;***************************************************************************** | ;***************************************************************************** | ||||
| %assign FENC_STRIDE 16 | |||||
| %assign FDEC_STRIDE 32 | |||||
| %macro SBUTTERFLY 4 | %macro SBUTTERFLY 4 | ||||
| mova m%4, m%2 | mova m%4, m%2 | ||||
| punpckl%1 m%2, m%3 | punpckl%1 m%2, m%3 | ||||
| @@ -25,6 +31,13 @@ | |||||
| SWAP %3, %4 | SWAP %3, %4 | ||||
| %endmacro | %endmacro | ||||
| %macro SBUTTERFLY2 4 | |||||
| mova m%4, m%2 | |||||
| punpckh%1 m%2, m%3 | |||||
| punpckl%1 m%4, m%3 | |||||
| SWAP %2, %4, %3 | |||||
| %endmacro | |||||
| %macro TRANSPOSE4x4W 5 | %macro TRANSPOSE4x4W 5 | ||||
| SBUTTERFLY wd, %1, %2, %5 | SBUTTERFLY wd, %1, %2, %5 | ||||
| SBUTTERFLY wd, %3, %4, %5 | SBUTTERFLY wd, %3, %4, %5 | ||||
| @@ -123,14 +136,40 @@ | |||||
| pabsw %2, %2 | pabsw %2, %2 | ||||
| %endmacro | %endmacro | ||||
| %define ABS1 ABS1_MMX | |||||
| %define ABS2 ABS2_MMX | |||||
| %macro ABSB_MMX 2 | |||||
| pxor %2, %2 | |||||
| psubb %2, %1 | |||||
| pminub %1, %2 | |||||
| %endmacro | |||||
| %macro ABSB2_MMX 4 | |||||
| pxor %3, %3 | |||||
| pxor %4, %4 | |||||
| psubb %3, %1 | |||||
| psubb %4, %2 | |||||
| pminub %1, %3 | |||||
| pminub %2, %4 | |||||
| %endmacro | |||||
| %macro ABSB_SSSE3 2 | |||||
| pabsb %1, %1 | |||||
| %endmacro | |||||
| %macro ABSB2_SSSE3 4 | |||||
| pabsb %1, %1 | |||||
| pabsb %2, %2 | |||||
| %endmacro | |||||
| %macro ABS4 6 | %macro ABS4 6 | ||||
| ABS2 %1, %2, %5, %6 | ABS2 %1, %2, %5, %6 | ||||
| ABS2 %3, %4, %5, %6 | ABS2 %3, %4, %5, %6 | ||||
| %endmacro | %endmacro | ||||
| %define ABS1 ABS1_MMX | |||||
| %define ABS2 ABS2_MMX | |||||
| %define ABSB ABSB_MMX | |||||
| %define ABSB2 ABSB2_MMX | |||||
| %macro SPLATB_MMX 3 | %macro SPLATB_MMX 3 | ||||
| movd %1, [%2-3] ;to avoid crossing a cacheline | movd %1, [%2-3] ;to avoid crossing a cacheline | ||||
| punpcklbw %1, %1 | punpcklbw %1, %1 | ||||
| @@ -226,10 +265,10 @@ | |||||
| ; %3/%4: source regs | ; %3/%4: source regs | ||||
| ; %5/%6: tmp regs | ; %5/%6: tmp regs | ||||
| %ifidn %1, d | %ifidn %1, d | ||||
| %define mask [mask_10 GLOBAL] | |||||
| %define mask [mask_10] | |||||
| %define shift 16 | %define shift 16 | ||||
| %elifidn %1, q | %elifidn %1, q | ||||
| %define mask [mask_1100 GLOBAL] | |||||
| %define mask [mask_1100] | |||||
| %define shift 32 | %define shift 32 | ||||
| %endif | %endif | ||||
| %if %0==6 ; less dependency if we have two tmp | %if %0==6 ; less dependency if we have two tmp | ||||
| @@ -383,10 +422,10 @@ | |||||
| %macro SUMSUBD2_AB 4 | %macro SUMSUBD2_AB 4 | ||||
| mova %4, %1 | mova %4, %1 | ||||
| mova %3, %2 | mova %3, %2 | ||||
| psraw %2, 1 | |||||
| psraw %1, 1 | |||||
| paddw %2, %4 | |||||
| psubw %1, %3 | |||||
| psraw %2, 1 ; %2: %2>>1 | |||||
| psraw %1, 1 ; %1: %1>>1 | |||||
| paddw %2, %4 ; %2: %2>>1+%1 | |||||
| psubw %1, %3 ; %1: %1>>1-%2 | |||||
| %endmacro | %endmacro | ||||
| %macro DCT4_1D 5 | %macro DCT4_1D 5 | ||||
| @@ -407,16 +446,27 @@ | |||||
| %macro IDCT4_1D 5-6 | %macro IDCT4_1D 5-6 | ||||
| %ifnum %5 | %ifnum %5 | ||||
| SUMSUBD2_AB m%2, m%4, m%6, m%5 | SUMSUBD2_AB m%2, m%4, m%6, m%5 | ||||
| ; %2: %2>>1-%4 %4: %2+%4>>1 | |||||
| SUMSUB_BA m%3, m%1, m%6 | SUMSUB_BA m%3, m%1, m%6 | ||||
| ; %3: %1+%3 %1: %1-%3 | |||||
| SUMSUB_BADC m%4, m%3, m%2, m%1, m%6 | SUMSUB_BADC m%4, m%3, m%2, m%1, m%6 | ||||
| ; %4: %1+%3 + (%2+%4>>1) | |||||
| ; %3: %1+%3 - (%2+%4>>1) | |||||
| ; %2: %1-%3 + (%2>>1-%4) | |||||
| ; %1: %1-%3 - (%2>>1-%4) | |||||
| %else | %else | ||||
| SUMSUBD2_AB m%2, m%4, [%5], [%5+16] | SUMSUBD2_AB m%2, m%4, [%5], [%5+16] | ||||
| SUMSUB_BA m%3, m%1 | SUMSUB_BA m%3, m%1 | ||||
| SUMSUB_BADC m%4, m%3, m%2, m%1 | SUMSUB_BADC m%4, m%3, m%2, m%1 | ||||
| %endif | %endif | ||||
| SWAP %1, %4, %3 | SWAP %1, %4, %3 | ||||
| ; %1: %1+%3 + (%2+%4>>1) row0 | |||||
| ; %2: %1-%3 + (%2>>1-%4) row1 | |||||
| ; %3: %1-%3 - (%2>>1-%4) row2 | |||||
| ; %4: %1+%3 - (%2+%4>>1) row3 | |||||
| %endmacro | %endmacro | ||||
| %macro LOAD_DIFF 5 | %macro LOAD_DIFF 5 | ||||
| %ifidn %3, none | %ifidn %3, none | ||||
| movh %1, %4 | movh %1, %4 | ||||
| @@ -512,4 +562,3 @@ | |||||
| packuswb %1, %1 | packuswb %1, %1 | ||||
| movh %4, %1 | movh %4, %1 | ||||
| %endmacro | %endmacro | ||||