Update x264asm header files to latest versions.

Modify the asm accordingly. GLOBAL is now no longoer necessary for PIC-compliant loads. Originally committed as revision 23739 to svn://svn.ffmpeg.org/ffmpeg/trunk
15 years ago · 2966cc1849
--- a/libavcodec/x86/dsputil_yasm.asm
+++ b/libavcodec/x86/dsputil_yasm.asm
@@ -40,7 +40,7 @@ section .text align=16
 %endmacro

 %macro FLOAT_TO_INT16_INTERLEAVE6 1
 ; void ff_float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len)
 ; void float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len)
 cglobal float_to_int16_interleave6_%1, 2,7,0, dst, src, src1, src2, src3, src4, src5
 %ifdef ARCH_X86_64
    %define lend r10d
@@ -272,7 +272,7 @@ SCALARPRODUCT_LOOP 0



 ; void ff_add_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top)
 ; void add_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top)
 cglobal add_hfyu_median_prediction_mmx2, 6,6,0, dst, top, diff, w, left, left_top
    movq    mm0, [topq]
    movq    mm2, mm0
@@ -370,23 +370,23 @@ cglobal add_hfyu_median_prediction_mmx2, 6,6,0, dst, top, diff, w, left, left_to
    RET
 %endmacro

 ; int ff_add_hfyu_left_prediction(uint8_t *dst, const uint8_t *src, int w, int left)
 ; int add_hfyu_left_prediction(uint8_t *dst, const uint8_t *src, int w, int left)
 INIT_MMX
 cglobal add_hfyu_left_prediction_ssse3, 3,3,7, dst, src, w, left
 .skip_prologue:
    mova    m5, [pb_7 GLOBAL]
    mova    m4, [pb_zzzz3333zzzzbbbb GLOBAL]
    mova    m3, [pb_zz11zz55zz99zzdd GLOBAL]
    mova    m5, [pb_7]
    mova    m4, [pb_zzzz3333zzzzbbbb]
    mova    m3, [pb_zz11zz55zz99zzdd]
    movd    m0, leftm
    psllq   m0, 56
    ADD_HFYU_LEFT_LOOP 1

 INIT_XMM
 cglobal add_hfyu_left_prediction_sse4, 3,3,7, dst, src, w, left
    mova    m5, [pb_f GLOBAL]
    mova    m6, [pb_zzzzzzzz77777777 GLOBAL]
    mova    m4, [pb_zzzz3333zzzzbbbb GLOBAL]
    mova    m3, [pb_zz11zz55zz99zzdd GLOBAL]
    mova    m5, [pb_f]
    mova    m6, [pb_zzzzzzzz77777777]
    mova    m4, [pb_zzzz3333zzzzbbbb]
    mova    m3, [pb_zz11zz55zz99zzdd]
    movd    m0, leftm
    pslldq  m0, 15
    test    srcq, 15
@@ -398,7 +398,7 @@ cglobal add_hfyu_left_prediction_sse4, 3,3,7, dst, src, w, left
    ADD_HFYU_LEFT_LOOP 0


 ; float ff_scalarproduct_float_sse(const float *v1, const float *v2, int len)
 ; float scalarproduct_float_sse(const float *v1, const float *v2, int len)
 cglobal scalarproduct_float_sse, 3,3,2, v1, v2, offset
    neg offsetq
    shl offsetq, 2
--- a/libavcodec/x86/fft_mmx.asm
+++ b/libavcodec/x86/fft_mmx.asm
@@ -35,7 +35,7 @@ ps_m1p1: dd 1<<31, 0

 %assign i 16
 %rep 13
 cextern ff_cos_ %+ i
 cextern cos_ %+ i
 %assign i i<<1
 %endrep

@@ -64,7 +64,7 @@ section .text align=16
    mova     %5, %3
    pfsub    %3, %4
    pfadd    %5, %4 ; {t6,t5}
    pxor     %3, [ps_m1p1 GLOBAL] ; {t8,t7}
    pxor     %3, [ps_m1p1] ; {t8,t7}
    mova     %6, %1
    pswapd   %3, %3
    pfadd    %1, %5 ; {r0,i0}
@@ -105,8 +105,8 @@ section .text align=16
    addps    %6, %5       ; {t1,t2,t3,t4}
    mova     %5, %3
    shufps   %5, %5, 0xb1 ; {i5,r5,i7,r7}
    mulps    %3, [ps_root2mppm GLOBAL] ; {-r5,i5,r7,-i7}
    mulps    %5, [ps_root2 GLOBAL]
    mulps    %3, [ps_root2mppm] ; {-r5,i5,r7,-i7}
    mulps    %5, [ps_root2]
    addps    %3, %5       ; {t8,t7,ta,t9}
    mova     %5, %6
    shufps   %6, %3, 0x36 ; {t3,t2,t9,t8}
@@ -309,7 +309,7 @@ fft16_sse:
    mova     m6, Z(6)
    mova     m7, Z(7)
    T4_SSE   m6, m7, m0
    PASS_SMALL 0, [ff_cos_16 GLOBAL], [ff_cos_16+16 GLOBAL]
    PASS_SMALL 0, [cos_16], [cos_16+16]
    ret


@@ -342,12 +342,12 @@ fft8%1:
    T2_3DN   m6, m7, Z(6), Z(7)
    pswapd   m0, m5
    pswapd   m2, m7
    pxor     m0, [ps_m1p1 GLOBAL]
    pxor     m2, [ps_m1p1 GLOBAL]
    pxor     m0, [ps_m1p1]
    pxor     m2, [ps_m1p1]
    pfsub    m5, m0
    pfadd    m7, m2
    pfmul    m5, [ps_root2 GLOBAL]
    pfmul    m7, [ps_root2 GLOBAL]
    pfmul    m5, [ps_root2]
    pfmul    m7, [ps_root2]
    T4_3DN   m1, m3, m5, m7, m0, m2
    mova   Z(5), m5
    mova   Z(7), m7
@@ -445,7 +445,7 @@ fft %+ n %+ %3%2:
    add r0, n*2 - (n2&(-2<<%1))
    call fft %+ n4 %+ %2
    sub r0, n*6 + (n2&(-2<<%1))
    lea r1, [ff_cos_ %+ n GLOBAL]
    lea r1, [cos_ %+ n]
    mov r2d, n4/2
    jmp pass%3%2

@@ -461,10 +461,10 @@ section .text
 ; On x86_32, this function does the register saving and restoring for all of fft.
 ; The others pass args in registers and don't spill anything.
 cglobal fft_dispatch%3%2, 2,5,8, z, nbits
    lea r2, [dispatch_tab%3%2 GLOBAL]
    lea r2, [dispatch_tab%3%2]
    mov r2, [r2 + (nbitsq-2)*gprsize]
 %ifdef PIC
    lea r3, [$$ GLOBAL]
    lea r3, [$$]
    add r2, r3
 %endif
    call r2
--- a/libavcodec/x86/h264_deblock_sse2.asm
+++ b/libavcodec/x86/h264_deblock_sse2.asm
@@ -234,18 +234,18 @@ SECTION .text
 %macro DEBLOCK_P0_Q0 0
    mova    m5, m1
    pxor    m5, m2           ; p0^q0
    pand    m5, [pb_01 GLOBAL] ; (p0^q0)&1
    pand    m5, [pb_01] ; (p0^q0)&1
    pcmpeqb m4, m4
    pxor    m3, m4
    pavgb   m3, m0           ; (p1 - q1 + 256)>>1
    pavgb   m3, [pb_03 GLOBAL] ; (((p1 - q1 + 256)>>1)+4)>>1 = 64+2+(p1-q1)>>2
    pavgb   m3, [pb_03] ; (((p1 - q1 + 256)>>1)+4)>>1 = 64+2+(p1-q1)>>2
    pxor    m4, m1
    pavgb   m4, m2           ; (q0 - p0 + 256)>>1
    pavgb   m3, m5
    paddusb m3, m4           ; d+128+33
    mova    m6, [pb_a1 GLOBAL]
    mova    m6, [pb_a1]
    psubusb m6, m3
    psubusb m3, [pb_a1 GLOBAL]
    psubusb m3, [pb_a1]
    pminub  m6, m7
    pminub  m3, m7
    psubusb m1, m6
@@ -263,7 +263,7 @@ SECTION .text
    pavgb   %6, m2
    pavgb   %2, %6             ; avg(p2,avg(p0,q0))
    pxor    %6, %3
    pand    %6, [pb_01 GLOBAL] ; (p2^avg(p0,q0))&1
    pand    %6, [pb_01] ; (p2^avg(p0,q0))&1
    psubusb %2, %6             ; (p2+((p0+q0+1)>>1))>>1
    mova    %6, %1
    psubusb %6, %5
@@ -612,8 +612,8 @@ DEBLOCK_LUMA sse2, v, 16
    %define mask0 spill(2)
    %define mask1p spill(3)
    %define mask1q spill(4)
    %define mpb_00 [pb_00 GLOBAL]
    %define mpb_01 [pb_01 GLOBAL]
    %define mpb_00 [pb_00]
    %define mpb_01 [pb_01]
 %endif

 ;-----------------------------------------------------------------------------
@@ -637,7 +637,7 @@ cglobal x264_deblock_%2_luma_intra_%1, 4,6,16
    mova    q1, [r0+r1]
 %ifdef ARCH_X86_64
    pxor    mpb_00, mpb_00
    mova    mpb_01, [pb_01 GLOBAL]
    mova    mpb_01, [pb_01]
    LOAD_MASK r2d, r3d, t5 ; m5=beta-1, t5=alpha-1, m7=mask0
    SWAP    7, 12 ; m12=mask0
    pavgb   t5, mpb_00
@@ -656,8 +656,8 @@ cglobal x264_deblock_%2_luma_intra_%1, 4,6,16
    LOAD_MASK r2d, r3d, t5 ; m5=beta-1, t5=alpha-1, m7=mask0
    mova    m4, t5
    mova    mask0, m7
    pavgb   m4, [pb_00 GLOBAL]
    pavgb   m4, [pb_01 GLOBAL] ; alpha/4+1
    pavgb   m4, [pb_00]
    pavgb   m4, [pb_01] ; alpha/4+1
    DIFF_GT2 p0, q0, m4, m6, m7 ; m6 = |p0-q0| > alpha/4+1
    pand    m6, mask0
    DIFF_GT2 p0, p2, m5, m4, m7 ; m4 = |p2-p0| > beta-1
--- a/libavcodec/x86/h264_idct_sse2.asm
+++ b/libavcodec/x86/h264_idct_sse2.asm
@@ -43,7 +43,7 @@ cglobal x264_add8x4_idct_sse2, 3,3,8
    movhps m3, [r1+56]
    IDCT4_1D 0,1,2,3,4,5
    TRANSPOSE2x4x4W 0,1,2,3,4
    paddw m0, [pw_32 GLOBAL]
    paddw m0, [pw_32]
    IDCT4_1D 0,1,2,3,4,5
    pxor  m7, m7
    STORE_DIFF  m0, m4, m7, [r0]
--- a/libavcodec/x86/x86inc.asm
+++ b/libavcodec/x86/x86inc.asm
@@ -1,25 +1,39 @@
 ;*****************************************************************************
 ;* x86inc.asm
 ;*****************************************************************************
 ;* Copyright (C) 2005-2008 Loren Merritt <lorenm@u.washington.edu>
 ;* Copyright (C) 2005-2008 x264 project
 ;*
 ;* This file is part of FFmpeg.
 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
 ;*          Anton Mitrofanov <BugMaster@narod.ru>
 ;*
 ;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;* Permission to use, copy, modify, and/or distribute this software for any
 ;* purpose with or without fee is hereby granted, provided that the above
 ;* copyright notice and this permission notice appear in all copies.
 ;*
 ;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
 ;* License along with FFmpeg; if not, write to the Free Software
 ;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
 ;* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
 ;* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
 ;* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
 ;* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
 ;* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
 ;* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
 ;*****************************************************************************

 ; This is a header file for the x264ASM assembly language, which uses
 ; NASM/YASM syntax combined with a large number of macros to provide easy
 ; abstraction between different calling conventions (x86_32, win64, linux64).
 ; It also has various other useful features to simplify writing the kind of
 ; DSP functions that are most often used in x264.

 ; Unlike the rest of x264, this file is available under an ISC license, as it
 ; has significant usefulness outside of x264 and we want it to be available
 ; to the largest audience possible.  Of course, if you modify it for your own
 ; purposes to add a new feature, we strongly encourage contributing a patch
 ; as this feature might be useful for others as well.  Send patches or ideas
 ; to x264-devel@videolan.org .

 %define program_name ff

 %ifdef ARCH_X86_64
    %ifidn __OUTPUT_FORMAT__,win32
        %define WIN64
@@ -28,6 +42,12 @@
    %endif
 %endif

 %ifdef PREFIX
    %define mangle(x) _ %+ x
 %else
    %define mangle(x) x
 %endif

 ; FIXME: All of the 64bit asm functions that take a stride as an argument
 ; via register, assume that the high dword of that register is filled with 0.
 ; This is true in practice (since we never do any 64bit arithmetic on strides,
@@ -47,28 +67,16 @@
    %endif
 %endmacro

 ; PIC support macros.
 ; x86_64 can't fit 64bit address literals in most instruction types,
 ; so shared objects (under the assumption that they might be anywhere
 ; in memory) must use an address mode that does fit.
 ; So all accesses to global variables must use this macro, e.g.
 ;     mov eax, [foo GLOBAL]
 ; instead of
 ;     mov eax, [foo]
 ;
 ; x86_32 doesn't require PIC.
 ; Some distros prefer shared objects to be PIC, but nothing breaks if
 ; the code contains a few textrels, so we'll skip that complexity.

 %ifdef WIN64
    %define PIC
 %elifndef ARCH_X86_64
 ; x86_32 doesn't require PIC.
 ; Some distros prefer shared objects to be PIC, but nothing breaks if
 ; the code contains a few textrels, so we'll skip that complexity.
    %undef PIC
 %endif
 %ifdef PIC
    %define GLOBAL wrt rip
 %else
    %define GLOBAL
    default rel
 %endif

 ; Macros to eliminate most code duplication between x86_32 and x86_64:
@@ -163,7 +171,7 @@ DECLARE_REG_SIZE bp, bpl
    %endrep
 %endmacro

 DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7
 DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9

 %ifdef ARCH_X86_64
    %define gprsize 8
@@ -259,15 +267,11 @@ DECLARE_REG 6, rax, eax, ax,  al,  [rsp + stack_offset + 56]
    %endif
 %endmacro

 %macro PROLOGUE 2-4+ ; #args, #regs, #xmm_regs, arg_names...
 %macro PROLOGUE 2-4+ 0 ; #args, #regs, #xmm_regs, arg_names...
    ASSERT %2 >= %1
    %assign regs_used %2
    ASSERT regs_used <= 7
    %if %0 > 2
        %assign xmm_regs_used %3
    %else
        %assign xmm_regs_used 0
    %endif
    %assign xmm_regs_used %3
    ASSERT xmm_regs_used <= 16
    %if regs_used > 4
        push r4
@@ -388,7 +392,7 @@ DECLARE_REG 6, ebp, ebp, bp, null, [esp + stack_offset + 28]
    %endif
 %endmacro

 %macro PROLOGUE 2-4+ ; #args, #regs, arg_names...
 %macro PROLOGUE 2-4+ ; #args, #regs, #xmm_regs, arg_names...
    ASSERT %2 >= %1
    %assign regs_used %2
    ASSERT regs_used <= 7
@@ -434,10 +438,7 @@ DECLARE_REG 6, ebp, ebp, bp, null, [esp + stack_offset + 28]

 ; Symbol prefix for C linkage
 %macro cglobal 1-2+
    %xdefine %1 ff_%1
    %ifdef PREFIX
        %xdefine %1 _ %+ %1
    %endif
    %xdefine %1 mangle(program_name %+ _ %+ %1)
    %xdefine %1.skip_prologue %1 %+ .skip_prologue
    %ifidn __OUTPUT_FORMAT__,elf
        global %1:function hidden
@@ -454,21 +455,28 @@ DECLARE_REG 6, ebp, ebp, bp, null, [esp + stack_offset + 28]
 %endmacro

 %macro cextern 1
    %ifdef PREFIX
        %xdefine %1 _%1
    %endif
    %xdefine %1 mangle(program_name %+ _ %+ %1)
    extern %1
 %endmacro

 ;like cextern, but without the prefix
 %macro cextern_naked 1
    %xdefine %1 mangle(%1)
    extern %1
 %endmacro

 %macro const 2+
    %xdefine %1 mangle(program_name %+ _ %+ %1)
    global %1
    %1: %2
 %endmacro

 ; This is needed for ELF, otherwise the GNU linker assumes the stack is
 ; executable by default.
 %ifidn __OUTPUT_FORMAT__,elf
 SECTION .note.GNU-stack noalloc noexec nowrite progbits
 %endif

 %assign FENC_STRIDE 16
 %assign FDEC_STRIDE 32

 ; merge mmx and sse*

 %macro CAT_XDEFINE 3
@@ -575,7 +583,10 @@ INIT_MMX
 %endrep
 %endmacro

 %macro SAVE_MM_PERMUTATION 1
 ; If SAVE_MM_PERMUTATION is placed at the end of a function and given the
 ; function name, then any later calls to that function will automatically
 ; load the permutation, so values can be returned in mmregs.
 %macro SAVE_MM_PERMUTATION 1 ; name to save as
    %assign %%i 0
    %rep num_mmregs
    CAT_XDEFINE %1_m, %%i, m %+ %%i
@@ -583,7 +594,7 @@ INIT_MMX
    %endrep
 %endmacro

 %macro LOAD_MM_PERMUTATION 1
 %macro LOAD_MM_PERMUTATION 1 ; name to load from
    %assign %%i 0
    %rep num_mmregs
    CAT_XDEFINE m, %%i, %1_m %+ %%i
@@ -599,7 +610,7 @@ INIT_MMX
    %endif
 %endmacro

 ;Substitutions that reduce instruction size but are functionally equivalent
 ; Substitutions that reduce instruction size but are functionally equivalent
 %macro add 2
    %ifnum %2
        %if %2==128
--- a/libavcodec/x86/x86util.asm
+++ b/libavcodec/x86/x86util.asm
@@ -1,7 +1,10 @@
 ;*****************************************************************************
 ;* x86util.asm
 ;*****************************************************************************
 ;* Copyright (C) 2008 Loren Merritt <lorenm@u.washington.edu>
 ;* Copyright (C) 2008 x264 project
 ;*
 ;* Authors: Holger Lubitz <holger@lubitz.org>
 ;*          Loren Merritt <lorenm@u.washington.edu>
 ;*
 ;* This program is free software; you can redistribute it and/or modify
 ;* it under the terms of the GNU General Public License as published by
@@ -18,6 +21,9 @@
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 ;*****************************************************************************

 %assign FENC_STRIDE 16
 %assign FDEC_STRIDE 32

 %macro SBUTTERFLY 4
    mova      m%4, m%2
    punpckl%1 m%2, m%3
@@ -25,6 +31,13 @@
    SWAP %3, %4
 %endmacro

 %macro SBUTTERFLY2 4
    mova      m%4, m%2
    punpckh%1 m%2, m%3
    punpckl%1 m%4, m%3
    SWAP %2, %4, %3
 %endmacro

 %macro TRANSPOSE4x4W 5
    SBUTTERFLY wd, %1, %2, %5
    SBUTTERFLY wd, %3, %4, %5
@@ -123,14 +136,40 @@
    pabsw   %2, %2
 %endmacro

 %define ABS1 ABS1_MMX
 %define ABS2 ABS2_MMX
 %macro ABSB_MMX 2
    pxor    %2, %2
    psubb   %2, %1
    pminub  %1, %2
 %endmacro

 %macro ABSB2_MMX 4
    pxor    %3, %3
    pxor    %4, %4
    psubb   %3, %1
    psubb   %4, %2
    pminub  %1, %3
    pminub  %2, %4
 %endmacro

 %macro ABSB_SSSE3 2
    pabsb   %1, %1
 %endmacro

 %macro ABSB2_SSSE3 4
    pabsb   %1, %1
    pabsb   %2, %2
 %endmacro

 %macro ABS4 6
    ABS2 %1, %2, %5, %6
    ABS2 %3, %4, %5, %6
 %endmacro

 %define ABS1 ABS1_MMX
 %define ABS2 ABS2_MMX
 %define ABSB ABSB_MMX
 %define ABSB2 ABSB2_MMX

 %macro SPLATB_MMX 3
    movd      %1, [%2-3] ;to avoid crossing a cacheline
    punpcklbw %1, %1
@@ -226,10 +265,10 @@
 ; %3/%4: source regs
 ; %5/%6: tmp regs
 %ifidn %1, d
 %define mask [mask_10 GLOBAL]
 %define mask [mask_10]
 %define shift 16
 %elifidn %1, q
 %define mask [mask_1100 GLOBAL]
 %define mask [mask_1100]
 %define shift 32
 %endif
 %if %0==6 ; less dependency if we have two tmp
@@ -383,10 +422,10 @@
 %macro SUMSUBD2_AB 4
    mova    %4, %1
    mova    %3, %2
    psraw   %2, 1
    psraw   %1, 1
    paddw   %2, %4
    psubw   %1, %3
    psraw   %2, 1  ; %2: %2>>1
    psraw   %1, 1  ; %1: %1>>1
    paddw   %2, %4 ; %2: %2>>1+%1
    psubw   %1, %3 ; %1: %1>>1-%2
 %endmacro

 %macro DCT4_1D 5
@@ -407,16 +446,27 @@
 %macro IDCT4_1D 5-6
 %ifnum %5
    SUMSUBD2_AB m%2, m%4, m%6, m%5
    ; %2: %2>>1-%4 %4: %2+%4>>1
    SUMSUB_BA   m%3, m%1, m%6
    ; %3: %1+%3 %1: %1-%3
    SUMSUB_BADC m%4, m%3, m%2, m%1, m%6
    ; %4: %1+%3 + (%2+%4>>1)
    ; %3: %1+%3 - (%2+%4>>1)
    ; %2: %1-%3 + (%2>>1-%4)
    ; %1: %1-%3 - (%2>>1-%4)
 %else
    SUMSUBD2_AB m%2, m%4, [%5], [%5+16]
    SUMSUB_BA   m%3, m%1
    SUMSUB_BADC m%4, m%3, m%2, m%1
 %endif
    SWAP %1, %4, %3
    ; %1: %1+%3 + (%2+%4>>1) row0
    ; %2: %1-%3 + (%2>>1-%4) row1
    ; %3: %1-%3 - (%2>>1-%4) row2
    ; %4: %1+%3 - (%2+%4>>1) row3
 %endmacro


 %macro LOAD_DIFF 5
 %ifidn %3, none
    movh       %1, %4
@@ -512,4 +562,3 @@
    packuswb   %1, %1
    movh       %4, %1
 %endmacro