* commit '57b5b84e208ad61ffdd74ad849bed212deb92bc5': x86: dsputil: Move ff_apply_window_int16_* bits to ac3dsp, where they belong Merged-by: Michael Niedermayer <michaelni@gmx.at>tags/n2.3
| @@ -35,6 +35,10 @@ pw_bap_mul2: dw 5, 7, 0, 7, 5, 7, 0, 7 | |||
| pd_1: times 4 dd 1 | |||
| pd_151: times 4 dd 151 | |||
| ; used in ff_apply_window_int16() | |||
| pb_revwords: SHUFFLE_MASK_W 7, 6, 5, 4, 3, 2, 1, 0 | |||
| pd_16384: times 4 dd 16384 | |||
| SECTION .text | |||
| ;----------------------------------------------------------------------------- | |||
| @@ -419,3 +423,130 @@ AC3_EXTRACT_EXPONENTS | |||
| INIT_XMM ssse3 | |||
| AC3_EXTRACT_EXPONENTS | |||
| %endif | |||
| ;----------------------------------------------------------------------------- | |||
| ; void ff_apply_window_int16(int16_t *output, const int16_t *input, | |||
| ; const int16_t *window, unsigned int len) | |||
| ;----------------------------------------------------------------------------- | |||
| %macro REVERSE_WORDS 1-2 | |||
| %if cpuflag(ssse3) && notcpuflag(atom) | |||
| pshufb %1, %2 | |||
| %elif cpuflag(sse2) | |||
| pshuflw %1, %1, 0x1B | |||
| pshufhw %1, %1, 0x1B | |||
| pshufd %1, %1, 0x4E | |||
| %elif cpuflag(mmxext) | |||
| pshufw %1, %1, 0x1B | |||
| %endif | |||
| %endmacro | |||
| %macro MUL16FIXED 3 | |||
| %if cpuflag(ssse3) ; dst, src, unused | |||
| ; dst = ((dst * src) + (1<<14)) >> 15 | |||
| pmulhrsw %1, %2 | |||
| %elif cpuflag(mmxext) ; dst, src, temp | |||
| ; dst = (dst * src) >> 15 | |||
| ; pmulhw cuts off the bottom bit, so we have to lshift by 1 and add it back | |||
| ; in from the pmullw result. | |||
| mova %3, %1 | |||
| pmulhw %1, %2 | |||
| pmullw %3, %2 | |||
| psrlw %3, 15 | |||
| psllw %1, 1 | |||
| por %1, %3 | |||
| %endif | |||
| %endmacro | |||
| %macro APPLY_WINDOW_INT16 1 ; %1 bitexact version | |||
| %if %1 | |||
| cglobal apply_window_int16, 4,5,6, output, input, window, offset, offset2 | |||
| %else | |||
| cglobal apply_window_int16_round, 4,5,6, output, input, window, offset, offset2 | |||
| %endif | |||
| lea offset2q, [offsetq-mmsize] | |||
| %if cpuflag(ssse3) && notcpuflag(atom) | |||
| mova m5, [pb_revwords] | |||
| ALIGN 16 | |||
| %elif %1 | |||
| mova m5, [pd_16384] | |||
| %endif | |||
| .loop: | |||
| %if cpuflag(ssse3) | |||
| ; This version does the 16x16->16 multiplication in-place without expanding | |||
| ; to 32-bit. The ssse3 version is bit-identical. | |||
| mova m0, [windowq+offset2q] | |||
| mova m1, [ inputq+offset2q] | |||
| pmulhrsw m1, m0 | |||
| REVERSE_WORDS m0, m5 | |||
| pmulhrsw m0, [ inputq+offsetq ] | |||
| mova [outputq+offset2q], m1 | |||
| mova [outputq+offsetq ], m0 | |||
| %elif %1 | |||
| ; This version expands 16-bit to 32-bit, multiplies by the window, | |||
| ; adds 16384 for rounding, right shifts 15, then repacks back to words to | |||
| ; save to the output. The window is reversed for the second half. | |||
| mova m3, [windowq+offset2q] | |||
| mova m4, [ inputq+offset2q] | |||
| pxor m0, m0 | |||
| punpcklwd m0, m3 | |||
| punpcklwd m1, m4 | |||
| pmaddwd m0, m1 | |||
| paddd m0, m5 | |||
| psrad m0, 15 | |||
| pxor m2, m2 | |||
| punpckhwd m2, m3 | |||
| punpckhwd m1, m4 | |||
| pmaddwd m2, m1 | |||
| paddd m2, m5 | |||
| psrad m2, 15 | |||
| packssdw m0, m2 | |||
| mova [outputq+offset2q], m0 | |||
| REVERSE_WORDS m3 | |||
| mova m4, [ inputq+offsetq] | |||
| pxor m0, m0 | |||
| punpcklwd m0, m3 | |||
| punpcklwd m1, m4 | |||
| pmaddwd m0, m1 | |||
| paddd m0, m5 | |||
| psrad m0, 15 | |||
| pxor m2, m2 | |||
| punpckhwd m2, m3 | |||
| punpckhwd m1, m4 | |||
| pmaddwd m2, m1 | |||
| paddd m2, m5 | |||
| psrad m2, 15 | |||
| packssdw m0, m2 | |||
| mova [outputq+offsetq], m0 | |||
| %else | |||
| ; This version does the 16x16->16 multiplication in-place without expanding | |||
| ; to 32-bit. The mmxext and sse2 versions do not use rounding, and | |||
| ; therefore are not bit-identical to the C version. | |||
| mova m0, [windowq+offset2q] | |||
| mova m1, [ inputq+offset2q] | |||
| mova m2, [ inputq+offsetq ] | |||
| MUL16FIXED m1, m0, m3 | |||
| REVERSE_WORDS m0 | |||
| MUL16FIXED m2, m0, m3 | |||
| mova [outputq+offset2q], m1 | |||
| mova [outputq+offsetq ], m2 | |||
| %endif | |||
| add offsetd, mmsize | |||
| sub offset2d, mmsize | |||
| jae .loop | |||
| REP_RET | |||
| %endmacro | |||
| INIT_MMX mmxext | |||
| APPLY_WINDOW_INT16 0 | |||
| INIT_XMM sse2 | |||
| APPLY_WINDOW_INT16 0 | |||
| INIT_MMX mmxext | |||
| APPLY_WINDOW_INT16 1 | |||
| INIT_XMM sse2 | |||
| APPLY_WINDOW_INT16 1 | |||
| INIT_XMM ssse3 | |||
| APPLY_WINDOW_INT16 1 | |||
| INIT_XMM ssse3, atom | |||
| APPLY_WINDOW_INT16 1 | |||
| @@ -29,8 +29,6 @@ pb_zzzzzzzz77777777: times 8 db -1 | |||
| pb_7: times 8 db 7 | |||
| pb_zzzz3333zzzzbbbb: db -1,-1,-1,-1,3,3,3,3,-1,-1,-1,-1,11,11,11,11 | |||
| pb_zz11zz55zz99zzdd: db -1,-1,1,1,-1,-1,5,5,-1,-1,9,9,-1,-1,13,13 | |||
| pb_revwords: SHUFFLE_MASK_W 7, 6, 5, 4, 3, 2, 1, 0 | |||
| pd_16384: times 4 dd 16384 | |||
| pb_bswap32: db 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 | |||
| SECTION_TEXT | |||
| @@ -210,134 +208,6 @@ SCALARPRODUCT_LOOP 0 | |||
| RET | |||
| ;----------------------------------------------------------------------------- | |||
| ; void ff_apply_window_int16(int16_t *output, const int16_t *input, | |||
| ; const int16_t *window, unsigned int len) | |||
| ;----------------------------------------------------------------------------- | |||
| %macro REVERSE_WORDS 1-2 | |||
| %if cpuflag(ssse3) && notcpuflag(atom) | |||
| pshufb %1, %2 | |||
| %elif cpuflag(sse2) | |||
| pshuflw %1, %1, 0x1B | |||
| pshufhw %1, %1, 0x1B | |||
| pshufd %1, %1, 0x4E | |||
| %elif cpuflag(mmxext) | |||
| pshufw %1, %1, 0x1B | |||
| %endif | |||
| %endmacro | |||
| %macro MUL16FIXED 3 | |||
| %if cpuflag(ssse3) ; dst, src, unused | |||
| ; dst = ((dst * src) + (1<<14)) >> 15 | |||
| pmulhrsw %1, %2 | |||
| %elif cpuflag(mmxext) ; dst, src, temp | |||
| ; dst = (dst * src) >> 15 | |||
| ; pmulhw cuts off the bottom bit, so we have to lshift by 1 and add it back | |||
| ; in from the pmullw result. | |||
| mova %3, %1 | |||
| pmulhw %1, %2 | |||
| pmullw %3, %2 | |||
| psrlw %3, 15 | |||
| psllw %1, 1 | |||
| por %1, %3 | |||
| %endif | |||
| %endmacro | |||
| %macro APPLY_WINDOW_INT16 1 ; %1 bitexact version | |||
| %if %1 | |||
| cglobal apply_window_int16, 4,5,6, output, input, window, offset, offset2 | |||
| %else | |||
| cglobal apply_window_int16_round, 4,5,6, output, input, window, offset, offset2 | |||
| %endif | |||
| lea offset2q, [offsetq-mmsize] | |||
| %if cpuflag(ssse3) && notcpuflag(atom) | |||
| mova m5, [pb_revwords] | |||
| ALIGN 16 | |||
| %elif %1 | |||
| mova m5, [pd_16384] | |||
| %endif | |||
| .loop: | |||
| %if cpuflag(ssse3) | |||
| ; This version does the 16x16->16 multiplication in-place without expanding | |||
| ; to 32-bit. The ssse3 version is bit-identical. | |||
| mova m0, [windowq+offset2q] | |||
| mova m1, [ inputq+offset2q] | |||
| pmulhrsw m1, m0 | |||
| REVERSE_WORDS m0, m5 | |||
| pmulhrsw m0, [ inputq+offsetq ] | |||
| mova [outputq+offset2q], m1 | |||
| mova [outputq+offsetq ], m0 | |||
| %elif %1 | |||
| ; This version expands 16-bit to 32-bit, multiplies by the window, | |||
| ; adds 16384 for rounding, right shifts 15, then repacks back to words to | |||
| ; save to the output. The window is reversed for the second half. | |||
| mova m3, [windowq+offset2q] | |||
| mova m4, [ inputq+offset2q] | |||
| pxor m0, m0 | |||
| punpcklwd m0, m3 | |||
| punpcklwd m1, m4 | |||
| pmaddwd m0, m1 | |||
| paddd m0, m5 | |||
| psrad m0, 15 | |||
| pxor m2, m2 | |||
| punpckhwd m2, m3 | |||
| punpckhwd m1, m4 | |||
| pmaddwd m2, m1 | |||
| paddd m2, m5 | |||
| psrad m2, 15 | |||
| packssdw m0, m2 | |||
| mova [outputq+offset2q], m0 | |||
| REVERSE_WORDS m3 | |||
| mova m4, [ inputq+offsetq] | |||
| pxor m0, m0 | |||
| punpcklwd m0, m3 | |||
| punpcklwd m1, m4 | |||
| pmaddwd m0, m1 | |||
| paddd m0, m5 | |||
| psrad m0, 15 | |||
| pxor m2, m2 | |||
| punpckhwd m2, m3 | |||
| punpckhwd m1, m4 | |||
| pmaddwd m2, m1 | |||
| paddd m2, m5 | |||
| psrad m2, 15 | |||
| packssdw m0, m2 | |||
| mova [outputq+offsetq], m0 | |||
| %else | |||
| ; This version does the 16x16->16 multiplication in-place without expanding | |||
| ; to 32-bit. The mmxext and sse2 versions do not use rounding, and | |||
| ; therefore are not bit-identical to the C version. | |||
| mova m0, [windowq+offset2q] | |||
| mova m1, [ inputq+offset2q] | |||
| mova m2, [ inputq+offsetq ] | |||
| MUL16FIXED m1, m0, m3 | |||
| REVERSE_WORDS m0 | |||
| MUL16FIXED m2, m0, m3 | |||
| mova [outputq+offset2q], m1 | |||
| mova [outputq+offsetq ], m2 | |||
| %endif | |||
| add offsetd, mmsize | |||
| sub offset2d, mmsize | |||
| jae .loop | |||
| REP_RET | |||
| %endmacro | |||
| INIT_MMX mmxext | |||
| APPLY_WINDOW_INT16 0 | |||
| INIT_XMM sse2 | |||
| APPLY_WINDOW_INT16 0 | |||
| INIT_MMX mmxext | |||
| APPLY_WINDOW_INT16 1 | |||
| INIT_XMM sse2 | |||
| APPLY_WINDOW_INT16 1 | |||
| INIT_XMM ssse3 | |||
| APPLY_WINDOW_INT16 1 | |||
| INIT_XMM ssse3, atom | |||
| APPLY_WINDOW_INT16 1 | |||
| ; void ff_add_hfyu_median_prediction_mmxext(uint8_t *dst, const uint8_t *top, | |||
| ; const uint8_t *diff, int w, | |||
| ; int *left, int *left_top) | |||