* qatar/master: sws: implement MMX/SSE2/SSSE3/SSE4 versions for horizontal scaling. include stdint.h in adpcm_data.h mpeg12: reorder functions to avoid ugly forward declarations Fixed off by one packet size allocation in the smacker demuxer. Check for invalid packet size in the smacker demuxer. ape demuxer: fix segfault on memory allocation failure. xan: Add some buffer checks xan: Remove extra trailing newline Fixed size given to init_get_bits() in xan decoder. Conflicts: libavcodec/mpeg12.c libswscale/x86/swscale_template.c Merged-by: Michael Niedermayer <michaelni@gmx.at>tags/n0.9
| @@ -26,6 +26,8 @@ | |||
| #ifndef AVCODEC_ADPCM_DATA_H | |||
| #define AVCODEC_ADPCM_DATA_H | |||
| #include <stdint.h> | |||
| extern const int8_t ff_adpcm_index_table[16]; | |||
| extern const int16_t ff_adpcm_step_table[89]; | |||
| extern const int16_t ff_adpcm_AdaptationTable[]; | |||
| @@ -97,17 +97,21 @@ static av_cold int xan_decode_init(AVCodecContext *avctx) | |||
| return 0; | |||
| } | |||
| static int xan_huffman_decode(unsigned char *dest, const unsigned char *src, | |||
| int dest_len) | |||
| static int xan_huffman_decode(unsigned char *dest, int dest_len, | |||
| const unsigned char *src, int src_len) | |||
| { | |||
| unsigned char byte = *src++; | |||
| unsigned char ival = byte + 0x16; | |||
| const unsigned char * ptr = src + byte*2; | |||
| int ptr_len = src_len - 1 - byte*2; | |||
| unsigned char val = ival; | |||
| unsigned char *dest_end = dest + dest_len; | |||
| GetBitContext gb; | |||
| init_get_bits(&gb, ptr, 0); // FIXME: no src size available | |||
| if (ptr_len < 0) | |||
| return AVERROR_INVALIDDATA; | |||
| init_get_bits(&gb, ptr, ptr_len * 8); | |||
| while ( val != 0x16 ) { | |||
| val = src[val - 0x17 + get_bits1(&gb) * byte]; | |||
| @@ -246,7 +250,7 @@ static inline void xan_wc3_copy_pixel_run(XanContext *s, | |||
| } | |||
| } | |||
| static void xan_wc3_decode_frame(XanContext *s) { | |||
| static int xan_wc3_decode_frame(XanContext *s) { | |||
| int width = s->avctx->width; | |||
| int height = s->avctx->height; | |||
| @@ -266,13 +270,30 @@ static void xan_wc3_decode_frame(XanContext *s) { | |||
| const unsigned char *size_segment; | |||
| const unsigned char *vector_segment; | |||
| const unsigned char *imagedata_segment; | |||
| int huffman_offset, size_offset, vector_offset, imagedata_offset; | |||
| if (s->size < 8) | |||
| return AVERROR_INVALIDDATA; | |||
| huffman_offset = AV_RL16(&s->buf[0]); | |||
| size_offset = AV_RL16(&s->buf[2]); | |||
| vector_offset = AV_RL16(&s->buf[4]); | |||
| imagedata_offset = AV_RL16(&s->buf[6]); | |||
| huffman_segment = s->buf + AV_RL16(&s->buf[0]); | |||
| size_segment = s->buf + AV_RL16(&s->buf[2]); | |||
| vector_segment = s->buf + AV_RL16(&s->buf[4]); | |||
| imagedata_segment = s->buf + AV_RL16(&s->buf[6]); | |||
| if (huffman_offset >= s->size || | |||
| size_offset >= s->size || | |||
| vector_offset >= s->size || | |||
| imagedata_offset >= s->size) | |||
| return AVERROR_INVALIDDATA; | |||
| xan_huffman_decode(opcode_buffer, huffman_segment, opcode_buffer_size); | |||
| huffman_segment = s->buf + huffman_offset; | |||
| size_segment = s->buf + size_offset; | |||
| vector_segment = s->buf + vector_offset; | |||
| imagedata_segment = s->buf + imagedata_offset; | |||
| if (xan_huffman_decode(opcode_buffer, opcode_buffer_size, | |||
| huffman_segment, s->size - huffman_offset) < 0) | |||
| return AVERROR_INVALIDDATA; | |||
| if (imagedata_segment[0] == 2) | |||
| xan_unpack(s->buffer2, &imagedata_segment[1], s->buffer2_size); | |||
| @@ -358,6 +379,7 @@ static void xan_wc3_decode_frame(XanContext *s) { | |||
| y += (x + size) / width; | |||
| x = (x + size) % width; | |||
| } | |||
| return 0; | |||
| } | |||
| #if RUNTIME_GAMMA | |||
| @@ -519,7 +541,8 @@ static int xan_decode_frame(AVCodecContext *avctx, | |||
| s->buf = buf; | |||
| s->size = buf_size; | |||
| xan_wc3_decode_frame(s); | |||
| if (xan_wc3_decode_frame(s) < 0) | |||
| return AVERROR_INVALIDDATA; | |||
| /* release the last frame if it is allocated */ | |||
| if (s->last_frame.data[0]) | |||
| @@ -563,4 +586,3 @@ AVCodec ff_xan_wc3_decoder = { | |||
| .capabilities = CODEC_CAP_DR1, | |||
| .long_name = NULL_IF_CONFIG_SMALL("Wing Commander III / Xan"), | |||
| }; | |||
| @@ -19,6 +19,7 @@ OBJS-$(HAVE_MMX) += x86/rgb2rgb.o \ | |||
| x86/swscale_mmx.o \ | |||
| x86/yuv2rgb_mmx.o | |||
| OBJS-$(HAVE_VIS) += sparc/yuv2rgb_vis.o | |||
| OBJS-$(HAVE_YASM) += x86/scale.o | |||
| TESTPROGS = colorspace swscale | |||
| @@ -0,0 +1,429 @@ | |||
| ;****************************************************************************** | |||
| ;* x86-optimized horizontal line scaling functions | |||
| ;* Copyright (c) 2011 Ronald S. Bultje <rsbultje@gmail.com> | |||
| ;* | |||
| ;* This file is part of Libav. | |||
| ;* | |||
| ;* Libav is free software; you can redistribute it and/or | |||
| ;* modify it under the terms of the GNU Lesser General Public | |||
| ;* License as published by the Free Software Foundation; either | |||
| ;* version 2.1 of the License, or (at your option) any later version. | |||
| ;* | |||
| ;* Libav is distributed in the hope that it will be useful, | |||
| ;* but WITHOUT ANY WARRANTY; without even the implied warranty of | |||
| ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |||
| ;* Lesser General Public License for more details. | |||
| ;* | |||
| ;* You should have received a copy of the GNU Lesser General Public | |||
| ;* License along with Libav; if not, write to the Free Software | |||
| ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |||
| ;****************************************************************************** | |||
| %include "x86inc.asm" | |||
| %include "x86util.asm" | |||
| SECTION_RODATA | |||
| max_19bit_int: times 4 dd 0x7ffff | |||
| max_19bit_flt: times 4 dd 524287.0 | |||
| minshort: times 8 dw 0x8000 | |||
| unicoeff: times 4 dd 0x20000000 | |||
| SECTION .text | |||
| ;----------------------------------------------------------------------------- | |||
| ; horizontal line scaling | |||
| ; | |||
| ; void hscale<source_width>to<intermediate_nbits>_<filterSize>_<opt> | |||
| ; (SwsContext *c, int{16,32}_t *dst, | |||
| ; int dstW, const uint{8,16}_t *src, | |||
| ; const int16_t *filter, | |||
| ; const int16_t *filterPos, int filterSize); | |||
| ; | |||
| ; Scale one horizontal line. Input is either 8-bits width or 16-bits width | |||
| ; ($source_width can be either 8, 9, 10 or 16, difference is whether we have to | |||
| ; downscale before multiplying). Filter is 14-bits. Output is either 15bits | |||
| ; (in int16_t) or 19bits (in int32_t), as given in $intermediate_nbits. Each | |||
| ; output pixel is generated from $filterSize input pixels, the position of | |||
| ; the first pixel is given in filterPos[nOutputPixel]. | |||
| ;----------------------------------------------------------------------------- | |||
| ; SCALE_FUNC source_width, intermediate_nbits, filtersize, filtersuffix, opt, n_args, n_xmm | |||
| %macro SCALE_FUNC 7 | |||
| cglobal hscale%1to%2_%4_%5, %6, 7, %7 | |||
| %ifdef ARCH_X86_64 | |||
| movsxd r2, r2d | |||
| %endif ; x86-64 | |||
| %if %2 == 19 | |||
| %if mmsize == 8 ; mmx | |||
| mova m2, [max_19bit_int] | |||
| %elifidn %5, sse4 | |||
| mova m2, [max_19bit_int] | |||
| %else ; ssse3/sse2 | |||
| mova m2, [max_19bit_flt] | |||
| %endif ; mmx/sse2/ssse3/sse4 | |||
| %endif ; %2 == 19 | |||
| %if %1 == 16 | |||
| mova m6, [minshort] | |||
| mova m7, [unicoeff] | |||
| %elif %1 == 8 | |||
| pxor m3, m3 | |||
| %endif ; %1 == 8/16 | |||
| %if %1 == 8 | |||
| %define movlh movd | |||
| %define movbh movh | |||
| %define srcmul 1 | |||
| %else ; %1 == 9-16 | |||
| %define movlh movq | |||
| %define movbh movu | |||
| %define srcmul 2 | |||
| %endif ; %1 == 8/9-16 | |||
| %ifnidn %3, X | |||
| ; setup loop | |||
| %if %3 == 8 | |||
| shl r2, 1 ; this allows *16 (i.e. now *8) in lea instructions for the 8-tap filter | |||
| %define r2shr 1 | |||
| %else ; %3 == 4 | |||
| %define r2shr 0 | |||
| %endif ; %3 == 8 | |||
| lea r4, [r4+r2*8] | |||
| %if %2 == 15 | |||
| lea r1, [r1+r2*(2>>r2shr)] | |||
| %else ; %2 == 19 | |||
| lea r1, [r1+r2*(4>>r2shr)] | |||
| %endif ; %2 == 15/19 | |||
| lea r5, [r5+r2*(2>>r2shr)] | |||
| neg r2 | |||
| .loop: | |||
| %if %3 == 4 ; filterSize == 4 scaling | |||
| ; load 2x4 or 4x4 source pixels into m0/m1 | |||
| movsx r0, word [r5+r2*2+0] ; filterPos[0] | |||
| movsx r6, word [r5+r2*2+2] ; filterPos[1] | |||
| movlh m0, [r3+r0*srcmul] ; src[filterPos[0] + {0,1,2,3}] | |||
| %if mmsize == 8 | |||
| movlh m1, [r3+r6*srcmul] ; src[filterPos[1] + {0,1,2,3}] | |||
| %else ; mmsize == 16 | |||
| %if %1 > 8 | |||
| movhps m0, [r3+r6*srcmul] ; src[filterPos[1] + {0,1,2,3}] | |||
| %else ; %1 == 8 | |||
| movd m4, [r3+r6*srcmul] ; src[filterPos[1] + {0,1,2,3}] | |||
| %endif | |||
| movsx r0, word [r5+r2*2+4] ; filterPos[2] | |||
| movsx r6, word [r5+r2*2+6] ; filterPos[3] | |||
| movlh m1, [r3+r0*srcmul] ; src[filterPos[2] + {0,1,2,3}] | |||
| %if %1 > 8 | |||
| movhps m1, [r3+r6*srcmul] ; src[filterPos[3] + {0,1,2,3}] | |||
| %else ; %1 == 8 | |||
| movd m5, [r3+r6*srcmul] ; src[filterPos[3] + {0,1,2,3}] | |||
| punpckldq m0, m4 | |||
| punpckldq m1, m5 | |||
| %endif ; %1 == 8 && %5 <= ssse | |||
| %endif ; mmsize == 8/16 | |||
| %if %1 == 8 | |||
| punpcklbw m0, m3 ; byte -> word | |||
| punpcklbw m1, m3 ; byte -> word | |||
| %endif ; %1 == 8 | |||
| ; multiply with filter coefficients | |||
| %if %1 == 16 ; pmaddwd needs signed adds, so this moves unsigned -> signed, we'll | |||
| ; add back 0x8000 * sum(coeffs) after the horizontal add | |||
| psubw m0, m6 | |||
| psubw m1, m6 | |||
| %endif ; %1 == 16 | |||
| pmaddwd m0, [r4+r2*8+mmsize*0] ; *= filter[{0,1,..,6,7}] | |||
| pmaddwd m1, [r4+r2*8+mmsize*1] ; *= filter[{8,9,..,14,15}] | |||
| ; add up horizontally (4 srcpix * 4 coefficients -> 1 dstpix) | |||
| %if mmsize == 8 ; mmx | |||
| movq m4, m0 | |||
| punpckldq m0, m1 | |||
| punpckhdq m4, m1 | |||
| paddd m0, m4 | |||
| %elifidn %5, sse2 | |||
| mova m4, m0 | |||
| shufps m0, m1, 10001000b | |||
| shufps m4, m1, 11011101b | |||
| paddd m0, m4 | |||
| %else ; ssse3/sse4 | |||
| phaddd m0, m1 ; filter[{ 0, 1, 2, 3}]*src[filterPos[0]+{0,1,2,3}], | |||
| ; filter[{ 4, 5, 6, 7}]*src[filterPos[1]+{0,1,2,3}], | |||
| ; filter[{ 8, 9,10,11}]*src[filterPos[2]+{0,1,2,3}], | |||
| ; filter[{12,13,14,15}]*src[filterPos[3]+{0,1,2,3}] | |||
| %endif ; mmx/sse2/ssse3/sse4 | |||
| %else ; %3 == 8, i.e. filterSize == 8 scaling | |||
| ; load 2x8 or 4x8 source pixels into m0, m1, m4 and m5 | |||
| movsx r0, word [r5+r2*1+0] ; filterPos[0] | |||
| movsx r6, word [r5+r2*1+2] ; filterPos[1] | |||
| movbh m0, [r3+ r0 *srcmul] ; src[filterPos[0] + {0,1,2,3,4,5,6,7}] | |||
| %if mmsize == 8 | |||
| movbh m1, [r3+(r0+4)*srcmul] ; src[filterPos[0] + {4,5,6,7}] | |||
| movbh m4, [r3+ r6 *srcmul] ; src[filterPos[1] + {0,1,2,3}] | |||
| movbh m5, [r3+(r6+4)*srcmul] ; src[filterPos[1] + {4,5,6,7}] | |||
| %else ; mmsize == 16 | |||
| movbh m1, [r3+ r6 *srcmul] ; src[filterPos[1] + {0,1,2,3,4,5,6,7}] | |||
| movsx r0, word [r5+r2*1+4] ; filterPos[2] | |||
| movsx r6, word [r5+r2*1+6] ; filterPos[3] | |||
| movbh m4, [r3+ r0 *srcmul] ; src[filterPos[2] + {0,1,2,3,4,5,6,7}] | |||
| movbh m5, [r3+ r6 *srcmul] ; src[filterPos[3] + {0,1,2,3,4,5,6,7}] | |||
| %endif ; mmsize == 8/16 | |||
| %if %1 == 8 | |||
| punpcklbw m0, m3 ; byte -> word | |||
| punpcklbw m1, m3 ; byte -> word | |||
| punpcklbw m4, m3 ; byte -> word | |||
| punpcklbw m5, m3 ; byte -> word | |||
| %endif ; %1 == 8 | |||
| ; multiply | |||
| %if %1 == 16 ; pmaddwd needs signed adds, so this moves unsigned -> signed, we'll | |||
| ; add back 0x8000 * sum(coeffs) after the horizontal add | |||
| psubw m0, m6 | |||
| psubw m1, m6 | |||
| psubw m4, m6 | |||
| psubw m5, m6 | |||
| %endif ; %1 == 16 | |||
| pmaddwd m0, [r4+r2*8+mmsize*0] ; *= filter[{0,1,..,6,7}] | |||
| pmaddwd m1, [r4+r2*8+mmsize*1] ; *= filter[{8,9,..,14,15}] | |||
| pmaddwd m4, [r4+r2*8+mmsize*2] ; *= filter[{16,17,..,22,23}] | |||
| pmaddwd m5, [r4+r2*8+mmsize*3] ; *= filter[{24,25,..,30,31}] | |||
| ; add up horizontally (8 srcpix * 8 coefficients -> 1 dstpix) | |||
| %if mmsize == 8 | |||
| paddd m0, m1 | |||
| paddd m4, m5 | |||
| movq m1, m0 | |||
| punpckldq m0, m4 | |||
| punpckhdq m1, m4 | |||
| paddd m0, m1 | |||
| %elifidn %5, sse2 | |||
| %if %1 == 8 | |||
| %define mex m6 | |||
| %else | |||
| %define mex m3 | |||
| %endif | |||
| ; emulate horizontal add as transpose + vertical add | |||
| mova mex, m0 | |||
| punpckldq m0, m1 | |||
| punpckhdq mex, m1 | |||
| paddd m0, mex | |||
| mova m1, m4 | |||
| punpckldq m4, m5 | |||
| punpckhdq m1, m5 | |||
| paddd m4, m1 | |||
| mova m1, m0 | |||
| punpcklqdq m0, m4 | |||
| punpckhqdq m1, m4 | |||
| paddd m0, m1 | |||
| %else ; ssse3/sse4 | |||
| ; FIXME if we rearrange the filter in pairs of 4, we can | |||
| ; load pixels likewise and use 2 x paddd + phaddd instead | |||
| ; of 3 x phaddd here, faster on older cpus | |||
| phaddd m0, m1 | |||
| phaddd m4, m5 | |||
| phaddd m0, m4 ; filter[{ 0, 1,..., 6, 7}]*src[filterPos[0]+{0,1,...,6,7}], | |||
| ; filter[{ 8, 9,...,14,15}]*src[filterPos[1]+{0,1,...,6,7}], | |||
| ; filter[{16,17,...,22,23}]*src[filterPos[2]+{0,1,...,6,7}], | |||
| ; filter[{24,25,...,30,31}]*src[filterPos[3]+{0,1,...,6,7}] | |||
| %endif ; mmx/sse2/ssse3/sse4 | |||
| %endif ; %3 == 4/8 | |||
| %else ; %3 == X, i.e. any filterSize scaling | |||
| %ifidn %4, X4 | |||
| %define r6sub 4 | |||
| %else ; %4 == X || %4 == X8 | |||
| %define r6sub 0 | |||
| %endif ; %4 ==/!= X4 | |||
| %ifdef ARCH_X86_64 | |||
| push r12 | |||
| movsxd r6, r6d ; filterSize | |||
| lea r12, [r3+(r6-r6sub)*srcmul] ; &src[filterSize&~4] | |||
| %define src_reg r11 | |||
| %define r1x r10 | |||
| %define filter2 r12 | |||
| %else ; x86-32 | |||
| lea r0, [r3+(r6-r6sub)*srcmul] ; &src[filterSize&~4] | |||
| mov r6m, r0 | |||
| %define src_reg r3 | |||
| %define r1x r1 | |||
| %define filter2 r6m | |||
| %endif ; x86-32/64 | |||
| lea r5, [r5+r2*2] | |||
| %if %2 == 15 | |||
| lea r1, [r1+r2*2] | |||
| %else ; %2 == 19 | |||
| lea r1, [r1+r2*4] | |||
| %endif ; %2 == 15/19 | |||
| movifnidn r1mp, r1 | |||
| neg r2 | |||
| .loop: | |||
| movsx r0, word [r5+r2*2+0] ; filterPos[0] | |||
| movsx r1x, word [r5+r2*2+2] ; filterPos[1] | |||
| ; FIXME maybe do 4px/iteration on x86-64 (x86-32 wouldn't have enough regs)? | |||
| pxor m4, m4 | |||
| pxor m5, m5 | |||
| mov src_reg, r3mp | |||
| .innerloop: | |||
| ; load 2x4 (mmx) or 2x8 (sse) source pixels into m0/m1 -> m4/m5 | |||
| movbh m0, [src_reg+r0 *srcmul] ; src[filterPos[0] + {0,1,2,3(,4,5,6,7)}] | |||
| movbh m1, [src_reg+(r1x+r6sub)*srcmul] ; src[filterPos[1] + {0,1,2,3(,4,5,6,7)}] | |||
| %if %1 == 8 | |||
| punpcklbw m0, m3 | |||
| punpcklbw m1, m3 | |||
| %endif ; %1 == 8 | |||
| ; multiply | |||
| %if %1 == 16 ; pmaddwd needs signed adds, so this moves unsigned -> signed, we'll | |||
| ; add back 0x8000 * sum(coeffs) after the horizontal add | |||
| psubw m0, m6 | |||
| psubw m1, m6 | |||
| %endif ; %1 == 16 | |||
| pmaddwd m0, [r4 ] ; filter[{0,1,2,3(,4,5,6,7)}] | |||
| pmaddwd m1, [r4+(r6+r6sub)*2] ; filter[filtersize+{0,1,2,3(,4,5,6,7)}] | |||
| paddd m4, m0 | |||
| paddd m5, m1 | |||
| add r4, mmsize | |||
| add src_reg, srcmul*mmsize/2 | |||
| cmp src_reg, filter2 ; while (src += 4) < &src[filterSize] | |||
| jl .innerloop | |||
| %ifidn %4, X4 | |||
| movsx r1x, word [r5+r2*2+2] ; filterPos[1] | |||
| movlh m0, [src_reg+r0 *srcmul] ; split last 4 srcpx of dstpx[0] | |||
| sub r1x, r6 ; and first 4 srcpx of dstpx[1] | |||
| %if %1 > 8 | |||
| movhps m0, [src_reg+(r1x+r6sub)*srcmul] | |||
| %else ; %1 == 8 | |||
| movd m1, [src_reg+(r1x+r6sub)*srcmul] | |||
| punpckldq m0, m1 | |||
| %endif ; %1 == 8 && %5 <= ssse | |||
| %if %1 == 8 | |||
| punpcklbw m0, m3 | |||
| %endif ; %1 == 8 | |||
| %if %1 == 16 ; pmaddwd needs signed adds, so this moves unsigned -> signed, we'll | |||
| ; add back 0x8000 * sum(coeffs) after the horizontal add | |||
| psubw m0, m6 | |||
| %endif ; %1 == 16 | |||
| pmaddwd m0, [r4] | |||
| %endif ; %4 == X4 | |||
| lea r4, [r4+(r6+r6sub)*2] | |||
| %if mmsize == 8 ; mmx | |||
| movq m0, m4 | |||
| punpckldq m4, m5 | |||
| punpckhdq m0, m5 | |||
| paddd m0, m4 | |||
| %else ; mmsize == 16 | |||
| %ifidn %5, sse2 | |||
| mova m1, m4 | |||
| punpcklqdq m4, m5 | |||
| punpckhqdq m1, m5 | |||
| paddd m4, m1 | |||
| %else ; ssse3/sse4 | |||
| phaddd m4, m5 | |||
| %endif ; sse2/ssse3/sse4 | |||
| %ifidn %4, X4 | |||
| paddd m4, m0 | |||
| %endif ; %3 == X4 | |||
| %ifidn %5, sse2 | |||
| pshufd m4, m4, 11011000b | |||
| movhlps m0, m4 | |||
| paddd m0, m4 | |||
| %else ; ssse3/sse4 | |||
| phaddd m4, m4 | |||
| SWAP 0, 4 | |||
| %endif ; sse2/ssse3/sse4 | |||
| %endif ; mmsize == 8/16 | |||
| %endif ; %3 ==/!= X | |||
| %if %1 == 16 ; add 0x8000 * sum(coeffs), i.e. back from signed -> unsigned | |||
| paddd m0, m7 | |||
| %endif ; %1 == 16 | |||
| ; clip, store | |||
| psrad m0, 14 + %1 - %2 | |||
| %ifidn %3, X | |||
| movifnidn r1, r1mp | |||
| %endif ; %3 == X | |||
| %if %2 == 15 | |||
| packssdw m0, m0 | |||
| %ifnidn %3, X | |||
| movh [r1+r2*(2>>r2shr)], m0 | |||
| %else ; %3 == X | |||
| movd [r1+r2*2], m0 | |||
| %endif ; %3 ==/!= X | |||
| %else ; %2 == 19 | |||
| %if mmsize == 8 | |||
| PMINSD_MMX m0, m2, m4 | |||
| %elifidn %5, sse4 | |||
| pminsd m0, m2 | |||
| %else ; sse2/ssse3 | |||
| cvtdq2ps m0, m0 | |||
| minps m0, m2 | |||
| cvtps2dq m0, m0 | |||
| %endif ; mmx/sse2/ssse3/sse4 | |||
| %ifnidn %3, X | |||
| movu [r1+r2*(4>>r2shr)], m0 | |||
| %else ; %3 == X | |||
| movq [r1+r2*4], m0 | |||
| %endif ; %3 ==/!= X | |||
| %endif ; %2 == 15/19 | |||
| %ifnidn %3, X | |||
| add r2, (mmsize<<r2shr)/4 ; both 8tap and 4tap really only do 4 pixels (or for mmx: 2 pixels) | |||
| ; per iteration. see "shl r2,1" above as for why we do this | |||
| %else ; %3 == X | |||
| add r2, 2 | |||
| %endif ; %3 ==/!= X | |||
| jl .loop | |||
| %ifnidn %3, X | |||
| REP_RET | |||
| %else ; %3 == X | |||
| %ifdef ARCH_X86_64 | |||
| pop r12 | |||
| RET | |||
| %else ; x86-32 | |||
| REP_RET | |||
| %endif ; x86-32/64 | |||
| %endif ; %3 ==/!= X | |||
| %endmacro | |||
| ; SCALE_FUNCS source_width, intermediate_nbits, opt, n_xmm | |||
| %macro SCALE_FUNCS 4 | |||
| SCALE_FUNC %1, %2, 4, 4, %3, 6, %4 | |||
| SCALE_FUNC %1, %2, 8, 8, %3, 6, %4 | |||
| %if mmsize == 8 | |||
| SCALE_FUNC %1, %2, X, X, %3, 7, %4 | |||
| %else | |||
| SCALE_FUNC %1, %2, X, X4, %3, 7, %4 | |||
| SCALE_FUNC %1, %2, X, X8, %3, 7, %4 | |||
| %endif | |||
| %endmacro | |||
| ; SCALE_FUNCS2 opt, 8_xmm_args, 9to10_xmm_args, 16_xmm_args | |||
| %macro SCALE_FUNCS2 4 | |||
| %ifnidn %1, sse4 | |||
| SCALE_FUNCS 8, 15, %1, %2 | |||
| SCALE_FUNCS 9, 15, %1, %3 | |||
| SCALE_FUNCS 10, 15, %1, %3 | |||
| SCALE_FUNCS 16, 15, %1, %4 | |||
| %endif ; !sse4 | |||
| SCALE_FUNCS 8, 19, %1, %2 | |||
| SCALE_FUNCS 9, 19, %1, %3 | |||
| SCALE_FUNCS 10, 19, %1, %3 | |||
| SCALE_FUNCS 16, 19, %1, %4 | |||
| %endmacro | |||
| %ifdef ARCH_X86_32 | |||
| INIT_MMX | |||
| SCALE_FUNCS2 mmx, 0, 0, 0 | |||
| %endif | |||
| INIT_XMM | |||
| SCALE_FUNCS2 sse2, 6, 7, 8 | |||
| SCALE_FUNCS2 ssse3, 6, 6, 8 | |||
| SCALE_FUNCS2 sse4, 6, 6, 8 | |||
| @@ -176,6 +176,41 @@ void updateMMXDitherTables(SwsContext *c, int dstY, int lumBufIndex, int chrBufI | |||
| } | |||
| } | |||
| #define SCALE_FUNC(filter_n, from_bpc, to_bpc, opt) \ | |||
| extern void ff_hscale ## from_bpc ## to ## to_bpc ## _ ## filter_n ## _ ## opt( \ | |||
| SwsContext *c, int16_t *data, \ | |||
| int dstW, const uint8_t *src, \ | |||
| const int16_t *filter, \ | |||
| const int16_t *filterPos, int filterSize); | |||
| #define SCALE_FUNCS(filter_n, opt) \ | |||
| SCALE_FUNC(filter_n, 8, 15, opt); \ | |||
| SCALE_FUNC(filter_n, 9, 15, opt); \ | |||
| SCALE_FUNC(filter_n, 10, 15, opt); \ | |||
| SCALE_FUNC(filter_n, 16, 15, opt); \ | |||
| SCALE_FUNC(filter_n, 8, 19, opt); \ | |||
| SCALE_FUNC(filter_n, 9, 19, opt); \ | |||
| SCALE_FUNC(filter_n, 10, 19, opt); \ | |||
| SCALE_FUNC(filter_n, 16, 19, opt) | |||
| #define SCALE_FUNCS_MMX(opt) \ | |||
| SCALE_FUNCS(4, opt); \ | |||
| SCALE_FUNCS(8, opt); \ | |||
| SCALE_FUNCS(X, opt) | |||
| #define SCALE_FUNCS_SSE(opt) \ | |||
| SCALE_FUNCS(4, opt); \ | |||
| SCALE_FUNCS(8, opt); \ | |||
| SCALE_FUNCS(X4, opt); \ | |||
| SCALE_FUNCS(X8, opt) | |||
| #if ARCH_X86_32 | |||
| SCALE_FUNCS_MMX(mmx); | |||
| #endif | |||
| SCALE_FUNCS_SSE(sse2); | |||
| SCALE_FUNCS_SSE(ssse3); | |||
| SCALE_FUNCS_SSE(sse4); | |||
| void ff_sws_init_swScale_mmx(SwsContext *c) | |||
| { | |||
| int cpu_flags = av_get_cpu_flags(); | |||
| @@ -186,4 +221,55 @@ void ff_sws_init_swScale_mmx(SwsContext *c) | |||
| if (cpu_flags & AV_CPU_FLAG_MMX2) | |||
| sws_init_swScale_MMX2(c); | |||
| #endif | |||
| #if HAVE_YASM | |||
| #define ASSIGN_SCALE_FUNC2(hscalefn, filtersize, opt1, opt2) do { \ | |||
| if (c->srcBpc == 8) { \ | |||
| hscalefn = c->dstBpc <= 10 ? ff_hscale8to15_ ## filtersize ## _ ## opt2 : \ | |||
| ff_hscale8to19_ ## filtersize ## _ ## opt1; \ | |||
| } else if (c->srcBpc == 9) { \ | |||
| hscalefn = c->dstBpc <= 10 ? ff_hscale9to15_ ## filtersize ## _ ## opt2 : \ | |||
| ff_hscale9to19_ ## filtersize ## _ ## opt1; \ | |||
| } else if (c->srcBpc == 10) { \ | |||
| hscalefn = c->dstBpc <= 10 ? ff_hscale10to15_ ## filtersize ## _ ## opt2 : \ | |||
| ff_hscale10to19_ ## filtersize ## _ ## opt1; \ | |||
| } else if(c->srcBpc == 16 && !((c->srcFormat==PIX_FMT_PAL8||isAnyRGB(c->srcFormat)) && av_pix_fmt_descriptors[c->srcFormat].comp[0].depth_minus1<15)) { \ | |||
| hscalefn = c->dstBpc <= 10 ? ff_hscale16to15_ ## filtersize ## _ ## opt2 : \ | |||
| ff_hscale16to19_ ## filtersize ## _ ## opt1; \ | |||
| } \ | |||
| } while (0) | |||
| #define ASSIGN_MMX_SCALE_FUNC(hscalefn, filtersize, opt1, opt2) \ | |||
| switch (filtersize) { \ | |||
| case 4: ASSIGN_SCALE_FUNC2(hscalefn, 4, opt1, opt2); break; \ | |||
| case 8: ASSIGN_SCALE_FUNC2(hscalefn, 8, opt1, opt2); break; \ | |||
| default: ASSIGN_SCALE_FUNC2(hscalefn, X, opt1, opt2); break; \ | |||
| } | |||
| #if ARCH_X86_32 | |||
| if (cpu_flags & AV_CPU_FLAG_MMX) { | |||
| ASSIGN_MMX_SCALE_FUNC(c->hyScale, c->hLumFilterSize, mmx, mmx); | |||
| ASSIGN_MMX_SCALE_FUNC(c->hcScale, c->hChrFilterSize, mmx, mmx); | |||
| } | |||
| #endif | |||
| #define ASSIGN_SSE_SCALE_FUNC(hscalefn, filtersize, opt1, opt2) \ | |||
| switch (filtersize) { \ | |||
| case 4: ASSIGN_SCALE_FUNC2(hscalefn, 4, opt1, opt2); break; \ | |||
| case 8: ASSIGN_SCALE_FUNC2(hscalefn, 8, opt1, opt2); break; \ | |||
| default: if (filtersize & 4) ASSIGN_SCALE_FUNC2(hscalefn, X4, opt1, opt2); \ | |||
| else ASSIGN_SCALE_FUNC2(hscalefn, X8, opt1, opt2); \ | |||
| break; \ | |||
| } | |||
| if (cpu_flags & AV_CPU_FLAG_SSE2) { | |||
| ASSIGN_SSE_SCALE_FUNC(c->hyScale, c->hLumFilterSize, sse2, sse2); | |||
| ASSIGN_SSE_SCALE_FUNC(c->hcScale, c->hChrFilterSize, sse2, sse2); | |||
| } | |||
| if (cpu_flags & AV_CPU_FLAG_SSSE3) { | |||
| ASSIGN_SSE_SCALE_FUNC(c->hyScale, c->hLumFilterSize, ssse3, ssse3); | |||
| ASSIGN_SSE_SCALE_FUNC(c->hcScale, c->hChrFilterSize, ssse3, ssse3); | |||
| } | |||
| if (cpu_flags & AV_CPU_FLAG_SSE4) { | |||
| /* Xto15 don't need special sse4 functions */ | |||
| ASSIGN_SSE_SCALE_FUNC(c->hyScale, c->hLumFilterSize, sse4, ssse3); | |||
| ASSIGN_SSE_SCALE_FUNC(c->hcScale, c->hChrFilterSize, sse4, ssse3); | |||
| } | |||
| #endif | |||
| } | |||
| @@ -1951,164 +1951,6 @@ static void RENAME(rgb24ToUV)(int16_t *dstU, int16_t *dstV, | |||
| RENAME(bgr24ToUV_mmx)(dstU, dstV, src1, width, PIX_FMT_RGB24); | |||
| } | |||
| #if !COMPILE_TEMPLATE_MMX2 | |||
| // bilinear / bicubic scaling | |||
| static void RENAME(hScale)(SwsContext *c, int16_t *dst, int dstW, | |||
| const uint8_t *src, const int16_t *filter, | |||
| const int16_t *filterPos, int filterSize) | |||
| { | |||
| assert(filterSize % 4 == 0 && filterSize>0); | |||
| if (filterSize==4) { // Always true for upscaling, sometimes for down, too. | |||
| x86_reg counter= -2*dstW; | |||
| filter-= counter*2; | |||
| filterPos-= counter/2; | |||
| dst-= counter/2; | |||
| __asm__ volatile( | |||
| #if defined(PIC) | |||
| "push %%"REG_b" \n\t" | |||
| #endif | |||
| "pxor %%mm7, %%mm7 \n\t" | |||
| "push %%"REG_BP" \n\t" // we use 7 regs here ... | |||
| "mov %%"REG_a", %%"REG_BP" \n\t" | |||
| ".p2align 4 \n\t" | |||
| "1: \n\t" | |||
| "movzwl (%2, %%"REG_BP"), %%eax \n\t" | |||
| "movzwl 2(%2, %%"REG_BP"), %%ebx \n\t" | |||
| "movq (%1, %%"REG_BP", 4), %%mm1 \n\t" | |||
| "movq 8(%1, %%"REG_BP", 4), %%mm3 \n\t" | |||
| "movd (%3, %%"REG_a"), %%mm0 \n\t" | |||
| "movd (%3, %%"REG_b"), %%mm2 \n\t" | |||
| "punpcklbw %%mm7, %%mm0 \n\t" | |||
| "punpcklbw %%mm7, %%mm2 \n\t" | |||
| "pmaddwd %%mm1, %%mm0 \n\t" | |||
| "pmaddwd %%mm2, %%mm3 \n\t" | |||
| "movq %%mm0, %%mm4 \n\t" | |||
| "punpckldq %%mm3, %%mm0 \n\t" | |||
| "punpckhdq %%mm3, %%mm4 \n\t" | |||
| "paddd %%mm4, %%mm0 \n\t" | |||
| "psrad $7, %%mm0 \n\t" | |||
| "packssdw %%mm0, %%mm0 \n\t" | |||
| "movd %%mm0, (%4, %%"REG_BP") \n\t" | |||
| "add $4, %%"REG_BP" \n\t" | |||
| " jnc 1b \n\t" | |||
| "pop %%"REG_BP" \n\t" | |||
| #if defined(PIC) | |||
| "pop %%"REG_b" \n\t" | |||
| #endif | |||
| : "+a" (counter) | |||
| : "c" (filter), "d" (filterPos), "S" (src), "D" (dst) | |||
| #if !defined(PIC) | |||
| : "%"REG_b | |||
| #endif | |||
| ); | |||
| } else if (filterSize==8) { | |||
| x86_reg counter= -2*dstW; | |||
| filter-= counter*4; | |||
| filterPos-= counter/2; | |||
| dst-= counter/2; | |||
| __asm__ volatile( | |||
| #if defined(PIC) | |||
| "push %%"REG_b" \n\t" | |||
| #endif | |||
| "pxor %%mm7, %%mm7 \n\t" | |||
| "push %%"REG_BP" \n\t" // we use 7 regs here ... | |||
| "mov %%"REG_a", %%"REG_BP" \n\t" | |||
| ".p2align 4 \n\t" | |||
| "1: \n\t" | |||
| "movzwl (%2, %%"REG_BP"), %%eax \n\t" | |||
| "movzwl 2(%2, %%"REG_BP"), %%ebx \n\t" | |||
| "movq (%1, %%"REG_BP", 8), %%mm1 \n\t" | |||
| "movq 16(%1, %%"REG_BP", 8), %%mm3 \n\t" | |||
| "movd (%3, %%"REG_a"), %%mm0 \n\t" | |||
| "movd (%3, %%"REG_b"), %%mm2 \n\t" | |||
| "punpcklbw %%mm7, %%mm0 \n\t" | |||
| "punpcklbw %%mm7, %%mm2 \n\t" | |||
| "pmaddwd %%mm1, %%mm0 \n\t" | |||
| "pmaddwd %%mm2, %%mm3 \n\t" | |||
| "movq 8(%1, %%"REG_BP", 8), %%mm1 \n\t" | |||
| "movq 24(%1, %%"REG_BP", 8), %%mm5 \n\t" | |||
| "movd 4(%3, %%"REG_a"), %%mm4 \n\t" | |||
| "movd 4(%3, %%"REG_b"), %%mm2 \n\t" | |||
| "punpcklbw %%mm7, %%mm4 \n\t" | |||
| "punpcklbw %%mm7, %%mm2 \n\t" | |||
| "pmaddwd %%mm1, %%mm4 \n\t" | |||
| "pmaddwd %%mm2, %%mm5 \n\t" | |||
| "paddd %%mm4, %%mm0 \n\t" | |||
| "paddd %%mm5, %%mm3 \n\t" | |||
| "movq %%mm0, %%mm4 \n\t" | |||
| "punpckldq %%mm3, %%mm0 \n\t" | |||
| "punpckhdq %%mm3, %%mm4 \n\t" | |||
| "paddd %%mm4, %%mm0 \n\t" | |||
| "psrad $7, %%mm0 \n\t" | |||
| "packssdw %%mm0, %%mm0 \n\t" | |||
| "movd %%mm0, (%4, %%"REG_BP") \n\t" | |||
| "add $4, %%"REG_BP" \n\t" | |||
| " jnc 1b \n\t" | |||
| "pop %%"REG_BP" \n\t" | |||
| #if defined(PIC) | |||
| "pop %%"REG_b" \n\t" | |||
| #endif | |||
| : "+a" (counter) | |||
| : "c" (filter), "d" (filterPos), "S" (src), "D" (dst) | |||
| #if !defined(PIC) | |||
| : "%"REG_b | |||
| #endif | |||
| ); | |||
| } else { | |||
| const uint8_t *offset = src+filterSize; | |||
| x86_reg counter= -2*dstW; | |||
| //filter-= counter*filterSize/2; | |||
| filterPos-= counter/2; | |||
| dst-= counter/2; | |||
| __asm__ volatile( | |||
| "pxor %%mm7, %%mm7 \n\t" | |||
| ".p2align 4 \n\t" | |||
| "1: \n\t" | |||
| "mov %2, %%"REG_c" \n\t" | |||
| "movzwl (%%"REG_c", %0), %%eax \n\t" | |||
| "movzwl 2(%%"REG_c", %0), %%edx \n\t" | |||
| "mov %5, %%"REG_c" \n\t" | |||
| "pxor %%mm4, %%mm4 \n\t" | |||
| "pxor %%mm5, %%mm5 \n\t" | |||
| "2: \n\t" | |||
| "movq (%1), %%mm1 \n\t" | |||
| "movq (%1, %6), %%mm3 \n\t" | |||
| "movd (%%"REG_c", %%"REG_a"), %%mm0 \n\t" | |||
| "movd (%%"REG_c", %%"REG_d"), %%mm2 \n\t" | |||
| "punpcklbw %%mm7, %%mm0 \n\t" | |||
| "punpcklbw %%mm7, %%mm2 \n\t" | |||
| "pmaddwd %%mm1, %%mm0 \n\t" | |||
| "pmaddwd %%mm2, %%mm3 \n\t" | |||
| "paddd %%mm3, %%mm5 \n\t" | |||
| "paddd %%mm0, %%mm4 \n\t" | |||
| "add $8, %1 \n\t" | |||
| "add $4, %%"REG_c" \n\t" | |||
| "cmp %4, %%"REG_c" \n\t" | |||
| " jb 2b \n\t" | |||
| "add %6, %1 \n\t" | |||
| "movq %%mm4, %%mm0 \n\t" | |||
| "punpckldq %%mm5, %%mm4 \n\t" | |||
| "punpckhdq %%mm5, %%mm0 \n\t" | |||
| "paddd %%mm0, %%mm4 \n\t" | |||
| "psrad $7, %%mm4 \n\t" | |||
| "packssdw %%mm4, %%mm4 \n\t" | |||
| "mov %3, %%"REG_a" \n\t" | |||
| "movd %%mm4, (%%"REG_a", %0) \n\t" | |||
| "add $4, %0 \n\t" | |||
| " jnc 1b \n\t" | |||
| : "+r" (counter), "+r" (filter) | |||
| : "m" (filterPos), "m" (dst), "m"(offset), | |||
| "m" (src), "r" ((x86_reg)filterSize*2) | |||
| : "%"REG_a, "%"REG_c, "%"REG_d | |||
| ); | |||
| } | |||
| } | |||
| #endif /* !COMPILE_TEMPLATE_MMX2 */ | |||
| static inline void RENAME(hScale16)(int16_t *dst, int dstW, const uint16_t *src, int srcW, int xInc, | |||
| const int16_t *filter, const int16_t *filterPos, long filterSize, int shift) | |||
| { | |||
| @@ -2265,7 +2107,6 @@ static inline void RENAME(hScale16)(int16_t *dst, int dstW, const uint16_t *src, | |||
| } | |||
| } | |||
| #if COMPILE_TEMPLATE_MMX2 | |||
| static void RENAME(hyscale_fast)(SwsContext *c, int16_t *dst, | |||
| int dstWidth, const uint8_t *src, | |||
| @@ -2466,10 +2307,6 @@ static av_cold void RENAME(sws_init_swScale)(SwsContext *c) | |||
| } | |||
| if (c->srcBpc == 8 && c->dstBpc <= 10) { | |||
| #if !COMPILE_TEMPLATE_MMX2 | |||
| c->hyScale = c->hcScale = RENAME(hScale ); | |||
| #endif /* !COMPILE_TEMPLATE_MMX2 */ | |||
| // Use the new MMX scaler if the MMX2 one can't be used (it is faster than the x86 ASM one). | |||
| #if COMPILE_TEMPLATE_MMX2 | |||
| if (c->flags & SWS_FAST_BILINEAR && c->canMMX2BeUsed) | |||