* qatar/master: sws: implement MMX/SSE2/SSSE3/SSE4 versions for horizontal scaling. include stdint.h in adpcm_data.h mpeg12: reorder functions to avoid ugly forward declarations Fixed off by one packet size allocation in the smacker demuxer. Check for invalid packet size in the smacker demuxer. ape demuxer: fix segfault on memory allocation failure. xan: Add some buffer checks xan: Remove extra trailing newline Fixed size given to init_get_bits() in xan decoder. Conflicts: libavcodec/mpeg12.c libswscale/x86/swscale_template.c Merged-by: Michael Niedermayer <michaelni@gmx.at>tags/n0.9
| @@ -26,6 +26,8 @@ | |||||
| #ifndef AVCODEC_ADPCM_DATA_H | #ifndef AVCODEC_ADPCM_DATA_H | ||||
| #define AVCODEC_ADPCM_DATA_H | #define AVCODEC_ADPCM_DATA_H | ||||
| #include <stdint.h> | |||||
| extern const int8_t ff_adpcm_index_table[16]; | extern const int8_t ff_adpcm_index_table[16]; | ||||
| extern const int16_t ff_adpcm_step_table[89]; | extern const int16_t ff_adpcm_step_table[89]; | ||||
| extern const int16_t ff_adpcm_AdaptationTable[]; | extern const int16_t ff_adpcm_AdaptationTable[]; | ||||
| @@ -97,17 +97,21 @@ static av_cold int xan_decode_init(AVCodecContext *avctx) | |||||
| return 0; | return 0; | ||||
| } | } | ||||
| static int xan_huffman_decode(unsigned char *dest, const unsigned char *src, | |||||
| int dest_len) | |||||
| static int xan_huffman_decode(unsigned char *dest, int dest_len, | |||||
| const unsigned char *src, int src_len) | |||||
| { | { | ||||
| unsigned char byte = *src++; | unsigned char byte = *src++; | ||||
| unsigned char ival = byte + 0x16; | unsigned char ival = byte + 0x16; | ||||
| const unsigned char * ptr = src + byte*2; | const unsigned char * ptr = src + byte*2; | ||||
| int ptr_len = src_len - 1 - byte*2; | |||||
| unsigned char val = ival; | unsigned char val = ival; | ||||
| unsigned char *dest_end = dest + dest_len; | unsigned char *dest_end = dest + dest_len; | ||||
| GetBitContext gb; | GetBitContext gb; | ||||
| init_get_bits(&gb, ptr, 0); // FIXME: no src size available | |||||
| if (ptr_len < 0) | |||||
| return AVERROR_INVALIDDATA; | |||||
| init_get_bits(&gb, ptr, ptr_len * 8); | |||||
| while ( val != 0x16 ) { | while ( val != 0x16 ) { | ||||
| val = src[val - 0x17 + get_bits1(&gb) * byte]; | val = src[val - 0x17 + get_bits1(&gb) * byte]; | ||||
| @@ -246,7 +250,7 @@ static inline void xan_wc3_copy_pixel_run(XanContext *s, | |||||
| } | } | ||||
| } | } | ||||
| static void xan_wc3_decode_frame(XanContext *s) { | |||||
| static int xan_wc3_decode_frame(XanContext *s) { | |||||
| int width = s->avctx->width; | int width = s->avctx->width; | ||||
| int height = s->avctx->height; | int height = s->avctx->height; | ||||
| @@ -266,13 +270,30 @@ static void xan_wc3_decode_frame(XanContext *s) { | |||||
| const unsigned char *size_segment; | const unsigned char *size_segment; | ||||
| const unsigned char *vector_segment; | const unsigned char *vector_segment; | ||||
| const unsigned char *imagedata_segment; | const unsigned char *imagedata_segment; | ||||
| int huffman_offset, size_offset, vector_offset, imagedata_offset; | |||||
| if (s->size < 8) | |||||
| return AVERROR_INVALIDDATA; | |||||
| huffman_offset = AV_RL16(&s->buf[0]); | |||||
| size_offset = AV_RL16(&s->buf[2]); | |||||
| vector_offset = AV_RL16(&s->buf[4]); | |||||
| imagedata_offset = AV_RL16(&s->buf[6]); | |||||
| huffman_segment = s->buf + AV_RL16(&s->buf[0]); | |||||
| size_segment = s->buf + AV_RL16(&s->buf[2]); | |||||
| vector_segment = s->buf + AV_RL16(&s->buf[4]); | |||||
| imagedata_segment = s->buf + AV_RL16(&s->buf[6]); | |||||
| if (huffman_offset >= s->size || | |||||
| size_offset >= s->size || | |||||
| vector_offset >= s->size || | |||||
| imagedata_offset >= s->size) | |||||
| return AVERROR_INVALIDDATA; | |||||
| xan_huffman_decode(opcode_buffer, huffman_segment, opcode_buffer_size); | |||||
| huffman_segment = s->buf + huffman_offset; | |||||
| size_segment = s->buf + size_offset; | |||||
| vector_segment = s->buf + vector_offset; | |||||
| imagedata_segment = s->buf + imagedata_offset; | |||||
| if (xan_huffman_decode(opcode_buffer, opcode_buffer_size, | |||||
| huffman_segment, s->size - huffman_offset) < 0) | |||||
| return AVERROR_INVALIDDATA; | |||||
| if (imagedata_segment[0] == 2) | if (imagedata_segment[0] == 2) | ||||
| xan_unpack(s->buffer2, &imagedata_segment[1], s->buffer2_size); | xan_unpack(s->buffer2, &imagedata_segment[1], s->buffer2_size); | ||||
| @@ -358,6 +379,7 @@ static void xan_wc3_decode_frame(XanContext *s) { | |||||
| y += (x + size) / width; | y += (x + size) / width; | ||||
| x = (x + size) % width; | x = (x + size) % width; | ||||
| } | } | ||||
| return 0; | |||||
| } | } | ||||
| #if RUNTIME_GAMMA | #if RUNTIME_GAMMA | ||||
| @@ -519,7 +541,8 @@ static int xan_decode_frame(AVCodecContext *avctx, | |||||
| s->buf = buf; | s->buf = buf; | ||||
| s->size = buf_size; | s->size = buf_size; | ||||
| xan_wc3_decode_frame(s); | |||||
| if (xan_wc3_decode_frame(s) < 0) | |||||
| return AVERROR_INVALIDDATA; | |||||
| /* release the last frame if it is allocated */ | /* release the last frame if it is allocated */ | ||||
| if (s->last_frame.data[0]) | if (s->last_frame.data[0]) | ||||
| @@ -563,4 +586,3 @@ AVCodec ff_xan_wc3_decoder = { | |||||
| .capabilities = CODEC_CAP_DR1, | .capabilities = CODEC_CAP_DR1, | ||||
| .long_name = NULL_IF_CONFIG_SMALL("Wing Commander III / Xan"), | .long_name = NULL_IF_CONFIG_SMALL("Wing Commander III / Xan"), | ||||
| }; | }; | ||||
| @@ -19,6 +19,7 @@ OBJS-$(HAVE_MMX) += x86/rgb2rgb.o \ | |||||
| x86/swscale_mmx.o \ | x86/swscale_mmx.o \ | ||||
| x86/yuv2rgb_mmx.o | x86/yuv2rgb_mmx.o | ||||
| OBJS-$(HAVE_VIS) += sparc/yuv2rgb_vis.o | OBJS-$(HAVE_VIS) += sparc/yuv2rgb_vis.o | ||||
| OBJS-$(HAVE_YASM) += x86/scale.o | |||||
| TESTPROGS = colorspace swscale | TESTPROGS = colorspace swscale | ||||
| @@ -0,0 +1,429 @@ | |||||
| ;****************************************************************************** | |||||
| ;* x86-optimized horizontal line scaling functions | |||||
| ;* Copyright (c) 2011 Ronald S. Bultje <rsbultje@gmail.com> | |||||
| ;* | |||||
| ;* This file is part of Libav. | |||||
| ;* | |||||
| ;* Libav is free software; you can redistribute it and/or | |||||
| ;* modify it under the terms of the GNU Lesser General Public | |||||
| ;* License as published by the Free Software Foundation; either | |||||
| ;* version 2.1 of the License, or (at your option) any later version. | |||||
| ;* | |||||
| ;* Libav is distributed in the hope that it will be useful, | |||||
| ;* but WITHOUT ANY WARRANTY; without even the implied warranty of | |||||
| ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |||||
| ;* Lesser General Public License for more details. | |||||
| ;* | |||||
| ;* You should have received a copy of the GNU Lesser General Public | |||||
| ;* License along with Libav; if not, write to the Free Software | |||||
| ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |||||
| ;****************************************************************************** | |||||
| %include "x86inc.asm" | |||||
| %include "x86util.asm" | |||||
| SECTION_RODATA | |||||
| max_19bit_int: times 4 dd 0x7ffff | |||||
| max_19bit_flt: times 4 dd 524287.0 | |||||
| minshort: times 8 dw 0x8000 | |||||
| unicoeff: times 4 dd 0x20000000 | |||||
| SECTION .text | |||||
| ;----------------------------------------------------------------------------- | |||||
| ; horizontal line scaling | |||||
| ; | |||||
| ; void hscale<source_width>to<intermediate_nbits>_<filterSize>_<opt> | |||||
| ; (SwsContext *c, int{16,32}_t *dst, | |||||
| ; int dstW, const uint{8,16}_t *src, | |||||
| ; const int16_t *filter, | |||||
| ; const int16_t *filterPos, int filterSize); | |||||
| ; | |||||
| ; Scale one horizontal line. Input is either 8-bits width or 16-bits width | |||||
| ; ($source_width can be either 8, 9, 10 or 16, difference is whether we have to | |||||
| ; downscale before multiplying). Filter is 14-bits. Output is either 15bits | |||||
| ; (in int16_t) or 19bits (in int32_t), as given in $intermediate_nbits. Each | |||||
| ; output pixel is generated from $filterSize input pixels, the position of | |||||
| ; the first pixel is given in filterPos[nOutputPixel]. | |||||
| ;----------------------------------------------------------------------------- | |||||
| ; SCALE_FUNC source_width, intermediate_nbits, filtersize, filtersuffix, opt, n_args, n_xmm | |||||
| %macro SCALE_FUNC 7 | |||||
| cglobal hscale%1to%2_%4_%5, %6, 7, %7 | |||||
| %ifdef ARCH_X86_64 | |||||
| movsxd r2, r2d | |||||
| %endif ; x86-64 | |||||
| %if %2 == 19 | |||||
| %if mmsize == 8 ; mmx | |||||
| mova m2, [max_19bit_int] | |||||
| %elifidn %5, sse4 | |||||
| mova m2, [max_19bit_int] | |||||
| %else ; ssse3/sse2 | |||||
| mova m2, [max_19bit_flt] | |||||
| %endif ; mmx/sse2/ssse3/sse4 | |||||
| %endif ; %2 == 19 | |||||
| %if %1 == 16 | |||||
| mova m6, [minshort] | |||||
| mova m7, [unicoeff] | |||||
| %elif %1 == 8 | |||||
| pxor m3, m3 | |||||
| %endif ; %1 == 8/16 | |||||
| %if %1 == 8 | |||||
| %define movlh movd | |||||
| %define movbh movh | |||||
| %define srcmul 1 | |||||
| %else ; %1 == 9-16 | |||||
| %define movlh movq | |||||
| %define movbh movu | |||||
| %define srcmul 2 | |||||
| %endif ; %1 == 8/9-16 | |||||
| %ifnidn %3, X | |||||
| ; setup loop | |||||
| %if %3 == 8 | |||||
| shl r2, 1 ; this allows *16 (i.e. now *8) in lea instructions for the 8-tap filter | |||||
| %define r2shr 1 | |||||
| %else ; %3 == 4 | |||||
| %define r2shr 0 | |||||
| %endif ; %3 == 8 | |||||
| lea r4, [r4+r2*8] | |||||
| %if %2 == 15 | |||||
| lea r1, [r1+r2*(2>>r2shr)] | |||||
| %else ; %2 == 19 | |||||
| lea r1, [r1+r2*(4>>r2shr)] | |||||
| %endif ; %2 == 15/19 | |||||
| lea r5, [r5+r2*(2>>r2shr)] | |||||
| neg r2 | |||||
| .loop: | |||||
| %if %3 == 4 ; filterSize == 4 scaling | |||||
| ; load 2x4 or 4x4 source pixels into m0/m1 | |||||
| movsx r0, word [r5+r2*2+0] ; filterPos[0] | |||||
| movsx r6, word [r5+r2*2+2] ; filterPos[1] | |||||
| movlh m0, [r3+r0*srcmul] ; src[filterPos[0] + {0,1,2,3}] | |||||
| %if mmsize == 8 | |||||
| movlh m1, [r3+r6*srcmul] ; src[filterPos[1] + {0,1,2,3}] | |||||
| %else ; mmsize == 16 | |||||
| %if %1 > 8 | |||||
| movhps m0, [r3+r6*srcmul] ; src[filterPos[1] + {0,1,2,3}] | |||||
| %else ; %1 == 8 | |||||
| movd m4, [r3+r6*srcmul] ; src[filterPos[1] + {0,1,2,3}] | |||||
| %endif | |||||
| movsx r0, word [r5+r2*2+4] ; filterPos[2] | |||||
| movsx r6, word [r5+r2*2+6] ; filterPos[3] | |||||
| movlh m1, [r3+r0*srcmul] ; src[filterPos[2] + {0,1,2,3}] | |||||
| %if %1 > 8 | |||||
| movhps m1, [r3+r6*srcmul] ; src[filterPos[3] + {0,1,2,3}] | |||||
| %else ; %1 == 8 | |||||
| movd m5, [r3+r6*srcmul] ; src[filterPos[3] + {0,1,2,3}] | |||||
| punpckldq m0, m4 | |||||
| punpckldq m1, m5 | |||||
| %endif ; %1 == 8 && %5 <= ssse | |||||
| %endif ; mmsize == 8/16 | |||||
| %if %1 == 8 | |||||
| punpcklbw m0, m3 ; byte -> word | |||||
| punpcklbw m1, m3 ; byte -> word | |||||
| %endif ; %1 == 8 | |||||
| ; multiply with filter coefficients | |||||
| %if %1 == 16 ; pmaddwd needs signed adds, so this moves unsigned -> signed, we'll | |||||
| ; add back 0x8000 * sum(coeffs) after the horizontal add | |||||
| psubw m0, m6 | |||||
| psubw m1, m6 | |||||
| %endif ; %1 == 16 | |||||
| pmaddwd m0, [r4+r2*8+mmsize*0] ; *= filter[{0,1,..,6,7}] | |||||
| pmaddwd m1, [r4+r2*8+mmsize*1] ; *= filter[{8,9,..,14,15}] | |||||
| ; add up horizontally (4 srcpix * 4 coefficients -> 1 dstpix) | |||||
| %if mmsize == 8 ; mmx | |||||
| movq m4, m0 | |||||
| punpckldq m0, m1 | |||||
| punpckhdq m4, m1 | |||||
| paddd m0, m4 | |||||
| %elifidn %5, sse2 | |||||
| mova m4, m0 | |||||
| shufps m0, m1, 10001000b | |||||
| shufps m4, m1, 11011101b | |||||
| paddd m0, m4 | |||||
| %else ; ssse3/sse4 | |||||
| phaddd m0, m1 ; filter[{ 0, 1, 2, 3}]*src[filterPos[0]+{0,1,2,3}], | |||||
| ; filter[{ 4, 5, 6, 7}]*src[filterPos[1]+{0,1,2,3}], | |||||
| ; filter[{ 8, 9,10,11}]*src[filterPos[2]+{0,1,2,3}], | |||||
| ; filter[{12,13,14,15}]*src[filterPos[3]+{0,1,2,3}] | |||||
| %endif ; mmx/sse2/ssse3/sse4 | |||||
| %else ; %3 == 8, i.e. filterSize == 8 scaling | |||||
| ; load 2x8 or 4x8 source pixels into m0, m1, m4 and m5 | |||||
| movsx r0, word [r5+r2*1+0] ; filterPos[0] | |||||
| movsx r6, word [r5+r2*1+2] ; filterPos[1] | |||||
| movbh m0, [r3+ r0 *srcmul] ; src[filterPos[0] + {0,1,2,3,4,5,6,7}] | |||||
| %if mmsize == 8 | |||||
| movbh m1, [r3+(r0+4)*srcmul] ; src[filterPos[0] + {4,5,6,7}] | |||||
| movbh m4, [r3+ r6 *srcmul] ; src[filterPos[1] + {0,1,2,3}] | |||||
| movbh m5, [r3+(r6+4)*srcmul] ; src[filterPos[1] + {4,5,6,7}] | |||||
| %else ; mmsize == 16 | |||||
| movbh m1, [r3+ r6 *srcmul] ; src[filterPos[1] + {0,1,2,3,4,5,6,7}] | |||||
| movsx r0, word [r5+r2*1+4] ; filterPos[2] | |||||
| movsx r6, word [r5+r2*1+6] ; filterPos[3] | |||||
| movbh m4, [r3+ r0 *srcmul] ; src[filterPos[2] + {0,1,2,3,4,5,6,7}] | |||||
| movbh m5, [r3+ r6 *srcmul] ; src[filterPos[3] + {0,1,2,3,4,5,6,7}] | |||||
| %endif ; mmsize == 8/16 | |||||
| %if %1 == 8 | |||||
| punpcklbw m0, m3 ; byte -> word | |||||
| punpcklbw m1, m3 ; byte -> word | |||||
| punpcklbw m4, m3 ; byte -> word | |||||
| punpcklbw m5, m3 ; byte -> word | |||||
| %endif ; %1 == 8 | |||||
| ; multiply | |||||
| %if %1 == 16 ; pmaddwd needs signed adds, so this moves unsigned -> signed, we'll | |||||
| ; add back 0x8000 * sum(coeffs) after the horizontal add | |||||
| psubw m0, m6 | |||||
| psubw m1, m6 | |||||
| psubw m4, m6 | |||||
| psubw m5, m6 | |||||
| %endif ; %1 == 16 | |||||
| pmaddwd m0, [r4+r2*8+mmsize*0] ; *= filter[{0,1,..,6,7}] | |||||
| pmaddwd m1, [r4+r2*8+mmsize*1] ; *= filter[{8,9,..,14,15}] | |||||
| pmaddwd m4, [r4+r2*8+mmsize*2] ; *= filter[{16,17,..,22,23}] | |||||
| pmaddwd m5, [r4+r2*8+mmsize*3] ; *= filter[{24,25,..,30,31}] | |||||
| ; add up horizontally (8 srcpix * 8 coefficients -> 1 dstpix) | |||||
| %if mmsize == 8 | |||||
| paddd m0, m1 | |||||
| paddd m4, m5 | |||||
| movq m1, m0 | |||||
| punpckldq m0, m4 | |||||
| punpckhdq m1, m4 | |||||
| paddd m0, m1 | |||||
| %elifidn %5, sse2 | |||||
| %if %1 == 8 | |||||
| %define mex m6 | |||||
| %else | |||||
| %define mex m3 | |||||
| %endif | |||||
| ; emulate horizontal add as transpose + vertical add | |||||
| mova mex, m0 | |||||
| punpckldq m0, m1 | |||||
| punpckhdq mex, m1 | |||||
| paddd m0, mex | |||||
| mova m1, m4 | |||||
| punpckldq m4, m5 | |||||
| punpckhdq m1, m5 | |||||
| paddd m4, m1 | |||||
| mova m1, m0 | |||||
| punpcklqdq m0, m4 | |||||
| punpckhqdq m1, m4 | |||||
| paddd m0, m1 | |||||
| %else ; ssse3/sse4 | |||||
| ; FIXME if we rearrange the filter in pairs of 4, we can | |||||
| ; load pixels likewise and use 2 x paddd + phaddd instead | |||||
| ; of 3 x phaddd here, faster on older cpus | |||||
| phaddd m0, m1 | |||||
| phaddd m4, m5 | |||||
| phaddd m0, m4 ; filter[{ 0, 1,..., 6, 7}]*src[filterPos[0]+{0,1,...,6,7}], | |||||
| ; filter[{ 8, 9,...,14,15}]*src[filterPos[1]+{0,1,...,6,7}], | |||||
| ; filter[{16,17,...,22,23}]*src[filterPos[2]+{0,1,...,6,7}], | |||||
| ; filter[{24,25,...,30,31}]*src[filterPos[3]+{0,1,...,6,7}] | |||||
| %endif ; mmx/sse2/ssse3/sse4 | |||||
| %endif ; %3 == 4/8 | |||||
| %else ; %3 == X, i.e. any filterSize scaling | |||||
| %ifidn %4, X4 | |||||
| %define r6sub 4 | |||||
| %else ; %4 == X || %4 == X8 | |||||
| %define r6sub 0 | |||||
| %endif ; %4 ==/!= X4 | |||||
| %ifdef ARCH_X86_64 | |||||
| push r12 | |||||
| movsxd r6, r6d ; filterSize | |||||
| lea r12, [r3+(r6-r6sub)*srcmul] ; &src[filterSize&~4] | |||||
| %define src_reg r11 | |||||
| %define r1x r10 | |||||
| %define filter2 r12 | |||||
| %else ; x86-32 | |||||
| lea r0, [r3+(r6-r6sub)*srcmul] ; &src[filterSize&~4] | |||||
| mov r6m, r0 | |||||
| %define src_reg r3 | |||||
| %define r1x r1 | |||||
| %define filter2 r6m | |||||
| %endif ; x86-32/64 | |||||
| lea r5, [r5+r2*2] | |||||
| %if %2 == 15 | |||||
| lea r1, [r1+r2*2] | |||||
| %else ; %2 == 19 | |||||
| lea r1, [r1+r2*4] | |||||
| %endif ; %2 == 15/19 | |||||
| movifnidn r1mp, r1 | |||||
| neg r2 | |||||
| .loop: | |||||
| movsx r0, word [r5+r2*2+0] ; filterPos[0] | |||||
| movsx r1x, word [r5+r2*2+2] ; filterPos[1] | |||||
| ; FIXME maybe do 4px/iteration on x86-64 (x86-32 wouldn't have enough regs)? | |||||
| pxor m4, m4 | |||||
| pxor m5, m5 | |||||
| mov src_reg, r3mp | |||||
| .innerloop: | |||||
| ; load 2x4 (mmx) or 2x8 (sse) source pixels into m0/m1 -> m4/m5 | |||||
| movbh m0, [src_reg+r0 *srcmul] ; src[filterPos[0] + {0,1,2,3(,4,5,6,7)}] | |||||
| movbh m1, [src_reg+(r1x+r6sub)*srcmul] ; src[filterPos[1] + {0,1,2,3(,4,5,6,7)}] | |||||
| %if %1 == 8 | |||||
| punpcklbw m0, m3 | |||||
| punpcklbw m1, m3 | |||||
| %endif ; %1 == 8 | |||||
| ; multiply | |||||
| %if %1 == 16 ; pmaddwd needs signed adds, so this moves unsigned -> signed, we'll | |||||
| ; add back 0x8000 * sum(coeffs) after the horizontal add | |||||
| psubw m0, m6 | |||||
| psubw m1, m6 | |||||
| %endif ; %1 == 16 | |||||
| pmaddwd m0, [r4 ] ; filter[{0,1,2,3(,4,5,6,7)}] | |||||
| pmaddwd m1, [r4+(r6+r6sub)*2] ; filter[filtersize+{0,1,2,3(,4,5,6,7)}] | |||||
| paddd m4, m0 | |||||
| paddd m5, m1 | |||||
| add r4, mmsize | |||||
| add src_reg, srcmul*mmsize/2 | |||||
| cmp src_reg, filter2 ; while (src += 4) < &src[filterSize] | |||||
| jl .innerloop | |||||
| %ifidn %4, X4 | |||||
| movsx r1x, word [r5+r2*2+2] ; filterPos[1] | |||||
| movlh m0, [src_reg+r0 *srcmul] ; split last 4 srcpx of dstpx[0] | |||||
| sub r1x, r6 ; and first 4 srcpx of dstpx[1] | |||||
| %if %1 > 8 | |||||
| movhps m0, [src_reg+(r1x+r6sub)*srcmul] | |||||
| %else ; %1 == 8 | |||||
| movd m1, [src_reg+(r1x+r6sub)*srcmul] | |||||
| punpckldq m0, m1 | |||||
| %endif ; %1 == 8 && %5 <= ssse | |||||
| %if %1 == 8 | |||||
| punpcklbw m0, m3 | |||||
| %endif ; %1 == 8 | |||||
| %if %1 == 16 ; pmaddwd needs signed adds, so this moves unsigned -> signed, we'll | |||||
| ; add back 0x8000 * sum(coeffs) after the horizontal add | |||||
| psubw m0, m6 | |||||
| %endif ; %1 == 16 | |||||
| pmaddwd m0, [r4] | |||||
| %endif ; %4 == X4 | |||||
| lea r4, [r4+(r6+r6sub)*2] | |||||
| %if mmsize == 8 ; mmx | |||||
| movq m0, m4 | |||||
| punpckldq m4, m5 | |||||
| punpckhdq m0, m5 | |||||
| paddd m0, m4 | |||||
| %else ; mmsize == 16 | |||||
| %ifidn %5, sse2 | |||||
| mova m1, m4 | |||||
| punpcklqdq m4, m5 | |||||
| punpckhqdq m1, m5 | |||||
| paddd m4, m1 | |||||
| %else ; ssse3/sse4 | |||||
| phaddd m4, m5 | |||||
| %endif ; sse2/ssse3/sse4 | |||||
| %ifidn %4, X4 | |||||
| paddd m4, m0 | |||||
| %endif ; %3 == X4 | |||||
| %ifidn %5, sse2 | |||||
| pshufd m4, m4, 11011000b | |||||
| movhlps m0, m4 | |||||
| paddd m0, m4 | |||||
| %else ; ssse3/sse4 | |||||
| phaddd m4, m4 | |||||
| SWAP 0, 4 | |||||
| %endif ; sse2/ssse3/sse4 | |||||
| %endif ; mmsize == 8/16 | |||||
| %endif ; %3 ==/!= X | |||||
| %if %1 == 16 ; add 0x8000 * sum(coeffs), i.e. back from signed -> unsigned | |||||
| paddd m0, m7 | |||||
| %endif ; %1 == 16 | |||||
| ; clip, store | |||||
| psrad m0, 14 + %1 - %2 | |||||
| %ifidn %3, X | |||||
| movifnidn r1, r1mp | |||||
| %endif ; %3 == X | |||||
| %if %2 == 15 | |||||
| packssdw m0, m0 | |||||
| %ifnidn %3, X | |||||
| movh [r1+r2*(2>>r2shr)], m0 | |||||
| %else ; %3 == X | |||||
| movd [r1+r2*2], m0 | |||||
| %endif ; %3 ==/!= X | |||||
| %else ; %2 == 19 | |||||
| %if mmsize == 8 | |||||
| PMINSD_MMX m0, m2, m4 | |||||
| %elifidn %5, sse4 | |||||
| pminsd m0, m2 | |||||
| %else ; sse2/ssse3 | |||||
| cvtdq2ps m0, m0 | |||||
| minps m0, m2 | |||||
| cvtps2dq m0, m0 | |||||
| %endif ; mmx/sse2/ssse3/sse4 | |||||
| %ifnidn %3, X | |||||
| movu [r1+r2*(4>>r2shr)], m0 | |||||
| %else ; %3 == X | |||||
| movq [r1+r2*4], m0 | |||||
| %endif ; %3 ==/!= X | |||||
| %endif ; %2 == 15/19 | |||||
| %ifnidn %3, X | |||||
| add r2, (mmsize<<r2shr)/4 ; both 8tap and 4tap really only do 4 pixels (or for mmx: 2 pixels) | |||||
| ; per iteration. see "shl r2,1" above as for why we do this | |||||
| %else ; %3 == X | |||||
| add r2, 2 | |||||
| %endif ; %3 ==/!= X | |||||
| jl .loop | |||||
| %ifnidn %3, X | |||||
| REP_RET | |||||
| %else ; %3 == X | |||||
| %ifdef ARCH_X86_64 | |||||
| pop r12 | |||||
| RET | |||||
| %else ; x86-32 | |||||
| REP_RET | |||||
| %endif ; x86-32/64 | |||||
| %endif ; %3 ==/!= X | |||||
| %endmacro | |||||
| ; SCALE_FUNCS source_width, intermediate_nbits, opt, n_xmm | |||||
| %macro SCALE_FUNCS 4 | |||||
| SCALE_FUNC %1, %2, 4, 4, %3, 6, %4 | |||||
| SCALE_FUNC %1, %2, 8, 8, %3, 6, %4 | |||||
| %if mmsize == 8 | |||||
| SCALE_FUNC %1, %2, X, X, %3, 7, %4 | |||||
| %else | |||||
| SCALE_FUNC %1, %2, X, X4, %3, 7, %4 | |||||
| SCALE_FUNC %1, %2, X, X8, %3, 7, %4 | |||||
| %endif | |||||
| %endmacro | |||||
| ; SCALE_FUNCS2 opt, 8_xmm_args, 9to10_xmm_args, 16_xmm_args | |||||
| %macro SCALE_FUNCS2 4 | |||||
| %ifnidn %1, sse4 | |||||
| SCALE_FUNCS 8, 15, %1, %2 | |||||
| SCALE_FUNCS 9, 15, %1, %3 | |||||
| SCALE_FUNCS 10, 15, %1, %3 | |||||
| SCALE_FUNCS 16, 15, %1, %4 | |||||
| %endif ; !sse4 | |||||
| SCALE_FUNCS 8, 19, %1, %2 | |||||
| SCALE_FUNCS 9, 19, %1, %3 | |||||
| SCALE_FUNCS 10, 19, %1, %3 | |||||
| SCALE_FUNCS 16, 19, %1, %4 | |||||
| %endmacro | |||||
| %ifdef ARCH_X86_32 | |||||
| INIT_MMX | |||||
| SCALE_FUNCS2 mmx, 0, 0, 0 | |||||
| %endif | |||||
| INIT_XMM | |||||
| SCALE_FUNCS2 sse2, 6, 7, 8 | |||||
| SCALE_FUNCS2 ssse3, 6, 6, 8 | |||||
| SCALE_FUNCS2 sse4, 6, 6, 8 | |||||
| @@ -176,6 +176,41 @@ void updateMMXDitherTables(SwsContext *c, int dstY, int lumBufIndex, int chrBufI | |||||
| } | } | ||||
| } | } | ||||
| #define SCALE_FUNC(filter_n, from_bpc, to_bpc, opt) \ | |||||
| extern void ff_hscale ## from_bpc ## to ## to_bpc ## _ ## filter_n ## _ ## opt( \ | |||||
| SwsContext *c, int16_t *data, \ | |||||
| int dstW, const uint8_t *src, \ | |||||
| const int16_t *filter, \ | |||||
| const int16_t *filterPos, int filterSize); | |||||
| #define SCALE_FUNCS(filter_n, opt) \ | |||||
| SCALE_FUNC(filter_n, 8, 15, opt); \ | |||||
| SCALE_FUNC(filter_n, 9, 15, opt); \ | |||||
| SCALE_FUNC(filter_n, 10, 15, opt); \ | |||||
| SCALE_FUNC(filter_n, 16, 15, opt); \ | |||||
| SCALE_FUNC(filter_n, 8, 19, opt); \ | |||||
| SCALE_FUNC(filter_n, 9, 19, opt); \ | |||||
| SCALE_FUNC(filter_n, 10, 19, opt); \ | |||||
| SCALE_FUNC(filter_n, 16, 19, opt) | |||||
| #define SCALE_FUNCS_MMX(opt) \ | |||||
| SCALE_FUNCS(4, opt); \ | |||||
| SCALE_FUNCS(8, opt); \ | |||||
| SCALE_FUNCS(X, opt) | |||||
| #define SCALE_FUNCS_SSE(opt) \ | |||||
| SCALE_FUNCS(4, opt); \ | |||||
| SCALE_FUNCS(8, opt); \ | |||||
| SCALE_FUNCS(X4, opt); \ | |||||
| SCALE_FUNCS(X8, opt) | |||||
| #if ARCH_X86_32 | |||||
| SCALE_FUNCS_MMX(mmx); | |||||
| #endif | |||||
| SCALE_FUNCS_SSE(sse2); | |||||
| SCALE_FUNCS_SSE(ssse3); | |||||
| SCALE_FUNCS_SSE(sse4); | |||||
| void ff_sws_init_swScale_mmx(SwsContext *c) | void ff_sws_init_swScale_mmx(SwsContext *c) | ||||
| { | { | ||||
| int cpu_flags = av_get_cpu_flags(); | int cpu_flags = av_get_cpu_flags(); | ||||
| @@ -186,4 +221,55 @@ void ff_sws_init_swScale_mmx(SwsContext *c) | |||||
| if (cpu_flags & AV_CPU_FLAG_MMX2) | if (cpu_flags & AV_CPU_FLAG_MMX2) | ||||
| sws_init_swScale_MMX2(c); | sws_init_swScale_MMX2(c); | ||||
| #endif | #endif | ||||
| #if HAVE_YASM | |||||
| #define ASSIGN_SCALE_FUNC2(hscalefn, filtersize, opt1, opt2) do { \ | |||||
| if (c->srcBpc == 8) { \ | |||||
| hscalefn = c->dstBpc <= 10 ? ff_hscale8to15_ ## filtersize ## _ ## opt2 : \ | |||||
| ff_hscale8to19_ ## filtersize ## _ ## opt1; \ | |||||
| } else if (c->srcBpc == 9) { \ | |||||
| hscalefn = c->dstBpc <= 10 ? ff_hscale9to15_ ## filtersize ## _ ## opt2 : \ | |||||
| ff_hscale9to19_ ## filtersize ## _ ## opt1; \ | |||||
| } else if (c->srcBpc == 10) { \ | |||||
| hscalefn = c->dstBpc <= 10 ? ff_hscale10to15_ ## filtersize ## _ ## opt2 : \ | |||||
| ff_hscale10to19_ ## filtersize ## _ ## opt1; \ | |||||
| } else if(c->srcBpc == 16 && !((c->srcFormat==PIX_FMT_PAL8||isAnyRGB(c->srcFormat)) && av_pix_fmt_descriptors[c->srcFormat].comp[0].depth_minus1<15)) { \ | |||||
| hscalefn = c->dstBpc <= 10 ? ff_hscale16to15_ ## filtersize ## _ ## opt2 : \ | |||||
| ff_hscale16to19_ ## filtersize ## _ ## opt1; \ | |||||
| } \ | |||||
| } while (0) | |||||
| #define ASSIGN_MMX_SCALE_FUNC(hscalefn, filtersize, opt1, opt2) \ | |||||
| switch (filtersize) { \ | |||||
| case 4: ASSIGN_SCALE_FUNC2(hscalefn, 4, opt1, opt2); break; \ | |||||
| case 8: ASSIGN_SCALE_FUNC2(hscalefn, 8, opt1, opt2); break; \ | |||||
| default: ASSIGN_SCALE_FUNC2(hscalefn, X, opt1, opt2); break; \ | |||||
| } | |||||
| #if ARCH_X86_32 | |||||
| if (cpu_flags & AV_CPU_FLAG_MMX) { | |||||
| ASSIGN_MMX_SCALE_FUNC(c->hyScale, c->hLumFilterSize, mmx, mmx); | |||||
| ASSIGN_MMX_SCALE_FUNC(c->hcScale, c->hChrFilterSize, mmx, mmx); | |||||
| } | |||||
| #endif | |||||
| #define ASSIGN_SSE_SCALE_FUNC(hscalefn, filtersize, opt1, opt2) \ | |||||
| switch (filtersize) { \ | |||||
| case 4: ASSIGN_SCALE_FUNC2(hscalefn, 4, opt1, opt2); break; \ | |||||
| case 8: ASSIGN_SCALE_FUNC2(hscalefn, 8, opt1, opt2); break; \ | |||||
| default: if (filtersize & 4) ASSIGN_SCALE_FUNC2(hscalefn, X4, opt1, opt2); \ | |||||
| else ASSIGN_SCALE_FUNC2(hscalefn, X8, opt1, opt2); \ | |||||
| break; \ | |||||
| } | |||||
| if (cpu_flags & AV_CPU_FLAG_SSE2) { | |||||
| ASSIGN_SSE_SCALE_FUNC(c->hyScale, c->hLumFilterSize, sse2, sse2); | |||||
| ASSIGN_SSE_SCALE_FUNC(c->hcScale, c->hChrFilterSize, sse2, sse2); | |||||
| } | |||||
| if (cpu_flags & AV_CPU_FLAG_SSSE3) { | |||||
| ASSIGN_SSE_SCALE_FUNC(c->hyScale, c->hLumFilterSize, ssse3, ssse3); | |||||
| ASSIGN_SSE_SCALE_FUNC(c->hcScale, c->hChrFilterSize, ssse3, ssse3); | |||||
| } | |||||
| if (cpu_flags & AV_CPU_FLAG_SSE4) { | |||||
| /* Xto15 don't need special sse4 functions */ | |||||
| ASSIGN_SSE_SCALE_FUNC(c->hyScale, c->hLumFilterSize, sse4, ssse3); | |||||
| ASSIGN_SSE_SCALE_FUNC(c->hcScale, c->hChrFilterSize, sse4, ssse3); | |||||
| } | |||||
| #endif | |||||
| } | } | ||||
| @@ -1951,164 +1951,6 @@ static void RENAME(rgb24ToUV)(int16_t *dstU, int16_t *dstV, | |||||
| RENAME(bgr24ToUV_mmx)(dstU, dstV, src1, width, PIX_FMT_RGB24); | RENAME(bgr24ToUV_mmx)(dstU, dstV, src1, width, PIX_FMT_RGB24); | ||||
| } | } | ||||
| #if !COMPILE_TEMPLATE_MMX2 | |||||
| // bilinear / bicubic scaling | |||||
| static void RENAME(hScale)(SwsContext *c, int16_t *dst, int dstW, | |||||
| const uint8_t *src, const int16_t *filter, | |||||
| const int16_t *filterPos, int filterSize) | |||||
| { | |||||
| assert(filterSize % 4 == 0 && filterSize>0); | |||||
| if (filterSize==4) { // Always true for upscaling, sometimes for down, too. | |||||
| x86_reg counter= -2*dstW; | |||||
| filter-= counter*2; | |||||
| filterPos-= counter/2; | |||||
| dst-= counter/2; | |||||
| __asm__ volatile( | |||||
| #if defined(PIC) | |||||
| "push %%"REG_b" \n\t" | |||||
| #endif | |||||
| "pxor %%mm7, %%mm7 \n\t" | |||||
| "push %%"REG_BP" \n\t" // we use 7 regs here ... | |||||
| "mov %%"REG_a", %%"REG_BP" \n\t" | |||||
| ".p2align 4 \n\t" | |||||
| "1: \n\t" | |||||
| "movzwl (%2, %%"REG_BP"), %%eax \n\t" | |||||
| "movzwl 2(%2, %%"REG_BP"), %%ebx \n\t" | |||||
| "movq (%1, %%"REG_BP", 4), %%mm1 \n\t" | |||||
| "movq 8(%1, %%"REG_BP", 4), %%mm3 \n\t" | |||||
| "movd (%3, %%"REG_a"), %%mm0 \n\t" | |||||
| "movd (%3, %%"REG_b"), %%mm2 \n\t" | |||||
| "punpcklbw %%mm7, %%mm0 \n\t" | |||||
| "punpcklbw %%mm7, %%mm2 \n\t" | |||||
| "pmaddwd %%mm1, %%mm0 \n\t" | |||||
| "pmaddwd %%mm2, %%mm3 \n\t" | |||||
| "movq %%mm0, %%mm4 \n\t" | |||||
| "punpckldq %%mm3, %%mm0 \n\t" | |||||
| "punpckhdq %%mm3, %%mm4 \n\t" | |||||
| "paddd %%mm4, %%mm0 \n\t" | |||||
| "psrad $7, %%mm0 \n\t" | |||||
| "packssdw %%mm0, %%mm0 \n\t" | |||||
| "movd %%mm0, (%4, %%"REG_BP") \n\t" | |||||
| "add $4, %%"REG_BP" \n\t" | |||||
| " jnc 1b \n\t" | |||||
| "pop %%"REG_BP" \n\t" | |||||
| #if defined(PIC) | |||||
| "pop %%"REG_b" \n\t" | |||||
| #endif | |||||
| : "+a" (counter) | |||||
| : "c" (filter), "d" (filterPos), "S" (src), "D" (dst) | |||||
| #if !defined(PIC) | |||||
| : "%"REG_b | |||||
| #endif | |||||
| ); | |||||
| } else if (filterSize==8) { | |||||
| x86_reg counter= -2*dstW; | |||||
| filter-= counter*4; | |||||
| filterPos-= counter/2; | |||||
| dst-= counter/2; | |||||
| __asm__ volatile( | |||||
| #if defined(PIC) | |||||
| "push %%"REG_b" \n\t" | |||||
| #endif | |||||
| "pxor %%mm7, %%mm7 \n\t" | |||||
| "push %%"REG_BP" \n\t" // we use 7 regs here ... | |||||
| "mov %%"REG_a", %%"REG_BP" \n\t" | |||||
| ".p2align 4 \n\t" | |||||
| "1: \n\t" | |||||
| "movzwl (%2, %%"REG_BP"), %%eax \n\t" | |||||
| "movzwl 2(%2, %%"REG_BP"), %%ebx \n\t" | |||||
| "movq (%1, %%"REG_BP", 8), %%mm1 \n\t" | |||||
| "movq 16(%1, %%"REG_BP", 8), %%mm3 \n\t" | |||||
| "movd (%3, %%"REG_a"), %%mm0 \n\t" | |||||
| "movd (%3, %%"REG_b"), %%mm2 \n\t" | |||||
| "punpcklbw %%mm7, %%mm0 \n\t" | |||||
| "punpcklbw %%mm7, %%mm2 \n\t" | |||||
| "pmaddwd %%mm1, %%mm0 \n\t" | |||||
| "pmaddwd %%mm2, %%mm3 \n\t" | |||||
| "movq 8(%1, %%"REG_BP", 8), %%mm1 \n\t" | |||||
| "movq 24(%1, %%"REG_BP", 8), %%mm5 \n\t" | |||||
| "movd 4(%3, %%"REG_a"), %%mm4 \n\t" | |||||
| "movd 4(%3, %%"REG_b"), %%mm2 \n\t" | |||||
| "punpcklbw %%mm7, %%mm4 \n\t" | |||||
| "punpcklbw %%mm7, %%mm2 \n\t" | |||||
| "pmaddwd %%mm1, %%mm4 \n\t" | |||||
| "pmaddwd %%mm2, %%mm5 \n\t" | |||||
| "paddd %%mm4, %%mm0 \n\t" | |||||
| "paddd %%mm5, %%mm3 \n\t" | |||||
| "movq %%mm0, %%mm4 \n\t" | |||||
| "punpckldq %%mm3, %%mm0 \n\t" | |||||
| "punpckhdq %%mm3, %%mm4 \n\t" | |||||
| "paddd %%mm4, %%mm0 \n\t" | |||||
| "psrad $7, %%mm0 \n\t" | |||||
| "packssdw %%mm0, %%mm0 \n\t" | |||||
| "movd %%mm0, (%4, %%"REG_BP") \n\t" | |||||
| "add $4, %%"REG_BP" \n\t" | |||||
| " jnc 1b \n\t" | |||||
| "pop %%"REG_BP" \n\t" | |||||
| #if defined(PIC) | |||||
| "pop %%"REG_b" \n\t" | |||||
| #endif | |||||
| : "+a" (counter) | |||||
| : "c" (filter), "d" (filterPos), "S" (src), "D" (dst) | |||||
| #if !defined(PIC) | |||||
| : "%"REG_b | |||||
| #endif | |||||
| ); | |||||
| } else { | |||||
| const uint8_t *offset = src+filterSize; | |||||
| x86_reg counter= -2*dstW; | |||||
| //filter-= counter*filterSize/2; | |||||
| filterPos-= counter/2; | |||||
| dst-= counter/2; | |||||
| __asm__ volatile( | |||||
| "pxor %%mm7, %%mm7 \n\t" | |||||
| ".p2align 4 \n\t" | |||||
| "1: \n\t" | |||||
| "mov %2, %%"REG_c" \n\t" | |||||
| "movzwl (%%"REG_c", %0), %%eax \n\t" | |||||
| "movzwl 2(%%"REG_c", %0), %%edx \n\t" | |||||
| "mov %5, %%"REG_c" \n\t" | |||||
| "pxor %%mm4, %%mm4 \n\t" | |||||
| "pxor %%mm5, %%mm5 \n\t" | |||||
| "2: \n\t" | |||||
| "movq (%1), %%mm1 \n\t" | |||||
| "movq (%1, %6), %%mm3 \n\t" | |||||
| "movd (%%"REG_c", %%"REG_a"), %%mm0 \n\t" | |||||
| "movd (%%"REG_c", %%"REG_d"), %%mm2 \n\t" | |||||
| "punpcklbw %%mm7, %%mm0 \n\t" | |||||
| "punpcklbw %%mm7, %%mm2 \n\t" | |||||
| "pmaddwd %%mm1, %%mm0 \n\t" | |||||
| "pmaddwd %%mm2, %%mm3 \n\t" | |||||
| "paddd %%mm3, %%mm5 \n\t" | |||||
| "paddd %%mm0, %%mm4 \n\t" | |||||
| "add $8, %1 \n\t" | |||||
| "add $4, %%"REG_c" \n\t" | |||||
| "cmp %4, %%"REG_c" \n\t" | |||||
| " jb 2b \n\t" | |||||
| "add %6, %1 \n\t" | |||||
| "movq %%mm4, %%mm0 \n\t" | |||||
| "punpckldq %%mm5, %%mm4 \n\t" | |||||
| "punpckhdq %%mm5, %%mm0 \n\t" | |||||
| "paddd %%mm0, %%mm4 \n\t" | |||||
| "psrad $7, %%mm4 \n\t" | |||||
| "packssdw %%mm4, %%mm4 \n\t" | |||||
| "mov %3, %%"REG_a" \n\t" | |||||
| "movd %%mm4, (%%"REG_a", %0) \n\t" | |||||
| "add $4, %0 \n\t" | |||||
| " jnc 1b \n\t" | |||||
| : "+r" (counter), "+r" (filter) | |||||
| : "m" (filterPos), "m" (dst), "m"(offset), | |||||
| "m" (src), "r" ((x86_reg)filterSize*2) | |||||
| : "%"REG_a, "%"REG_c, "%"REG_d | |||||
| ); | |||||
| } | |||||
| } | |||||
| #endif /* !COMPILE_TEMPLATE_MMX2 */ | |||||
| static inline void RENAME(hScale16)(int16_t *dst, int dstW, const uint16_t *src, int srcW, int xInc, | static inline void RENAME(hScale16)(int16_t *dst, int dstW, const uint16_t *src, int srcW, int xInc, | ||||
| const int16_t *filter, const int16_t *filterPos, long filterSize, int shift) | const int16_t *filter, const int16_t *filterPos, long filterSize, int shift) | ||||
| { | { | ||||
| @@ -2265,7 +2107,6 @@ static inline void RENAME(hScale16)(int16_t *dst, int dstW, const uint16_t *src, | |||||
| } | } | ||||
| } | } | ||||
| #if COMPILE_TEMPLATE_MMX2 | #if COMPILE_TEMPLATE_MMX2 | ||||
| static void RENAME(hyscale_fast)(SwsContext *c, int16_t *dst, | static void RENAME(hyscale_fast)(SwsContext *c, int16_t *dst, | ||||
| int dstWidth, const uint8_t *src, | int dstWidth, const uint8_t *src, | ||||
| @@ -2466,10 +2307,6 @@ static av_cold void RENAME(sws_init_swScale)(SwsContext *c) | |||||
| } | } | ||||
| if (c->srcBpc == 8 && c->dstBpc <= 10) { | if (c->srcBpc == 8 && c->dstBpc <= 10) { | ||||
| #if !COMPILE_TEMPLATE_MMX2 | |||||
| c->hyScale = c->hcScale = RENAME(hScale ); | |||||
| #endif /* !COMPILE_TEMPLATE_MMX2 */ | |||||
| // Use the new MMX scaler if the MMX2 one can't be used (it is faster than the x86 ASM one). | // Use the new MMX scaler if the MMX2 one can't be used (it is faster than the x86 ASM one). | ||||
| #if COMPILE_TEMPLATE_MMX2 | #if COMPILE_TEMPLATE_MMX2 | ||||
| if (c->flags & SWS_FAST_BILINEAR && c->canMMX2BeUsed) | if (c->flags & SWS_FAST_BILINEAR && c->canMMX2BeUsed) | ||||