Also implement SSE2/AVX variants.tags/n0.10
@@ -17,7 +17,8 @@ OBJS-$(HAVE_MMX) += x86/rgb2rgb.o \ | |||||
x86/swscale_mmx.o \ | x86/swscale_mmx.o \ | ||||
x86/yuv2rgb_mmx.o | x86/yuv2rgb_mmx.o | ||||
OBJS-$(HAVE_VIS) += sparc/yuv2rgb_vis.o | OBJS-$(HAVE_VIS) += sparc/yuv2rgb_vis.o | ||||
MMX-OBJS-$(HAVE_YASM) += x86/output.o \ | |||||
MMX-OBJS-$(HAVE_YASM) += x86/input.o \ | |||||
x86/output.o \ | |||||
x86/scale.o | x86/scale.o | ||||
TESTPROGS = colorspace swscale | TESTPROGS = colorspace swscale | ||||
@@ -0,0 +1,242 @@ | |||||
;****************************************************************************** | |||||
;* x86-optimized input routines; does shuffling of packed | |||||
;* YUV formats into individual planes, and converts RGB | |||||
;* into YUV planes also. | |||||
;* Copyright (c) 2012 Ronald S. Bultje <rsbultje@gmail.com> | |||||
;* | |||||
;* This file is part of Libav. | |||||
;* | |||||
;* Libav is free software; you can redistribute it and/or | |||||
;* modify it under the terms of the GNU Lesser General Public | |||||
;* License as published by the Free Software Foundation; either | |||||
;* version 2.1 of the License, or (at your option) any later version. | |||||
;* | |||||
;* Libav is distributed in the hope that it will be useful, | |||||
;* but WITHOUT ANY WARRANTY; without even the implied warranty of | |||||
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |||||
;* Lesser General Public License for more details. | |||||
;* | |||||
;* You should have received a copy of the GNU Lesser General Public | |||||
;* License along with Libav; if not, write to the Free Software | |||||
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |||||
;****************************************************************************** | |||||
%include "x86inc.asm" | |||||
%include "x86util.asm" | |||||
SECTION_RODATA | |||||
SECTION .text | |||||
;----------------------------------------------------------------------------- | |||||
; YUYV/UYVY/NV12/NV21 packed pixel shuffling. | |||||
; | |||||
; void <fmt>ToY_<opt>(uint8_t *dst, const uint8_t *src, int w); | |||||
; and | |||||
; void <fmt>toUV_<opt>(uint8_t *dstU, uint8_t *dstV, const uint8_t *src, | |||||
; const uint8_t *unused, int w); | |||||
;----------------------------------------------------------------------------- | |||||
; %1 = a (aligned) or u (unaligned) | |||||
; %2 = yuyv or uyvy | |||||
%macro LOOP_YUYV_TO_Y 2 | |||||
.loop_%1: | |||||
mov%1 m0, [srcq+wq*2] ; (byte) { Y0, U0, Y1, V0, ... } | |||||
mov%1 m1, [srcq+wq*2+mmsize] ; (byte) { Y8, U4, Y9, V4, ... } | |||||
%ifidn %2, yuyv | |||||
pand m0, m2 ; (word) { Y0, Y1, ..., Y7 } | |||||
pand m1, m2 ; (word) { Y8, Y9, ..., Y15 } | |||||
%else ; uyvy | |||||
psrlw m0, 8 ; (word) { Y0, Y1, ..., Y7 } | |||||
psrlw m1, 8 ; (word) { Y8, Y9, ..., Y15 } | |||||
%endif ; yuyv/uyvy | |||||
packuswb m0, m1 ; (byte) { Y0, ..., Y15 } | |||||
mova [dstq+wq], m0 | |||||
add wq, mmsize | |||||
jl .loop_%1 | |||||
REP_RET | |||||
%endmacro | |||||
; %1 = nr. of XMM registers | |||||
; %2 = yuyv or uyvy | |||||
; %3 = if specified, it means that unaligned and aligned code in loop | |||||
; will be the same (i.e. YUYV+AVX), and thus we don't need to | |||||
; split the loop in an aligned and unaligned case | |||||
%macro YUYV_TO_Y_FN 2-3 | |||||
cglobal %2ToY, 3, 3, %1, dst, src, w | |||||
%ifdef ARCH_X86_64 | |||||
movsxd wq, wd | |||||
%endif | |||||
add dstq, wq | |||||
%if mmsize == 16 | |||||
test srcq, 15 | |||||
%endif | |||||
lea srcq, [srcq+wq*2] | |||||
%ifidn %2, yuyv | |||||
pcmpeqb m2, m2 ; (byte) { 0xff } x 16 | |||||
psrlw m2, 8 ; (word) { 0x00ff } x 8 | |||||
%endif ; yuyv | |||||
%if mmsize == 16 | |||||
jnz .loop_u_start | |||||
neg wq | |||||
LOOP_YUYV_TO_Y a, %2 | |||||
.loop_u_start: | |||||
neg wq | |||||
LOOP_YUYV_TO_Y u, %2 | |||||
%else ; mmsize == 8 | |||||
neg wq | |||||
LOOP_YUYV_TO_Y a, %2 | |||||
%endif ; mmsize == 8/16 | |||||
%endmacro | |||||
; %1 = a (aligned) or u (unaligned) | |||||
; %2 = yuyv or uyvy | |||||
%macro LOOP_YUYV_TO_UV 2 | |||||
.loop_%1: | |||||
%ifidn %2, yuyv | |||||
mov%1 m0, [srcq+wq*4] ; (byte) { Y0, U0, Y1, V0, ... } | |||||
mov%1 m1, [srcq+wq*4+mmsize] ; (byte) { Y8, U4, Y9, V4, ... } | |||||
psrlw m0, 8 ; (word) { U0, V0, ..., U3, V3 } | |||||
psrlw m1, 8 ; (word) { U4, V4, ..., U7, V7 } | |||||
%else ; uyvy | |||||
%if cpuflag(avx) | |||||
vpand m0, m2, [srcq+wq*4] ; (word) { U0, V0, ..., U3, V3 } | |||||
vpand m1, m2, [srcq+wq*4+mmsize] ; (word) { U4, V4, ..., U7, V7 } | |||||
%else | |||||
mov%1 m0, [srcq+wq*4] ; (byte) { Y0, U0, Y1, V0, ... } | |||||
mov%1 m1, [srcq+wq*4+mmsize] ; (byte) { Y8, U4, Y9, V4, ... } | |||||
pand m0, m2 ; (word) { U0, V0, ..., U3, V3 } | |||||
pand m1, m2 ; (word) { U4, V4, ..., U7, V7 } | |||||
%endif | |||||
%endif ; yuyv/uyvy | |||||
packuswb m0, m1 ; (byte) { U0, V0, ..., U7, V7 } | |||||
pand m1, m0, m2 ; (word) { U0, U1, ..., U7 } | |||||
psrlw m0, 8 ; (word) { V0, V1, ..., V7 } | |||||
%if mmsize == 16 | |||||
packuswb m1, m0 ; (byte) { U0, ... U7, V1, ... V7 } | |||||
movh [dstUq+wq], m1 | |||||
movhps [dstVq+wq], m1 | |||||
%else ; mmsize == 8 | |||||
packuswb m1, m1 ; (byte) { U0, ... U3 } | |||||
packuswb m0, m0 ; (byte) { V0, ... V3 } | |||||
movh [dstUq+wq], m1 | |||||
movh [dstVq+wq], m0 | |||||
%endif ; mmsize == 8/16 | |||||
add wq, mmsize / 2 | |||||
jl .loop_%1 | |||||
REP_RET | |||||
%endmacro | |||||
; %1 = nr. of XMM registers | |||||
; %2 = yuyv or uyvy | |||||
; %3 = if specified, it means that unaligned and aligned code in loop | |||||
; will be the same (i.e. UYVY+AVX), and thus we don't need to | |||||
; split the loop in an aligned and unaligned case | |||||
%macro YUYV_TO_UV_FN 2-3 | |||||
cglobal %2ToUV, 3, 4, %1, dstU, dstV, src, w | |||||
%ifdef ARCH_X86_64 | |||||
movsxd wq, r4m | |||||
%else ; x86-32 | |||||
mov wq, r4m | |||||
%endif | |||||
add dstUq, wq | |||||
add dstVq, wq | |||||
%if mmsize == 16 && %0 == 2 | |||||
test srcq, 15 | |||||
%endif | |||||
lea srcq, [srcq+wq*4] | |||||
pcmpeqb m2, m2 ; (byte) { 0xff } x 16 | |||||
psrlw m2, 8 ; (word) { 0x00ff } x 8 | |||||
; NOTE: if uyvy+avx, u/a are identical | |||||
%if mmsize == 16 && %0 == 2 | |||||
jnz .loop_u_start | |||||
neg wq | |||||
LOOP_YUYV_TO_UV a, %2 | |||||
.loop_u_start: | |||||
neg wq | |||||
LOOP_YUYV_TO_UV u, %2 | |||||
%else ; mmsize == 8 | |||||
neg wq | |||||
LOOP_YUYV_TO_UV a, %2 | |||||
%endif ; mmsize == 8/16 | |||||
%endmacro | |||||
; %1 = a (aligned) or u (unaligned) | |||||
; %2 = nv12 or nv21 | |||||
%macro LOOP_NVXX_TO_UV 2 | |||||
.loop_%1: | |||||
mov%1 m0, [srcq+wq*2] ; (byte) { U0, V0, U1, V1, ... } | |||||
mov%1 m1, [srcq+wq*2+mmsize] ; (byte) { U8, V8, U9, V9, ... } | |||||
pand m2, m0, m4 ; (word) { U0, U1, ..., U7 } | |||||
pand m3, m1, m4 ; (word) { U8, U9, ..., U15 } | |||||
psrlw m0, 8 ; (word) { V0, V1, ..., V7 } | |||||
psrlw m1, 8 ; (word) { V8, V9, ..., V15 } | |||||
packuswb m2, m3 ; (byte) { U0, ..., U15 } | |||||
packuswb m0, m1 ; (byte) { V0, ..., V15 } | |||||
%ifidn %2, nv12 | |||||
mova [dstUq+wq], m2 | |||||
mova [dstVq+wq], m0 | |||||
%else ; nv21 | |||||
mova [dstVq+wq], m2 | |||||
mova [dstUq+wq], m0 | |||||
%endif ; nv12/21 | |||||
add wq, mmsize | |||||
jl .loop_%1 | |||||
REP_RET | |||||
%endmacro | |||||
; %1 = nr. of XMM registers | |||||
; %2 = nv12 or nv21 | |||||
%macro NVXX_TO_UV_FN 2 | |||||
cglobal %2ToUV, 3, 4, %1, dstU, dstV, src, w | |||||
%ifdef ARCH_X86_64 | |||||
movsxd wq, r4m | |||||
%else ; x86-32 | |||||
mov wq, r4m | |||||
%endif | |||||
add dstUq, wq | |||||
add dstVq, wq | |||||
%if mmsize == 16 | |||||
test srcq, 15 | |||||
%endif | |||||
lea srcq, [srcq+wq*2] | |||||
pcmpeqb m4, m4 ; (byte) { 0xff } x 16 | |||||
psrlw m4, 8 ; (word) { 0x00ff } x 8 | |||||
%if mmsize == 16 | |||||
jnz .loop_u_start | |||||
neg wq | |||||
LOOP_NVXX_TO_UV a, %2 | |||||
.loop_u_start: | |||||
neg wq | |||||
LOOP_NVXX_TO_UV u, %2 | |||||
%else ; mmsize == 8 | |||||
neg wq | |||||
LOOP_NVXX_TO_UV a, %2 | |||||
%endif ; mmsize == 8/16 | |||||
%endmacro | |||||
%ifdef ARCH_X86_32 | |||||
INIT_MMX mmx | |||||
YUYV_TO_Y_FN 0, yuyv | |||||
YUYV_TO_Y_FN 0, uyvy | |||||
YUYV_TO_UV_FN 0, yuyv | |||||
YUYV_TO_UV_FN 0, uyvy | |||||
NVXX_TO_UV_FN 0, nv12 | |||||
NVXX_TO_UV_FN 0, nv21 | |||||
%endif | |||||
INIT_XMM sse2 | |||||
YUYV_TO_Y_FN 3, yuyv | |||||
YUYV_TO_Y_FN 2, uyvy | |||||
YUYV_TO_UV_FN 3, yuyv | |||||
YUYV_TO_UV_FN 3, uyvy | |||||
NVXX_TO_UV_FN 5, nv12 | |||||
NVXX_TO_UV_FN 5, nv21 | |||||
INIT_XMM avx | |||||
; in theory, we could write a yuy2-to-y using vpand (i.e. AVX), but | |||||
; that's not faster in practice | |||||
YUYV_TO_UV_FN 3, yuyv | |||||
YUYV_TO_UV_FN 3, uyvy, 1 | |||||
NVXX_TO_UV_FN 5, nv12 | |||||
NVXX_TO_UV_FN 5, nv21 |
@@ -244,6 +244,26 @@ VSCALE_FUNCS(sse2, sse2); | |||||
VSCALE_FUNC(16, sse4); | VSCALE_FUNC(16, sse4); | ||||
VSCALE_FUNCS(avx, avx); | VSCALE_FUNCS(avx, avx); | ||||
#define INPUT_UV_FUNC(fmt, opt) \ | |||||
extern void ff_ ## fmt ## ToUV_ ## opt(uint8_t *dstU, uint8_t *dstV, \ | |||||
const uint8_t *src, const uint8_t *unused1, \ | |||||
int w, uint32_t *unused2) | |||||
#define INPUT_FUNC(fmt, opt) \ | |||||
extern void ff_ ## fmt ## ToY_ ## opt(uint8_t *dst, const uint8_t *src, \ | |||||
int w, uint32_t *unused); \ | |||||
INPUT_UV_FUNC(fmt, opt) | |||||
#define INPUT_FUNCS(opt) \ | |||||
INPUT_FUNC(uyvy, opt); \ | |||||
INPUT_FUNC(yuyv, opt); \ | |||||
INPUT_UV_FUNC(nv12, opt); \ | |||||
INPUT_UV_FUNC(nv21, opt) | |||||
#if ARCH_X86_32 | |||||
INPUT_FUNCS(mmx); | |||||
#endif | |||||
INPUT_FUNCS(sse2); | |||||
INPUT_FUNCS(avx); | |||||
void ff_sws_init_swScale_mmx(SwsContext *c) | void ff_sws_init_swScale_mmx(SwsContext *c) | ||||
{ | { | ||||
int cpu_flags = av_get_cpu_flags(); | int cpu_flags = av_get_cpu_flags(); | ||||
@@ -296,6 +316,30 @@ switch(c->dstBpc){ \ | |||||
ASSIGN_MMX_SCALE_FUNC(c->hyScale, c->hLumFilterSize, mmx, mmx); | ASSIGN_MMX_SCALE_FUNC(c->hyScale, c->hLumFilterSize, mmx, mmx); | ||||
ASSIGN_MMX_SCALE_FUNC(c->hcScale, c->hChrFilterSize, mmx, mmx); | ASSIGN_MMX_SCALE_FUNC(c->hcScale, c->hChrFilterSize, mmx, mmx); | ||||
ASSIGN_VSCALE_FUNC(c->yuv2plane1, mmx, mmx2, cpu_flags & AV_CPU_FLAG_MMX2); | ASSIGN_VSCALE_FUNC(c->yuv2plane1, mmx, mmx2, cpu_flags & AV_CPU_FLAG_MMX2); | ||||
switch (c->srcFormat) { | |||||
case PIX_FMT_Y400A: | |||||
c->lumToYV12 = ff_yuyvToY_mmx; | |||||
if (c->alpPixBuf) | |||||
c->alpToYV12 = ff_uyvyToY_mmx; | |||||
break; | |||||
case PIX_FMT_YUYV422: | |||||
c->lumToYV12 = ff_yuyvToY_mmx; | |||||
c->chrToYV12 = ff_yuyvToUV_mmx; | |||||
break; | |||||
case PIX_FMT_UYVY422: | |||||
c->lumToYV12 = ff_uyvyToY_mmx; | |||||
c->chrToYV12 = ff_uyvyToUV_mmx; | |||||
break; | |||||
case PIX_FMT_NV12: | |||||
c->chrToYV12 = ff_nv12ToUV_mmx; | |||||
break; | |||||
case PIX_FMT_NV21: | |||||
c->chrToYV12 = ff_nv21ToUV_mmx; | |||||
break; | |||||
default: | |||||
break; | |||||
} | |||||
} | } | ||||
if (cpu_flags & AV_CPU_FLAG_MMX2) { | if (cpu_flags & AV_CPU_FLAG_MMX2) { | ||||
ASSIGN_VSCALEX_FUNC(c->yuv2planeX, mmx2,); | ASSIGN_VSCALEX_FUNC(c->yuv2planeX, mmx2,); | ||||
@@ -314,6 +358,28 @@ switch(c->dstBpc){ \ | |||||
ASSIGN_SSE_SCALE_FUNC(c->hcScale, c->hChrFilterSize, sse2, sse2); | ASSIGN_SSE_SCALE_FUNC(c->hcScale, c->hChrFilterSize, sse2, sse2); | ||||
ASSIGN_VSCALEX_FUNC(c->yuv2planeX, sse2,); | ASSIGN_VSCALEX_FUNC(c->yuv2planeX, sse2,); | ||||
ASSIGN_VSCALE_FUNC(c->yuv2plane1, sse2, sse2, 1); | ASSIGN_VSCALE_FUNC(c->yuv2plane1, sse2, sse2, 1); | ||||
switch (c->srcFormat) { | |||||
case PIX_FMT_Y400A: | |||||
c->lumToYV12 = ff_yuyvToY_sse2; | |||||
if (c->alpPixBuf) | |||||
c->alpToYV12 = ff_uyvyToY_sse2; | |||||
break; | |||||
case PIX_FMT_YUYV422: | |||||
c->lumToYV12 = ff_yuyvToY_sse2; | |||||
c->chrToYV12 = ff_yuyvToUV_sse2; | |||||
break; | |||||
case PIX_FMT_UYVY422: | |||||
c->lumToYV12 = ff_uyvyToY_sse2; | |||||
c->chrToYV12 = ff_uyvyToUV_sse2; | |||||
break; | |||||
case PIX_FMT_NV12: | |||||
c->chrToYV12 = ff_nv12ToUV_sse2; | |||||
break; | |||||
case PIX_FMT_NV21: | |||||
c->chrToYV12 = ff_nv21ToUV_sse2; | |||||
break; | |||||
} | |||||
} | } | ||||
if (cpu_flags & AV_CPU_FLAG_SSSE3) { | if (cpu_flags & AV_CPU_FLAG_SSSE3) { | ||||
ASSIGN_SSE_SCALE_FUNC(c->hyScale, c->hLumFilterSize, ssse3, ssse3); | ASSIGN_SSE_SCALE_FUNC(c->hyScale, c->hLumFilterSize, ssse3, ssse3); | ||||
@@ -332,6 +398,23 @@ switch(c->dstBpc){ \ | |||||
if (cpu_flags & AV_CPU_FLAG_AVX) { | if (cpu_flags & AV_CPU_FLAG_AVX) { | ||||
ASSIGN_VSCALEX_FUNC(c->yuv2planeX, avx,); | ASSIGN_VSCALEX_FUNC(c->yuv2planeX, avx,); | ||||
ASSIGN_VSCALE_FUNC(c->yuv2plane1, avx, avx, 1); | ASSIGN_VSCALE_FUNC(c->yuv2plane1, avx, avx, 1); | ||||
switch (c->srcFormat) { | |||||
case PIX_FMT_YUYV422: | |||||
c->chrToYV12 = ff_yuyvToUV_avx; | |||||
break; | |||||
case PIX_FMT_UYVY422: | |||||
c->chrToYV12 = ff_uyvyToUV_avx; | |||||
break; | |||||
case PIX_FMT_NV12: | |||||
c->chrToYV12 = ff_nv12ToUV_avx; | |||||
break; | |||||
case PIX_FMT_NV21: | |||||
c->chrToYV12 = ff_nv21ToUV_avx; | |||||
break; | |||||
default: | |||||
break; | |||||
} | |||||
} | } | ||||
#endif | #endif | ||||
} | } |
@@ -1361,147 +1361,6 @@ static void RENAME(yuv2yuyv422_1)(SwsContext *c, const int16_t *buf0, | |||||
} | } | ||||
} | } | ||||
#if !COMPILE_TEMPLATE_MMX2 | |||||
//FIXME yuy2* can read up to 7 samples too much | |||||
static void RENAME(yuy2ToY)(uint8_t *dst, const uint8_t *src, | |||||
int width, uint32_t *unused) | |||||
{ | |||||
__asm__ volatile( | |||||
"movq "MANGLE(bm01010101)", %%mm2 \n\t" | |||||
"mov %0, %%"REG_a" \n\t" | |||||
"1: \n\t" | |||||
"movq (%1, %%"REG_a",2), %%mm0 \n\t" | |||||
"movq 8(%1, %%"REG_a",2), %%mm1 \n\t" | |||||
"pand %%mm2, %%mm0 \n\t" | |||||
"pand %%mm2, %%mm1 \n\t" | |||||
"packuswb %%mm1, %%mm0 \n\t" | |||||
"movq %%mm0, (%2, %%"REG_a") \n\t" | |||||
"add $8, %%"REG_a" \n\t" | |||||
" js 1b \n\t" | |||||
: : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst+width) | |||||
: "%"REG_a | |||||
); | |||||
} | |||||
static void RENAME(yuy2ToUV)(uint8_t *dstU, uint8_t *dstV, | |||||
const uint8_t *src1, const uint8_t *src2, | |||||
int width, uint32_t *unused) | |||||
{ | |||||
__asm__ volatile( | |||||
"movq "MANGLE(bm01010101)", %%mm4 \n\t" | |||||
"mov %0, %%"REG_a" \n\t" | |||||
"1: \n\t" | |||||
"movq (%1, %%"REG_a",4), %%mm0 \n\t" | |||||
"movq 8(%1, %%"REG_a",4), %%mm1 \n\t" | |||||
"psrlw $8, %%mm0 \n\t" | |||||
"psrlw $8, %%mm1 \n\t" | |||||
"packuswb %%mm1, %%mm0 \n\t" | |||||
"movq %%mm0, %%mm1 \n\t" | |||||
"psrlw $8, %%mm0 \n\t" | |||||
"pand %%mm4, %%mm1 \n\t" | |||||
"packuswb %%mm0, %%mm0 \n\t" | |||||
"packuswb %%mm1, %%mm1 \n\t" | |||||
"movd %%mm0, (%3, %%"REG_a") \n\t" | |||||
"movd %%mm1, (%2, %%"REG_a") \n\t" | |||||
"add $4, %%"REG_a" \n\t" | |||||
" js 1b \n\t" | |||||
: : "g" ((x86_reg)-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width) | |||||
: "%"REG_a | |||||
); | |||||
assert(src1 == src2); | |||||
} | |||||
/* This is almost identical to the previous, end exists only because | |||||
* yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses. */ | |||||
static void RENAME(uyvyToY)(uint8_t *dst, const uint8_t *src, | |||||
int width, uint32_t *unused) | |||||
{ | |||||
__asm__ volatile( | |||||
"mov %0, %%"REG_a" \n\t" | |||||
"1: \n\t" | |||||
"movq (%1, %%"REG_a",2), %%mm0 \n\t" | |||||
"movq 8(%1, %%"REG_a",2), %%mm1 \n\t" | |||||
"psrlw $8, %%mm0 \n\t" | |||||
"psrlw $8, %%mm1 \n\t" | |||||
"packuswb %%mm1, %%mm0 \n\t" | |||||
"movq %%mm0, (%2, %%"REG_a") \n\t" | |||||
"add $8, %%"REG_a" \n\t" | |||||
" js 1b \n\t" | |||||
: : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst+width) | |||||
: "%"REG_a | |||||
); | |||||
} | |||||
static void RENAME(uyvyToUV)(uint8_t *dstU, uint8_t *dstV, | |||||
const uint8_t *src1, const uint8_t *src2, | |||||
int width, uint32_t *unused) | |||||
{ | |||||
__asm__ volatile( | |||||
"movq "MANGLE(bm01010101)", %%mm4 \n\t" | |||||
"mov %0, %%"REG_a" \n\t" | |||||
"1: \n\t" | |||||
"movq (%1, %%"REG_a",4), %%mm0 \n\t" | |||||
"movq 8(%1, %%"REG_a",4), %%mm1 \n\t" | |||||
"pand %%mm4, %%mm0 \n\t" | |||||
"pand %%mm4, %%mm1 \n\t" | |||||
"packuswb %%mm1, %%mm0 \n\t" | |||||
"movq %%mm0, %%mm1 \n\t" | |||||
"psrlw $8, %%mm0 \n\t" | |||||
"pand %%mm4, %%mm1 \n\t" | |||||
"packuswb %%mm0, %%mm0 \n\t" | |||||
"packuswb %%mm1, %%mm1 \n\t" | |||||
"movd %%mm0, (%3, %%"REG_a") \n\t" | |||||
"movd %%mm1, (%2, %%"REG_a") \n\t" | |||||
"add $4, %%"REG_a" \n\t" | |||||
" js 1b \n\t" | |||||
: : "g" ((x86_reg)-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width) | |||||
: "%"REG_a | |||||
); | |||||
assert(src1 == src2); | |||||
} | |||||
static av_always_inline void RENAME(nvXXtoUV)(uint8_t *dst1, uint8_t *dst2, | |||||
const uint8_t *src, int width) | |||||
{ | |||||
__asm__ volatile( | |||||
"movq "MANGLE(bm01010101)", %%mm4 \n\t" | |||||
"mov %0, %%"REG_a" \n\t" | |||||
"1: \n\t" | |||||
"movq (%1, %%"REG_a",2), %%mm0 \n\t" | |||||
"movq 8(%1, %%"REG_a",2), %%mm1 \n\t" | |||||
"movq %%mm0, %%mm2 \n\t" | |||||
"movq %%mm1, %%mm3 \n\t" | |||||
"pand %%mm4, %%mm0 \n\t" | |||||
"pand %%mm4, %%mm1 \n\t" | |||||
"psrlw $8, %%mm2 \n\t" | |||||
"psrlw $8, %%mm3 \n\t" | |||||
"packuswb %%mm1, %%mm0 \n\t" | |||||
"packuswb %%mm3, %%mm2 \n\t" | |||||
"movq %%mm0, (%2, %%"REG_a") \n\t" | |||||
"movq %%mm2, (%3, %%"REG_a") \n\t" | |||||
"add $8, %%"REG_a" \n\t" | |||||
" js 1b \n\t" | |||||
: : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst1+width), "r" (dst2+width) | |||||
: "%"REG_a | |||||
); | |||||
} | |||||
static void RENAME(nv12ToUV)(uint8_t *dstU, uint8_t *dstV, | |||||
const uint8_t *src1, const uint8_t *src2, | |||||
int width, uint32_t *unused) | |||||
{ | |||||
RENAME(nvXXtoUV)(dstU, dstV, src1, width); | |||||
} | |||||
static void RENAME(nv21ToUV)(uint8_t *dstU, uint8_t *dstV, | |||||
const uint8_t *src1, const uint8_t *src2, | |||||
int width, uint32_t *unused) | |||||
{ | |||||
RENAME(nvXXtoUV)(dstV, dstU, src1, width); | |||||
} | |||||
#endif /* !COMPILE_TEMPLATE_MMX2 */ | |||||
static av_always_inline void RENAME(bgr24ToY_mmx)(uint8_t *dst, const uint8_t *src, | static av_always_inline void RENAME(bgr24ToY_mmx)(uint8_t *dst, const uint8_t *src, | ||||
int width, enum PixelFormat srcFormat) | int width, enum PixelFormat srcFormat) | ||||
{ | { | ||||
@@ -1856,15 +1715,6 @@ static av_cold void RENAME(sws_init_swScale)(SwsContext *c) | |||||
#endif /* COMPILE_TEMPLATE_MMX2 */ | #endif /* COMPILE_TEMPLATE_MMX2 */ | ||||
} | } | ||||
#if !COMPILE_TEMPLATE_MMX2 | |||||
switch(srcFormat) { | |||||
case PIX_FMT_YUYV422 : c->chrToYV12 = RENAME(yuy2ToUV); break; | |||||
case PIX_FMT_UYVY422 : c->chrToYV12 = RENAME(uyvyToUV); break; | |||||
case PIX_FMT_NV12 : c->chrToYV12 = RENAME(nv12ToUV); break; | |||||
case PIX_FMT_NV21 : c->chrToYV12 = RENAME(nv21ToUV); break; | |||||
default: break; | |||||
} | |||||
#endif /* !COMPILE_TEMPLATE_MMX2 */ | |||||
if (!c->chrSrcHSubSample) { | if (!c->chrSrcHSubSample) { | ||||
switch(srcFormat) { | switch(srcFormat) { | ||||
case PIX_FMT_BGR24 : c->chrToYV12 = RENAME(bgr24ToUV); break; | case PIX_FMT_BGR24 : c->chrToYV12 = RENAME(bgr24ToUV); break; | ||||
@@ -1874,21 +1724,8 @@ static av_cold void RENAME(sws_init_swScale)(SwsContext *c) | |||||
} | } | ||||
switch (srcFormat) { | switch (srcFormat) { | ||||
#if !COMPILE_TEMPLATE_MMX2 | |||||
case PIX_FMT_YUYV422 : | |||||
case PIX_FMT_Y400A : c->lumToYV12 = RENAME(yuy2ToY); break; | |||||
case PIX_FMT_UYVY422 : c->lumToYV12 = RENAME(uyvyToY); break; | |||||
#endif /* !COMPILE_TEMPLATE_MMX2 */ | |||||
case PIX_FMT_BGR24 : c->lumToYV12 = RENAME(bgr24ToY); break; | case PIX_FMT_BGR24 : c->lumToYV12 = RENAME(bgr24ToY); break; | ||||
case PIX_FMT_RGB24 : c->lumToYV12 = RENAME(rgb24ToY); break; | case PIX_FMT_RGB24 : c->lumToYV12 = RENAME(rgb24ToY); break; | ||||
default: break; | default: break; | ||||
} | } | ||||
#if !COMPILE_TEMPLATE_MMX2 | |||||
if (c->alpPixBuf) { | |||||
switch (srcFormat) { | |||||
case PIX_FMT_Y400A : c->alpToYV12 = RENAME(yuy2ToY); break; | |||||
default: break; | |||||
} | |||||
} | |||||
#endif /* !COMPILE_TEMPLATE_MMX2 */ | |||||
} | } |