| @@ -0,0 +1,113 @@ | |||
| /* | |||
| * Copyright (c) 2019 Eugene Lyapustin | |||
| * | |||
| * This file is part of FFmpeg. | |||
| * | |||
| * FFmpeg is free software; you can redistribute it and/or | |||
| * modify it under the terms of the GNU Lesser General Public | |||
| * License as published by the Free Software Foundation; either | |||
| * version 2.1 of the License, or (at your option) any later version. | |||
| * | |||
| * FFmpeg is distributed in the hope that it will be useful, | |||
| * but WITHOUT ANY WARRANTY; without even the implied warranty of | |||
| * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |||
| * Lesser General Public License for more details. | |||
| * | |||
| * You should have received a copy of the GNU Lesser General Public | |||
| * License along with FFmpeg; if not, write to the Free Software | |||
| * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |||
| */ | |||
| #ifndef AVFILTER_V360_H | |||
| #define AVFILTER_V360_H | |||
| #include "avfilter.h" | |||
| enum Projections { | |||
| EQUIRECTANGULAR, | |||
| CUBEMAP_3_2, | |||
| CUBEMAP_6_1, | |||
| EQUIANGULAR, | |||
| FLAT, | |||
| DUAL_FISHEYE, | |||
| BARREL, | |||
| CUBEMAP_1_6, | |||
| NB_PROJECTIONS, | |||
| }; | |||
| enum InterpMethod { | |||
| NEAREST, | |||
| BILINEAR, | |||
| BICUBIC, | |||
| LANCZOS, | |||
| NB_INTERP_METHODS, | |||
| }; | |||
| enum Faces { | |||
| TOP_LEFT, | |||
| TOP_MIDDLE, | |||
| TOP_RIGHT, | |||
| BOTTOM_LEFT, | |||
| BOTTOM_MIDDLE, | |||
| BOTTOM_RIGHT, | |||
| NB_FACES, | |||
| }; | |||
| enum Direction { | |||
| RIGHT, ///< Axis +X | |||
| LEFT, ///< Axis -X | |||
| UP, ///< Axis +Y | |||
| DOWN, ///< Axis -Y | |||
| FRONT, ///< Axis -Z | |||
| BACK, ///< Axis +Z | |||
| NB_DIRECTIONS, | |||
| }; | |||
| enum Rotation { | |||
| ROT_0, | |||
| ROT_90, | |||
| ROT_180, | |||
| ROT_270, | |||
| NB_ROTATIONS, | |||
| }; | |||
| typedef struct V360Context { | |||
| const AVClass *class; | |||
| int in, out; | |||
| int interp; | |||
| int width, height; | |||
| char* in_forder; | |||
| char* out_forder; | |||
| char* in_frot; | |||
| char* out_frot; | |||
| int in_cubemap_face_order[6]; | |||
| int out_cubemap_direction_order[6]; | |||
| int in_cubemap_face_rotation[6]; | |||
| int out_cubemap_face_rotation[6]; | |||
| float in_pad, out_pad; | |||
| float yaw, pitch, roll; | |||
| int h_flip, v_flip, d_flip; | |||
| float h_fov, v_fov; | |||
| float flat_range[3]; | |||
| int planewidth[4], planeheight[4]; | |||
| int inplanewidth[4], inplaneheight[4]; | |||
| int nb_planes; | |||
| uint16_t *u[4], *v[4]; | |||
| int16_t *ker[4]; | |||
| int (*remap_slice)(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs); | |||
| void (*remap_line)(uint8_t *dst, int width, const uint8_t *src, ptrdiff_t in_linesize, | |||
| const uint16_t *u, const uint16_t *v, const int16_t *ker); | |||
| } V360Context; | |||
| void ff_v360_init(V360Context *s, int depth); | |||
| void ff_v360_init_x86(V360Context *s, int depth); | |||
| #endif /* AVFILTER_V360_H */ | |||
| @@ -41,88 +41,7 @@ | |||
| #include "formats.h" | |||
| #include "internal.h" | |||
| #include "video.h" | |||
| enum Projections { | |||
| EQUIRECTANGULAR, | |||
| CUBEMAP_3_2, | |||
| CUBEMAP_6_1, | |||
| EQUIANGULAR, | |||
| FLAT, | |||
| DUAL_FISHEYE, | |||
| BARREL, | |||
| CUBEMAP_1_6, | |||
| NB_PROJECTIONS, | |||
| }; | |||
| enum InterpMethod { | |||
| NEAREST, | |||
| BILINEAR, | |||
| BICUBIC, | |||
| LANCZOS, | |||
| NB_INTERP_METHODS, | |||
| }; | |||
| enum Faces { | |||
| TOP_LEFT, | |||
| TOP_MIDDLE, | |||
| TOP_RIGHT, | |||
| BOTTOM_LEFT, | |||
| BOTTOM_MIDDLE, | |||
| BOTTOM_RIGHT, | |||
| NB_FACES, | |||
| }; | |||
| enum Direction { | |||
| RIGHT, ///< Axis +X | |||
| LEFT, ///< Axis -X | |||
| UP, ///< Axis +Y | |||
| DOWN, ///< Axis -Y | |||
| FRONT, ///< Axis -Z | |||
| BACK, ///< Axis +Z | |||
| NB_DIRECTIONS, | |||
| }; | |||
| enum Rotation { | |||
| ROT_0, | |||
| ROT_90, | |||
| ROT_180, | |||
| ROT_270, | |||
| NB_ROTATIONS, | |||
| }; | |||
| typedef struct V360Context { | |||
| const AVClass *class; | |||
| int in, out; | |||
| int interp; | |||
| int width, height; | |||
| char* in_forder; | |||
| char* out_forder; | |||
| char* in_frot; | |||
| char* out_frot; | |||
| int in_cubemap_face_order[6]; | |||
| int out_cubemap_direction_order[6]; | |||
| int in_cubemap_face_rotation[6]; | |||
| int out_cubemap_face_rotation[6]; | |||
| float in_pad, out_pad; | |||
| float yaw, pitch, roll; | |||
| int h_flip, v_flip, d_flip; | |||
| float h_fov, v_fov; | |||
| float flat_range[3]; | |||
| int planewidth[4], planeheight[4]; | |||
| int inplanewidth[4], inplaneheight[4]; | |||
| int nb_planes; | |||
| uint16_t *u[4], *v[4]; | |||
| int16_t *ker[4]; | |||
| int (*remap_slice)(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs); | |||
| } V360Context; | |||
| #include "v360.h" | |||
| typedef struct ThreadData { | |||
| AVFrame *in; | |||
| @@ -251,47 +170,22 @@ static int query_formats(AVFilterContext *ctx) | |||
| return ff_set_common_formats(ctx, fmts_list); | |||
| } | |||
| /** | |||
| * Generate no-interpolation remapping function with a given pixel depth. | |||
| * | |||
| * @param bits number of bits per pixel | |||
| * @param div number of bytes per pixel | |||
| */ | |||
| #define DEFINE_REMAP1(bits, div) \ | |||
| static int remap1_##bits##bit_slice(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs) \ | |||
| { \ | |||
| ThreadData *td = (ThreadData*)arg; \ | |||
| const V360Context *s = ctx->priv; \ | |||
| const AVFrame *in = td->in; \ | |||
| AVFrame *out = td->out; \ | |||
| \ | |||
| int plane, x, y; \ | |||
| \ | |||
| for (plane = 0; plane < s->nb_planes; plane++) { \ | |||
| const int in_linesize = in->linesize[plane] / div; \ | |||
| const int out_linesize = out->linesize[plane] / div; \ | |||
| const uint##bits##_t *src = (const uint##bits##_t *)in->data[plane]; \ | |||
| uint##bits##_t *dst = (uint##bits##_t *)out->data[plane]; \ | |||
| const int width = s->planewidth[plane]; \ | |||
| const int height = s->planeheight[plane]; \ | |||
| \ | |||
| const int slice_start = (height * jobnr ) / nb_jobs; \ | |||
| const int slice_end = (height * (jobnr + 1)) / nb_jobs; \ | |||
| \ | |||
| for (y = slice_start; y < slice_end; y++) { \ | |||
| const uint16_t *u = s->u[plane] + y * width; \ | |||
| const uint16_t *v = s->v[plane] + y * width; \ | |||
| uint##bits##_t *d = dst + y * out_linesize; \ | |||
| for (x = 0; x < width; x++) \ | |||
| *d++ = src[v[x] * in_linesize + u[x]]; \ | |||
| } \ | |||
| } \ | |||
| \ | |||
| return 0; \ | |||
| #define DEFINE_REMAP1_LINE(bits, div) \ | |||
| static void remap1_##bits##bit_line_c(uint8_t *dst, int width, const uint8_t *src, \ | |||
| ptrdiff_t in_linesize, \ | |||
| const uint16_t *u, const uint16_t *v, const int16_t *ker) \ | |||
| { \ | |||
| const uint##bits##_t *s = (const uint##bits##_t *)src; \ | |||
| uint##bits##_t *d = (uint##bits##_t *)dst; \ | |||
| \ | |||
| in_linesize /= div; \ | |||
| \ | |||
| for (int x = 0; x < width; x++) \ | |||
| d[x] = s[v[x] * in_linesize + u[x]]; \ | |||
| } | |||
| DEFINE_REMAP1( 8, 1) | |||
| DEFINE_REMAP1(16, 2) | |||
| DEFINE_REMAP1_LINE( 8, 1) | |||
| DEFINE_REMAP1_LINE(16, 2) | |||
| typedef struct XYRemap { | |||
| uint16_t u[4][4]; | |||
| @@ -304,9 +198,8 @@ typedef struct XYRemap { | |||
| * | |||
| * @param ws size of interpolation window | |||
| * @param bits number of bits per pixel | |||
| * @param div number of bytes per pixel | |||
| */ | |||
| #define DEFINE_REMAP(ws, bits, div) \ | |||
| #define DEFINE_REMAP(ws, bits) \ | |||
| static int remap##ws##_##bits##bit_slice(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs) \ | |||
| { \ | |||
| ThreadData *td = (ThreadData*)arg; \ | |||
| @@ -314,48 +207,85 @@ static int remap##ws##_##bits##bit_slice(AVFilterContext *ctx, void *arg, int jo | |||
| const AVFrame *in = td->in; \ | |||
| AVFrame *out = td->out; \ | |||
| \ | |||
| int plane, x, y, i, j; \ | |||
| \ | |||
| for (plane = 0; plane < s->nb_planes; plane++) { \ | |||
| const int in_linesize = in->linesize[plane] / div; \ | |||
| const int out_linesize = out->linesize[plane] / div; \ | |||
| const uint##bits##_t *src = (const uint##bits##_t *)in->data[plane]; \ | |||
| uint##bits##_t *dst = (uint##bits##_t *)out->data[plane]; \ | |||
| for (int plane = 0; plane < s->nb_planes; plane++) { \ | |||
| const int in_linesize = in->linesize[plane]; \ | |||
| const int out_linesize = out->linesize[plane]; \ | |||
| const uint8_t *src = in->data[plane]; \ | |||
| uint8_t *dst = out->data[plane]; \ | |||
| const int width = s->planewidth[plane]; \ | |||
| const int height = s->planeheight[plane]; \ | |||
| \ | |||
| const int slice_start = (height * jobnr ) / nb_jobs; \ | |||
| const int slice_end = (height * (jobnr + 1)) / nb_jobs; \ | |||
| \ | |||
| for (y = slice_start; y < slice_end; y++) { \ | |||
| uint##bits##_t *d = dst + y * out_linesize; \ | |||
| for (int y = slice_start; y < slice_end; y++) { \ | |||
| const uint16_t *u = s->u[plane] + y * width * ws * ws; \ | |||
| const uint16_t *v = s->v[plane] + y * width * ws * ws; \ | |||
| const int16_t *ker = s->ker[plane] + y * width * ws * ws; \ | |||
| for (x = 0; x < width; x++) { \ | |||
| const uint16_t *uu = u + x * ws * ws; \ | |||
| const uint16_t *vv = v + x * ws * ws; \ | |||
| const int16_t *kker = ker + x * ws * ws; \ | |||
| int tmp = 0; \ | |||
| \ | |||
| for (i = 0; i < ws; i++) { \ | |||
| for (j = 0; j < ws; j++) { \ | |||
| tmp += kker[i * ws + j] * src[vv[i * ws + j] * in_linesize + uu[i * ws + j]]; \ | |||
| } \ | |||
| } \ | |||
| \ | |||
| *d++ = av_clip_uint##bits(tmp >> (15 - ws)); \ | |||
| } \ | |||
| s->remap_line(dst + y * out_linesize, width, src, in_linesize, u, v, ker); \ | |||
| } \ | |||
| } \ | |||
| \ | |||
| return 0; \ | |||
| } | |||
| DEFINE_REMAP(2, 8, 1) | |||
| DEFINE_REMAP(4, 8, 1) | |||
| DEFINE_REMAP(2, 16, 2) | |||
| DEFINE_REMAP(4, 16, 2) | |||
| DEFINE_REMAP(1, 8) | |||
| DEFINE_REMAP(2, 8) | |||
| DEFINE_REMAP(4, 8) | |||
| DEFINE_REMAP(1, 16) | |||
| DEFINE_REMAP(2, 16) | |||
| DEFINE_REMAP(4, 16) | |||
| #define DEFINE_REMAP_LINE(ws, bits, div) \ | |||
| static void remap##ws##_##bits##bit_line_c(uint8_t *dst, int width, const uint8_t *src, \ | |||
| ptrdiff_t in_linesize, \ | |||
| const uint16_t *u, const uint16_t *v, const int16_t *ker) \ | |||
| { \ | |||
| const uint##bits##_t *s = (const uint##bits##_t *)src; \ | |||
| uint##bits##_t *d = (uint##bits##_t *)dst; \ | |||
| \ | |||
| in_linesize /= div; \ | |||
| \ | |||
| for (int x = 0; x < width; x++) { \ | |||
| const uint16_t *uu = u + x * ws * ws; \ | |||
| const uint16_t *vv = v + x * ws * ws; \ | |||
| const int16_t *kker = ker + x * ws * ws; \ | |||
| int tmp = 0; \ | |||
| \ | |||
| for (int i = 0; i < ws; i++) { \ | |||
| for (int j = 0; j < ws; j++) { \ | |||
| tmp += kker[i * ws + j] * s[vv[i * ws + j] * in_linesize + uu[i * ws + j]]; \ | |||
| } \ | |||
| } \ | |||
| \ | |||
| d[x] = av_clip_uint##bits(tmp >> 14); \ | |||
| } \ | |||
| } | |||
| DEFINE_REMAP_LINE(2, 8, 1) | |||
| DEFINE_REMAP_LINE(4, 8, 1) | |||
| DEFINE_REMAP_LINE(2, 16, 2) | |||
| DEFINE_REMAP_LINE(4, 16, 2) | |||
| void ff_v360_init(V360Context *s, int depth) | |||
| { | |||
| switch (s->interp) { | |||
| case NEAREST: | |||
| s->remap_line = depth <= 8 ? remap1_8bit_line_c : remap1_16bit_line_c; | |||
| break; | |||
| case BILINEAR: | |||
| s->remap_line = depth <= 8 ? remap2_8bit_line_c : remap2_16bit_line_c; | |||
| break; | |||
| case BICUBIC: | |||
| case LANCZOS: | |||
| s->remap_line = depth <= 8 ? remap4_8bit_line_c : remap4_16bit_line_c; | |||
| break; | |||
| } | |||
| if (ARCH_X86_64) | |||
| ff_v360_init_x86(s, depth); | |||
| } | |||
| /** | |||
| * Save nearest pixel coordinates for remapping. | |||
| @@ -399,10 +329,10 @@ static void bilinear_kernel(float du, float dv, const XYRemap *r_tmp, | |||
| } | |||
| } | |||
| ker[0] = (1.f - du) * (1.f - dv) * 8192; | |||
| ker[1] = du * (1.f - dv) * 8192; | |||
| ker[2] = (1.f - du) * dv * 8192; | |||
| ker[3] = du * dv * 8192; | |||
| ker[0] = (1.f - du) * (1.f - dv) * 16384; | |||
| ker[1] = du * (1.f - dv) * 16384; | |||
| ker[2] = (1.f - du) * dv * 16384; | |||
| ker[3] = du * dv * 16384; | |||
| } | |||
| /** | |||
| @@ -446,7 +376,7 @@ static void bicubic_kernel(float du, float dv, const XYRemap *r_tmp, | |||
| for (j = 0; j < 4; j++) { | |||
| u[i * 4 + j] = r_tmp->u[i][j]; | |||
| v[i * 4 + j] = r_tmp->v[i][j]; | |||
| ker[i * 4 + j] = du_coeffs[j] * dv_coeffs[i] * 2048; | |||
| ker[i * 4 + j] = du_coeffs[j] * dv_coeffs[i] * 16384; | |||
| } | |||
| } | |||
| } | |||
| @@ -501,7 +431,7 @@ static void lanczos_kernel(float du, float dv, const XYRemap *r_tmp, | |||
| for (j = 0; j < 4; j++) { | |||
| u[i * 4 + j] = r_tmp->u[i][j]; | |||
| v[i * 4 + j] = r_tmp->v[i][j]; | |||
| ker[i * 4 + j] = du_coeffs[j] * dv_coeffs[i] * 2048; | |||
| ker[i * 4 + j] = du_coeffs[j] * dv_coeffs[i] * 16384; | |||
| } | |||
| } | |||
| } | |||
| @@ -2038,6 +1968,8 @@ static int config_output(AVFilterLink *outlink) | |||
| av_assert0(0); | |||
| } | |||
| ff_v360_init(s, depth); | |||
| switch (s->in) { | |||
| case EQUIRECTANGULAR: | |||
| in_transform = xyz_to_equirect; | |||
| @@ -31,6 +31,7 @@ OBJS-$(CONFIG_TBLEND_FILTER) += x86/vf_blend_init.o | |||
| OBJS-$(CONFIG_THRESHOLD_FILTER) += x86/vf_threshold_init.o | |||
| OBJS-$(CONFIG_TINTERLACE_FILTER) += x86/vf_tinterlace_init.o | |||
| OBJS-$(CONFIG_VOLUME_FILTER) += x86/af_volume_init.o | |||
| OBJS-$(CONFIG_V360_FILTER) += x86/vf_v360_init.o | |||
| OBJS-$(CONFIG_W3FDIF_FILTER) += x86/vf_w3fdif_init.o | |||
| OBJS-$(CONFIG_YADIF_FILTER) += x86/vf_yadif_init.o | |||
| @@ -66,5 +67,6 @@ X86ASM-OBJS-$(CONFIG_TBLEND_FILTER) += x86/vf_blend.o | |||
| X86ASM-OBJS-$(CONFIG_THRESHOLD_FILTER) += x86/vf_threshold.o | |||
| X86ASM-OBJS-$(CONFIG_TINTERLACE_FILTER) += x86/vf_interlace.o | |||
| X86ASM-OBJS-$(CONFIG_VOLUME_FILTER) += x86/af_volume.o | |||
| X86ASM-OBJS-$(CONFIG_V360_FILTER) += x86/vf_v360.o | |||
| X86ASM-OBJS-$(CONFIG_W3FDIF_FILTER) += x86/vf_w3fdif.o | |||
| X86ASM-OBJS-$(CONFIG_YADIF_FILTER) += x86/vf_yadif.o x86/yadif-16.o x86/yadif-10.o | |||
| @@ -0,0 +1,142 @@ | |||
| ;***************************************************************************** | |||
| ;* x86-optimized functions for v360 filter | |||
| ;* | |||
| ;* This file is part of FFmpeg. | |||
| ;* | |||
| ;* FFmpeg is free software; you can redistribute it and/or | |||
| ;* modify it under the terms of the GNU Lesser General Public | |||
| ;* License as published by the Free Software Foundation; either | |||
| ;* version 2.1 of the License, or (at your option) any later version. | |||
| ;* | |||
| ;* FFmpeg is distributed in the hope that it will be useful, | |||
| ;* but WITHOUT ANY WARRANTY; without even the implied warranty of | |||
| ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |||
| ;* Lesser General Public License for more details. | |||
| ;* | |||
| ;* You should have received a copy of the GNU Lesser General Public | |||
| ;* License along with FFmpeg; if not, write to the Free Software | |||
| ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |||
| ;****************************************************************************** | |||
| %if HAVE_AVX2_EXTERNAL && ARCH_X86_64 | |||
| %include "libavutil/x86/x86util.asm" | |||
| SECTION_RODATA | |||
| pb_mask: db 0,4,8,12,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1 | |||
| pd_255: times 4 dd 255 | |||
| SECTION .text | |||
| ; void ff_remap2_8bit_line_avx2(uint8_t *dst, int width, const uint8_t *src, ptrdiff_t in_linesize, | |||
| ; const uint16_t *u, const uint16_t *v, const int16_t *ker); | |||
| INIT_YMM avx2 | |||
| cglobal remap1_8bit_line, 6, 7, 6, dst, width, src, in_linesize, u, v, x | |||
| movsxdifnidn widthq, widthd | |||
| xor xq, xq | |||
| movd xm0, in_linesized | |||
| pcmpeqw m4, m4 | |||
| VBROADCASTI128 m3, [pb_mask] | |||
| vpbroadcastd m0, xm0 | |||
| .loop: | |||
| pmovsxwd m1, [vq + xq * 2] | |||
| pmovsxwd m2, [uq + xq * 2] | |||
| pmulld m1, m0 | |||
| paddd m1, m2 | |||
| mova m2, m4 | |||
| vpgatherdd m5, [srcq + m1], m2 | |||
| pshufb m1, m5, m3 | |||
| vextracti128 xm2, m1, 1 | |||
| movd [dstq+xq], xm1 | |||
| movd [dstq+xq+4], xm2 | |||
| add xq, mmsize / 4 | |||
| cmp xq, widthq | |||
| jl .loop | |||
| RET | |||
| INIT_YMM avx2 | |||
| cglobal remap2_8bit_line, 7, 8, 8, dst, width, src, in_linesize, u, v, ker, x | |||
| movsxdifnidn widthq, widthd | |||
| xor xq, xq | |||
| movd xm0, in_linesized | |||
| pcmpeqw m7, m7 | |||
| vpbroadcastd m0, xm0 | |||
| vpbroadcastd m6, [pd_255] | |||
| .loop: | |||
| pmovsxwd m1, [kerq + xq * 8] | |||
| pmovsxwd m2, [vq + xq * 8] | |||
| pmovsxwd m3, [uq + xq * 8] | |||
| pmulld m4, m2, m0 | |||
| paddd m4, m3 | |||
| mova m3, m7 | |||
| vpgatherdd m2, [srcq + m4], m3 | |||
| pand m2, m6 | |||
| pmulld m2, m1 | |||
| phaddd m2, m2 | |||
| phaddd m1, m2, m2 | |||
| psrld m1, m1, 0xe | |||
| vextracti128 xm2, m1, 1 | |||
| pextrb [dstq+xq], xm1, 0 | |||
| pextrb [dstq+xq+1], xm2, 0 | |||
| add xq, mmsize / 16 | |||
| cmp xq, widthq | |||
| jl .loop | |||
| RET | |||
| INIT_YMM avx2 | |||
| cglobal remap4_8bit_line, 7, 9, 11, dst, width, src, in_linesize, u, v, ker, x, y | |||
| movsxdifnidn widthq, widthd | |||
| xor yq, yq | |||
| xor xq, xq | |||
| movd xm0, in_linesized | |||
| pcmpeqw m7, m7 | |||
| vpbroadcastd m0, xm0 | |||
| vpbroadcastd m6, [pd_255] | |||
| .loop: | |||
| pmovsxwd m1, [kerq + yq] | |||
| pmovsxwd m5, [kerq + yq + 16] | |||
| pmovsxwd m2, [vq + yq] | |||
| pmovsxwd m8, [vq + yq + 16] | |||
| pmovsxwd m3, [uq + yq] | |||
| pmovsxwd m9, [uq + yq + 16] | |||
| pmulld m4, m2, m0 | |||
| pmulld m10, m8, m0 | |||
| paddd m4, m3 | |||
| paddd m10, m9 | |||
| mova m3, m7 | |||
| vpgatherdd m2, [srcq + m4], m3 | |||
| mova m3, m7 | |||
| vpgatherdd m4, [srcq + m10], m3 | |||
| pand m2, m6 | |||
| pand m4, m6 | |||
| pmulld m2, m1 | |||
| pmulld m4, m5 | |||
| paddd m2, m4 | |||
| vextracti128 xm1, m2, 1 | |||
| paddd m1, m2 | |||
| phaddd m1, m1 | |||
| phaddd m1, m1 | |||
| psrld m1, m1, 0xe | |||
| packuswb m1, m1 | |||
| pextrb [dstq+xq], xm1, 0 | |||
| add xq, 1 | |||
| add yq, 32 | |||
| cmp xq, widthq | |||
| jl .loop | |||
| RET | |||
| %endif | |||
| @@ -0,0 +1,50 @@ | |||
| /* | |||
| * This file is part of FFmpeg. | |||
| * | |||
| * FFmpeg is free software; you can redistribute it and/or | |||
| * modify it under the terms of the GNU Lesser General Public | |||
| * License as published by the Free Software Foundation; either | |||
| * version 2.1 of the License, or (at your option) any later version. | |||
| * | |||
| * FFmpeg is distributed in the hope that it will be useful, | |||
| * but WITHOUT ANY WARRANTY; without even the implied warranty of | |||
| * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |||
| * Lesser General Public License for more details. | |||
| * | |||
| * You should have received a copy of the GNU Lesser General Public | |||
| * License along with FFmpeg; if not, write to the Free Software | |||
| * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |||
| */ | |||
| #include "config.h" | |||
| #include "libavutil/attributes.h" | |||
| #include "libavutil/cpu.h" | |||
| #include "libavutil/x86/cpu.h" | |||
| #include "libavfilter/v360.h" | |||
| void ff_remap1_8bit_line_avx2(uint8_t *dst, int width, const uint8_t *src, ptrdiff_t in_linesize, | |||
| const uint16_t *u, const uint16_t *v, const int16_t *ker); | |||
| void ff_remap2_8bit_line_avx2(uint8_t *dst, int width, const uint8_t *src, ptrdiff_t in_linesize, | |||
| const uint16_t *u, const uint16_t *v, const int16_t *ker); | |||
| void ff_remap4_8bit_line_avx2(uint8_t *dst, int width, const uint8_t *src, ptrdiff_t in_linesize, | |||
| const uint16_t *u, const uint16_t *v, const int16_t *ker); | |||
| av_cold void ff_v360_init_x86(V360Context *s, int depth) | |||
| { | |||
| #if ARCH_X86_64 | |||
| int cpu_flags = av_get_cpu_flags(); | |||
| if (EXTERNAL_AVX2_FAST(cpu_flags) && s->interp == NEAREST && depth <= 8) | |||
| s->remap_line = ff_remap1_8bit_line_avx2; | |||
| if (EXTERNAL_AVX2_FAST(cpu_flags) && s->interp == BILINEAR && depth <= 8) | |||
| s->remap_line = ff_remap2_8bit_line_avx2; | |||
| if (EXTERNAL_AVX2_FAST(cpu_flags) && (s->interp == BICUBIC || | |||
| s->interp == LANCZOS) && depth <= 8) | |||
| s->remap_line = ff_remap4_8bit_line_avx2; | |||
| #endif | |||
| } | |||