swscale/x86/output: add AVX2 version of yuv2nv12cX

256 bits is just wide enough to fit all the operands needed to vectorize the software implementation, but AVX2 is needed to for a couple of instructions like cross-lane permutation. Output is bit-for-bit identical to C. Signed-off-by: Nelson Gomez <nelson.gomez@microsoft.com>
5 years ago · bc01337db4
--- a/libswscale/x86/output.asm
+++ b/libswscale/x86/output.asm
@@ -2,6 +2,7 @@
 ;* x86-optimized vertical line scaling functions
 ;* Copyright (c) 2011 Ronald S. Bultje <rsbultje@gmail.com>
 ;*                    Kieran Kunhya <kieran@kunhya.com>
 ;*           (c) 2020 Nelson Gomez <nelson.gomez@microsoft.com>
 ;*
 ;* This file is part of FFmpeg.
 ;*
@@ -22,7 +23,7 @@

 %include "libavutil/x86/x86util.asm"

 SECTION_RODATA
 SECTION_RODATA 32

 minshort:      times 8 dw 0x8000
 yuv2yuvX_16_start:  times 4 dd 0x4000 - 0x40000000
@@ -34,9 +35,20 @@ pd_4:          times 4 dd 4
 pd_4min0x40000:times 4 dd 4 - (0x40000)
 pw_16:         times 8 dw 16
 pw_32:         times 8 dw 32
 pd_255:        times 8 dd 255
 pw_512:        times 8 dw 512
 pw_1024:       times 8 dw 1024

 yuv2nv12_shuffle_mask: times 2 db 0,  4,  8, 12, \
                                 -1, -1, -1, -1, \
                                 -1, -1, -1, -1, \
                                 -1, -1, -1, -1
 yuv2nv21_shuffle_mask: times 2 db 4,  0, 12,  8, \
                                 -1, -1, -1, -1, \
                                 -1, -1, -1, -1, \
                                 -1, -1, -1, -1
 yuv2nv12_permute_mask: dd 0, 4, 1, 2, 3, 5, 6, 7

 SECTION .text

 ;-----------------------------------------------------------------------------
@@ -423,3 +435,115 @@ yuv2plane1_fn  9, 5, 3
 yuv2plane1_fn 10, 5, 3
 yuv2plane1_fn 16, 5, 3
 %endif

 %undef movsx

 ;-----------------------------------------------------------------------------
 ; AVX2 yuv2nv12cX implementation
 ;
 ; void ff_yuv2nv12cX_avx2(enum AVPixelFormat format, const uint8_t *dither,
 ;                         const int16_t *filter, int filterSize,
 ;                         const int16_t **u, const int16_t **v,
 ;                         uint8_t *dst, int dstWidth)
 ;
 ; void ff_yuv2nv21cX_avx2(enum AVPixelFormat format, const uint8_t *dither,
 ;                         const int16_t *filter, int filterSize,
 ;                         const int16_t **u, const int16_t **v,
 ;                         uint8_t *dst, int dstWidth)
 ;-----------------------------------------------------------------------------

 %if ARCH_X86_64
 %macro yuv2nv12cX_fn 1
 cglobal %1cX, 8, 11, 13, tmp1, dither, filter, filterSize, u, v, dst, dstWidth

    mov tmp1q, qword [ditherq]
    movq xm0, tmp1q
    ror tmp1q, 24
    movq xm1, tmp1q

    pmovzxbd m0, xm0
    pslld m0, m0, 12                        ; ditherLo
    pmovzxbd m1, xm1
    pslld m1, m1, 12                        ; ditherHi

    pxor m9, m9                             ; uint8_min dwords
    mova m10, [pd_255]                      ; uint8_max dwords
    mova m11, [%1_shuffle_mask]             ; shuffle_mask
    mova m12, [yuv2nv12_permute_mask]       ; permute mask

    DEFINE_ARGS tmp1, tmp2, filter, filterSize, u, v, dst, dstWidth

    xor r8q, r8q

 nv12_outer_%1:
    mova m2, m0                             ; resultLo
    mova m3, m1                             ; resultHi
    xor r9q, r9q

 nv12_inner_%1:
    movsx r10d, word [filterq + (2 * r9q)]
    movd xm4, r10d
    vpbroadcastd m4, xm4                    ; filter

    mov tmp1q, [uq + (gprsize * r9q)]
    mova xm7, oword [tmp1q + 2 * r8q]

    mov tmp2q, [vq + (gprsize * r9q)]
    mova xm8, oword [tmp2q + 2 * r8q]

    punpcklwd xm5, xm7, xm8
    pmovsxwd m5, xm5                        ; multiplicandsLo
    punpckhwd xm6, xm7, xm8
    pmovsxwd m6, xm6                        ; multiplicandsHi

    pmulld m7, m5, m4                       ; mulResultLo
    pmulld m8, m6, m4                       ; mulResultHi
    paddd m2, m2, m7                        ; resultLo += mulResultLo
    paddd m3, m3, m8                        ; resultHi += mulResultHi

    inc r9d
    cmp r9d, filterSized
    jl nv12_inner_%1
    ; end of inner loop

    psrad m2, m2, 19
    psrad m3, m3, 19

    ; Vectorized av_clip_uint8
    pmaxsd m2, m2, m9
    pmaxsd m3, m3, m9
    pminsd m2, m2, m10
    pminsd m3, m3, m10

    ; At this point we have clamped uint8s arranged in this order:
    ;     m2: u1  0  0  0  v1  0  0  0  [...]
    ;     m3: u5  0  0  0  v5  0  0  0  [...]
    ;
    ; First, we shuffle the bytes to make the bytes semi-contiguous.
    ; AVX-2 doesn't have cross-lane shuffling, so we'll end up with:
    ;     m2: u1  v1  u2  v2  0  0  0  0  0  0  0  0  u3  v3  u4  v4
    ;     m3: u5  v5  u6  v6  0  0  0  0  0  0  0  0  u7  v7  u8  v8
    pshufb m2, m2, m11
    pshufb m3, m3, m11

    ; To fix the cross-lane shuffling issue, we'll then use cross-lane
    ; permutation to combine the two segments
    vpermd m2, m12, m2
    vpermd m3, m12, m3

    ; Now we have the final results in the lower 8 bytes of each register
    movq [dstq], xm2
    movq [dstq + 8], xm3

    add r8d, 8
    add dstq, 16

    cmp r8d, dstWidthd
    jl nv12_outer_%1
    RET
 %endmacro

 INIT_YMM avx2
 yuv2nv12cX_fn yuv2nv12
 yuv2nv12cX_fn yuv2nv21
 %endif ; ARCH_X86_64
--- a/libswscale/x86/swscale.c
+++ b/libswscale/x86/swscale.c
@@ -380,6 +380,17 @@ INPUT_FUNCS(sse2);
 INPUT_FUNCS(ssse3);
 INPUT_FUNCS(avx);

 #if ARCH_X86_64
 #define YUV2NV_DECL(fmt, opt) \
 void ff_yuv2 ## fmt ## cX_ ## opt(enum AVPixelFormat format, const uint8_t *dither, \
                                  const int16_t *filter, int filterSize, \
                                  const int16_t **u, const int16_t **v, \
                                  uint8_t *dst, int dstWidth)

 YUV2NV_DECL(nv12, avx2);
 YUV2NV_DECL(nv21, avx2);
 #endif

 av_cold void ff_sws_init_swscale_x86(SwsContext *c)
 {
    int cpu_flags = av_get_cpu_flags();
@@ -580,4 +591,21 @@ switch(c->dstBpc){ \
            break;
        }
    }

 #if ARCH_X86_64
    if (EXTERNAL_AVX2_FAST(cpu_flags)) {
        switch (c->dstFormat) {
        case AV_PIX_FMT_NV12:
        case AV_PIX_FMT_NV24:
            c->yuv2nv12cX = ff_yuv2nv12cX_avx2;
            break;
        case AV_PIX_FMT_NV21:
        case AV_PIX_FMT_NV42:
            c->yuv2nv12cX = ff_yuv2nv21cX_avx2;
            break;
        default:
            break;
        }
    }
 #endif
 }