asm code by Henrik Gramnertags/n4.0
| @@ -74,10 +74,25 @@ static void sub_median_pred_c(uint8_t *dst, const uint8_t *src1, | |||||
| *left_top = lt; | *left_top = lt; | ||||
| } | } | ||||
| static void sub_left_predict_c(uint8_t *dst, uint8_t *src, | |||||
| ptrdiff_t stride, ptrdiff_t width, int height) | |||||
| { | |||||
| int i, j; | |||||
| uint8_t prev = 0x80; /* Set the initial value */ | |||||
| for (j = 0; j < height; j++) { | |||||
| for (i = 0; i < width; i++) { | |||||
| *dst++ = src[i] - prev; | |||||
| prev = src[i]; | |||||
| } | |||||
| src += stride; | |||||
| } | |||||
| } | |||||
| av_cold void ff_llvidencdsp_init(LLVidEncDSPContext *c) | av_cold void ff_llvidencdsp_init(LLVidEncDSPContext *c) | ||||
| { | { | ||||
| c->diff_bytes = diff_bytes_c; | c->diff_bytes = diff_bytes_c; | ||||
| c->sub_median_pred = sub_median_pred_c; | c->sub_median_pred = sub_median_pred_c; | ||||
| c->sub_left_predict = sub_left_predict_c; | |||||
| if (ARCH_X86) | if (ARCH_X86) | ||||
| ff_llvidencdsp_init_x86(c); | ff_llvidencdsp_init_x86(c); | ||||
| @@ -21,6 +21,8 @@ | |||||
| #include <stdint.h> | #include <stdint.h> | ||||
| #include "avcodec.h" | |||||
| typedef struct LLVidEncDSPContext { | typedef struct LLVidEncDSPContext { | ||||
| void (*diff_bytes)(uint8_t *dst /* align 16 */, | void (*diff_bytes)(uint8_t *dst /* align 16 */, | ||||
| const uint8_t *src1 /* align 16 */, | const uint8_t *src1 /* align 16 */, | ||||
| @@ -33,6 +35,9 @@ typedef struct LLVidEncDSPContext { | |||||
| void (*sub_median_pred)(uint8_t *dst, const uint8_t *src1, | void (*sub_median_pred)(uint8_t *dst, const uint8_t *src1, | ||||
| const uint8_t *src2, intptr_t w, | const uint8_t *src2, intptr_t w, | ||||
| int *left, int *left_top); | int *left, int *left_top); | ||||
| void (*sub_left_predict)(uint8_t *dst, uint8_t *src, | |||||
| ptrdiff_t stride, ptrdiff_t width, int height); | |||||
| } LLVidEncDSPContext; | } LLVidEncDSPContext; | ||||
| void ff_llvidencdsp_init(LLVidEncDSPContext *c); | void ff_llvidencdsp_init(LLVidEncDSPContext *c); | ||||
| @@ -283,23 +283,6 @@ static void mangle_rgb_planes(uint8_t *dst[4], ptrdiff_t dst_stride, | |||||
| } | } | ||||
| } | } | ||||
| /* Write data to a plane with left prediction */ | |||||
| static void left_predict(uint8_t *src, uint8_t *dst, ptrdiff_t stride, | |||||
| int width, int height) | |||||
| { | |||||
| int i, j; | |||||
| uint8_t prev; | |||||
| prev = 0x80; /* Set the initial value */ | |||||
| for (j = 0; j < height; j++) { | |||||
| for (i = 0; i < width; i++) { | |||||
| *dst++ = src[i] - prev; | |||||
| prev = src[i]; | |||||
| } | |||||
| src += stride; | |||||
| } | |||||
| } | |||||
| #undef A | #undef A | ||||
| #undef B | #undef B | ||||
| @@ -436,8 +419,7 @@ static int encode_plane(AVCodecContext *avctx, uint8_t *src, | |||||
| for (i = 0; i < c->slices; i++) { | for (i = 0; i < c->slices; i++) { | ||||
| sstart = send; | sstart = send; | ||||
| send = height * (i + 1) / c->slices & cmask; | send = height * (i + 1) / c->slices & cmask; | ||||
| left_predict(src + sstart * stride, dst + sstart * width, | |||||
| stride, width, send - sstart); | |||||
| c->llvidencdsp.sub_left_predict(dst + sstart * width, src + sstart * stride, stride, width, send - sstart); | |||||
| } | } | ||||
| break; | break; | ||||
| case PRED_MEDIAN: | case PRED_MEDIAN: | ||||
| @@ -25,6 +25,8 @@ | |||||
| %include "libavutil/x86/x86util.asm" | %include "libavutil/x86/x86util.asm" | ||||
| cextern pb_80 | |||||
| SECTION .text | SECTION .text | ||||
| ; void ff_diff_bytes(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, | ; void ff_diff_bytes(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, | ||||
| @@ -149,3 +151,44 @@ DIFF_BYTES_PROLOGUE | |||||
| DIFF_BYTES_BODY u, u | DIFF_BYTES_BODY u, u | ||||
| %undef i | %undef i | ||||
| %endif | %endif | ||||
| ;-------------------------------------------------------------------------------------------------- | |||||
| ;void sub_left_predict(uint8_t *dst, uint8_t *src, ptrdiff_t stride, ptrdiff_t width, int height) | |||||
| ;-------------------------------------------------------------------------------------------------- | |||||
| INIT_XMM avx | |||||
| cglobal sub_left_predict, 5,6,5, dst, src, stride, width, height, x | |||||
| mova m1, [pb_80] ; prev initial | |||||
| add dstq, widthq | |||||
| add srcq, widthq | |||||
| lea xd, [widthq-1] | |||||
| neg widthq | |||||
| and xd, 15 | |||||
| pinsrb m4, m1, xd, 15 | |||||
| mov xq, widthq | |||||
| .loop: | |||||
| movu m0, [srcq + widthq] | |||||
| palignr m2, m0, m1, 15 | |||||
| movu m1, [srcq + widthq + 16] | |||||
| palignr m3, m1, m0, 15 | |||||
| psubb m2, m0, m2 | |||||
| psubb m3, m1, m3 | |||||
| movu [dstq + widthq], m2 | |||||
| movu [dstq + widthq + 16], m3 | |||||
| add widthq, 2 * 16 | |||||
| jl .loop | |||||
| add srcq, strideq | |||||
| sub dstq, xq ; dst + width | |||||
| test xd, 16 | |||||
| jz .mod32 | |||||
| mova m1, m0 | |||||
| .mod32: | |||||
| pshufb m1, m4 | |||||
| mov widthq, xq | |||||
| dec heightd | |||||
| jg .loop | |||||
| RET | |||||
| @@ -36,6 +36,9 @@ void ff_diff_bytes_sse2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, | |||||
| void ff_diff_bytes_avx2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, | void ff_diff_bytes_avx2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, | ||||
| intptr_t w); | intptr_t w); | ||||
| void ff_sub_left_predict_avx(uint8_t *dst, uint8_t *src, | |||||
| ptrdiff_t stride, ptrdiff_t width, int height); | |||||
| #if HAVE_INLINE_ASM | #if HAVE_INLINE_ASM | ||||
| static void sub_median_pred_mmxext(uint8_t *dst, const uint8_t *src1, | static void sub_median_pred_mmxext(uint8_t *dst, const uint8_t *src1, | ||||
| @@ -98,6 +101,10 @@ av_cold void ff_llvidencdsp_init_x86(LLVidEncDSPContext *c) | |||||
| c->diff_bytes = ff_diff_bytes_sse2; | c->diff_bytes = ff_diff_bytes_sse2; | ||||
| } | } | ||||
| if (EXTERNAL_AVX(cpu_flags)) { | |||||
| c->sub_left_predict = ff_sub_left_predict_avx; | |||||
| } | |||||
| if (EXTERNAL_AVX2_FAST(cpu_flags)) { | if (EXTERNAL_AVX2_FAST(cpu_flags)) { | ||||
| c->diff_bytes = ff_diff_bytes_avx2; | c->diff_bytes = ff_diff_bytes_avx2; | ||||
| } | } | ||||