This performs the start code search which was previously part of
h264_find_frame_end() - the most CPU intensive part of the function.
By itself, this results in a performance regression:
Before After
Mean StdDev Mean StdDev Change
Overall time 2925.6 26.2 3068.5 31.7 -4.7%
but this can more than be made up for by platform-optimised
implementations of the function.
Signed-off-by: Martin Storsjö <martin@martin.st>
tags/n2.1
| @@ -47,30 +47,9 @@ static int h264_find_frame_end(H264Context *h, const uint8_t *buf, | |||||
| for (i = 0; i < buf_size; i++) { | for (i = 0; i < buf_size; i++) { | ||||
| if (state == 7) { | if (state == 7) { | ||||
| #if HAVE_FAST_UNALIGNED | |||||
| /* we check i < buf_size instead of i + 3 / 7 because it is | |||||
| * simpler and there must be FF_INPUT_BUFFER_PADDING_SIZE | |||||
| * bytes at the end. | |||||
| */ | |||||
| #if HAVE_FAST_64BIT | |||||
| while (i < buf_size && | |||||
| !((~*(const uint64_t *)(buf + i) & | |||||
| (*(const uint64_t *)(buf + i) - 0x0101010101010101ULL)) & | |||||
| 0x8080808080808080ULL)) | |||||
| i += 8; | |||||
| #else | |||||
| while (i < buf_size && | |||||
| !((~*(const uint32_t *)(buf + i) & | |||||
| (*(const uint32_t *)(buf + i) - 0x01010101U)) & | |||||
| 0x80808080U)) | |||||
| i += 4; | |||||
| #endif | |||||
| #endif | |||||
| for (; i < buf_size; i++) | |||||
| if (!buf[i]) { | |||||
| state = 2; | |||||
| break; | |||||
| } | |||||
| i += h->h264dsp.h264_find_start_code_candidate(buf + i, buf_size - i); | |||||
| if (i < buf_size) | |||||
| state = 2; | |||||
| } else if (state <= 2) { | } else if (state <= 2) { | ||||
| if (buf[i] == 1) | if (buf[i] == 1) | ||||
| state ^= 5; // 2->7, 1->4, 0->5 | state ^= 5; // 2->7, 1->4, 0->5 | ||||
| @@ -53,6 +53,34 @@ | |||||
| #include "h264addpx_template.c" | #include "h264addpx_template.c" | ||||
| #undef BIT_DEPTH | #undef BIT_DEPTH | ||||
| static int h264_find_start_code_candidate_c(const uint8_t *buf, int size) | |||||
| { | |||||
| int i = 0; | |||||
| #if HAVE_FAST_UNALIGNED | |||||
| /* we check i < size instead of i + 3 / 7 because it is | |||||
| * simpler and there must be FF_INPUT_BUFFER_PADDING_SIZE | |||||
| * bytes at the end. | |||||
| */ | |||||
| #if HAVE_FAST_64BIT | |||||
| while (i < size && | |||||
| !((~*(const uint64_t *)(buf + i) & | |||||
| (*(const uint64_t *)(buf + i) - 0x0101010101010101ULL)) & | |||||
| 0x8080808080808080ULL)) | |||||
| i += 8; | |||||
| #else | |||||
| while (i < size && | |||||
| !((~*(const uint32_t *)(buf + i) & | |||||
| (*(const uint32_t *)(buf + i) - 0x01010101U)) & | |||||
| 0x80808080U)) | |||||
| i += 4; | |||||
| #endif | |||||
| #endif | |||||
| for (; i < size; i++) | |||||
| if (!buf[i]) | |||||
| break; | |||||
| return i; | |||||
| } | |||||
| av_cold void ff_h264dsp_init(H264DSPContext *c, const int bit_depth, | av_cold void ff_h264dsp_init(H264DSPContext *c, const int bit_depth, | ||||
| const int chroma_format_idc) | const int chroma_format_idc) | ||||
| { | { | ||||
| @@ -133,6 +161,7 @@ av_cold void ff_h264dsp_init(H264DSPContext *c, const int bit_depth, | |||||
| H264_DSP(8); | H264_DSP(8); | ||||
| break; | break; | ||||
| } | } | ||||
| c->h264_find_start_code_candidate = h264_find_start_code_candidate_c; | |||||
| if (ARCH_ARM) ff_h264dsp_init_arm(c, bit_depth, chroma_format_idc); | if (ARCH_ARM) ff_h264dsp_init_arm(c, bit_depth, chroma_format_idc); | ||||
| if (ARCH_PPC) ff_h264dsp_init_ppc(c, bit_depth, chroma_format_idc); | if (ARCH_PPC) ff_h264dsp_init_ppc(c, bit_depth, chroma_format_idc); | ||||
| @@ -105,6 +105,15 @@ typedef struct H264DSPContext { | |||||
| /* bypass-transform */ | /* bypass-transform */ | ||||
| void (*h264_add_pixels8_clear)(uint8_t *dst, int16_t *block, int stride); | void (*h264_add_pixels8_clear)(uint8_t *dst, int16_t *block, int stride); | ||||
| void (*h264_add_pixels4_clear)(uint8_t *dst, int16_t *block, int stride); | void (*h264_add_pixels4_clear)(uint8_t *dst, int16_t *block, int stride); | ||||
| /** | |||||
| * Search buf from the start for up to size bytes. Return the index | |||||
| * of a zero byte, or >= size if not found. Ideally, use lookahead | |||||
| * to filter out any zero bytes that are known to not be followed by | |||||
| * one or more further zero bytes and a one byte. Better still, filter | |||||
| * out any bytes that form the trailing_zero_8bits syntax element too. | |||||
| */ | |||||
| int (*h264_find_start_code_candidate)(const uint8_t *buf, int size); | |||||
| } H264DSPContext; | } H264DSPContext; | ||||
| void ff_h264dsp_init(H264DSPContext *c, const int bit_depth, | void ff_h264dsp_init(H264DSPContext *c, const int bit_depth, | ||||