The reason is this is easier for PIC code (in particular on darwin...). Keep the old names as pointers (static in cabac_functions.h so gcc knows these are just immediate offsets) so the c code can nicely stay the same (alternatively could use offsets directly in the functions needing the tables). This should produce the same code as before with non-pic and better code (confirmed) with pic. The assembly uses the new table but still won't work for PIC case. Signed-off-by: Michael Niedermayer <michaelni@gmx.at>tags/n0.11
@@ -31,6 +31,29 @@ | |||||
#include "cabac.h" | #include "cabac.h" | ||||
#include "cabac_functions.h" | #include "cabac_functions.h" | ||||
uint8_t ff_h264_cabac_tables[512 + 4*2*64 + 4*64 + 63] = { | |||||
9,8,7,7,6,6,6,6,5,5,5,5,5,5,5,5, | |||||
4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4, | |||||
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, | |||||
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, | |||||
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, | |||||
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, | |||||
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, | |||||
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, | |||||
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, | |||||
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, | |||||
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, | |||||
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, | |||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, | |||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, | |||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, | |||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, | |||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, | |||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, | |||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, | |||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, | |||||
}; | |||||
static const uint8_t lps_range[64][4]= { | static const uint8_t lps_range[64][4]= { | ||||
{128,176,208,240}, {128,167,197,227}, {128,158,187,216}, {123,150,178,205}, | {128,176,208,240}, {128,167,197,227}, {128,158,187,216}, {123,150,178,205}, | ||||
{116,142,169,195}, {111,135,160,185}, {105,128,152,175}, {100,122,144,166}, | {116,142,169,195}, {111,135,160,185}, {105,128,152,175}, {100,122,144,166}, | ||||
@@ -50,8 +73,6 @@ static const uint8_t lps_range[64][4]= { | |||||
{ 6, 8, 9, 11}, { 6, 7, 9, 10}, { 6, 7, 8, 9}, { 2, 2, 2, 2}, | { 6, 8, 9, 11}, { 6, 7, 9, 10}, { 6, 7, 8, 9}, { 2, 2, 2, 2}, | ||||
}; | }; | ||||
uint8_t ff_h264_mlps_state[4*64]; | |||||
uint8_t ff_h264_lps_range[4*2*64]; | |||||
static uint8_t h264_lps_state[2*64]; | static uint8_t h264_lps_state[2*64]; | ||||
static uint8_t h264_mps_state[2*64]; | static uint8_t h264_mps_state[2*64]; | ||||
@@ -77,27 +98,11 @@ static const uint8_t lps_state[64]= { | |||||
36,36,37,37,37,38,38,63, | 36,36,37,37,37,38,38,63, | ||||
}; | }; | ||||
const uint8_t ff_h264_norm_shift[512]= { | |||||
9,8,7,7,6,6,6,6,5,5,5,5,5,5,5,5, | |||||
4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4, | |||||
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, | |||||
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, | |||||
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, | |||||
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, | |||||
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, | |||||
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, | |||||
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, | |||||
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, | |||||
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, | |||||
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, | |||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, | |||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, | |||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, | |||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, | |||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, | |||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, | |||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, | |||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, | |||||
static const uint8_t last_coeff_flag_offset_8x8[63] = { | |||||
0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | |||||
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, | |||||
3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, | |||||
5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8 | |||||
}; | }; | ||||
/** | /** | ||||
@@ -158,6 +163,9 @@ void ff_init_cabac_states(CABACContext *c){ | |||||
ff_h264_mlps_state[128-2*i-2]= 0; | ff_h264_mlps_state[128-2*i-2]= 0; | ||||
} | } | ||||
} | } | ||||
for(i=0; i< 63; i++){ | |||||
ff_h264_last_coeff_flag_offset_8x8[i] = last_coeff_flag_offset_8x8[i]; | |||||
} | |||||
} | } | ||||
#ifdef TEST | #ifdef TEST | ||||
@@ -31,6 +31,11 @@ | |||||
#include "put_bits.h" | #include "put_bits.h" | ||||
#define H264_NORM_SHIFT_OFFSET 0 | |||||
#define H264_LPS_RANGE_OFFSET 512 | |||||
#define H264_MLPS_STATE_OFFSET 1024 | |||||
#define H264_LAST_COEFF_FLAG_OFFSET_8x8_OFFSET 1280 | |||||
#define CABAC_BITS 16 | #define CABAC_BITS 16 | ||||
#define CABAC_MASK ((1<<CABAC_BITS)-1) | #define CABAC_MASK ((1<<CABAC_BITS)-1) | ||||
@@ -36,9 +36,11 @@ | |||||
# include "x86/cabac.h" | # include "x86/cabac.h" | ||||
#endif | #endif | ||||
extern const uint8_t ff_h264_norm_shift[512]; | |||||
extern uint8_t ff_h264_mlps_state[4*64]; | |||||
extern uint8_t ff_h264_lps_range[4*2*64]; ///< rangeTabLPS | |||||
extern uint8_t ff_h264_cabac_tables[512 + 4*2*64 + 4*64 + 63]; | |||||
static uint8_t * const ff_h264_norm_shift = ff_h264_cabac_tables + H264_NORM_SHIFT_OFFSET; | |||||
static uint8_t * const ff_h264_lps_range = ff_h264_cabac_tables + H264_LPS_RANGE_OFFSET; | |||||
static uint8_t * const ff_h264_mlps_state = ff_h264_cabac_tables + H264_MLPS_STATE_OFFSET; | |||||
static uint8_t * const ff_h264_last_coeff_flag_offset_8x8 = ff_h264_cabac_tables + H264_LAST_COEFF_FLAG_OFFSET_8x8_OFFSET; | |||||
static void refill(CABACContext *c){ | static void refill(CABACContext *c){ | ||||
#if CABAC_BITS == 16 | #if CABAC_BITS == 16 | ||||
@@ -1561,13 +1561,6 @@ static av_always_inline int get_cabac_cbf_ctx( H264Context *h, int cat, int idx, | |||||
return base_ctx[cat] + ctx; | return base_ctx[cat] + ctx; | ||||
} | } | ||||
DECLARE_ASM_CONST(1, uint8_t, last_coeff_flag_offset_8x8)[63] = { | |||||
0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | |||||
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, | |||||
3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, | |||||
5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8 | |||||
}; | |||||
static av_always_inline void | static av_always_inline void | ||||
decode_cabac_residual_internal(H264Context *h, DCTELEM *block, | decode_cabac_residual_internal(H264Context *h, DCTELEM *block, | ||||
int cat, int n, const uint8_t *scantable, | int cat, int n, const uint8_t *scantable, | ||||
@@ -1670,7 +1663,7 @@ decode_cabac_residual_internal(H264Context *h, DCTELEM *block, | |||||
last_coeff_ctx_base-significant_coeff_ctx_base); | last_coeff_ctx_base-significant_coeff_ctx_base); | ||||
} | } | ||||
#else | #else | ||||
DECODE_SIGNIFICANCE( 63, sig_off[last], last_coeff_flag_offset_8x8[last] ); | |||||
DECODE_SIGNIFICANCE( 63, sig_off[last], ff_h264_last_coeff_flag_offset_8x8[last] ); | |||||
} else { | } else { | ||||
if (is_dc && chroma422) { // dc 422 | if (is_dc && chroma422) { // dc 422 | ||||
DECODE_SIGNIFICANCE(7, sig_coeff_offset_dc[last], sig_coeff_offset_dc[last]); | DECODE_SIGNIFICANCE(7, sig_coeff_offset_dc[last], sig_coeff_offset_dc[last]); | ||||
@@ -24,6 +24,7 @@ | |||||
#include "libavcodec/cabac.h" | #include "libavcodec/cabac.h" | ||||
#include "libavutil/attributes.h" | #include "libavutil/attributes.h" | ||||
#include "libavutil/x86_cpu.h" | #include "libavutil/x86_cpu.h" | ||||
#include "libavutil/internal.h" | |||||
#include "config.h" | #include "config.h" | ||||
#if HAVE_FAST_CMOV | #if HAVE_FAST_CMOV | ||||
@@ -51,16 +52,16 @@ | |||||
"xor "tmp" , "ret" \n\t" | "xor "tmp" , "ret" \n\t" | ||||
#endif /* HAVE_FAST_CMOV */ | #endif /* HAVE_FAST_CMOV */ | ||||
#define BRANCHLESS_GET_CABAC(ret, statep, low, lowword, range, tmp, tmpbyte, byte, end) \ | |||||
#define BRANCHLESS_GET_CABAC(ret, statep, low, lowword, range, tmp, tmpbyte, byte, end, norm_off, lps_off, mlps_off) \ | |||||
"movzbl "statep" , "ret" \n\t"\ | "movzbl "statep" , "ret" \n\t"\ | ||||
"mov "range" , "tmp" \n\t"\ | "mov "range" , "tmp" \n\t"\ | ||||
"and $0xC0 , "range" \n\t"\ | "and $0xC0 , "range" \n\t"\ | ||||
"movzbl "MANGLE(ff_h264_lps_range)"("ret", "range", 2), "range" \n\t"\ | |||||
"movzbl "MANGLE(ff_h264_cabac_tables)"+"lps_off"("ret", "range", 2), "range" \n\t"\ | |||||
"sub "range" , "tmp" \n\t"\ | "sub "range" , "tmp" \n\t"\ | ||||
BRANCHLESS_GET_CABAC_UPDATE(ret, low, range, tmp) \ | BRANCHLESS_GET_CABAC_UPDATE(ret, low, range, tmp) \ | ||||
"movzbl " MANGLE(ff_h264_norm_shift) "("range"), %%ecx \n\t"\ | |||||
"movzbl "MANGLE(ff_h264_cabac_tables)"+"norm_off"("range"), %%ecx \n\t"\ | |||||
"shl %%cl , "range" \n\t"\ | "shl %%cl , "range" \n\t"\ | ||||
"movzbl "MANGLE(ff_h264_mlps_state)"+128("ret"), "tmp" \n\t"\ | |||||
"movzbl "MANGLE(ff_h264_cabac_tables)"+"mlps_off"+128("ret"), "tmp" \n\t"\ | |||||
"shl %%cl , "low" \n\t"\ | "shl %%cl , "low" \n\t"\ | ||||
"mov "tmpbyte" , "statep" \n\t"\ | "mov "tmpbyte" , "statep" \n\t"\ | ||||
"test "lowword" , "lowword" \n\t"\ | "test "lowword" , "lowword" \n\t"\ | ||||
@@ -73,7 +74,7 @@ | |||||
"shr $15 , %%ecx \n\t"\ | "shr $15 , %%ecx \n\t"\ | ||||
"bswap "tmp" \n\t"\ | "bswap "tmp" \n\t"\ | ||||
"shr $15 , "tmp" \n\t"\ | "shr $15 , "tmp" \n\t"\ | ||||
"movzbl " MANGLE(ff_h264_norm_shift) "(%%ecx), %%ecx \n\t"\ | |||||
"movzbl "MANGLE(ff_h264_cabac_tables)"+"norm_off"(%%ecx), %%ecx \n\t"\ | |||||
"sub $0xFFFF , "tmp" \n\t"\ | "sub $0xFFFF , "tmp" \n\t"\ | ||||
"neg %%ecx \n\t"\ | "neg %%ecx \n\t"\ | ||||
"add $7 , %%ecx \n\t"\ | "add $7 , %%ecx \n\t"\ | ||||
@@ -93,11 +94,14 @@ static av_always_inline int get_cabac_inline_x86(CABACContext *c, | |||||
__asm__ volatile( | __asm__ volatile( | ||||
BRANCHLESS_GET_CABAC("%0", "(%4)", "%1", "%w1", | BRANCHLESS_GET_CABAC("%0", "(%4)", "%1", "%w1", | ||||
"%2", "%3", "%b3", | "%2", "%3", "%b3", | ||||
"%a6(%5)", "%a7(%5)") | |||||
"%a6(%5)", "%a7(%5)", "%a8", "%a9", "%a10") | |||||
: "=&r"(bit), "+&r"(c->low), "+&r"(c->range), "=&q"(tmp) | : "=&r"(bit), "+&r"(c->low), "+&r"(c->range), "=&q"(tmp) | ||||
: "r"(state), "r"(c), | : "r"(state), "r"(c), | ||||
"i"(offsetof(CABACContext, bytestream)), | "i"(offsetof(CABACContext, bytestream)), | ||||
"i"(offsetof(CABACContext, bytestream_end)) | |||||
"i"(offsetof(CABACContext, bytestream_end)), | |||||
"i"(H264_NORM_SHIFT_OFFSET), | |||||
"i"(H264_LPS_RANGE_OFFSET), | |||||
"i"(H264_MLPS_STATE_OFFSET) | |||||
: "%"REG_c, "memory" | : "%"REG_c, "memory" | ||||
); | ); | ||||
return bit & 1; | return bit & 1; | ||||
@@ -45,12 +45,13 @@ static int decode_significance_x86(CABACContext *c, int max_coeff, | |||||
int minusindex= 4-(intptr_t)index; | int minusindex= 4-(intptr_t)index; | ||||
int bit; | int bit; | ||||
x86_reg coeff_count; | x86_reg coeff_count; | ||||
__asm__ volatile( | __asm__ volatile( | ||||
"3: \n\t" | "3: \n\t" | ||||
BRANCHLESS_GET_CABAC("%4", "(%1)", "%3", "%w3", | BRANCHLESS_GET_CABAC("%4", "(%1)", "%3", "%w3", | ||||
"%5", "%k0", "%b0", | "%5", "%k0", "%b0", | ||||
"%a11(%6)", "%a12(%6)") | |||||
"%a11(%6)", "%a12(%6)", "%a13", "%a14", "%a15") | |||||
"test $1, %4 \n\t" | "test $1, %4 \n\t" | ||||
" jz 4f \n\t" | " jz 4f \n\t" | ||||
@@ -58,7 +59,7 @@ static int decode_significance_x86(CABACContext *c, int max_coeff, | |||||
BRANCHLESS_GET_CABAC("%4", "(%1)", "%3", "%w3", | BRANCHLESS_GET_CABAC("%4", "(%1)", "%3", "%w3", | ||||
"%5", "%k0", "%b0", | "%5", "%k0", "%b0", | ||||
"%a11(%6)", "%a12(%6)") | |||||
"%a11(%6)", "%a12(%6)", "%a13", "%a14", "%a15") | |||||
"sub %10, %1 \n\t" | "sub %10, %1 \n\t" | ||||
"mov %2, %0 \n\t" | "mov %2, %0 \n\t" | ||||
@@ -86,7 +87,10 @@ static int decode_significance_x86(CABACContext *c, int max_coeff, | |||||
"+&r"(c->low), "=&r"(bit), "+&r"(c->range) | "+&r"(c->low), "=&r"(bit), "+&r"(c->range) | ||||
: "r"(c), "m"(minusstart), "m"(end), "m"(minusindex), "m"(last_off), | : "r"(c), "m"(minusstart), "m"(end), "m"(minusindex), "m"(last_off), | ||||
"i"(offsetof(CABACContext, bytestream)), | "i"(offsetof(CABACContext, bytestream)), | ||||
"i"(offsetof(CABACContext, bytestream_end)) | |||||
"i"(offsetof(CABACContext, bytestream_end)), | |||||
"i"(H264_NORM_SHIFT_OFFSET), | |||||
"i"(H264_LPS_RANGE_OFFSET), | |||||
"i"(H264_MLPS_STATE_OFFSET) | |||||
: "%"REG_c, "memory" | : "%"REG_c, "memory" | ||||
); | ); | ||||
return coeff_count; | return coeff_count; | ||||
@@ -100,6 +104,7 @@ static int decode_significance_8x8_x86(CABACContext *c, | |||||
x86_reg coeff_count; | x86_reg coeff_count; | ||||
x86_reg last=0; | x86_reg last=0; | ||||
x86_reg state; | x86_reg state; | ||||
__asm__ volatile( | __asm__ volatile( | ||||
"mov %1, %6 \n\t" | "mov %1, %6 \n\t" | ||||
"3: \n\t" | "3: \n\t" | ||||
@@ -110,18 +115,19 @@ static int decode_significance_8x8_x86(CABACContext *c, | |||||
BRANCHLESS_GET_CABAC("%4", "(%6)", "%3", "%w3", | BRANCHLESS_GET_CABAC("%4", "(%6)", "%3", "%w3", | ||||
"%5", "%k0", "%b0", | "%5", "%k0", "%b0", | ||||
"%a12(%7)", "%a13(%7)") | |||||
"%a12(%7)", "%a13(%7)", "%a14", "%a15", "%a16") | |||||
"mov %1, %k6 \n\t" | "mov %1, %k6 \n\t" | ||||
"test $1, %4 \n\t" | "test $1, %4 \n\t" | ||||
" jz 4f \n\t" | " jz 4f \n\t" | ||||
"movzbl "MANGLE(last_coeff_flag_offset_8x8)"(%k6), %k6\n\t" | |||||
"movzbl "MANGLE(ff_h264_cabac_tables)"+%a17(%k6), %k6\n\t" | |||||
"add %11, %6 \n\t" | "add %11, %6 \n\t" | ||||
BRANCHLESS_GET_CABAC("%4", "(%6)", "%3", "%w3", | BRANCHLESS_GET_CABAC("%4", "(%6)", "%3", "%w3", | ||||
"%5", "%k0", "%b0", | "%5", "%k0", "%b0", | ||||
"%a12(%7)", "%a13(%7)") | |||||
"%a12(%7)", "%a13(%7)", "%a14", "%a15", "%a16") | |||||
"mov %2, %0 \n\t" | "mov %2, %0 \n\t" | ||||
"mov %1, %k6 \n\t" | "mov %1, %k6 \n\t" | ||||
@@ -147,7 +153,11 @@ static int decode_significance_8x8_x86(CABACContext *c, | |||||
: "r"(c), "m"(minusindex), "m"(significant_coeff_ctx_base), | : "r"(c), "m"(minusindex), "m"(significant_coeff_ctx_base), | ||||
"m"(sig_off), "m"(last_coeff_ctx_base), | "m"(sig_off), "m"(last_coeff_ctx_base), | ||||
"i"(offsetof(CABACContext, bytestream)), | "i"(offsetof(CABACContext, bytestream)), | ||||
"i"(offsetof(CABACContext, bytestream_end)) | |||||
"i"(offsetof(CABACContext, bytestream_end)), | |||||
"i"(H264_NORM_SHIFT_OFFSET), | |||||
"i"(H264_LPS_RANGE_OFFSET), | |||||
"i"(H264_MLPS_STATE_OFFSET), | |||||
"i"(H264_LAST_COEFF_FLAG_OFFSET_8x8_OFFSET) | |||||
: "%"REG_c, "memory" | : "%"REG_c, "memory" | ||||
); | ); | ||||
return coeff_count; | return coeff_count; | ||||