This fixes out of global array reads. The alternative solutions of checking the index or modifying the VLC tables to prevent the index going outside are each about 1-2 cpu cyclces slower per coded 4x4 block. The alternative of padding the global tables directly is more ugly and moving them to the context should benefit cache locality. Found-by: Mateusz "j00ru" Jurczyk and Gynvael Coldwind Signed-off-by: Michael Niedermayer <michaelni@gmx.at>tags/n1.0
@@ -2719,19 +2719,19 @@ static void init_scan_tables(H264Context *h) | |||||
#undef T | #undef T | ||||
} | } | ||||
if (h->sps.transform_bypass) { // FIXME same ugly | if (h->sps.transform_bypass) { // FIXME same ugly | ||||
h->zigzag_scan_q0 = zigzag_scan; | |||||
h->zigzag_scan8x8_q0 = ff_zigzag_direct; | |||||
h->zigzag_scan8x8_cavlc_q0 = zigzag_scan8x8_cavlc; | |||||
h->field_scan_q0 = field_scan; | |||||
h->field_scan8x8_q0 = field_scan8x8; | |||||
h->field_scan8x8_cavlc_q0 = field_scan8x8_cavlc; | |||||
memcpy(h->zigzag_scan_q0 , zigzag_scan , sizeof(h->zigzag_scan_q0 )); | |||||
memcpy(h->zigzag_scan8x8_q0 , ff_zigzag_direct , sizeof(h->zigzag_scan8x8_q0 )); | |||||
memcpy(h->zigzag_scan8x8_cavlc_q0 , zigzag_scan8x8_cavlc , sizeof(h->zigzag_scan8x8_cavlc_q0)); | |||||
memcpy(h->field_scan_q0 , field_scan , sizeof(h->field_scan_q0 )); | |||||
memcpy(h->field_scan8x8_q0 , field_scan8x8 , sizeof(h->field_scan8x8_q0 )); | |||||
memcpy(h->field_scan8x8_cavlc_q0 , field_scan8x8_cavlc , sizeof(h->field_scan8x8_cavlc_q0 )); | |||||
} else { | } else { | ||||
h->zigzag_scan_q0 = h->zigzag_scan; | |||||
h->zigzag_scan8x8_q0 = h->zigzag_scan8x8; | |||||
h->zigzag_scan8x8_cavlc_q0 = h->zigzag_scan8x8_cavlc; | |||||
h->field_scan_q0 = h->field_scan; | |||||
h->field_scan8x8_q0 = h->field_scan8x8; | |||||
h->field_scan8x8_cavlc_q0 = h->field_scan8x8_cavlc; | |||||
memcpy(h->zigzag_scan_q0 , h->zigzag_scan , sizeof(h->zigzag_scan_q0 )); | |||||
memcpy(h->zigzag_scan8x8_q0 , h->zigzag_scan8x8 , sizeof(h->zigzag_scan8x8_q0 )); | |||||
memcpy(h->zigzag_scan8x8_cavlc_q0 , h->zigzag_scan8x8_cavlc , sizeof(h->zigzag_scan8x8_cavlc_q0)); | |||||
memcpy(h->field_scan_q0 , h->field_scan , sizeof(h->field_scan_q0 )); | |||||
memcpy(h->field_scan8x8_q0 , h->field_scan8x8 , sizeof(h->field_scan8x8_q0 )); | |||||
memcpy(h->field_scan8x8_cavlc_q0 , h->field_scan8x8_cavlc , sizeof(h->field_scan8x8_cavlc_q0 )); | |||||
} | } | ||||
} | } | ||||
@@ -421,12 +421,12 @@ typedef struct H264Context { | |||||
uint8_t field_scan[16]; | uint8_t field_scan[16]; | ||||
uint8_t field_scan8x8[64]; | uint8_t field_scan8x8[64]; | ||||
uint8_t field_scan8x8_cavlc[64]; | uint8_t field_scan8x8_cavlc[64]; | ||||
const uint8_t *zigzag_scan_q0; | |||||
const uint8_t *zigzag_scan8x8_q0; | |||||
const uint8_t *zigzag_scan8x8_cavlc_q0; | |||||
const uint8_t *field_scan_q0; | |||||
const uint8_t *field_scan8x8_q0; | |||||
const uint8_t *field_scan8x8_cavlc_q0; | |||||
uint8_t zigzag_scan_q0[16]; | |||||
uint8_t zigzag_scan8x8_q0[64]; | |||||
uint8_t zigzag_scan8x8_cavlc_q0[64]; | |||||
uint8_t field_scan_q0[16]; | |||||
uint8_t field_scan8x8_q0[64]; | |||||
uint8_t field_scan8x8_cavlc_q0[64]; | |||||
int x264_build; | int x264_build; | ||||
@@ -57,7 +57,6 @@ static const uint8_t zigzag_scan[16+1] = { | |||||
1 + 1 * 4, 2 + 0 * 4, 3 + 0 * 4, 2 + 1 * 4, | 1 + 1 * 4, 2 + 0 * 4, 3 + 0 * 4, 2 + 1 * 4, | ||||
1 + 2 * 4, 0 + 3 * 4, 1 + 3 * 4, 2 + 2 * 4, | 1 + 2 * 4, 0 + 3 * 4, 1 + 3 * 4, 2 + 2 * 4, | ||||
3 + 1 * 4, 3 + 2 * 4, 2 + 3 * 4, 3 + 3 * 4, | 3 + 1 * 4, 3 + 2 * 4, 2 + 3 * 4, 3 + 3 * 4, | ||||
0, | |||||
}; | }; | ||||
static const uint8_t field_scan[16+1] = { | static const uint8_t field_scan[16+1] = { | ||||
@@ -65,7 +64,6 @@ static const uint8_t field_scan[16+1] = { | |||||
0 + 3 * 4, 1 + 1 * 4, 1 + 2 * 4, 1 + 3 * 4, | 0 + 3 * 4, 1 + 1 * 4, 1 + 2 * 4, 1 + 3 * 4, | ||||
2 + 0 * 4, 2 + 1 * 4, 2 + 2 * 4, 2 + 3 * 4, | 2 + 0 * 4, 2 + 1 * 4, 2 + 2 * 4, 2 + 3 * 4, | ||||
3 + 0 * 4, 3 + 1 * 4, 3 + 2 * 4, 3 + 3 * 4, | 3 + 0 * 4, 3 + 1 * 4, 3 + 2 * 4, 3 + 3 * 4, | ||||
0, | |||||
}; | }; | ||||
static const uint8_t luma_dc_zigzag_scan[16] = { | static const uint8_t luma_dc_zigzag_scan[16] = { | ||||
@@ -112,7 +110,6 @@ static const uint8_t zigzag_scan8x8_cavlc[64+1] = { | |||||
1 + 4 * 8, 2 + 4 * 8, 6 + 0 * 8, 4 + 3 * 8, | 1 + 4 * 8, 2 + 4 * 8, 6 + 0 * 8, 4 + 3 * 8, | ||||
0 + 7 * 8, 4 + 4 * 8, 7 + 2 * 8, 3 + 6 * 8, | 0 + 7 * 8, 4 + 4 * 8, 7 + 2 * 8, 3 + 6 * 8, | ||||
5 + 5 * 8, 6 + 5 * 8, 6 + 6 * 8, 7 + 7 * 8, | 5 + 5 * 8, 6 + 5 * 8, 6 + 6 * 8, 7 + 7 * 8, | ||||
0, | |||||
}; | }; | ||||
static const uint8_t field_scan8x8[64+1] = { | static const uint8_t field_scan8x8[64+1] = { | ||||
@@ -132,7 +129,6 @@ static const uint8_t field_scan8x8[64+1] = { | |||||
7 + 0 * 8, 7 + 1 * 8, 6 + 4 * 8, 6 + 5 * 8, | 7 + 0 * 8, 7 + 1 * 8, 6 + 4 * 8, 6 + 5 * 8, | ||||
6 + 6 * 8, 6 + 7 * 8, 7 + 2 * 8, 7 + 3 * 8, | 6 + 6 * 8, 6 + 7 * 8, 7 + 2 * 8, 7 + 3 * 8, | ||||
7 + 4 * 8, 7 + 5 * 8, 7 + 6 * 8, 7 + 7 * 8, | 7 + 4 * 8, 7 + 5 * 8, 7 + 6 * 8, 7 + 7 * 8, | ||||
0, | |||||
}; | }; | ||||
static const uint8_t field_scan8x8_cavlc[64+1] = { | static const uint8_t field_scan8x8_cavlc[64+1] = { | ||||
@@ -152,7 +148,6 @@ static const uint8_t field_scan8x8_cavlc[64+1] = { | |||||
1 + 7 * 8, 3 + 2 * 8, 2 + 7 * 8, 4 + 2 * 8, | 1 + 7 * 8, 3 + 2 * 8, 2 + 7 * 8, 4 + 2 * 8, | ||||
3 + 7 * 8, 5 + 2 * 8, 4 + 7 * 8, 5 + 4 * 8, | 3 + 7 * 8, 5 + 2 * 8, 4 + 7 * 8, 5 + 4 * 8, | ||||
6 + 3 * 8, 6 + 5 * 8, 7 + 3 * 8, 7 + 7 * 8, | 6 + 3 * 8, 6 + 5 * 8, 7 + 3 * 8, 7 + 7 * 8, | ||||
0, | |||||
}; | }; | ||||
typedef struct IMbInfo { | typedef struct IMbInfo { | ||||