This fixes out of global array reads. The alternative solutions of checking the index or modifying the VLC tables to prevent the index going outside are each about 1-2 cpu cyclces slower per coded 4x4 block. The alternative of padding the global tables directly is more ugly and moving them to the context should benefit cache locality. Found-by: Mateusz "j00ru" Jurczyk and Gynvael Coldwind Signed-off-by: Michael Niedermayer <michaelni@gmx.at>tags/n1.0
| @@ -2719,19 +2719,19 @@ static void init_scan_tables(H264Context *h) | |||||
| #undef T | #undef T | ||||
| } | } | ||||
| if (h->sps.transform_bypass) { // FIXME same ugly | if (h->sps.transform_bypass) { // FIXME same ugly | ||||
| h->zigzag_scan_q0 = zigzag_scan; | |||||
| h->zigzag_scan8x8_q0 = ff_zigzag_direct; | |||||
| h->zigzag_scan8x8_cavlc_q0 = zigzag_scan8x8_cavlc; | |||||
| h->field_scan_q0 = field_scan; | |||||
| h->field_scan8x8_q0 = field_scan8x8; | |||||
| h->field_scan8x8_cavlc_q0 = field_scan8x8_cavlc; | |||||
| memcpy(h->zigzag_scan_q0 , zigzag_scan , sizeof(h->zigzag_scan_q0 )); | |||||
| memcpy(h->zigzag_scan8x8_q0 , ff_zigzag_direct , sizeof(h->zigzag_scan8x8_q0 )); | |||||
| memcpy(h->zigzag_scan8x8_cavlc_q0 , zigzag_scan8x8_cavlc , sizeof(h->zigzag_scan8x8_cavlc_q0)); | |||||
| memcpy(h->field_scan_q0 , field_scan , sizeof(h->field_scan_q0 )); | |||||
| memcpy(h->field_scan8x8_q0 , field_scan8x8 , sizeof(h->field_scan8x8_q0 )); | |||||
| memcpy(h->field_scan8x8_cavlc_q0 , field_scan8x8_cavlc , sizeof(h->field_scan8x8_cavlc_q0 )); | |||||
| } else { | } else { | ||||
| h->zigzag_scan_q0 = h->zigzag_scan; | |||||
| h->zigzag_scan8x8_q0 = h->zigzag_scan8x8; | |||||
| h->zigzag_scan8x8_cavlc_q0 = h->zigzag_scan8x8_cavlc; | |||||
| h->field_scan_q0 = h->field_scan; | |||||
| h->field_scan8x8_q0 = h->field_scan8x8; | |||||
| h->field_scan8x8_cavlc_q0 = h->field_scan8x8_cavlc; | |||||
| memcpy(h->zigzag_scan_q0 , h->zigzag_scan , sizeof(h->zigzag_scan_q0 )); | |||||
| memcpy(h->zigzag_scan8x8_q0 , h->zigzag_scan8x8 , sizeof(h->zigzag_scan8x8_q0 )); | |||||
| memcpy(h->zigzag_scan8x8_cavlc_q0 , h->zigzag_scan8x8_cavlc , sizeof(h->zigzag_scan8x8_cavlc_q0)); | |||||
| memcpy(h->field_scan_q0 , h->field_scan , sizeof(h->field_scan_q0 )); | |||||
| memcpy(h->field_scan8x8_q0 , h->field_scan8x8 , sizeof(h->field_scan8x8_q0 )); | |||||
| memcpy(h->field_scan8x8_cavlc_q0 , h->field_scan8x8_cavlc , sizeof(h->field_scan8x8_cavlc_q0 )); | |||||
| } | } | ||||
| } | } | ||||
| @@ -421,12 +421,12 @@ typedef struct H264Context { | |||||
| uint8_t field_scan[16]; | uint8_t field_scan[16]; | ||||
| uint8_t field_scan8x8[64]; | uint8_t field_scan8x8[64]; | ||||
| uint8_t field_scan8x8_cavlc[64]; | uint8_t field_scan8x8_cavlc[64]; | ||||
| const uint8_t *zigzag_scan_q0; | |||||
| const uint8_t *zigzag_scan8x8_q0; | |||||
| const uint8_t *zigzag_scan8x8_cavlc_q0; | |||||
| const uint8_t *field_scan_q0; | |||||
| const uint8_t *field_scan8x8_q0; | |||||
| const uint8_t *field_scan8x8_cavlc_q0; | |||||
| uint8_t zigzag_scan_q0[16]; | |||||
| uint8_t zigzag_scan8x8_q0[64]; | |||||
| uint8_t zigzag_scan8x8_cavlc_q0[64]; | |||||
| uint8_t field_scan_q0[16]; | |||||
| uint8_t field_scan8x8_q0[64]; | |||||
| uint8_t field_scan8x8_cavlc_q0[64]; | |||||
| int x264_build; | int x264_build; | ||||
| @@ -57,7 +57,6 @@ static const uint8_t zigzag_scan[16+1] = { | |||||
| 1 + 1 * 4, 2 + 0 * 4, 3 + 0 * 4, 2 + 1 * 4, | 1 + 1 * 4, 2 + 0 * 4, 3 + 0 * 4, 2 + 1 * 4, | ||||
| 1 + 2 * 4, 0 + 3 * 4, 1 + 3 * 4, 2 + 2 * 4, | 1 + 2 * 4, 0 + 3 * 4, 1 + 3 * 4, 2 + 2 * 4, | ||||
| 3 + 1 * 4, 3 + 2 * 4, 2 + 3 * 4, 3 + 3 * 4, | 3 + 1 * 4, 3 + 2 * 4, 2 + 3 * 4, 3 + 3 * 4, | ||||
| 0, | |||||
| }; | }; | ||||
| static const uint8_t field_scan[16+1] = { | static const uint8_t field_scan[16+1] = { | ||||
| @@ -65,7 +64,6 @@ static const uint8_t field_scan[16+1] = { | |||||
| 0 + 3 * 4, 1 + 1 * 4, 1 + 2 * 4, 1 + 3 * 4, | 0 + 3 * 4, 1 + 1 * 4, 1 + 2 * 4, 1 + 3 * 4, | ||||
| 2 + 0 * 4, 2 + 1 * 4, 2 + 2 * 4, 2 + 3 * 4, | 2 + 0 * 4, 2 + 1 * 4, 2 + 2 * 4, 2 + 3 * 4, | ||||
| 3 + 0 * 4, 3 + 1 * 4, 3 + 2 * 4, 3 + 3 * 4, | 3 + 0 * 4, 3 + 1 * 4, 3 + 2 * 4, 3 + 3 * 4, | ||||
| 0, | |||||
| }; | }; | ||||
| static const uint8_t luma_dc_zigzag_scan[16] = { | static const uint8_t luma_dc_zigzag_scan[16] = { | ||||
| @@ -112,7 +110,6 @@ static const uint8_t zigzag_scan8x8_cavlc[64+1] = { | |||||
| 1 + 4 * 8, 2 + 4 * 8, 6 + 0 * 8, 4 + 3 * 8, | 1 + 4 * 8, 2 + 4 * 8, 6 + 0 * 8, 4 + 3 * 8, | ||||
| 0 + 7 * 8, 4 + 4 * 8, 7 + 2 * 8, 3 + 6 * 8, | 0 + 7 * 8, 4 + 4 * 8, 7 + 2 * 8, 3 + 6 * 8, | ||||
| 5 + 5 * 8, 6 + 5 * 8, 6 + 6 * 8, 7 + 7 * 8, | 5 + 5 * 8, 6 + 5 * 8, 6 + 6 * 8, 7 + 7 * 8, | ||||
| 0, | |||||
| }; | }; | ||||
| static const uint8_t field_scan8x8[64+1] = { | static const uint8_t field_scan8x8[64+1] = { | ||||
| @@ -132,7 +129,6 @@ static const uint8_t field_scan8x8[64+1] = { | |||||
| 7 + 0 * 8, 7 + 1 * 8, 6 + 4 * 8, 6 + 5 * 8, | 7 + 0 * 8, 7 + 1 * 8, 6 + 4 * 8, 6 + 5 * 8, | ||||
| 6 + 6 * 8, 6 + 7 * 8, 7 + 2 * 8, 7 + 3 * 8, | 6 + 6 * 8, 6 + 7 * 8, 7 + 2 * 8, 7 + 3 * 8, | ||||
| 7 + 4 * 8, 7 + 5 * 8, 7 + 6 * 8, 7 + 7 * 8, | 7 + 4 * 8, 7 + 5 * 8, 7 + 6 * 8, 7 + 7 * 8, | ||||
| 0, | |||||
| }; | }; | ||||
| static const uint8_t field_scan8x8_cavlc[64+1] = { | static const uint8_t field_scan8x8_cavlc[64+1] = { | ||||
| @@ -152,7 +148,6 @@ static const uint8_t field_scan8x8_cavlc[64+1] = { | |||||
| 1 + 7 * 8, 3 + 2 * 8, 2 + 7 * 8, 4 + 2 * 8, | 1 + 7 * 8, 3 + 2 * 8, 2 + 7 * 8, 4 + 2 * 8, | ||||
| 3 + 7 * 8, 5 + 2 * 8, 4 + 7 * 8, 5 + 4 * 8, | 3 + 7 * 8, 5 + 2 * 8, 4 + 7 * 8, 5 + 4 * 8, | ||||
| 6 + 3 * 8, 6 + 5 * 8, 7 + 3 * 8, 7 + 7 * 8, | 6 + 3 * 8, 6 + 5 * 8, 7 + 3 * 8, 7 + 7 * 8, | ||||
| 0, | |||||
| }; | }; | ||||
| typedef struct IMbInfo { | typedef struct IMbInfo { | ||||