The non-intra-pcm branch in hl_decode_mb (simple, 8bpp) goes from 700 to 672 cycles, and the complete loop of decode_mb_cabac and hl_decode_mb (in the decode_slice loop) goes from 1759 to 1733 cycles on the clip tested (cathedral), i.e. almost 30 cycles per mb faster. Signed-off-by: Martin Storsjö <martin@martin.st>tags/n2.0
| @@ -22,9 +22,12 @@ | |||||
| function ff_h264_idct_add_neon, export=1 | function ff_h264_idct_add_neon, export=1 | ||||
| vld1.64 {d0-d3}, [r1,:128] | vld1.64 {d0-d3}, [r1,:128] | ||||
| vmov.i16 q15, #0 | |||||
| vswp d1, d2 | vswp d1, d2 | ||||
| vst1.16 {q15}, [r1,:128]! | |||||
| vadd.i16 d4, d0, d1 | vadd.i16 d4, d0, d1 | ||||
| vst1.16 {q15}, [r1,:128]! | |||||
| vshr.s16 q8, q1, #1 | vshr.s16 q8, q1, #1 | ||||
| vsub.i16 d5, d0, d1 | vsub.i16 d5, d0, d1 | ||||
| vadd.i16 d6, d2, d17 | vadd.i16 d6, d2, d17 | ||||
| @@ -65,11 +68,14 @@ function ff_h264_idct_add_neon, export=1 | |||||
| vst1.32 {d0[1]}, [r0,:32], r2 | vst1.32 {d0[1]}, [r0,:32], r2 | ||||
| vst1.32 {d1[0]}, [r0,:32], r2 | vst1.32 {d1[0]}, [r0,:32], r2 | ||||
| sub r1, r1, #32 | |||||
| bx lr | bx lr | ||||
| endfunc | endfunc | ||||
| function ff_h264_idct_dc_add_neon, export=1 | function ff_h264_idct_dc_add_neon, export=1 | ||||
| mov r3, #0 | |||||
| vld1.16 {d2[],d3[]}, [r1,:16] | vld1.16 {d2[],d3[]}, [r1,:16] | ||||
| strh r3, [r1] | |||||
| vrshr.s16 q1, q1, #6 | vrshr.s16 q1, q1, #6 | ||||
| vld1.32 {d0[0]}, [r0,:32], r2 | vld1.32 {d0[0]}, [r0,:32], r2 | ||||
| vld1.32 {d0[1]}, [r0,:32], r2 | vld1.32 {d0[1]}, [r0,:32], r2 | ||||
| @@ -148,7 +154,7 @@ function ff_h264_idct_add8_neon, export=1 | |||||
| add r5, r1, #16*4 | add r5, r1, #16*4 | ||||
| add r1, r2, #16*32 | add r1, r2, #16*32 | ||||
| mov r2, r3 | mov r2, r3 | ||||
| mov r3, r1 | |||||
| mov r10, r1 | |||||
| ldr r6, [sp, #32] | ldr r6, [sp, #32] | ||||
| movrel r7, scan8+16 | movrel r7, scan8+16 | ||||
| mov r12, #0 | mov r12, #0 | ||||
| @@ -156,7 +162,7 @@ function ff_h264_idct_add8_neon, export=1 | |||||
| ldr r0, [r5, r12, lsl #2] | ldr r0, [r5, r12, lsl #2] | ||||
| ldrb r8, [r6, r8] | ldrb r8, [r6, r8] | ||||
| add r0, r0, r4 | add r0, r0, r4 | ||||
| add r1, r3, r12, lsl #5 | |||||
| add r1, r10, r12, lsl #5 | |||||
| cmp r8, #0 | cmp r8, #0 | ||||
| ldrsh r8, [r1] | ldrsh r8, [r1] | ||||
| iteet ne | iteet ne | ||||
| @@ -180,7 +186,9 @@ endfunc | |||||
| qb .req q14 | qb .req q14 | ||||
| vshr.s16 q2, q10, #1 | vshr.s16 q2, q10, #1 | ||||
| vadd.i16 q0, q8, q12 | vadd.i16 q0, q8, q12 | ||||
| vld1.16 {q14-q15},[r1,:128]! | |||||
| vld1.16 {q14-q15},[r1,:128] | |||||
| vst1.16 {q3}, [r1,:128]! | |||||
| vst1.16 {q3}, [r1,:128]! | |||||
| vsub.i16 q1, q8, q12 | vsub.i16 q1, q8, q12 | ||||
| vshr.s16 q3, q14, #1 | vshr.s16 q3, q14, #1 | ||||
| vsub.i16 q2, q2, q14 | vsub.i16 q2, q2, q14 | ||||
| @@ -259,9 +267,16 @@ endfunc | |||||
| .endm | .endm | ||||
| function ff_h264_idct8_add_neon, export=1 | function ff_h264_idct8_add_neon, export=1 | ||||
| vld1.16 {q8-q9}, [r1,:128]! | |||||
| vld1.16 {q10-q11},[r1,:128]! | |||||
| vld1.16 {q12-q13},[r1,:128]! | |||||
| vmov.i16 q3, #0 | |||||
| vld1.16 {q8-q9}, [r1,:128] | |||||
| vst1.16 {q3}, [r1,:128]! | |||||
| vst1.16 {q3}, [r1,:128]! | |||||
| vld1.16 {q10-q11},[r1,:128] | |||||
| vst1.16 {q3}, [r1,:128]! | |||||
| vst1.16 {q3}, [r1,:128]! | |||||
| vld1.16 {q12-q13},[r1,:128] | |||||
| vst1.16 {q3}, [r1,:128]! | |||||
| vst1.16 {q3}, [r1,:128]! | |||||
| idct8x8_cols 0 | idct8x8_cols 0 | ||||
| idct8x8_cols 1 | idct8x8_cols 1 | ||||
| @@ -313,7 +328,9 @@ function ff_h264_idct8_add_neon, export=1 | |||||
| endfunc | endfunc | ||||
| function ff_h264_idct8_dc_add_neon, export=1 | function ff_h264_idct8_dc_add_neon, export=1 | ||||
| mov r3, #0 | |||||
| vld1.16 {d30[],d31[]},[r1,:16] | vld1.16 {d30[],d31[]},[r1,:16] | ||||
| strh r3, [r1] | |||||
| vld1.32 {d0}, [r0,:64], r2 | vld1.32 {d0}, [r0,:64], r2 | ||||
| vrshr.s16 q15, q15, #6 | vrshr.s16 q15, q15, #6 | ||||
| vld1.32 {d1}, [r0,:64], r2 | vld1.32 {d1}, [r0,:64], r2 | ||||
| @@ -2257,7 +2257,7 @@ static av_always_inline void hl_decode_mb_predict_luma(H264Context *h, | |||||
| if (IS_8x8DCT(mb_type)) { | if (IS_8x8DCT(mb_type)) { | ||||
| if (transform_bypass) { | if (transform_bypass) { | ||||
| idct_dc_add = | idct_dc_add = | ||||
| idct_add = h->h264dsp.h264_add_pixels8; | |||||
| idct_add = h->h264dsp.h264_add_pixels8_clear; | |||||
| } else { | } else { | ||||
| idct_dc_add = h->h264dsp.h264_idct8_dc_add; | idct_dc_add = h->h264dsp.h264_idct8_dc_add; | ||||
| idct_add = h->h264dsp.h264_idct8_add; | idct_add = h->h264dsp.h264_idct8_add; | ||||
| @@ -2282,7 +2282,7 @@ static av_always_inline void hl_decode_mb_predict_luma(H264Context *h, | |||||
| } else { | } else { | ||||
| if (transform_bypass) { | if (transform_bypass) { | ||||
| idct_dc_add = | idct_dc_add = | ||||
| idct_add = h->h264dsp.h264_add_pixels4; | |||||
| idct_add = h->h264dsp.h264_add_pixels4_clear; | |||||
| } else { | } else { | ||||
| idct_dc_add = h->h264dsp.h264_idct_dc_add; | idct_dc_add = h->h264dsp.h264_idct_dc_add; | ||||
| idct_add = h->h264dsp.h264_idct_add; | idct_add = h->h264dsp.h264_idct_add; | ||||
| @@ -2379,9 +2379,9 @@ static av_always_inline void hl_decode_mb_idct_luma(H264Context *h, int mb_type, | |||||
| for (i = 0; i < 16; i++) | for (i = 0; i < 16; i++) | ||||
| if (h->non_zero_count_cache[scan8[i + p * 16]] || | if (h->non_zero_count_cache[scan8[i + p * 16]] || | ||||
| dctcoef_get(h->mb, pixel_shift, i * 16 + p * 256)) | dctcoef_get(h->mb, pixel_shift, i * 16 + p * 256)) | ||||
| h->h264dsp.h264_add_pixels4(dest_y + block_offset[i], | |||||
| h->mb + (i * 16 + p * 256 << pixel_shift), | |||||
| linesize); | |||||
| h->h264dsp.h264_add_pixels4_clear(dest_y + block_offset[i], | |||||
| h->mb + (i * 16 + p * 256 << pixel_shift), | |||||
| linesize); | |||||
| } | } | ||||
| } else { | } else { | ||||
| h->h264dsp.h264_idct_add16intra(dest_y, block_offset, | h->h264dsp.h264_idct_add16intra(dest_y, block_offset, | ||||
| @@ -2392,8 +2392,8 @@ static av_always_inline void hl_decode_mb_idct_luma(H264Context *h, int mb_type, | |||||
| } else if (h->cbp & 15) { | } else if (h->cbp & 15) { | ||||
| if (transform_bypass) { | if (transform_bypass) { | ||||
| const int di = IS_8x8DCT(mb_type) ? 4 : 1; | const int di = IS_8x8DCT(mb_type) ? 4 : 1; | ||||
| idct_add = IS_8x8DCT(mb_type) ? h->h264dsp.h264_add_pixels8 | |||||
| : h->h264dsp.h264_add_pixels4; | |||||
| idct_add = IS_8x8DCT(mb_type) ? h->h264dsp.h264_add_pixels8_clear | |||||
| : h->h264dsp.h264_add_pixels4_clear; | |||||
| for (i = 0; i < 16; i += di) | for (i = 0; i < 16; i += di) | ||||
| if (h->non_zero_count_cache[scan8[i + p * 16]]) | if (h->non_zero_count_cache[scan8[i + p * 16]]) | ||||
| idct_add(dest_y + block_offset[i], | idct_add(dest_y + block_offset[i], | ||||
| @@ -207,7 +207,7 @@ static av_noinline void FUNC(hl_decode_mb)(H264Context *h) | |||||
| h->mb + (16 * 16 * 2 << PIXEL_SHIFT), | h->mb + (16 * 16 * 2 << PIXEL_SHIFT), | ||||
| uvlinesize); | uvlinesize); | ||||
| } else { | } else { | ||||
| idct_add = h->h264dsp.h264_add_pixels4; | |||||
| idct_add = h->h264dsp.h264_add_pixels4_clear; | |||||
| for (j = 1; j < 3; j++) { | for (j = 1; j < 3; j++) { | ||||
| for (i = j * 16; i < j * 16 + 4; i++) | for (i = j * 16; i < j * 16 + 4; i++) | ||||
| if (h->non_zero_count_cache[scan8[i]] || | if (h->non_zero_count_cache[scan8[i]] || | ||||
| @@ -261,10 +261,6 @@ static av_noinline void FUNC(hl_decode_mb)(H264Context *h) | |||||
| } | } | ||||
| } | } | ||||
| } | } | ||||
| if (h->cbp || IS_INTRA(mb_type)) { | |||||
| h->dsp.clear_blocks(h->mb); | |||||
| h->dsp.clear_blocks(h->mb + (24 * 16 << PIXEL_SHIFT)); | |||||
| } | |||||
| } | } | ||||
| } | } | ||||
| @@ -368,11 +364,6 @@ static av_noinline void FUNC(hl_decode_mb_444)(H264Context *h) | |||||
| hl_decode_mb_idct_luma(h, mb_type, 1, SIMPLE, transform_bypass, | hl_decode_mb_idct_luma(h, mb_type, 1, SIMPLE, transform_bypass, | ||||
| PIXEL_SHIFT, block_offset, linesize, | PIXEL_SHIFT, block_offset, linesize, | ||||
| dest[p], p); | dest[p], p); | ||||
| if (h->cbp || IS_INTRA(mb_type)) { | |||||
| h->dsp.clear_blocks(h->mb); | |||||
| h->dsp.clear_blocks(h->mb + (24 * 16 << PIXEL_SHIFT)); | |||||
| } | |||||
| } | } | ||||
| } | } | ||||
| @@ -43,6 +43,8 @@ static void FUNCC(ff_h264_add_pixels4)(uint8_t *_dst, int16_t *_src, int stride) | |||||
| dst += stride; | dst += stride; | ||||
| src += 4; | src += 4; | ||||
| } | } | ||||
| memset(_src, 0, sizeof(dctcoef) * 16); | |||||
| } | } | ||||
| static void FUNCC(ff_h264_add_pixels8)(uint8_t *_dst, int16_t *_src, int stride) | static void FUNCC(ff_h264_add_pixels8)(uint8_t *_dst, int16_t *_src, int stride) | ||||
| @@ -65,4 +67,6 @@ static void FUNCC(ff_h264_add_pixels8)(uint8_t *_dst, int16_t *_src, int stride) | |||||
| dst += stride; | dst += stride; | ||||
| src += 8; | src += 8; | ||||
| } | } | ||||
| memset(_src, 0, sizeof(dctcoef) * 64); | |||||
| } | } | ||||
| @@ -57,8 +57,8 @@ void ff_h264dsp_init(H264DSPContext *c, const int bit_depth, const int chroma_fo | |||||
| #define FUNC(a, depth) a ## _ ## depth ## _c | #define FUNC(a, depth) a ## _ ## depth ## _c | ||||
| #define ADDPX_DSP(depth) \ | #define ADDPX_DSP(depth) \ | ||||
| c->h264_add_pixels4 = FUNC(ff_h264_add_pixels4, depth);\ | |||||
| c->h264_add_pixels8 = FUNC(ff_h264_add_pixels8, depth) | |||||
| c->h264_add_pixels4_clear = FUNC(ff_h264_add_pixels4, depth);\ | |||||
| c->h264_add_pixels8_clear = FUNC(ff_h264_add_pixels8, depth) | |||||
| if (bit_depth > 8 && bit_depth <= 16) { | if (bit_depth > 8 && bit_depth <= 16) { | ||||
| ADDPX_DSP(16); | ADDPX_DSP(16); | ||||
| @@ -103,8 +103,8 @@ typedef struct H264DSPContext { | |||||
| void (*h264_chroma_dc_dequant_idct)(int16_t *block, int qmul); | void (*h264_chroma_dc_dequant_idct)(int16_t *block, int qmul); | ||||
| /* bypass-transform */ | /* bypass-transform */ | ||||
| void (*h264_add_pixels8)(uint8_t *dst, int16_t *block, int stride); | |||||
| void (*h264_add_pixels4)(uint8_t *dst, int16_t *block, int stride); | |||||
| void (*h264_add_pixels8_clear)(uint8_t *dst, int16_t *block, int stride); | |||||
| void (*h264_add_pixels4_clear)(uint8_t *dst, int16_t *block, int stride); | |||||
| } H264DSPContext; | } H264DSPContext; | ||||
| void ff_h264dsp_init(H264DSPContext *c, const int bit_depth, | void ff_h264dsp_init(H264DSPContext *c, const int bit_depth, | ||||
| @@ -61,6 +61,8 @@ void FUNCC(ff_h264_idct_add)(uint8_t *_dst, int16_t *_block, int stride) | |||||
| dst[i + 2*stride]= av_clip_pixel(dst[i + 2*stride] + ((z1 - z2) >> 6)); | dst[i + 2*stride]= av_clip_pixel(dst[i + 2*stride] + ((z1 - z2) >> 6)); | ||||
| dst[i + 3*stride]= av_clip_pixel(dst[i + 3*stride] + ((z0 - z3) >> 6)); | dst[i + 3*stride]= av_clip_pixel(dst[i + 3*stride] + ((z0 - z3) >> 6)); | ||||
| } | } | ||||
| memset(block, 0, 16 * sizeof(dctcoef)); | |||||
| } | } | ||||
| void FUNCC(ff_h264_idct8_add)(uint8_t *_dst, int16_t *_block, int stride){ | void FUNCC(ff_h264_idct8_add)(uint8_t *_dst, int16_t *_block, int stride){ | ||||
| @@ -133,14 +135,18 @@ void FUNCC(ff_h264_idct8_add)(uint8_t *_dst, int16_t *_block, int stride){ | |||||
| dst[i + 6*stride] = av_clip_pixel( dst[i + 6*stride] + ((b2 - b5) >> 6) ); | dst[i + 6*stride] = av_clip_pixel( dst[i + 6*stride] + ((b2 - b5) >> 6) ); | ||||
| dst[i + 7*stride] = av_clip_pixel( dst[i + 7*stride] + ((b0 - b7) >> 6) ); | dst[i + 7*stride] = av_clip_pixel( dst[i + 7*stride] + ((b0 - b7) >> 6) ); | ||||
| } | } | ||||
| memset(block, 0, 64 * sizeof(dctcoef)); | |||||
| } | } | ||||
| // assumes all AC coefs are 0 | // assumes all AC coefs are 0 | ||||
| void FUNCC(ff_h264_idct_dc_add)(uint8_t *_dst, int16_t *block, int stride){ | |||||
| void FUNCC(ff_h264_idct_dc_add)(uint8_t *_dst, int16_t *_block, int stride){ | |||||
| int i, j; | int i, j; | ||||
| int dc = (((dctcoef*)block)[0] + 32) >> 6; | |||||
| pixel *dst = (pixel*)_dst; | pixel *dst = (pixel*)_dst; | ||||
| dctcoef *block = (dctcoef*)_block; | |||||
| int dc = (block[0] + 32) >> 6; | |||||
| stride /= sizeof(pixel); | stride /= sizeof(pixel); | ||||
| block[0] = 0; | |||||
| for( j = 0; j < 4; j++ ) | for( j = 0; j < 4; j++ ) | ||||
| { | { | ||||
| for( i = 0; i < 4; i++ ) | for( i = 0; i < 4; i++ ) | ||||
| @@ -149,10 +155,12 @@ void FUNCC(ff_h264_idct_dc_add)(uint8_t *_dst, int16_t *block, int stride){ | |||||
| } | } | ||||
| } | } | ||||
| void FUNCC(ff_h264_idct8_dc_add)(uint8_t *_dst, int16_t *block, int stride){ | |||||
| void FUNCC(ff_h264_idct8_dc_add)(uint8_t *_dst, int16_t *_block, int stride){ | |||||
| int i, j; | int i, j; | ||||
| int dc = (((dctcoef*)block)[0] + 32) >> 6; | |||||
| pixel *dst = (pixel*)_dst; | pixel *dst = (pixel*)_dst; | ||||
| dctcoef *block = (dctcoef*)_block; | |||||
| int dc = (block[0] + 32) >> 6; | |||||
| block[0] = 0; | |||||
| stride /= sizeof(pixel); | stride /= sizeof(pixel); | ||||
| for( j = 0; j < 8; j++ ) | for( j = 0; j < 8; j++ ) | ||||
| { | { | ||||
| @@ -98,15 +98,15 @@ typedef struct H264PredContext { | |||||
| void(*pred16x16[4 + 3 + 2])(uint8_t *src, ptrdiff_t stride); | void(*pred16x16[4 + 3 + 2])(uint8_t *src, ptrdiff_t stride); | ||||
| void(*pred4x4_add[2])(uint8_t *pix /*align 4*/, | void(*pred4x4_add[2])(uint8_t *pix /*align 4*/, | ||||
| const int16_t *block /*align 16*/, ptrdiff_t stride); | |||||
| int16_t *block /*align 16*/, ptrdiff_t stride); | |||||
| void(*pred8x8l_add[2])(uint8_t *pix /*align 8*/, | void(*pred8x8l_add[2])(uint8_t *pix /*align 8*/, | ||||
| const int16_t *block /*align 16*/, ptrdiff_t stride); | |||||
| int16_t *block /*align 16*/, ptrdiff_t stride); | |||||
| void(*pred8x8_add[3])(uint8_t *pix /*align 8*/, | void(*pred8x8_add[3])(uint8_t *pix /*align 8*/, | ||||
| const int *block_offset, | const int *block_offset, | ||||
| const int16_t *block /*align 16*/, ptrdiff_t stride); | |||||
| int16_t *block /*align 16*/, ptrdiff_t stride); | |||||
| void(*pred16x16_add[3])(uint8_t *pix /*align 16*/, | void(*pred16x16_add[3])(uint8_t *pix /*align 16*/, | ||||
| const int *block_offset, | const int *block_offset, | ||||
| const int16_t *block /*align 16*/, ptrdiff_t stride); | |||||
| int16_t *block /*align 16*/, ptrdiff_t stride); | |||||
| } H264PredContext; | } H264PredContext; | ||||
| void ff_h264_pred_init(H264PredContext *h, int codec_id, | void ff_h264_pred_init(H264PredContext *h, int codec_id, | ||||
| @@ -1131,7 +1131,7 @@ static void FUNCC(pred8x8l_horizontal_up)(uint8_t *_src, int has_topleft, | |||||
| #undef PL | #undef PL | ||||
| #undef SRC | #undef SRC | ||||
| static void FUNCC(pred4x4_vertical_add)(uint8_t *_pix, const int16_t *_block, | |||||
| static void FUNCC(pred4x4_vertical_add)(uint8_t *_pix, int16_t *_block, | |||||
| ptrdiff_t stride) | ptrdiff_t stride) | ||||
| { | { | ||||
| int i; | int i; | ||||
| @@ -1148,9 +1148,11 @@ static void FUNCC(pred4x4_vertical_add)(uint8_t *_pix, const int16_t *_block, | |||||
| pix++; | pix++; | ||||
| block++; | block++; | ||||
| } | } | ||||
| memset(_block, 0, sizeof(dctcoef) * 16); | |||||
| } | } | ||||
| static void FUNCC(pred4x4_horizontal_add)(uint8_t *_pix, const int16_t *_block, | |||||
| static void FUNCC(pred4x4_horizontal_add)(uint8_t *_pix, int16_t *_block, | |||||
| ptrdiff_t stride) | ptrdiff_t stride) | ||||
| { | { | ||||
| int i; | int i; | ||||
| @@ -1166,9 +1168,11 @@ static void FUNCC(pred4x4_horizontal_add)(uint8_t *_pix, const int16_t *_block, | |||||
| pix+= stride; | pix+= stride; | ||||
| block+= 4; | block+= 4; | ||||
| } | } | ||||
| memset(_block, 0, sizeof(dctcoef) * 16); | |||||
| } | } | ||||
| static void FUNCC(pred8x8l_vertical_add)(uint8_t *_pix, const int16_t *_block, | |||||
| static void FUNCC(pred8x8l_vertical_add)(uint8_t *_pix, int16_t *_block, | |||||
| ptrdiff_t stride) | ptrdiff_t stride) | ||||
| { | { | ||||
| int i; | int i; | ||||
| @@ -1189,9 +1193,11 @@ static void FUNCC(pred8x8l_vertical_add)(uint8_t *_pix, const int16_t *_block, | |||||
| pix++; | pix++; | ||||
| block++; | block++; | ||||
| } | } | ||||
| memset(_block, 0, sizeof(dctcoef) * 64); | |||||
| } | } | ||||
| static void FUNCC(pred8x8l_horizontal_add)(uint8_t *_pix, const int16_t *_block, | |||||
| static void FUNCC(pred8x8l_horizontal_add)(uint8_t *_pix, int16_t *_block, | |||||
| ptrdiff_t stride) | ptrdiff_t stride) | ||||
| { | { | ||||
| int i; | int i; | ||||
| @@ -1211,10 +1217,12 @@ static void FUNCC(pred8x8l_horizontal_add)(uint8_t *_pix, const int16_t *_block, | |||||
| pix+= stride; | pix+= stride; | ||||
| block+= 8; | block+= 8; | ||||
| } | } | ||||
| memset(_block, 0, sizeof(dctcoef) * 64); | |||||
| } | } | ||||
| static void FUNCC(pred16x16_vertical_add)(uint8_t *pix, const int *block_offset, | static void FUNCC(pred16x16_vertical_add)(uint8_t *pix, const int *block_offset, | ||||
| const int16_t *block, | |||||
| int16_t *block, | |||||
| ptrdiff_t stride) | ptrdiff_t stride) | ||||
| { | { | ||||
| int i; | int i; | ||||
| @@ -1224,7 +1232,7 @@ static void FUNCC(pred16x16_vertical_add)(uint8_t *pix, const int *block_offset, | |||||
| static void FUNCC(pred16x16_horizontal_add)(uint8_t *pix, | static void FUNCC(pred16x16_horizontal_add)(uint8_t *pix, | ||||
| const int *block_offset, | const int *block_offset, | ||||
| const int16_t *block, | |||||
| int16_t *block, | |||||
| ptrdiff_t stride) | ptrdiff_t stride) | ||||
| { | { | ||||
| int i; | int i; | ||||
| @@ -1233,7 +1241,7 @@ static void FUNCC(pred16x16_horizontal_add)(uint8_t *pix, | |||||
| } | } | ||||
| static void FUNCC(pred8x8_vertical_add)(uint8_t *pix, const int *block_offset, | static void FUNCC(pred8x8_vertical_add)(uint8_t *pix, const int *block_offset, | ||||
| const int16_t *block, ptrdiff_t stride) | |||||
| int16_t *block, ptrdiff_t stride) | |||||
| { | { | ||||
| int i; | int i; | ||||
| for(i=0; i<4; i++) | for(i=0; i<4; i++) | ||||
| @@ -1241,7 +1249,7 @@ static void FUNCC(pred8x8_vertical_add)(uint8_t *pix, const int *block_offset, | |||||
| } | } | ||||
| static void FUNCC(pred8x16_vertical_add)(uint8_t *pix, const int *block_offset, | static void FUNCC(pred8x16_vertical_add)(uint8_t *pix, const int *block_offset, | ||||
| const int16_t *block, ptrdiff_t stride) | |||||
| int16_t *block, ptrdiff_t stride) | |||||
| { | { | ||||
| int i; | int i; | ||||
| for(i=0; i<4; i++) | for(i=0; i<4; i++) | ||||
| @@ -1251,7 +1259,7 @@ static void FUNCC(pred8x16_vertical_add)(uint8_t *pix, const int *block_offset, | |||||
| } | } | ||||
| static void FUNCC(pred8x8_horizontal_add)(uint8_t *pix, const int *block_offset, | static void FUNCC(pred8x8_horizontal_add)(uint8_t *pix, const int *block_offset, | ||||
| const int16_t *block, | |||||
| int16_t *block, | |||||
| ptrdiff_t stride) | ptrdiff_t stride) | ||||
| { | { | ||||
| int i; | int i; | ||||
| @@ -1261,7 +1269,7 @@ static void FUNCC(pred8x8_horizontal_add)(uint8_t *pix, const int *block_offset, | |||||
| static void FUNCC(pred8x16_horizontal_add)(uint8_t *pix, | static void FUNCC(pred8x16_horizontal_add)(uint8_t *pix, | ||||
| const int *block_offset, | const int *block_offset, | ||||
| const int16_t *block, ptrdiff_t stride) | |||||
| int16_t *block, ptrdiff_t stride) | |||||
| { | { | ||||
| int i; | int i; | ||||
| for(i=0; i<4; i++) | for(i=0; i<4; i++) | ||||
| @@ -87,6 +87,7 @@ static void ff_h264_idct_add_altivec(uint8_t *dst, int16_t *block, int stride) | |||||
| vtmp1 = vec_sld(vtmp0, vtmp0, 8); | vtmp1 = vec_sld(vtmp0, vtmp0, 8); | ||||
| vtmp2 = vec_ld(16,block); | vtmp2 = vec_ld(16,block); | ||||
| vtmp3 = vec_sld(vtmp2, vtmp2, 8); | vtmp3 = vec_sld(vtmp2, vtmp2, 8); | ||||
| memset(block, 0, 16 * sizeof(int16_t)); | |||||
| VEC_1D_DCT(vtmp0,vtmp1,vtmp2,vtmp3,va0,va1,va2,va3); | VEC_1D_DCT(vtmp0,vtmp1,vtmp2,vtmp3,va0,va1,va2,va3); | ||||
| VEC_TRANSPOSE_4(va0,va1,va2,va3,vtmp0,vtmp1,vtmp2,vtmp3); | VEC_TRANSPOSE_4(va0,va1,va2,va3,vtmp0,vtmp1,vtmp2,vtmp3); | ||||
| @@ -206,6 +207,7 @@ static void ff_h264_idct8_add_altivec( uint8_t *dst, int16_t *dct, int stride ) | |||||
| s5 = vec_ld(0x50, (int16_t*)dct); | s5 = vec_ld(0x50, (int16_t*)dct); | ||||
| s6 = vec_ld(0x60, (int16_t*)dct); | s6 = vec_ld(0x60, (int16_t*)dct); | ||||
| s7 = vec_ld(0x70, (int16_t*)dct); | s7 = vec_ld(0x70, (int16_t*)dct); | ||||
| memset(dct, 0, 64 * sizeof(int16_t)); | |||||
| IDCT8_1D_ALTIVEC(s0, s1, s2, s3, s4, s5, s6, s7, | IDCT8_1D_ALTIVEC(s0, s1, s2, s3, s4, s5, s6, s7, | ||||
| d0, d1, d2, d3, d4, d5, d6, d7); | d0, d1, d2, d3, d4, d5, d6, d7); | ||||
| @@ -234,6 +236,7 @@ static av_always_inline void h264_idct_dc_add_internal(uint8_t *dst, int16_t *bl | |||||
| int i; | int i; | ||||
| dc = (block[0] + 32) >> 6; | dc = (block[0] + 32) >> 6; | ||||
| block[0] = 0; | |||||
| dc16 = vec_splat((vec_s16) vec_lde(0, &dc), 1); | dc16 = vec_splat((vec_s16) vec_lde(0, &dc), 1); | ||||
| if (size == 4) | if (size == 4) | ||||
| @@ -216,6 +216,8 @@ void ff_svq3_add_idct_c(uint8_t *dst, int16_t *block, | |||||
| dst[i + stride * 2] = av_clip_uint8(dst[i + stride * 2] + ((z1 - z2) * qmul + rr >> 20)); | dst[i + stride * 2] = av_clip_uint8(dst[i + stride * 2] + ((z1 - z2) * qmul + rr >> 20)); | ||||
| dst[i + stride * 3] = av_clip_uint8(dst[i + stride * 3] + ((z0 - z3) * qmul + rr >> 20)); | dst[i + stride * 3] = av_clip_uint8(dst[i + stride * 3] + ((z0 - z3) * qmul + rr >> 20)); | ||||
| } | } | ||||
| memset(block, 0, 16 * sizeof(int16_t)); | |||||
| } | } | ||||
| static inline int svq3_decode_block(GetBitContext *gb, int16_t *block, | static inline int svq3_decode_block(GetBitContext *gb, int16_t *block, | ||||
| @@ -664,8 +666,6 @@ static int svq3_decode_mb(SVQ3Context *s, unsigned int mb_type) | |||||
| } | } | ||||
| if (!IS_SKIP(mb_type) || h->pict_type == AV_PICTURE_TYPE_B) { | if (!IS_SKIP(mb_type) || h->pict_type == AV_PICTURE_TYPE_B) { | ||||
| memset(h->non_zero_count_cache + 8, 0, 14 * 8 * sizeof(uint8_t)); | memset(h->non_zero_count_cache + 8, 0, 14 * 8 * sizeof(uint8_t)); | ||||
| h->dsp.clear_blocks(h->mb + 0); | |||||
| h->dsp.clear_blocks(h->mb + 384); | |||||
| } | } | ||||
| if (!IS_INTRA16x16(mb_type) && | if (!IS_INTRA16x16(mb_type) && | ||||
| @@ -70,6 +70,10 @@ SECTION .text | |||||
| paddw m0, m6 | paddw m0, m6 | ||||
| IDCT4_1D w, 0, 1, 2, 3, 4, 5 | IDCT4_1D w, 0, 1, 2, 3, 4, 5 | ||||
| pxor m7, m7 | pxor m7, m7 | ||||
| movq [%2+ 0], m7 | |||||
| movq [%2+ 8], m7 | |||||
| movq [%2+16], m7 | |||||
| movq [%2+24], m7 | |||||
| STORE_DIFFx2 m0, m1, m4, m5, m7, 6, %1, %3 | STORE_DIFFx2 m0, m1, m4, m5, m7, 6, %1, %3 | ||||
| lea %1, [%1+%3*2] | lea %1, [%1+%3*2] | ||||
| @@ -161,13 +165,31 @@ cglobal h264_idct_add_8, 3, 3, 0 | |||||
| %endmacro | %endmacro | ||||
| ; %1=uint8_t *dst, %2=int16_t *block, %3=int stride | ; %1=uint8_t *dst, %2=int16_t *block, %3=int stride | ||||
| %macro IDCT8_ADD_MMX_END 3 | |||||
| %macro IDCT8_ADD_MMX_END 3-4 | |||||
| IDCT8_1D_FULL %2 | IDCT8_1D_FULL %2 | ||||
| mova [%2 ], m5 | mova [%2 ], m5 | ||||
| mova [%2+16], m6 | mova [%2+16], m6 | ||||
| mova [%2+32], m7 | mova [%2+32], m7 | ||||
| pxor m7, m7 | pxor m7, m7 | ||||
| %if %0 == 4 | |||||
| movq [%4+ 0], m7 | |||||
| movq [%4+ 8], m7 | |||||
| movq [%4+ 16], m7 | |||||
| movq [%4+ 24], m7 | |||||
| movq [%4+ 32], m7 | |||||
| movq [%4+ 40], m7 | |||||
| movq [%4+ 48], m7 | |||||
| movq [%4+ 56], m7 | |||||
| movq [%4+ 64], m7 | |||||
| movq [%4+ 72], m7 | |||||
| movq [%4+ 80], m7 | |||||
| movq [%4+ 88], m7 | |||||
| movq [%4+ 96], m7 | |||||
| movq [%4+104], m7 | |||||
| movq [%4+112], m7 | |||||
| movq [%4+120], m7 | |||||
| %endif | |||||
| STORE_DIFFx2 m0, m1, m5, m6, m7, 6, %1, %3 | STORE_DIFFx2 m0, m1, m5, m6, m7, 6, %1, %3 | ||||
| lea %1, [%1+%3*2] | lea %1, [%1+%3*2] | ||||
| STORE_DIFFx2 m2, m3, m5, m6, m7, 6, %1, %3 | STORE_DIFFx2 m2, m3, m5, m6, m7, 6, %1, %3 | ||||
| @@ -190,7 +212,7 @@ cglobal h264_idct8_add_8, 3, 4, 0 | |||||
| IDCT8_ADD_MMX_START r1 , rsp | IDCT8_ADD_MMX_START r1 , rsp | ||||
| IDCT8_ADD_MMX_START r1+8, rsp+64 | IDCT8_ADD_MMX_START r1+8, rsp+64 | ||||
| lea r3, [r0+4] | lea r3, [r0+4] | ||||
| IDCT8_ADD_MMX_END r0 , rsp, r2 | |||||
| IDCT8_ADD_MMX_END r0 , rsp, r2, r1 | |||||
| IDCT8_ADD_MMX_END r3 , rsp+8, r2 | IDCT8_ADD_MMX_END r3 , rsp+8, r2 | ||||
| ADD rsp, pad | ADD rsp, pad | ||||
| @@ -233,6 +255,14 @@ cglobal h264_idct8_add_8, 3, 4, 0 | |||||
| SWAP 0, 8 | SWAP 0, 8 | ||||
| SWAP 1, 9 | SWAP 1, 9 | ||||
| %endif | %endif | ||||
| mova [%2+ 0], m7 | |||||
| mova [%2+ 16], m7 | |||||
| mova [%2+ 32], m7 | |||||
| mova [%2+ 48], m7 | |||||
| mova [%2+ 64], m7 | |||||
| mova [%2+ 80], m7 | |||||
| mova [%2+ 96], m7 | |||||
| mova [%2+112], m7 | |||||
| lea %1, [%1+%3*4] | lea %1, [%1+%3*4] | ||||
| STORE_DIFF m4, m6, m7, [%1 ] | STORE_DIFF m4, m6, m7, [%1 ] | ||||
| STORE_DIFF m5, m6, m7, [%1+%3 ] | STORE_DIFF m5, m6, m7, [%1+%3 ] | ||||
| @@ -246,19 +276,11 @@ cglobal h264_idct8_add_8, 3, 4, 10 | |||||
| IDCT8_ADD_SSE r0, r1, r2, r3 | IDCT8_ADD_SSE r0, r1, r2, r3 | ||||
| RET | RET | ||||
| %macro DC_ADD_MMXEXT_INIT 2-3 | |||||
| %if %0 == 2 | |||||
| movsx %1, word [%1] | |||||
| %macro DC_ADD_MMXEXT_INIT 2 | |||||
| add %1, 32 | add %1, 32 | ||||
| sar %1, 6 | sar %1, 6 | ||||
| movd m0, %1d | movd m0, %1d | ||||
| lea %1, [%2*3] | lea %1, [%2*3] | ||||
| %else | |||||
| add %3, 32 | |||||
| sar %3, 6 | |||||
| movd m0, %3d | |||||
| lea %3, [%2*3] | |||||
| %endif | |||||
| pshufw m0, m0, 0 | pshufw m0, m0, 0 | ||||
| pxor m1, m1 | pxor m1, m1 | ||||
| psubw m1, m0 | psubw m1, m0 | ||||
| @@ -287,19 +309,44 @@ cglobal h264_idct8_add_8, 3, 4, 10 | |||||
| INIT_MMX mmxext | INIT_MMX mmxext | ||||
| ; ff_h264_idct_dc_add_mmxext(uint8_t *dst, int16_t *block, int stride) | ; ff_h264_idct_dc_add_mmxext(uint8_t *dst, int16_t *block, int stride) | ||||
| cglobal h264_idct_dc_add_8, 3, 3, 0 | |||||
| DC_ADD_MMXEXT_INIT r1, r2 | |||||
| DC_ADD_MMXEXT_OP movh, r0, r2, r1 | |||||
| %if ARCH_X86_64 | |||||
| cglobal h264_idct_dc_add_8, 3, 4, 0 | |||||
| movsx r3, word [r1] | |||||
| mov dword [r1], 0 | |||||
| DC_ADD_MMXEXT_INIT r3, r2 | |||||
| DC_ADD_MMXEXT_OP movh, r0, r2, r3 | |||||
| RET | RET | ||||
| ; ff_h264_idct8_dc_add_mmxext(uint8_t *dst, int16_t *block, int stride) | ; ff_h264_idct8_dc_add_mmxext(uint8_t *dst, int16_t *block, int stride) | ||||
| cglobal h264_idct8_dc_add_8, 3, 3, 0 | |||||
| DC_ADD_MMXEXT_INIT r1, r2 | |||||
| DC_ADD_MMXEXT_OP mova, r0, r2, r1 | |||||
| cglobal h264_idct8_dc_add_8, 3, 4, 0 | |||||
| movsx r3, word [r1] | |||||
| mov dword [r1], 0 | |||||
| DC_ADD_MMXEXT_INIT r3, r2 | |||||
| DC_ADD_MMXEXT_OP mova, r0, r2, r3 | |||||
| lea r0, [r0+r2*4] | lea r0, [r0+r2*4] | ||||
| DC_ADD_MMXEXT_OP mova, r0, r2, r1 | |||||
| DC_ADD_MMXEXT_OP mova, r0, r2, r3 | |||||
| RET | |||||
| %else | |||||
| cglobal h264_idct_dc_add_8, 2, 3, 0 | |||||
| movsx r2, word [r1] | |||||
| mov dword [r1], 0 | |||||
| mov r1, r2m | |||||
| DC_ADD_MMXEXT_INIT r2, r1 | |||||
| DC_ADD_MMXEXT_OP movh, r0, r1, r2 | |||||
| RET | RET | ||||
| ; ff_h264_idct8_dc_add_mmxext(uint8_t *dst, int16_t *block, int stride) | |||||
| cglobal h264_idct8_dc_add_8, 2, 3, 0 | |||||
| movsx r2, word [r1] | |||||
| mov dword [r1], 0 | |||||
| mov r1, r2m | |||||
| DC_ADD_MMXEXT_INIT r2, r1 | |||||
| DC_ADD_MMXEXT_OP mova, r0, r1, r2 | |||||
| lea r0, [r0+r1*4] | |||||
| DC_ADD_MMXEXT_OP mova, r0, r1, r2 | |||||
| RET | |||||
| %endif | |||||
| INIT_MMX mmx | INIT_MMX mmx | ||||
| ; ff_h264_idct_add16_mmx(uint8_t *dst, const int *block_offset, | ; ff_h264_idct_add16_mmx(uint8_t *dst, const int *block_offset, | ||||
| ; int16_t *block, int stride, const uint8_t nnzc[6*8]) | ; int16_t *block, int stride, const uint8_t nnzc[6*8]) | ||||
| @@ -343,7 +390,7 @@ cglobal h264_idct8_add4_8, 5, 7 + npicregs, 0, dst, block_offset, block, stride, | |||||
| add word [r2], 32 | add word [r2], 32 | ||||
| IDCT8_ADD_MMX_START r2 , rsp | IDCT8_ADD_MMX_START r2 , rsp | ||||
| IDCT8_ADD_MMX_START r2+8, rsp+64 | IDCT8_ADD_MMX_START r2+8, rsp+64 | ||||
| IDCT8_ADD_MMX_END r6 , rsp, r3 | |||||
| IDCT8_ADD_MMX_END r6 , rsp, r3, r2 | |||||
| mov r6d, dword [r1+r5*4] | mov r6d, dword [r1+r5*4] | ||||
| lea r6, [r0+r6+4] | lea r6, [r0+r6+4] | ||||
| IDCT8_ADD_MMX_END r6 , rsp+8, r3 | IDCT8_ADD_MMX_END r6 , rsp+8, r3 | ||||
| @@ -373,7 +420,8 @@ cglobal h264_idct_add16_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride | |||||
| movsx r6, word [r2] | movsx r6, word [r2] | ||||
| test r6, r6 | test r6, r6 | ||||
| jz .no_dc | jz .no_dc | ||||
| DC_ADD_MMXEXT_INIT r2, r3, r6 | |||||
| mov word [r2], 0 | |||||
| DC_ADD_MMXEXT_INIT r6, r3 | |||||
| %if ARCH_X86_64 == 0 | %if ARCH_X86_64 == 0 | ||||
| %define dst2q r1 | %define dst2q r1 | ||||
| %define dst2d r1d | %define dst2d r1d | ||||
| @@ -450,7 +498,8 @@ cglobal h264_idct_add16intra_8, 5, 8 + npicregs, 0, dst1, block_offset, block, s | |||||
| movsx r6, word [r2] | movsx r6, word [r2] | ||||
| test r6, r6 | test r6, r6 | ||||
| jz .skipblock | jz .skipblock | ||||
| DC_ADD_MMXEXT_INIT r2, r3, r6 | |||||
| mov word [r2], 0 | |||||
| DC_ADD_MMXEXT_INIT r6, r3 | |||||
| %if ARCH_X86_64 == 0 | %if ARCH_X86_64 == 0 | ||||
| %define dst2q r1 | %define dst2q r1 | ||||
| %define dst2d r1d | %define dst2d r1d | ||||
| @@ -489,7 +538,8 @@ cglobal h264_idct8_add4_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride | |||||
| movsx r6, word [r2] | movsx r6, word [r2] | ||||
| test r6, r6 | test r6, r6 | ||||
| jz .no_dc | jz .no_dc | ||||
| DC_ADD_MMXEXT_INIT r2, r3, r6 | |||||
| mov word [r2], 0 | |||||
| DC_ADD_MMXEXT_INIT r6, r3 | |||||
| %if ARCH_X86_64 == 0 | %if ARCH_X86_64 == 0 | ||||
| %define dst2q r1 | %define dst2q r1 | ||||
| %define dst2d r1d | %define dst2d r1d | ||||
| @@ -515,7 +565,7 @@ cglobal h264_idct8_add4_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride | |||||
| add word [r2], 32 | add word [r2], 32 | ||||
| IDCT8_ADD_MMX_START r2 , rsp | IDCT8_ADD_MMX_START r2 , rsp | ||||
| IDCT8_ADD_MMX_START r2+8, rsp+64 | IDCT8_ADD_MMX_START r2+8, rsp+64 | ||||
| IDCT8_ADD_MMX_END r6 , rsp, r3 | |||||
| IDCT8_ADD_MMX_END r6 , rsp, r3, r2 | |||||
| mov r6d, dword [r1+r5*4] | mov r6d, dword [r1+r5*4] | ||||
| lea r6, [r0+r6+4] | lea r6, [r0+r6+4] | ||||
| IDCT8_ADD_MMX_END r6 , rsp+8, r3 | IDCT8_ADD_MMX_END r6 , rsp+8, r3 | ||||
| @@ -547,7 +597,8 @@ cglobal h264_idct8_add4_8, 5, 8 + npicregs, 10, dst1, block_offset, block, strid | |||||
| test r6, r6 | test r6, r6 | ||||
| jz .no_dc | jz .no_dc | ||||
| INIT_MMX cpuname | INIT_MMX cpuname | ||||
| DC_ADD_MMXEXT_INIT r2, r3, r6 | |||||
| mov word [r2], 0 | |||||
| DC_ADD_MMXEXT_INIT r6, r3 | |||||
| %if ARCH_X86_64 == 0 | %if ARCH_X86_64 == 0 | ||||
| %define dst2q r1 | %define dst2q r1 | ||||
| %define dst2d r1d | %define dst2d r1d | ||||
| @@ -650,7 +701,8 @@ h264_idct_add8_mmxext_plane: | |||||
| movsx r6, word [r2] | movsx r6, word [r2] | ||||
| test r6, r6 | test r6, r6 | ||||
| jz .skipblock | jz .skipblock | ||||
| DC_ADD_MMXEXT_INIT r2, r3, r6 | |||||
| mov word [r2], 0 | |||||
| DC_ADD_MMXEXT_INIT r6, r3 | |||||
| %if ARCH_X86_64 | %if ARCH_X86_64 | ||||
| mov r0d, dword [r1+r5*4] | mov r0d, dword [r1+r5*4] | ||||
| add r0, [dst2q] | add r0, [dst2q] | ||||
| @@ -693,7 +745,9 @@ cglobal h264_idct_add8_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, | |||||
| ; r0 = uint8_t *dst, r2 = int16_t *block, r3 = int stride, r6=clobbered | ; r0 = uint8_t *dst, r2 = int16_t *block, r3 = int stride, r6=clobbered | ||||
| h264_idct_dc_add8_mmxext: | h264_idct_dc_add8_mmxext: | ||||
| movd m0, [r2 ] ; 0 0 X D | movd m0, [r2 ] ; 0 0 X D | ||||
| mov word [r2+ 0], 0 | |||||
| punpcklwd m0, [r2+32] ; x X d D | punpcklwd m0, [r2+32] ; x X d D | ||||
| mov word [r2+32], 0 | |||||
| paddsw m0, [pw_32] | paddsw m0, [pw_32] | ||||
| psraw m0, 6 | psraw m0, 6 | ||||
| punpcklwd m0, m0 ; d d D D | punpcklwd m0, m0 ; d d D D | ||||
| @@ -723,6 +777,10 @@ h264_add8x4_idct_sse2: | |||||
| paddw m0, [pw_32] | paddw m0, [pw_32] | ||||
| IDCT4_1D w,0,1,2,3,4,5 | IDCT4_1D w,0,1,2,3,4,5 | ||||
| pxor m7, m7 | pxor m7, m7 | ||||
| mova [r2+ 0], m7 | |||||
| mova [r2+16], m7 | |||||
| mova [r2+32], m7 | |||||
| mova [r2+48], m7 | |||||
| STORE_DIFFx2 m0, m1, m4, m5, m7, 6, r0, r3 | STORE_DIFFx2 m0, m1, m4, m5, m7, 6, r0, r3 | ||||
| lea r0, [r0+r3*2] | lea r0, [r0+r3*2] | ||||
| STORE_DIFFx2 m2, m3, m4, m5, m7, 6, r0, r3 | STORE_DIFFx2 m2, m3, m4, m5, m7, 6, r0, r3 | ||||
| @@ -66,6 +66,10 @@ SECTION .text | |||||
| paddd m0, [pd_32] | paddd m0, [pd_32] | ||||
| IDCT4_1D d,0,1,2,3,4,5 | IDCT4_1D d,0,1,2,3,4,5 | ||||
| pxor m5, m5 | pxor m5, m5 | ||||
| mova [%2+ 0], m5 | |||||
| mova [%2+16], m5 | |||||
| mova [%2+32], m5 | |||||
| mova [%2+48], m5 | |||||
| STORE_DIFFx2 m0, m1, m4, m5, %1, %3 | STORE_DIFFx2 m0, m1, m4, m5, %1, %3 | ||||
| lea %1, [%1+%3*2] | lea %1, [%1+%3*2] | ||||
| STORE_DIFFx2 m2, m3, m4, m5, %1, %3 | STORE_DIFFx2 m2, m3, m4, m5, %1, %3 | ||||
| @@ -98,6 +102,10 @@ add4x4_idct %+ SUFFIX: | |||||
| paddd m0, [pd_32] | paddd m0, [pd_32] | ||||
| IDCT4_1D d,0,1,2,3,4,5 | IDCT4_1D d,0,1,2,3,4,5 | ||||
| pxor m5, m5 | pxor m5, m5 | ||||
| mova [r2+ 0], m5 | |||||
| mova [r2+16], m5 | |||||
| mova [r2+32], m5 | |||||
| mova [r2+48], m5 | |||||
| STORE_DIFFx2 m0, m1, m4, m5, r5, r3 | STORE_DIFFx2 m0, m1, m4, m5, r5, r3 | ||||
| lea r5, [r5+r3*2] | lea r5, [r5+r3*2] | ||||
| STORE_DIFFx2 m2, m3, m4, m5, r5, r3 | STORE_DIFFx2 m2, m3, m4, m5, r5, r3 | ||||
| @@ -181,6 +189,7 @@ IDCT_ADD16_10 | |||||
| INIT_MMX mmxext | INIT_MMX mmxext | ||||
| cglobal h264_idct_dc_add_10,3,3 | cglobal h264_idct_dc_add_10,3,3 | ||||
| movd m0, [r1] | movd m0, [r1] | ||||
| mov dword [r1], 0 | |||||
| paddd m0, [pd_32] | paddd m0, [pd_32] | ||||
| psrad m0, 6 | psrad m0, 6 | ||||
| lea r1, [r2*3] | lea r1, [r2*3] | ||||
| @@ -193,11 +202,11 @@ cglobal h264_idct_dc_add_10,3,3 | |||||
| ; void h264_idct8_dc_add(pixel *dst, dctcoef *block, int stride) | ; void h264_idct8_dc_add(pixel *dst, dctcoef *block, int stride) | ||||
| ;----------------------------------------------------------------------------- | ;----------------------------------------------------------------------------- | ||||
| %macro IDCT8_DC_ADD 0 | %macro IDCT8_DC_ADD 0 | ||||
| cglobal h264_idct8_dc_add_10,3,3,7 | |||||
| mov r1d, [r1] | |||||
| add r1, 32 | |||||
| sar r1, 6 | |||||
| movd m0, r1d | |||||
| cglobal h264_idct8_dc_add_10,3,4,7 | |||||
| movd m0, [r1] | |||||
| mov dword[r1], 0 | |||||
| paddd m0, [pd_32] | |||||
| psrad m0, 6 | |||||
| lea r1, [r2*3] | lea r1, [r2*3] | ||||
| SPLATW m0, m0, 0 | SPLATW m0, m0, 0 | ||||
| mova m6, [pw_pixel_max] | mova m6, [pw_pixel_max] | ||||
| @@ -247,6 +256,8 @@ idct_dc_add %+ SUFFIX: | |||||
| add r5, r0 | add r5, r0 | ||||
| movq m0, [r2+ 0] | movq m0, [r2+ 0] | ||||
| movhps m0, [r2+64] | movhps m0, [r2+64] | ||||
| mov dword [r2+ 0], 0 | |||||
| mov dword [r2+64], 0 | |||||
| paddd m0, [pd_32] | paddd m0, [pd_32] | ||||
| psrad m0, 6 | psrad m0, 6 | ||||
| pshufhw m0, m0, 0 | pshufhw m0, m0, 0 | ||||
| @@ -461,6 +472,22 @@ h264_idct8_add1_10 %+ SUFFIX: | |||||
| packssdw m8, m0 | packssdw m8, m0 | ||||
| paddsw m8, [r0] | paddsw m8, [r0] | ||||
| pxor m0, m0 | pxor m0, m0 | ||||
| mova [r1+ 0], m0 | |||||
| mova [r1+ 16], m0 | |||||
| mova [r1+ 32], m0 | |||||
| mova [r1+ 48], m0 | |||||
| mova [r1+ 64], m0 | |||||
| mova [r1+ 80], m0 | |||||
| mova [r1+ 96], m0 | |||||
| mova [r1+112], m0 | |||||
| mova [r1+128], m0 | |||||
| mova [r1+144], m0 | |||||
| mova [r1+160], m0 | |||||
| mova [r1+176], m0 | |||||
| mova [r1+192], m0 | |||||
| mova [r1+208], m0 | |||||
| mova [r1+224], m0 | |||||
| mova [r1+240], m0 | |||||
| CLIPW m8, m0, [pw_pixel_max] | CLIPW m8, m0, [pw_pixel_max] | ||||
| mova [r0], m8 | mova [r0], m8 | ||||
| mova m8, [pw_pixel_max] | mova m8, [pw_pixel_max] | ||||
| @@ -480,6 +507,22 @@ h264_idct8_add1_10 %+ SUFFIX: | |||||
| lea r3, [r0+8] | lea r3, [r0+8] | ||||
| IDCT8_ADD_SSE_END r0, rsp, r2 | IDCT8_ADD_SSE_END r0, rsp, r2 | ||||
| IDCT8_ADD_SSE_END r3, rsp+16, r2 | IDCT8_ADD_SSE_END r3, rsp+16, r2 | ||||
| mova [r1+ 0], m7 | |||||
| mova [r1+ 16], m7 | |||||
| mova [r1+ 32], m7 | |||||
| mova [r1+ 48], m7 | |||||
| mova [r1+ 64], m7 | |||||
| mova [r1+ 80], m7 | |||||
| mova [r1+ 96], m7 | |||||
| mova [r1+112], m7 | |||||
| mova [r1+128], m7 | |||||
| mova [r1+144], m7 | |||||
| mova [r1+160], m7 | |||||
| mova [r1+176], m7 | |||||
| mova [r1+192], m7 | |||||
| mova [r1+208], m7 | |||||
| mova [r1+224], m7 | |||||
| mova [r1+240], m7 | |||||
| %endif ; ARCH_X86_64 | %endif ; ARCH_X86_64 | ||||
| add rsp, pad | add rsp, pad | ||||