This way, the special IDCT permutations are no longer needed. This is similar to how H264 does it, and removes the dsputil dependency imposed by the scantable code. Also remove the unused type == 0 cases from the plain C version of the idct. Signed-off-by: Martin Storsjö <martin@martin.st>tags/n2.0
| @@ -41,6 +41,5 @@ av_cold void ff_vp3dsp_init_arm(VP3DSPContext *c, int flags) | |||||
| c->idct_dc_add = ff_vp3_idct_dc_add_neon; | c->idct_dc_add = ff_vp3_idct_dc_add_neon; | ||||
| c->v_loop_filter = ff_vp3_v_loop_filter_neon; | c->v_loop_filter = ff_vp3_v_loop_filter_neon; | ||||
| c->h_loop_filter = ff_vp3_h_loop_filter_neon; | c->h_loop_filter = ff_vp3_h_loop_filter_neon; | ||||
| c->idct_perm = FF_TRANSPOSE_IDCT_PERM; | |||||
| } | } | ||||
| } | } | ||||
| @@ -61,6 +61,5 @@ av_cold void ff_vp3dsp_init_bfin(VP3DSPContext *c, int flags) | |||||
| if (!(flags & CODEC_FLAG_BITEXACT)) { | if (!(flags & CODEC_FLAG_BITEXACT)) { | ||||
| c->idct_add = bfin_vp3_idct_add; | c->idct_add = bfin_vp3_idct_add; | ||||
| c->idct_put = bfin_vp3_idct_put; | c->idct_put = bfin_vp3_idct_put; | ||||
| c->idct_perm = FF_TRANSPOSE_IDCT_PERM; | |||||
| } | } | ||||
| } | } | ||||
| @@ -184,7 +184,6 @@ av_cold void ff_vp3dsp_init_ppc(VP3DSPContext *c, int flags) | |||||
| if (av_get_cpu_flags() & AV_CPU_FLAG_ALTIVEC) { | if (av_get_cpu_flags() & AV_CPU_FLAG_ALTIVEC) { | ||||
| c->idct_put = vp3_idct_put_altivec; | c->idct_put = vp3_idct_put_altivec; | ||||
| c->idct_add = vp3_idct_add_altivec; | c->idct_add = vp3_idct_add_altivec; | ||||
| c->idct_perm = FF_TRANSPOSE_IDCT_PERM; | |||||
| } | } | ||||
| #endif | #endif | ||||
| } | } | ||||
| @@ -136,6 +136,7 @@ typedef struct Vp3DecodeContext { | |||||
| ThreadFrame current_frame; | ThreadFrame current_frame; | ||||
| int keyframe; | int keyframe; | ||||
| uint8_t idct_permutation[64]; | uint8_t idct_permutation[64]; | ||||
| uint8_t idct_scantable[64]; | |||||
| DSPContext dsp; | DSPContext dsp; | ||||
| VideoDSPContext vdsp; | VideoDSPContext vdsp; | ||||
| VP3DSPContext vp3dsp; | VP3DSPContext vp3dsp; | ||||
| @@ -173,8 +174,6 @@ typedef struct Vp3DecodeContext { | |||||
| int8_t (*motion_val[2])[2]; | int8_t (*motion_val[2])[2]; | ||||
| ScanTable scantable; | |||||
| /* tables */ | /* tables */ | ||||
| uint16_t coded_dc_scale_factor[64]; | uint16_t coded_dc_scale_factor[64]; | ||||
| uint32_t coded_ac_scale_factor[64]; | uint32_t coded_ac_scale_factor[64]; | ||||
| @@ -1351,7 +1350,7 @@ static inline int vp3_dequant(Vp3DecodeContext *s, Vp3Fragment *frag, | |||||
| int plane, int inter, int16_t block[64]) | int plane, int inter, int16_t block[64]) | ||||
| { | { | ||||
| int16_t *dequantizer = s->qmat[frag->qpi][inter][plane]; | int16_t *dequantizer = s->qmat[frag->qpi][inter][plane]; | ||||
| uint8_t *perm = s->scantable.permutated; | |||||
| uint8_t *perm = s->idct_scantable; | |||||
| int i = 0; | int i = 0; | ||||
| do { | do { | ||||
| @@ -1700,8 +1699,12 @@ static av_cold int vp3_decode_init(AVCodecContext *avctx) | |||||
| ff_videodsp_init(&s->vdsp, 8); | ff_videodsp_init(&s->vdsp, 8); | ||||
| ff_vp3dsp_init(&s->vp3dsp, avctx->flags); | ff_vp3dsp_init(&s->vp3dsp, avctx->flags); | ||||
| ff_init_scantable_permutation(s->idct_permutation, s->vp3dsp.idct_perm); | |||||
| ff_init_scantable(s->idct_permutation, &s->scantable, ff_zigzag_direct); | |||||
| for (i = 0; i < 64; i++) { | |||||
| #define T(x) (x >> 3) | ((x & 7) << 3) | |||||
| s->idct_permutation[i] = T(i); | |||||
| s->idct_scantable[i] = T(ff_zigzag_direct[i]); | |||||
| #undef T | |||||
| } | |||||
| /* initialize to an impossible value which will force a recalculation | /* initialize to an impossible value which will force a recalculation | ||||
| * in the first frame decode */ | * in the first frame decode */ | ||||
| @@ -54,11 +54,12 @@ static av_always_inline void idct(uint8_t *dst, int stride, int16_t *input, int | |||||
| /* Inverse DCT on the rows now */ | /* Inverse DCT on the rows now */ | ||||
| for (i = 0; i < 8; i++) { | for (i = 0; i < 8; i++) { | ||||
| /* Check for non-zero values */ | /* Check for non-zero values */ | ||||
| if ( ip[0] | ip[1] | ip[2] | ip[3] | ip[4] | ip[5] | ip[6] | ip[7] ) { | |||||
| A = M(xC1S7, ip[1]) + M(xC7S1, ip[7]); | |||||
| B = M(xC7S1, ip[1]) - M(xC1S7, ip[7]); | |||||
| C = M(xC3S5, ip[3]) + M(xC5S3, ip[5]); | |||||
| D = M(xC3S5, ip[5]) - M(xC5S3, ip[3]); | |||||
| if ( ip[0 * 8] | ip[1 * 8] | ip[2 * 8] | ip[3 * 8] | | |||||
| ip[4 * 8] | ip[5 * 8] | ip[6 * 8] | ip[7 * 8] ) { | |||||
| A = M(xC1S7, ip[1 * 8]) + M(xC7S1, ip[7 * 8]); | |||||
| B = M(xC7S1, ip[1 * 8]) - M(xC1S7, ip[7 * 8]); | |||||
| C = M(xC3S5, ip[3 * 8]) + M(xC5S3, ip[5 * 8]); | |||||
| D = M(xC3S5, ip[5 * 8]) - M(xC5S3, ip[3 * 8]); | |||||
| Ad = M(xC4S4, (A - C)); | Ad = M(xC4S4, (A - C)); | ||||
| Bd = M(xC4S4, (B - D)); | Bd = M(xC4S4, (B - D)); | ||||
| @@ -66,11 +67,11 @@ static av_always_inline void idct(uint8_t *dst, int stride, int16_t *input, int | |||||
| Cd = A + C; | Cd = A + C; | ||||
| Dd = B + D; | Dd = B + D; | ||||
| E = M(xC4S4, (ip[0] + ip[4])); | |||||
| F = M(xC4S4, (ip[0] - ip[4])); | |||||
| E = M(xC4S4, (ip[0 * 8] + ip[4 * 8])); | |||||
| F = M(xC4S4, (ip[0 * 8] - ip[4 * 8])); | |||||
| G = M(xC2S6, ip[2]) + M(xC6S2, ip[6]); | |||||
| H = M(xC6S2, ip[2]) - M(xC2S6, ip[6]); | |||||
| G = M(xC2S6, ip[2 * 8]) + M(xC6S2, ip[6 * 8]); | |||||
| H = M(xC6S2, ip[2 * 8]) - M(xC2S6, ip[6 * 8]); | |||||
| Ed = E - G; | Ed = E - G; | ||||
| Gd = E + G; | Gd = E + G; | ||||
| @@ -82,33 +83,33 @@ static av_always_inline void idct(uint8_t *dst, int stride, int16_t *input, int | |||||
| Hd = Bd + H; | Hd = Bd + H; | ||||
| /* Final sequence of operations over-write original inputs. */ | /* Final sequence of operations over-write original inputs. */ | ||||
| ip[0] = Gd + Cd ; | |||||
| ip[7] = Gd - Cd ; | |||||
| ip[0 * 8] = Gd + Cd ; | |||||
| ip[7 * 8] = Gd - Cd ; | |||||
| ip[1] = Add + Hd; | |||||
| ip[2] = Add - Hd; | |||||
| ip[1 * 8] = Add + Hd; | |||||
| ip[2 * 8] = Add - Hd; | |||||
| ip[3] = Ed + Dd ; | |||||
| ip[4] = Ed - Dd ; | |||||
| ip[3 * 8] = Ed + Dd ; | |||||
| ip[4 * 8] = Ed - Dd ; | |||||
| ip[5] = Fd + Bdd; | |||||
| ip[6] = Fd - Bdd; | |||||
| ip[5 * 8] = Fd + Bdd; | |||||
| ip[6 * 8] = Fd - Bdd; | |||||
| } | } | ||||
| ip += 8; /* next row */ | |||||
| ip += 1; /* next row */ | |||||
| } | } | ||||
| ip = input; | ip = input; | ||||
| for ( i = 0; i < 8; i++) { | for ( i = 0; i < 8; i++) { | ||||
| /* Check for non-zero values (bitwise or faster than ||) */ | /* Check for non-zero values (bitwise or faster than ||) */ | ||||
| if ( ip[1 * 8] | ip[2 * 8] | ip[3 * 8] | | |||||
| ip[4 * 8] | ip[5 * 8] | ip[6 * 8] | ip[7 * 8] ) { | |||||
| if ( ip[1] | ip[2] | ip[3] | | |||||
| ip[4] | ip[5] | ip[6] | ip[7] ) { | |||||
| A = M(xC1S7, ip[1*8]) + M(xC7S1, ip[7*8]); | |||||
| B = M(xC7S1, ip[1*8]) - M(xC1S7, ip[7*8]); | |||||
| C = M(xC3S5, ip[3*8]) + M(xC5S3, ip[5*8]); | |||||
| D = M(xC3S5, ip[5*8]) - M(xC5S3, ip[3*8]); | |||||
| A = M(xC1S7, ip[1]) + M(xC7S1, ip[7]); | |||||
| B = M(xC7S1, ip[1]) - M(xC1S7, ip[7]); | |||||
| C = M(xC3S5, ip[3]) + M(xC5S3, ip[5]); | |||||
| D = M(xC3S5, ip[5]) - M(xC5S3, ip[3]); | |||||
| Ad = M(xC4S4, (A - C)); | Ad = M(xC4S4, (A - C)); | ||||
| Bd = M(xC4S4, (B - D)); | Bd = M(xC4S4, (B - D)); | ||||
| @@ -116,16 +117,16 @@ static av_always_inline void idct(uint8_t *dst, int stride, int16_t *input, int | |||||
| Cd = A + C; | Cd = A + C; | ||||
| Dd = B + D; | Dd = B + D; | ||||
| E = M(xC4S4, (ip[0*8] + ip[4*8])) + 8; | |||||
| F = M(xC4S4, (ip[0*8] - ip[4*8])) + 8; | |||||
| E = M(xC4S4, (ip[0] + ip[4])) + 8; | |||||
| F = M(xC4S4, (ip[0] - ip[4])) + 8; | |||||
| if(type==1){ //HACK | if(type==1){ //HACK | ||||
| E += 16*128; | E += 16*128; | ||||
| F += 16*128; | F += 16*128; | ||||
| } | } | ||||
| G = M(xC2S6, ip[2*8]) + M(xC6S2, ip[6*8]); | |||||
| H = M(xC6S2, ip[2*8]) - M(xC2S6, ip[6*8]); | |||||
| G = M(xC2S6, ip[2]) + M(xC6S2, ip[6]); | |||||
| H = M(xC6S2, ip[2]) - M(xC2S6, ip[6]); | |||||
| Ed = E - G; | Ed = E - G; | ||||
| Gd = E + G; | Gd = E + G; | ||||
| @@ -137,19 +138,7 @@ static av_always_inline void idct(uint8_t *dst, int stride, int16_t *input, int | |||||
| Hd = Bd + H; | Hd = Bd + H; | ||||
| /* Final sequence of operations over-write original inputs. */ | /* Final sequence of operations over-write original inputs. */ | ||||
| if(type==0){ | |||||
| ip[0*8] = (Gd + Cd ) >> 4; | |||||
| ip[7*8] = (Gd - Cd ) >> 4; | |||||
| ip[1*8] = (Add + Hd ) >> 4; | |||||
| ip[2*8] = (Add - Hd ) >> 4; | |||||
| ip[3*8] = (Ed + Dd ) >> 4; | |||||
| ip[4*8] = (Ed - Dd ) >> 4; | |||||
| ip[5*8] = (Fd + Bdd ) >> 4; | |||||
| ip[6*8] = (Fd - Bdd ) >> 4; | |||||
| }else if(type==1){ | |||||
| if (type == 1) { | |||||
| dst[0*stride] = av_clip_uint8((Gd + Cd ) >> 4); | dst[0*stride] = av_clip_uint8((Gd + Cd ) >> 4); | ||||
| dst[7*stride] = av_clip_uint8((Gd - Cd ) >> 4); | dst[7*stride] = av_clip_uint8((Gd - Cd ) >> 4); | ||||
| @@ -176,16 +165,7 @@ static av_always_inline void idct(uint8_t *dst, int stride, int16_t *input, int | |||||
| } | } | ||||
| } else { | } else { | ||||
| if(type==0){ | |||||
| ip[0*8] = | |||||
| ip[1*8] = | |||||
| ip[2*8] = | |||||
| ip[3*8] = | |||||
| ip[4*8] = | |||||
| ip[5*8] = | |||||
| ip[6*8] = | |||||
| ip[7*8] = ((xC4S4 * ip[0*8] + (IdctAdjustBeforeShift<<16))>>20); | |||||
| }else if(type==1){ | |||||
| if (type == 1) { | |||||
| dst[0*stride]= | dst[0*stride]= | ||||
| dst[1*stride]= | dst[1*stride]= | ||||
| dst[2*stride]= | dst[2*stride]= | ||||
| @@ -193,10 +173,10 @@ static av_always_inline void idct(uint8_t *dst, int stride, int16_t *input, int | |||||
| dst[4*stride]= | dst[4*stride]= | ||||
| dst[5*stride]= | dst[5*stride]= | ||||
| dst[6*stride]= | dst[6*stride]= | ||||
| dst[7*stride]= av_clip_uint8(128 + ((xC4S4 * ip[0*8] + (IdctAdjustBeforeShift<<16))>>20)); | |||||
| dst[7*stride]= av_clip_uint8(128 + ((xC4S4 * ip[0] + (IdctAdjustBeforeShift<<16))>>20)); | |||||
| }else{ | }else{ | ||||
| if(ip[0*8]){ | |||||
| int v= ((xC4S4 * ip[0*8] + (IdctAdjustBeforeShift<<16))>>20); | |||||
| if(ip[0]){ | |||||
| int v= ((xC4S4 * ip[0] + (IdctAdjustBeforeShift<<16))>>20); | |||||
| dst[0*stride] = av_clip_uint8(dst[0*stride] + v); | dst[0*stride] = av_clip_uint8(dst[0*stride] + v); | ||||
| dst[1*stride] = av_clip_uint8(dst[1*stride] + v); | dst[1*stride] = av_clip_uint8(dst[1*stride] + v); | ||||
| dst[2*stride] = av_clip_uint8(dst[2*stride] + v); | dst[2*stride] = av_clip_uint8(dst[2*stride] + v); | ||||
| @@ -209,7 +189,7 @@ static av_always_inline void idct(uint8_t *dst, int stride, int16_t *input, int | |||||
| } | } | ||||
| } | } | ||||
| ip++; /* next column */ | |||||
| ip += 8; /* next column */ | |||||
| dst++; | dst++; | ||||
| } | } | ||||
| } | } | ||||
| @@ -307,8 +287,6 @@ av_cold void ff_vp3dsp_init(VP3DSPContext *c, int flags) | |||||
| c->v_loop_filter = vp3_v_loop_filter_c; | c->v_loop_filter = vp3_v_loop_filter_c; | ||||
| c->h_loop_filter = vp3_h_loop_filter_c; | c->h_loop_filter = vp3_h_loop_filter_c; | ||||
| c->idct_perm = FF_NO_IDCT_PERM; | |||||
| if (ARCH_ARM) | if (ARCH_ARM) | ||||
| ff_vp3dsp_init_arm(c, flags); | ff_vp3dsp_init_arm(c, flags); | ||||
| if (ARCH_BFIN) | if (ARCH_BFIN) | ||||
| @@ -43,8 +43,6 @@ typedef struct VP3DSPContext { | |||||
| void (*idct_dc_add)(uint8_t *dest, int line_size, int16_t *block); | void (*idct_dc_add)(uint8_t *dest, int line_size, int16_t *block); | ||||
| void (*v_loop_filter)(uint8_t *src, int stride, int *bounding_values); | void (*v_loop_filter)(uint8_t *src, int stride, int *bounding_values); | ||||
| void (*h_loop_filter)(uint8_t *src, int stride, int *bounding_values); | void (*h_loop_filter)(uint8_t *src, int stride, int *bounding_values); | ||||
| int idct_perm; | |||||
| } VP3DSPContext; | } VP3DSPContext; | ||||
| void ff_vp3dsp_init(VP3DSPContext *c, int flags); | void ff_vp3dsp_init(VP3DSPContext *c, int flags); | ||||
| @@ -173,7 +173,7 @@ static void vp5_parse_coeff(VP56Context *s) | |||||
| { | { | ||||
| VP56RangeCoder *c = &s->c; | VP56RangeCoder *c = &s->c; | ||||
| VP56Model *model = s->modelp; | VP56Model *model = s->modelp; | ||||
| uint8_t *permute = s->scantable.permutated; | |||||
| uint8_t *permute = s->idct_scantable; | |||||
| uint8_t *model1, *model2; | uint8_t *model1, *model2; | ||||
| int coeff, sign, coeff_idx; | int coeff, sign, coeff_idx; | ||||
| int b, i, cg, idx, ctx, ctx_last; | int b, i, cg, idx, ctx, ctx_last; | ||||
| @@ -263,7 +263,7 @@ static VP56mb vp56_decode_mv(VP56Context *s, int row, int col) | |||||
| static void vp56_add_predictors_dc(VP56Context *s, VP56Frame ref_frame) | static void vp56_add_predictors_dc(VP56Context *s, VP56Frame ref_frame) | ||||
| { | { | ||||
| int idx = s->scantable.permutated[0]; | |||||
| int idx = s->idct_scantable[0]; | |||||
| int b; | int b; | ||||
| for (b=0; b<6; b++) { | for (b=0; b<6; b++) { | ||||
| @@ -661,8 +661,11 @@ av_cold int ff_vp56_init(AVCodecContext *avctx, int flip, int has_alpha) | |||||
| ff_videodsp_init(&s->vdsp, 8); | ff_videodsp_init(&s->vdsp, 8); | ||||
| ff_vp3dsp_init(&s->vp3dsp, avctx->flags); | ff_vp3dsp_init(&s->vp3dsp, avctx->flags); | ||||
| ff_vp56dsp_init(&s->vp56dsp, avctx->codec->id); | ff_vp56dsp_init(&s->vp56dsp, avctx->codec->id); | ||||
| ff_init_scantable_permutation(s->dsp.idct_permutation, s->vp3dsp.idct_perm); | |||||
| ff_init_scantable(s->dsp.idct_permutation, &s->scantable,ff_zigzag_direct); | |||||
| for (i = 0; i < 64; i++) { | |||||
| #define T(x) (x >> 3) | ((x & 7) << 3) | |||||
| s->idct_scantable[i] = T(ff_zigzag_direct[i]); | |||||
| #undef T | |||||
| } | |||||
| for (i = 0; i < FF_ARRAY_ELEMS(s->frames); i++) { | for (i = 0; i < FF_ARRAY_ELEMS(s->frames); i++) { | ||||
| s->frames[i] = av_frame_alloc(); | s->frames[i] = av_frame_alloc(); | ||||
| @@ -100,7 +100,7 @@ struct vp56_context { | |||||
| VideoDSPContext vdsp; | VideoDSPContext vdsp; | ||||
| VP3DSPContext vp3dsp; | VP3DSPContext vp3dsp; | ||||
| VP56DSPContext vp56dsp; | VP56DSPContext vp56dsp; | ||||
| ScanTable scantable; | |||||
| uint8_t idct_scantable[64]; | |||||
| AVFrame *frames[4]; | AVFrame *frames[4]; | ||||
| uint8_t *edge_emu_buffer_alloc; | uint8_t *edge_emu_buffer_alloc; | ||||
| uint8_t *edge_emu_buffer; | uint8_t *edge_emu_buffer; | ||||
| @@ -368,7 +368,7 @@ static unsigned vp6_get_nb_null(VP56Context *s) | |||||
| static void vp6_parse_coeff_huffman(VP56Context *s) | static void vp6_parse_coeff_huffman(VP56Context *s) | ||||
| { | { | ||||
| VP56Model *model = s->modelp; | VP56Model *model = s->modelp; | ||||
| uint8_t *permute = s->scantable.permutated; | |||||
| uint8_t *permute = s->idct_scantable; | |||||
| VLC *vlc_coeff; | VLC *vlc_coeff; | ||||
| int coeff, sign, coeff_idx; | int coeff, sign, coeff_idx; | ||||
| int b, cg, idx; | int b, cg, idx; | ||||
| @@ -428,7 +428,7 @@ static void vp6_parse_coeff(VP56Context *s) | |||||
| { | { | ||||
| VP56RangeCoder *c = s->ccp; | VP56RangeCoder *c = s->ccp; | ||||
| VP56Model *model = s->modelp; | VP56Model *model = s->modelp; | ||||
| uint8_t *permute = s->scantable.permutated; | |||||
| uint8_t *permute = s->idct_scantable; | |||||
| uint8_t *model1, *model2, *model3; | uint8_t *model1, *model2, *model3; | ||||
| int coeff, sign, coeff_idx; | int coeff, sign, coeff_idx; | ||||
| int b, i, cg, idx, ctx; | int b, i, cg, idx, ctx; | ||||
| @@ -501,22 +501,22 @@ cglobal vp3_h_loop_filter, 3, 4 | |||||
| ; at this point, function has completed dequantization + dezigzag + | ; at this point, function has completed dequantization + dezigzag + | ||||
| ; partial transposition; now do the idct itself | ; partial transposition; now do the idct itself | ||||
| %define I(x) [%1+16* x ] | |||||
| %define J(x) [%1+16*(x-4)+8] | |||||
| %define I(x) [%1+16*x] | |||||
| %define J(x) [%1+16*x] | |||||
| RowIDCT | RowIDCT | ||||
| Transpose | Transpose | ||||
| %define I(x) [%1+16* x +64] | |||||
| %define J(x) [%1+16*(x-4)+72] | |||||
| %define I(x) [%1+16*x+8] | |||||
| %define J(x) [%1+16*x+8] | |||||
| RowIDCT | RowIDCT | ||||
| Transpose | Transpose | ||||
| %define I(x) [%1+16*x] | |||||
| %define J(x) [%1+16*x] | |||||
| %define I(x) [%1+16* x] | |||||
| %define J(x) [%1+16*(x-4)+8] | |||||
| ColumnIDCT | ColumnIDCT | ||||
| %define I(x) [%1+16*x+8] | |||||
| %define J(x) [%1+16*x+8] | |||||
| %define I(x) [%1+16* x +64] | |||||
| %define J(x) [%1+16*(x-4)+72] | |||||
| ColumnIDCT | ColumnIDCT | ||||
| %endif ; mmsize == 16/8 | %endif ; mmsize == 16/8 | ||||
| %endmacro | %endmacro | ||||
| @@ -534,10 +534,17 @@ cglobal vp3_idct_put, 3, 4, 9 | |||||
| mova m1, [r2+mmsize*2+%%i] | mova m1, [r2+mmsize*2+%%i] | ||||
| mova m2, [r2+mmsize*4+%%i] | mova m2, [r2+mmsize*4+%%i] | ||||
| mova m3, [r2+mmsize*6+%%i] | mova m3, [r2+mmsize*6+%%i] | ||||
| %if mmsize == 8 | |||||
| packsswb m0, [r2+mmsize*8+%%i] | |||||
| packsswb m1, [r2+mmsize*10+%%i] | |||||
| packsswb m2, [r2+mmsize*12+%%i] | |||||
| packsswb m3, [r2+mmsize*14+%%i] | |||||
| %else | |||||
| packsswb m0, [r2+mmsize*1+%%i] | packsswb m0, [r2+mmsize*1+%%i] | ||||
| packsswb m1, [r2+mmsize*3+%%i] | packsswb m1, [r2+mmsize*3+%%i] | ||||
| packsswb m2, [r2+mmsize*5+%%i] | packsswb m2, [r2+mmsize*5+%%i] | ||||
| packsswb m3, [r2+mmsize*7+%%i] | packsswb m3, [r2+mmsize*7+%%i] | ||||
| %endif | |||||
| paddb m0, m4 | paddb m0, m4 | ||||
| paddb m1, m4 | paddb m1, m4 | ||||
| paddb m2, m4 | paddb m2, m4 | ||||
| @@ -561,7 +568,7 @@ cglobal vp3_idct_put, 3, 4, 9 | |||||
| movq [r0+r1*2], m3 | movq [r0+r1*2], m3 | ||||
| movhps [r0+r3 ], m3 | movhps [r0+r3 ], m3 | ||||
| %endif | %endif | ||||
| %assign %%i %%i+64 | |||||
| %assign %%i %%i+8 | |||||
| %endrep | %endrep | ||||
| pxor m0, m0 | pxor m0, m0 | ||||
| @@ -575,47 +582,81 @@ cglobal vp3_idct_put, 3, 4, 9 | |||||
| cglobal vp3_idct_add, 3, 4, 9 | cglobal vp3_idct_add, 3, 4, 9 | ||||
| VP3_IDCT r2 | VP3_IDCT r2 | ||||
| mov r3, 4 | |||||
| pxor m4, m4 | |||||
| movsxdifnidn r1, r1d | movsxdifnidn r1, r1d | ||||
| .loop: | |||||
| lea r3, [r1*3] | |||||
| pxor m4, m4 | |||||
| %if mmsize == 16 | |||||
| %assign %%i 0 | |||||
| %rep 2 | |||||
| movq m0, [r0] | movq m0, [r0] | ||||
| movq m1, [r0+r1] | movq m1, [r0+r1] | ||||
| %if mmsize == 8 | |||||
| mova m2, m0 | |||||
| mova m3, m1 | |||||
| %endif | |||||
| movq m2, [r0+r1*2] | |||||
| movq m3, [r0+r3] | |||||
| punpcklbw m0, m4 | punpcklbw m0, m4 | ||||
| punpcklbw m1, m4 | punpcklbw m1, m4 | ||||
| %if mmsize == 8 | |||||
| punpckhbw m2, m4 | |||||
| punpckhbw m3, m4 | |||||
| %endif | |||||
| paddsw m0, [r2+ 0] | |||||
| paddsw m1, [r2+16] | |||||
| %if mmsize == 8 | |||||
| paddsw m2, [r2+ 8] | |||||
| paddsw m3, [r2+24] | |||||
| packuswb m0, m2 | |||||
| packuswb m1, m3 | |||||
| %else ; mmsize == 16 | |||||
| punpcklbw m2, m4 | |||||
| punpcklbw m3, m4 | |||||
| paddsw m0, [r2+ 0+%%i] | |||||
| paddsw m1, [r2+16+%%i] | |||||
| paddsw m2, [r2+32+%%i] | |||||
| paddsw m3, [r2+48+%%i] | |||||
| packuswb m0, m1 | packuswb m0, m1 | ||||
| packuswb m2, m3 | |||||
| movq [r0 ], m0 | |||||
| movhps [r0+r1 ], m0 | |||||
| movq [r0+r1*2], m2 | |||||
| movhps [r0+r3 ], m2 | |||||
| %if %%i == 0 | |||||
| lea r0, [r0+r1*4] | |||||
| %endif | %endif | ||||
| movq [r0 ], m0 | |||||
| %if mmsize == 8 | |||||
| movq [r0+r1], m1 | |||||
| %else ; mmsize == 16 | |||||
| movhps [r0+r1], m0 | |||||
| %assign %%i %%i+64 | |||||
| %endrep | |||||
| %else | |||||
| %assign %%i 0 | |||||
| %rep 2 | |||||
| movq m0, [r0] | |||||
| movq m1, [r0+r1] | |||||
| movq m2, [r0+r1*2] | |||||
| movq m3, [r0+r3] | |||||
| movq m5, m0 | |||||
| movq m6, m1 | |||||
| movq m7, m2 | |||||
| punpcklbw m0, m4 | |||||
| punpcklbw m1, m4 | |||||
| punpcklbw m2, m4 | |||||
| punpckhbw m5, m4 | |||||
| punpckhbw m6, m4 | |||||
| punpckhbw m7, m4 | |||||
| paddsw m0, [r2+ 0+%%i] | |||||
| paddsw m1, [r2+16+%%i] | |||||
| paddsw m2, [r2+32+%%i] | |||||
| paddsw m5, [r2+64+%%i] | |||||
| paddsw m6, [r2+80+%%i] | |||||
| paddsw m7, [r2+96+%%i] | |||||
| packuswb m0, m5 | |||||
| movq m5, m3 | |||||
| punpcklbw m3, m4 | |||||
| punpckhbw m5, m4 | |||||
| packuswb m1, m6 | |||||
| paddsw m3, [r2+48+%%i] | |||||
| paddsw m5, [r2+112+%%i] | |||||
| packuswb m2, m7 | |||||
| packuswb m3, m5 | |||||
| movq [r0 ], m0 | |||||
| movq [r0+r1 ], m1 | |||||
| movq [r0+r1*2], m2 | |||||
| movq [r0+r3 ], m3 | |||||
| %if %%i == 0 | |||||
| lea r0, [r0+r1*4] | |||||
| %endif | %endif | ||||
| lea r0, [r0+r1*2] | |||||
| %assign %%offset 0 | |||||
| %rep 32/mmsize | |||||
| mova [r2+%%offset], m4 | |||||
| %assign %%offset %%offset+mmsize | |||||
| %assign %%i %%i+8 | |||||
| %endrep | |||||
| %endif | |||||
| %assign %%i 0 | |||||
| %rep 128/mmsize | |||||
| mova [r2+%%i], m4 | |||||
| %assign %%i %%i+mmsize | |||||
| %endrep | %endrep | ||||
| add r2, 32 | |||||
| dec r3 | |||||
| jg .loop | |||||
| RET | RET | ||||
| %endmacro | %endmacro | ||||
| @@ -48,7 +48,6 @@ av_cold void ff_vp3dsp_init_x86(VP3DSPContext *c, int flags) | |||||
| if (EXTERNAL_MMX(cpuflags)) { | if (EXTERNAL_MMX(cpuflags)) { | ||||
| c->idct_put = ff_vp3_idct_put_mmx; | c->idct_put = ff_vp3_idct_put_mmx; | ||||
| c->idct_add = ff_vp3_idct_add_mmx; | c->idct_add = ff_vp3_idct_add_mmx; | ||||
| c->idct_perm = FF_PARTTRANS_IDCT_PERM; | |||||
| } | } | ||||
| #endif | #endif | ||||
| @@ -64,6 +63,5 @@ av_cold void ff_vp3dsp_init_x86(VP3DSPContext *c, int flags) | |||||
| if (EXTERNAL_SSE2(cpuflags)) { | if (EXTERNAL_SSE2(cpuflags)) { | ||||
| c->idct_put = ff_vp3_idct_put_sse2; | c->idct_put = ff_vp3_idct_put_sse2; | ||||
| c->idct_add = ff_vp3_idct_add_sse2; | c->idct_add = ff_vp3_idct_add_sse2; | ||||
| c->idct_perm = FF_TRANSPOSE_IDCT_PERM; | |||||
| } | } | ||||
| } | } | ||||