SAD functions rewritten (8x8 support & MMX2 optimizations) HQ inter/intra decission msmpeg4 encoding bugfix (MV where too long) Originally committed as revision 362 to svn://svn.ffmpeg.org/ffmpeg/trunktags/v0.5
| @@ -61,9 +61,14 @@ extern int motion_estimation_method; | |||||
| #define ME_X1 5 | #define ME_X1 5 | ||||
| /* encoding support */ | /* encoding support */ | ||||
| /* note not everything is supported yet */ | |||||
| #define CODEC_FLAG_HQ 0x0001 /* high quality (non real time) encoding */ | #define CODEC_FLAG_HQ 0x0001 /* high quality (non real time) encoding */ | ||||
| #define CODEC_FLAG_QSCALE 0x0002 /* use fixed qscale */ | #define CODEC_FLAG_QSCALE 0x0002 /* use fixed qscale */ | ||||
| #define CODEC_FLAG_4MV 0x0004 /* 4 MV per MB allowed */ | |||||
| #define CODEC_FLAG_B 0x0008 /* use B frames */ | |||||
| #define CODEC_FLAG_QPEL 0x0010 /* use qpel MC */ | |||||
| #define CODEC_FLAG_GMC 0x0020 /* use GMC */ | |||||
| /* codec capabilities */ | /* codec capabilities */ | ||||
| @@ -36,6 +36,11 @@ op_pixels_abs_func pix_abs16x16_x2; | |||||
| op_pixels_abs_func pix_abs16x16_y2; | op_pixels_abs_func pix_abs16x16_y2; | ||||
| op_pixels_abs_func pix_abs16x16_xy2; | op_pixels_abs_func pix_abs16x16_xy2; | ||||
| op_pixels_abs_func pix_abs8x8; | |||||
| op_pixels_abs_func pix_abs8x8_x2; | |||||
| op_pixels_abs_func pix_abs8x8_y2; | |||||
| op_pixels_abs_func pix_abs8x8_xy2; | |||||
| UINT8 cropTbl[256 + 2 * MAX_NEG_CROP]; | UINT8 cropTbl[256 + 2 * MAX_NEG_CROP]; | ||||
| UINT32 squareTbl[512]; | UINT32 squareTbl[512]; | ||||
| @@ -377,14 +382,14 @@ static void qpel_h_lowpass(UINT8 *dst, UINT8 *src, int dstStride, int srcStride, | |||||
| int i; | int i; | ||||
| for(i=0; i<h; i++) | for(i=0; i<h; i++) | ||||
| { | { | ||||
| dst[0]= cm[(((src[0]+src[1])*160 - (src[0]+src[2])*48 + (src[1]+src[3])*24 - (src[2]+src[4])*8 + r)>>8)]; | |||||
| dst[1]= cm[(((src[1]+src[2])*160 - (src[0]+src[3])*48 + (src[0]+src[4])*24 - (src[1]+src[5])*8 + r)>>8)]; | |||||
| dst[2]= cm[(((src[2]+src[3])*160 - (src[1]+src[4])*48 + (src[0]+src[5])*24 - (src[0]+src[6])*8 + r)>>8)]; | |||||
| dst[3]= cm[(((src[3]+src[4])*160 - (src[2]+src[5])*48 + (src[1]+src[6])*24 - (src[0]+src[7])*8 + r)>>8)]; | |||||
| dst[4]= cm[(((src[4]+src[5])*160 - (src[3]+src[6])*48 + (src[2]+src[7])*24 - (src[1]+src[8])*8 + r)>>8)]; | |||||
| dst[5]= cm[(((src[5]+src[6])*160 - (src[4]+src[7])*48 + (src[3]+src[8])*24 - (src[2]+src[8])*8 + r)>>8)]; | |||||
| dst[6]= cm[(((src[6]+src[7])*160 - (src[5]+src[8])*48 + (src[4]+src[8])*24 - (src[3]+src[7])*8 + r)>>8)]; | |||||
| dst[7]= cm[(((src[7]+src[8])*160 - (src[6]+src[8])*48 + (src[5]+src[7])*24 - (src[4]+src[6])*8 + r)>>8)]; | |||||
| dst[0]= cm[(((src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]) + r)>>5)]; | |||||
| dst[1]= cm[(((src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]) + r)>>5)]; | |||||
| dst[2]= cm[(((src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]) + r)>>5)]; | |||||
| dst[3]= cm[(((src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]) + r)>>5)]; | |||||
| dst[4]= cm[(((src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]) + r)>>5)]; | |||||
| dst[5]= cm[(((src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]) + r)>>5)]; | |||||
| dst[6]= cm[(((src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]) + r)>>5)]; | |||||
| dst[7]= cm[(((src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]) + r)>>5)]; | |||||
| dst+=dstStride; | dst+=dstStride; | ||||
| src+=srcStride; | src+=srcStride; | ||||
| } | } | ||||
| @@ -405,14 +410,14 @@ static void qpel_v_lowpass(UINT8 *dst, UINT8 *src, int dstStride, int srcStride, | |||||
| const int src6= src[6*srcStride]; | const int src6= src[6*srcStride]; | ||||
| const int src7= src[7*srcStride]; | const int src7= src[7*srcStride]; | ||||
| const int src8= src[8*srcStride]; | const int src8= src[8*srcStride]; | ||||
| dst[0*dstStride]= cm[(((src0+src1)*160 - (src0+src2)*48 + (src1+src3)*24 - (src2+src4)*8 + r)>>8)]; | |||||
| dst[1*dstStride]= cm[(((src1+src2)*160 - (src0+src3)*48 + (src0+src4)*24 - (src1+src5)*8 + r)>>8)]; | |||||
| dst[2*dstStride]= cm[(((src2+src3)*160 - (src1+src4)*48 + (src0+src5)*24 - (src0+src6)*8 + r)>>8)]; | |||||
| dst[3*dstStride]= cm[(((src3+src4)*160 - (src2+src5)*48 + (src1+src6)*24 - (src0+src7)*8 + r)>>8)]; | |||||
| dst[4*dstStride]= cm[(((src4+src5)*160 - (src3+src6)*48 + (src2+src7)*24 - (src1+src8)*8 + r)>>8)]; | |||||
| dst[5*dstStride]= cm[(((src5+src6)*160 - (src4+src7)*48 + (src3+src8)*24 - (src2+src8)*8 + r)>>8)]; | |||||
| dst[6*dstStride]= cm[(((src6+src7)*160 - (src5+src8)*48 + (src4+src8)*24 - (src3+src7)*8 + r)>>8)]; | |||||
| dst[7*dstStride]= cm[(((src7+src8)*160 - (src6+src8)*48 + (src5+src7)*24 - (src4+src6)*8 + r)>>8)]; | |||||
| dst[0*dstStride]= cm[(((src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4) + r)>>5)]; | |||||
| dst[1*dstStride]= cm[(((src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5) + r)>>5)]; | |||||
| dst[2*dstStride]= cm[(((src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6) + r)>>5)]; | |||||
| dst[3*dstStride]= cm[(((src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7) + r)>>5)]; | |||||
| dst[4*dstStride]= cm[(((src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8) + r)>>5)]; | |||||
| dst[5*dstStride]= cm[(((src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8) + r)>>5)]; | |||||
| dst[6*dstStride]= cm[(((src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7) + r)>>5)]; | |||||
| dst[7*dstStride]= cm[(((src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6) + r)>>5)]; | |||||
| dst++; | dst++; | ||||
| src++; | src++; | ||||
| } | } | ||||
| @@ -485,38 +490,38 @@ static void qpel_mc00_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcS | |||||
| static void qpel_mc10_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\ | static void qpel_mc10_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\ | ||||
| {\ | {\ | ||||
| UINT8 half[64];\ | UINT8 half[64];\ | ||||
| qpel_h_lowpass(half, src, 8, srcStride, 8, 128-r);\ | |||||
| qpel_h_lowpass(half, src, 8, srcStride, 8, 16-r);\ | |||||
| avg2_block(dst, src, half, dstStride, srcStride, 1-r);\ | avg2_block(dst, src, half, dstStride, srcStride, 1-r);\ | ||||
| }\ | }\ | ||||
| \ | \ | ||||
| static void qpel_mc20_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\ | static void qpel_mc20_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\ | ||||
| {\ | {\ | ||||
| qpel_h_lowpass(dst, src, dstStride, srcStride, 8, 128-r);\ | |||||
| qpel_h_lowpass(dst, src, dstStride, srcStride, 8, 16-r);\ | |||||
| }\ | }\ | ||||
| \ | \ | ||||
| static void qpel_mc30_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\ | static void qpel_mc30_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\ | ||||
| {\ | {\ | ||||
| UINT8 half[64];\ | UINT8 half[64];\ | ||||
| qpel_h_lowpass(half, src, 8, srcStride, 8, 128-r);\ | |||||
| qpel_h_lowpass(half, src, 8, srcStride, 8, 16-r);\ | |||||
| avg2_block(dst, src+1, half, dstStride, srcStride, 1-r);\ | avg2_block(dst, src+1, half, dstStride, srcStride, 1-r);\ | ||||
| }\ | }\ | ||||
| \ | \ | ||||
| static void qpel_mc01_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\ | static void qpel_mc01_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\ | ||||
| {\ | {\ | ||||
| UINT8 half[64];\ | UINT8 half[64];\ | ||||
| qpel_v_lowpass(half, src, 8, srcStride, 8, 128-r);\ | |||||
| qpel_v_lowpass(half, src, 8, srcStride, 8, 16-r);\ | |||||
| avg2_block(dst, src, half, dstStride, srcStride, 1-r);\ | avg2_block(dst, src, half, dstStride, srcStride, 1-r);\ | ||||
| }\ | }\ | ||||
| \ | \ | ||||
| static void qpel_mc02_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\ | static void qpel_mc02_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\ | ||||
| {\ | {\ | ||||
| qpel_v_lowpass(dst, src, dstStride, srcStride, 8, 128-r);\ | |||||
| qpel_v_lowpass(dst, src, dstStride, srcStride, 8, 16-r);\ | |||||
| }\ | }\ | ||||
| \ | \ | ||||
| static void qpel_mc03_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\ | static void qpel_mc03_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\ | ||||
| {\ | {\ | ||||
| UINT8 half[64];\ | UINT8 half[64];\ | ||||
| qpel_v_lowpass(half, src, 8, srcStride, 8, 128-r);\ | |||||
| qpel_v_lowpass(half, src, 8, srcStride, 8, 16-r);\ | |||||
| avg2_block(dst, src+srcStride, half, dstStride, srcStride, 1-r);\ | avg2_block(dst, src+srcStride, half, dstStride, srcStride, 1-r);\ | ||||
| }\ | }\ | ||||
| static void qpel_mc11_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\ | static void qpel_mc11_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\ | ||||
| @@ -524,9 +529,9 @@ static void qpel_mc11_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcS | |||||
| UINT8 halfH[72];\ | UINT8 halfH[72];\ | ||||
| UINT8 halfV[64];\ | UINT8 halfV[64];\ | ||||
| UINT8 halfHV[64];\ | UINT8 halfHV[64];\ | ||||
| qpel_h_lowpass(halfH, src, 8, srcStride, 9, 128-r);\ | |||||
| qpel_v_lowpass(halfV, src, 8, srcStride, 8, 128-r);\ | |||||
| qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 128-r);\ | |||||
| qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\ | |||||
| qpel_v_lowpass(halfV, src, 8, srcStride, 8, 16-r);\ | |||||
| qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\ | |||||
| avg4_block(dst, src, halfH, halfV, halfHV, dstStride, srcStride, 2-r);\ | avg4_block(dst, src, halfH, halfV, halfHV, dstStride, srcStride, 2-r);\ | ||||
| }\ | }\ | ||||
| static void qpel_mc31_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\ | static void qpel_mc31_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\ | ||||
| @@ -534,9 +539,9 @@ static void qpel_mc31_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcS | |||||
| UINT8 halfH[72];\ | UINT8 halfH[72];\ | ||||
| UINT8 halfV[64];\ | UINT8 halfV[64];\ | ||||
| UINT8 halfHV[64];\ | UINT8 halfHV[64];\ | ||||
| qpel_h_lowpass(halfH, src, 8, srcStride, 9, 128-r);\ | |||||
| qpel_v_lowpass(halfV, src+1, 8, srcStride, 8, 128-r);\ | |||||
| qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 128-r);\ | |||||
| qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\ | |||||
| qpel_v_lowpass(halfV, src+1, 8, srcStride, 8, 16-r);\ | |||||
| qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\ | |||||
| avg4_block(dst, src+1, halfH, halfV, halfHV, dstStride, srcStride, 2-r);\ | avg4_block(dst, src+1, halfH, halfV, halfHV, dstStride, srcStride, 2-r);\ | ||||
| }\ | }\ | ||||
| static void qpel_mc13_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\ | static void qpel_mc13_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\ | ||||
| @@ -544,9 +549,9 @@ static void qpel_mc13_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcS | |||||
| UINT8 halfH[72];\ | UINT8 halfH[72];\ | ||||
| UINT8 halfV[64];\ | UINT8 halfV[64];\ | ||||
| UINT8 halfHV[64];\ | UINT8 halfHV[64];\ | ||||
| qpel_h_lowpass(halfH, src, 8, srcStride, 9, 128-r);\ | |||||
| qpel_v_lowpass(halfV, src, 8, srcStride, 8, 128-r);\ | |||||
| qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 128-r);\ | |||||
| qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\ | |||||
| qpel_v_lowpass(halfV, src, 8, srcStride, 8, 16-r);\ | |||||
| qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\ | |||||
| avg4_block(dst, src+srcStride, halfH+8, halfV, halfHV, dstStride, srcStride, 2-r);\ | avg4_block(dst, src+srcStride, halfH+8, halfV, halfHV, dstStride, srcStride, 2-r);\ | ||||
| }\ | }\ | ||||
| static void qpel_mc33_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\ | static void qpel_mc33_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\ | ||||
| @@ -554,25 +559,25 @@ static void qpel_mc33_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcS | |||||
| UINT8 halfH[72];\ | UINT8 halfH[72];\ | ||||
| UINT8 halfV[64];\ | UINT8 halfV[64];\ | ||||
| UINT8 halfHV[64];\ | UINT8 halfHV[64];\ | ||||
| qpel_h_lowpass(halfH, src, 8, srcStride, 9, 128-r);\ | |||||
| qpel_v_lowpass(halfV, src+1, 8, srcStride, 8, 128-r);\ | |||||
| qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 128-r);\ | |||||
| qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\ | |||||
| qpel_v_lowpass(halfV, src+1, 8, srcStride, 8, 16-r);\ | |||||
| qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\ | |||||
| avg4_block(dst, src+srcStride+1, halfH+8, halfV, halfHV, dstStride, srcStride, 2-r);\ | avg4_block(dst, src+srcStride+1, halfH+8, halfV, halfHV, dstStride, srcStride, 2-r);\ | ||||
| }\ | }\ | ||||
| static void qpel_mc21_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\ | static void qpel_mc21_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\ | ||||
| {\ | {\ | ||||
| UINT8 halfH[72];\ | UINT8 halfH[72];\ | ||||
| UINT8 halfHV[64];\ | UINT8 halfHV[64];\ | ||||
| qpel_h_lowpass(halfH, src, 8, srcStride, 9, 128-r);\ | |||||
| qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 128-r);\ | |||||
| qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\ | |||||
| qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\ | |||||
| avg2_block(dst, halfH, halfHV, dstStride, 8, 1-r);\ | avg2_block(dst, halfH, halfHV, dstStride, 8, 1-r);\ | ||||
| }\ | }\ | ||||
| static void qpel_mc23_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\ | static void qpel_mc23_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\ | ||||
| {\ | {\ | ||||
| UINT8 halfH[72];\ | UINT8 halfH[72];\ | ||||
| UINT8 halfHV[64];\ | UINT8 halfHV[64];\ | ||||
| qpel_h_lowpass(halfH, src, 8, srcStride, 9, 128-r);\ | |||||
| qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 128-r);\ | |||||
| qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\ | |||||
| qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\ | |||||
| avg2_block(dst, halfH+8, halfHV, dstStride, 8, 1-r);\ | avg2_block(dst, halfH+8, halfHV, dstStride, 8, 1-r);\ | ||||
| }\ | }\ | ||||
| static void qpel_mc12_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\ | static void qpel_mc12_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\ | ||||
| @@ -580,9 +585,9 @@ static void qpel_mc12_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcS | |||||
| UINT8 halfH[72];\ | UINT8 halfH[72];\ | ||||
| UINT8 halfV[64];\ | UINT8 halfV[64];\ | ||||
| UINT8 halfHV[64];\ | UINT8 halfHV[64];\ | ||||
| qpel_h_lowpass(halfH, src, 8, srcStride, 9, 128-r);\ | |||||
| qpel_v_lowpass(halfV, src, 8, srcStride, 8, 128-r);\ | |||||
| qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 128-r);\ | |||||
| qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\ | |||||
| qpel_v_lowpass(halfV, src, 8, srcStride, 8, 16-r);\ | |||||
| qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\ | |||||
| avg2_block(dst, halfV, halfHV, dstStride, 8, 1-r);\ | avg2_block(dst, halfV, halfHV, dstStride, 8, 1-r);\ | ||||
| }\ | }\ | ||||
| static void qpel_mc32_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\ | static void qpel_mc32_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\ | ||||
| @@ -590,16 +595,16 @@ static void qpel_mc32_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcS | |||||
| UINT8 halfH[72];\ | UINT8 halfH[72];\ | ||||
| UINT8 halfV[64];\ | UINT8 halfV[64];\ | ||||
| UINT8 halfHV[64];\ | UINT8 halfHV[64];\ | ||||
| qpel_h_lowpass(halfH, src, 8, srcStride, 9, 128-r);\ | |||||
| qpel_v_lowpass(halfV, src+1, 8, srcStride, 8, 128-r);\ | |||||
| qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 128-r);\ | |||||
| qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\ | |||||
| qpel_v_lowpass(halfV, src+1, 8, srcStride, 8, 16-r);\ | |||||
| qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\ | |||||
| avg2_block(dst, halfV, halfHV, dstStride, 8, 1-r);\ | avg2_block(dst, halfV, halfHV, dstStride, 8, 1-r);\ | ||||
| }\ | }\ | ||||
| static void qpel_mc22_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\ | static void qpel_mc22_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\ | ||||
| {\ | {\ | ||||
| UINT8 halfH[72];\ | UINT8 halfH[72];\ | ||||
| qpel_h_lowpass(halfH, src, 8, srcStride, 9, 128-r);\ | |||||
| qpel_v_lowpass(dst, halfH, dstStride, 8, 8, 128-r);\ | |||||
| qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\ | |||||
| qpel_v_lowpass(dst, halfH, dstStride, 8, 8, 16-r);\ | |||||
| }\ | }\ | ||||
| qpel_mc_func qpel_mc ## name ## _tab[16]={ \ | qpel_mc_func qpel_mc ## name ## _tab[16]={ \ | ||||
| qpel_mc00_c ## name, \ | qpel_mc00_c ## name, \ | ||||
| @@ -623,12 +628,12 @@ qpel_mc_func qpel_mc ## name ## _tab[16]={ \ | |||||
| QPEL_MC(0, _rnd) | QPEL_MC(0, _rnd) | ||||
| QPEL_MC(1, _no_rnd) | QPEL_MC(1, _no_rnd) | ||||
| int pix_abs16x16_c(UINT8 *pix1, UINT8 *pix2, int line_size, int h) | |||||
| int pix_abs16x16_c(UINT8 *pix1, UINT8 *pix2, int line_size) | |||||
| { | { | ||||
| int s, i; | int s, i; | ||||
| s = 0; | s = 0; | ||||
| for(i=0;i<h;i++) { | |||||
| for(i=0;i<16;i++) { | |||||
| s += abs(pix1[0] - pix2[0]); | s += abs(pix1[0] - pix2[0]); | ||||
| s += abs(pix1[1] - pix2[1]); | s += abs(pix1[1] - pix2[1]); | ||||
| s += abs(pix1[2] - pix2[2]); | s += abs(pix1[2] - pix2[2]); | ||||
| @@ -651,12 +656,12 @@ int pix_abs16x16_c(UINT8 *pix1, UINT8 *pix2, int line_size, int h) | |||||
| return s; | return s; | ||||
| } | } | ||||
| int pix_abs16x16_x2_c(UINT8 *pix1, UINT8 *pix2, int line_size, int h) | |||||
| int pix_abs16x16_x2_c(UINT8 *pix1, UINT8 *pix2, int line_size) | |||||
| { | { | ||||
| int s, i; | int s, i; | ||||
| s = 0; | s = 0; | ||||
| for(i=0;i<h;i++) { | |||||
| for(i=0;i<16;i++) { | |||||
| s += abs(pix1[0] - avg2(pix2[0], pix2[1])); | s += abs(pix1[0] - avg2(pix2[0], pix2[1])); | ||||
| s += abs(pix1[1] - avg2(pix2[1], pix2[2])); | s += abs(pix1[1] - avg2(pix2[1], pix2[2])); | ||||
| s += abs(pix1[2] - avg2(pix2[2], pix2[3])); | s += abs(pix1[2] - avg2(pix2[2], pix2[3])); | ||||
| @@ -679,13 +684,13 @@ int pix_abs16x16_x2_c(UINT8 *pix1, UINT8 *pix2, int line_size, int h) | |||||
| return s; | return s; | ||||
| } | } | ||||
| int pix_abs16x16_y2_c(UINT8 *pix1, UINT8 *pix2, int line_size, int h) | |||||
| int pix_abs16x16_y2_c(UINT8 *pix1, UINT8 *pix2, int line_size) | |||||
| { | { | ||||
| int s, i; | int s, i; | ||||
| UINT8 *pix3 = pix2 + line_size; | UINT8 *pix3 = pix2 + line_size; | ||||
| s = 0; | s = 0; | ||||
| for(i=0;i<h;i++) { | |||||
| for(i=0;i<16;i++) { | |||||
| s += abs(pix1[0] - avg2(pix2[0], pix3[0])); | s += abs(pix1[0] - avg2(pix2[0], pix3[0])); | ||||
| s += abs(pix1[1] - avg2(pix2[1], pix3[1])); | s += abs(pix1[1] - avg2(pix2[1], pix3[1])); | ||||
| s += abs(pix1[2] - avg2(pix2[2], pix3[2])); | s += abs(pix1[2] - avg2(pix2[2], pix3[2])); | ||||
| @@ -709,13 +714,13 @@ int pix_abs16x16_y2_c(UINT8 *pix1, UINT8 *pix2, int line_size, int h) | |||||
| return s; | return s; | ||||
| } | } | ||||
| int pix_abs16x16_xy2_c(UINT8 *pix1, UINT8 *pix2, int line_size, int h) | |||||
| int pix_abs16x16_xy2_c(UINT8 *pix1, UINT8 *pix2, int line_size) | |||||
| { | { | ||||
| int s, i; | int s, i; | ||||
| UINT8 *pix3 = pix2 + line_size; | UINT8 *pix3 = pix2 + line_size; | ||||
| s = 0; | s = 0; | ||||
| for(i=0;i<h;i++) { | |||||
| for(i=0;i<16;i++) { | |||||
| s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1])); | s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1])); | ||||
| s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2])); | s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2])); | ||||
| s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3])); | s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3])); | ||||
| @@ -739,6 +744,90 @@ int pix_abs16x16_xy2_c(UINT8 *pix1, UINT8 *pix2, int line_size, int h) | |||||
| return s; | return s; | ||||
| } | } | ||||
| int pix_abs8x8_c(UINT8 *pix1, UINT8 *pix2, int line_size) | |||||
| { | |||||
| int s, i; | |||||
| s = 0; | |||||
| for(i=0;i<8;i++) { | |||||
| s += abs(pix1[0] - pix2[0]); | |||||
| s += abs(pix1[1] - pix2[1]); | |||||
| s += abs(pix1[2] - pix2[2]); | |||||
| s += abs(pix1[3] - pix2[3]); | |||||
| s += abs(pix1[4] - pix2[4]); | |||||
| s += abs(pix1[5] - pix2[5]); | |||||
| s += abs(pix1[6] - pix2[6]); | |||||
| s += abs(pix1[7] - pix2[7]); | |||||
| pix1 += line_size; | |||||
| pix2 += line_size; | |||||
| } | |||||
| return s; | |||||
| } | |||||
| int pix_abs8x8_x2_c(UINT8 *pix1, UINT8 *pix2, int line_size) | |||||
| { | |||||
| int s, i; | |||||
| s = 0; | |||||
| for(i=0;i<8;i++) { | |||||
| s += abs(pix1[0] - avg2(pix2[0], pix2[1])); | |||||
| s += abs(pix1[1] - avg2(pix2[1], pix2[2])); | |||||
| s += abs(pix1[2] - avg2(pix2[2], pix2[3])); | |||||
| s += abs(pix1[3] - avg2(pix2[3], pix2[4])); | |||||
| s += abs(pix1[4] - avg2(pix2[4], pix2[5])); | |||||
| s += abs(pix1[5] - avg2(pix2[5], pix2[6])); | |||||
| s += abs(pix1[6] - avg2(pix2[6], pix2[7])); | |||||
| s += abs(pix1[7] - avg2(pix2[7], pix2[8])); | |||||
| pix1 += line_size; | |||||
| pix2 += line_size; | |||||
| } | |||||
| return s; | |||||
| } | |||||
| int pix_abs8x8_y2_c(UINT8 *pix1, UINT8 *pix2, int line_size) | |||||
| { | |||||
| int s, i; | |||||
| UINT8 *pix3 = pix2 + line_size; | |||||
| s = 0; | |||||
| for(i=0;i<8;i++) { | |||||
| s += abs(pix1[0] - avg2(pix2[0], pix3[0])); | |||||
| s += abs(pix1[1] - avg2(pix2[1], pix3[1])); | |||||
| s += abs(pix1[2] - avg2(pix2[2], pix3[2])); | |||||
| s += abs(pix1[3] - avg2(pix2[3], pix3[3])); | |||||
| s += abs(pix1[4] - avg2(pix2[4], pix3[4])); | |||||
| s += abs(pix1[5] - avg2(pix2[5], pix3[5])); | |||||
| s += abs(pix1[6] - avg2(pix2[6], pix3[6])); | |||||
| s += abs(pix1[7] - avg2(pix2[7], pix3[7])); | |||||
| pix1 += line_size; | |||||
| pix2 += line_size; | |||||
| pix3 += line_size; | |||||
| } | |||||
| return s; | |||||
| } | |||||
| int pix_abs8x8_xy2_c(UINT8 *pix1, UINT8 *pix2, int line_size) | |||||
| { | |||||
| int s, i; | |||||
| UINT8 *pix3 = pix2 + line_size; | |||||
| s = 0; | |||||
| for(i=0;i<8;i++) { | |||||
| s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1])); | |||||
| s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2])); | |||||
| s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3])); | |||||
| s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4])); | |||||
| s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5])); | |||||
| s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6])); | |||||
| s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7])); | |||||
| s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8])); | |||||
| pix1 += line_size; | |||||
| pix2 += line_size; | |||||
| pix3 += line_size; | |||||
| } | |||||
| return s; | |||||
| } | |||||
| /* permute block according so that it corresponds to the MMX idct | /* permute block according so that it corresponds to the MMX idct | ||||
| order */ | order */ | ||||
| #ifdef SIMPLE_IDCT | #ifdef SIMPLE_IDCT | ||||
| @@ -802,10 +891,14 @@ void dsputil_init(void) | |||||
| add_pixels_clamped = add_pixels_clamped_c; | add_pixels_clamped = add_pixels_clamped_c; | ||||
| gmc1= gmc1_c; | gmc1= gmc1_c; | ||||
| pix_abs16x16 = pix_abs16x16_c; | |||||
| pix_abs16x16_x2 = pix_abs16x16_x2_c; | |||||
| pix_abs16x16_y2 = pix_abs16x16_y2_c; | |||||
| pix_abs16x16 = pix_abs16x16_c; | |||||
| pix_abs16x16_x2 = pix_abs16x16_x2_c; | |||||
| pix_abs16x16_y2 = pix_abs16x16_y2_c; | |||||
| pix_abs16x16_xy2 = pix_abs16x16_xy2_c; | pix_abs16x16_xy2 = pix_abs16x16_xy2_c; | ||||
| pix_abs8x8 = pix_abs8x8_c; | |||||
| pix_abs8x8_x2 = pix_abs8x8_x2_c; | |||||
| pix_abs8x8_y2 = pix_abs8x8_y2_c; | |||||
| pix_abs8x8_xy2 = pix_abs8x8_xy2_c; | |||||
| av_fdct = jpeg_fdct_ifast; | av_fdct = jpeg_fdct_ifast; | ||||
| use_permuted_idct = 1; | use_permuted_idct = 1; | ||||
| @@ -66,17 +66,21 @@ extern void (*sub_pixels_tab[4])(DCTELEM *block, const UINT8 *pixels, int line_s | |||||
| /* motion estimation */ | /* motion estimation */ | ||||
| typedef int (*op_pixels_abs_func)(UINT8 *blk1, UINT8 *blk2, int line_size, int h); | |||||
| typedef int (*op_pixels_abs_func)(UINT8 *blk1, UINT8 *blk2, int line_size); | |||||
| extern op_pixels_abs_func pix_abs16x16; | extern op_pixels_abs_func pix_abs16x16; | ||||
| extern op_pixels_abs_func pix_abs16x16_x2; | extern op_pixels_abs_func pix_abs16x16_x2; | ||||
| extern op_pixels_abs_func pix_abs16x16_y2; | extern op_pixels_abs_func pix_abs16x16_y2; | ||||
| extern op_pixels_abs_func pix_abs16x16_xy2; | extern op_pixels_abs_func pix_abs16x16_xy2; | ||||
| int pix_abs16x16_c(UINT8 *blk1, UINT8 *blk2, int lx, int h); | |||||
| int pix_abs16x16_x2_c(UINT8 *blk1, UINT8 *blk2, int lx, int h); | |||||
| int pix_abs16x16_y2_c(UINT8 *blk1, UINT8 *blk2, int lx, int h); | |||||
| int pix_abs16x16_xy2_c(UINT8 *blk1, UINT8 *blk2, int lx, int h); | |||||
| extern op_pixels_abs_func pix_abs8x8; | |||||
| extern op_pixels_abs_func pix_abs8x8_x2; | |||||
| extern op_pixels_abs_func pix_abs8x8_y2; | |||||
| extern op_pixels_abs_func pix_abs8x8_xy2; | |||||
| int pix_abs16x16_c(UINT8 *blk1, UINT8 *blk2, int lx); | |||||
| int pix_abs16x16_x2_c(UINT8 *blk1, UINT8 *blk2, int lx); | |||||
| int pix_abs16x16_y2_c(UINT8 *blk1, UINT8 *blk2, int lx); | |||||
| int pix_abs16x16_xy2_c(UINT8 *blk1, UINT8 *blk2, int lx); | |||||
| static inline int block_permute_op(int j) | static inline int block_permute_op(int j) | ||||
| { | { | ||||
| @@ -469,14 +469,8 @@ void h263_encode_mb(MpegEncContext * s, | |||||
| } | } | ||||
| /* encode each block */ | /* encode each block */ | ||||
| if (s->h263_pred) { | |||||
| for (i = 0; i < 6; i++) { | |||||
| // mpeg4_encode_block(s, block[i], i); | |||||
| } | |||||
| } else { | |||||
| for (i = 0; i < 6; i++) { | |||||
| h263_encode_block(s, block[i], i); | |||||
| } | |||||
| for (i = 0; i < 6; i++) { | |||||
| h263_encode_block(s, block[i], i); | |||||
| } | } | ||||
| } | } | ||||
| @@ -778,8 +772,8 @@ void h263_encode_init(MpegEncContext *s) | |||||
| s->mv_penalty= mv_penalty; //FIXME exact table for msmpeg4 & h263p | s->mv_penalty= mv_penalty; //FIXME exact table for msmpeg4 & h263p | ||||
| // use fcodes >1 only for mpeg4 & h263 & h263p FIXME | // use fcodes >1 only for mpeg4 & h263 & h263p FIXME | ||||
| if(s->h263_plus) s->fcode_tab= umv_fcode_tab; | |||||
| else if(s->h263_pred) s->fcode_tab= fcode_tab; | |||||
| if(s->h263_plus) s->fcode_tab= umv_fcode_tab; | |||||
| else if(s->h263_pred && !s->h263_msmpeg4) s->fcode_tab= fcode_tab; | |||||
| } | } | ||||
| static void h263_encode_block(MpegEncContext * s, DCTELEM * block, int n) | static void h263_encode_block(MpegEncContext * s, DCTELEM * block, int n) | ||||
| @@ -24,19 +24,34 @@ | |||||
| int mm_flags; /* multimedia extension flags */ | int mm_flags; /* multimedia extension flags */ | ||||
| int pix_abs16x16_mmx(UINT8 *blk1, UINT8 *blk2, int lx, int h); | |||||
| int pix_abs16x16_sse(UINT8 *blk1, UINT8 *blk2, int lx, int h); | |||||
| int pix_abs16x16_x2_mmx(UINT8 *blk1, UINT8 *blk2, int lx, int h); | |||||
| int pix_abs16x16_y2_mmx(UINT8 *blk1, UINT8 *blk2, int lx, int h); | |||||
| int pix_abs16x16_xy2_mmx(UINT8 *blk1, UINT8 *blk2, int lx, int h); | |||||
| int pix_abs16x16_mmx(UINT8 *blk1, UINT8 *blk2, int lx); | |||||
| int pix_abs16x16_x2_mmx(UINT8 *blk1, UINT8 *blk2, int lx); | |||||
| int pix_abs16x16_y2_mmx(UINT8 *blk1, UINT8 *blk2, int lx); | |||||
| int pix_abs16x16_xy2_mmx(UINT8 *blk1, UINT8 *blk2, int lx); | |||||
| int pix_abs16x16_mmx2(UINT8 *blk1, UINT8 *blk2, int lx); | |||||
| int pix_abs16x16_x2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx); | |||||
| int pix_abs16x16_y2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx); | |||||
| int pix_abs16x16_xy2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx); | |||||
| int pix_abs8x8_mmx(UINT8 *blk1, UINT8 *blk2, int lx); | |||||
| int pix_abs8x8_x2_mmx(UINT8 *blk1, UINT8 *blk2, int lx); | |||||
| int pix_abs8x8_y2_mmx(UINT8 *blk1, UINT8 *blk2, int lx); | |||||
| int pix_abs8x8_xy2_mmx(UINT8 *blk1, UINT8 *blk2, int lx); | |||||
| int pix_abs8x8_mmx2(UINT8 *blk1, UINT8 *blk2, int lx); | |||||
| int pix_abs8x8_x2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx); | |||||
| int pix_abs8x8_y2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx); | |||||
| int pix_abs8x8_xy2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx); | |||||
| /* external functions, from idct_mmx.c */ | /* external functions, from idct_mmx.c */ | ||||
| void ff_mmx_idct(DCTELEM *block); | void ff_mmx_idct(DCTELEM *block); | ||||
| void ff_mmxext_idct(DCTELEM *block); | void ff_mmxext_idct(DCTELEM *block); | ||||
| /* pixel operations */ | /* pixel operations */ | ||||
| static const unsigned long long int mm_wone __attribute__ ((aligned(8))) = 0x0001000100010001; | |||||
| static const unsigned long long int mm_wtwo __attribute__ ((aligned(8))) = 0x0002000200020002; | |||||
| static const unsigned long long int mm_wone __attribute__ ((aligned(8))) = 0x0001000100010001LL; | |||||
| static const unsigned long long int mm_wtwo __attribute__ ((aligned(8))) = 0x0002000200020002LL; | |||||
| //static const unsigned short mm_wone[4] __attribute__ ((aligned(8))) = { 0x1, 0x1, 0x1, 0x1 }; | //static const unsigned short mm_wone[4] __attribute__ ((aligned(8))) = { 0x1, 0x1, 0x1, 0x1 }; | ||||
| //static const unsigned short mm_wtwo[4] __attribute__ ((aligned(8))) = { 0x2, 0x2, 0x2, 0x2 }; | //static const unsigned short mm_wtwo[4] __attribute__ ((aligned(8))) = { 0x2, 0x2, 0x2, 0x2 }; | ||||
| @@ -1035,10 +1050,14 @@ void dsputil_init_mmx(void) | |||||
| put_pixels_clamped = put_pixels_clamped_mmx; | put_pixels_clamped = put_pixels_clamped_mmx; | ||||
| add_pixels_clamped = add_pixels_clamped_mmx; | add_pixels_clamped = add_pixels_clamped_mmx; | ||||
| pix_abs16x16 = pix_abs16x16_mmx; | |||||
| pix_abs16x16_x2 = pix_abs16x16_x2_mmx; | |||||
| pix_abs16x16_y2 = pix_abs16x16_y2_mmx; | |||||
| pix_abs16x16 = pix_abs16x16_mmx; | |||||
| pix_abs16x16_x2 = pix_abs16x16_x2_mmx; | |||||
| pix_abs16x16_y2 = pix_abs16x16_y2_mmx; | |||||
| pix_abs16x16_xy2 = pix_abs16x16_xy2_mmx; | pix_abs16x16_xy2 = pix_abs16x16_xy2_mmx; | ||||
| pix_abs8x8 = pix_abs8x8_mmx; | |||||
| pix_abs8x8_x2 = pix_abs8x8_x2_mmx; | |||||
| pix_abs8x8_y2 = pix_abs8x8_y2_mmx; | |||||
| pix_abs8x8_xy2= pix_abs8x8_xy2_mmx; | |||||
| av_fdct = fdct_mmx; | av_fdct = fdct_mmx; | ||||
| put_pixels_tab[0] = put_pixels_mmx; | put_pixels_tab[0] = put_pixels_mmx; | ||||
| @@ -1067,10 +1086,16 @@ void dsputil_init_mmx(void) | |||||
| sub_pixels_tab[3] = sub_pixels_xy2_mmx; | sub_pixels_tab[3] = sub_pixels_xy2_mmx; | ||||
| if (mm_flags & MM_MMXEXT) { | if (mm_flags & MM_MMXEXT) { | ||||
| pix_abs16x16 = pix_abs16x16_sse; | |||||
| } | |||||
| if (mm_flags & MM_SSE) { | |||||
| pix_abs16x16 = pix_abs16x16_mmx2; | |||||
| pix_abs16x16_x2 = pix_abs16x16_x2_mmx2; | |||||
| pix_abs16x16_y2 = pix_abs16x16_y2_mmx2; | |||||
| pix_abs16x16_xy2= pix_abs16x16_xy2_mmx2; | |||||
| pix_abs8x8 = pix_abs8x8_mmx2; | |||||
| pix_abs8x8_x2 = pix_abs8x8_x2_mmx2; | |||||
| pix_abs8x8_y2 = pix_abs8x8_y2_mmx2; | |||||
| pix_abs8x8_xy2= pix_abs8x8_xy2_mmx2; | |||||
| put_pixels_tab[1] = put_pixels_x2_sse; | put_pixels_tab[1] = put_pixels_x2_sse; | ||||
| put_pixels_tab[2] = put_pixels_y2_sse; | put_pixels_tab[2] = put_pixels_y2_sse; | ||||
| @@ -16,229 +16,347 @@ | |||||
| * along with this program; if not, write to the Free Software | * along with this program; if not, write to the Free Software | ||||
| * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. | * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. | ||||
| * | * | ||||
| * mostly by Michael Niedermayer <michaelni@gmx.at> | |||||
| */ | */ | ||||
| #include "../dsputil.h" | #include "../dsputil.h" | ||||
| #include "mmx.h" | |||||
| static const unsigned long long int mm_wone __attribute__ ((aligned(8))) = 0x0001000100010001; | |||||
| static const unsigned long long int mm_wtwo __attribute__ ((aligned(8))) = 0x0002000200020002; | |||||
| static const __attribute__ ((aligned(8))) UINT64 round_tab[3]={ | |||||
| 0x0000000000000000, | |||||
| 0x0001000100010001, | |||||
| 0x0002000200020002, | |||||
| }; | |||||
| /* mm7 is accumulator, mm6 is zero */ | |||||
| static inline void sad_add(const UINT8 *p1, const UINT8 *p2) | |||||
| static inline void sad8_mmx(UINT8 *blk1, UINT8 *blk2, int stride, int h) | |||||
| { | { | ||||
| movq_m2r(*p1, mm0); | |||||
| movq_m2r(*p2, mm1); | |||||
| movq_r2r(mm0, mm2); | |||||
| psubusb_r2r(mm1, mm0); | |||||
| psubusb_r2r(mm2, mm1); | |||||
| por_r2r(mm1, mm0); /* mm0 is absolute value */ | |||||
| movq_r2r(mm0, mm1); | |||||
| punpcklbw_r2r(mm6, mm0); | |||||
| punpckhbw_r2r(mm6, mm1); | |||||
| paddusw_r2r(mm0, mm7); | |||||
| paddusw_r2r(mm1, mm7); | |||||
| int len= -(stride<<h); | |||||
| asm volatile( | |||||
| ".balign 16 \n\t" | |||||
| "1: \n\t" | |||||
| "movq (%1, %%eax), %%mm0 \n\t" | |||||
| "movq (%2, %%eax), %%mm2 \n\t" | |||||
| "movq (%2, %%eax), %%mm4 \n\t" | |||||
| "addl %3, %%eax \n\t" | |||||
| "psubusb %%mm0, %%mm2 \n\t" | |||||
| "psubusb %%mm4, %%mm0 \n\t" | |||||
| "movq (%1, %%eax), %%mm1 \n\t" | |||||
| "movq (%2, %%eax), %%mm3 \n\t" | |||||
| "movq (%2, %%eax), %%mm5 \n\t" | |||||
| "psubusb %%mm1, %%mm3 \n\t" | |||||
| "psubusb %%mm5, %%mm1 \n\t" | |||||
| "por %%mm2, %%mm0 \n\t" | |||||
| "por %%mm1, %%mm3 \n\t" | |||||
| "movq %%mm0, %%mm1 \n\t" | |||||
| "movq %%mm3, %%mm2 \n\t" | |||||
| "punpcklbw %%mm7, %%mm0 \n\t" | |||||
| "punpckhbw %%mm7, %%mm1 \n\t" | |||||
| "punpcklbw %%mm7, %%mm3 \n\t" | |||||
| "punpckhbw %%mm7, %%mm2 \n\t" | |||||
| "paddw %%mm1, %%mm0 \n\t" | |||||
| "paddw %%mm3, %%mm2 \n\t" | |||||
| "paddw %%mm2, %%mm0 \n\t" | |||||
| "paddw %%mm0, %%mm6 \n\t" | |||||
| "addl %3, %%eax \n\t" | |||||
| " js 1b \n\t" | |||||
| : "+a" (len) | |||||
| : "r" (blk1 - len), "r" (blk2 - len), "r" (stride) | |||||
| ); | |||||
| } | } | ||||
| /* convert mm7 to value */ | |||||
| static inline int sad_end(void) | |||||
| static inline void sad8_mmx2(UINT8 *blk1, UINT8 *blk2, int stride, int h) | |||||
| { | { | ||||
| int res; | |||||
| movq_r2r(mm7, mm0); | |||||
| psrlq_i2r(32, mm7); | |||||
| paddusw_r2r(mm0, mm7); | |||||
| movq_r2r(mm7, mm0); | |||||
| psrlq_i2r(16, mm7); | |||||
| paddusw_r2r(mm0, mm7); | |||||
| __asm __volatile ("movd %%mm7, %0" : "=a" (res)); | |||||
| return res & 0xffff; | |||||
| int len= -(stride<<h); | |||||
| asm volatile( | |||||
| ".balign 16 \n\t" | |||||
| "1: \n\t" | |||||
| "movq (%1, %%eax), %%mm0 \n\t" | |||||
| "movq (%2, %%eax), %%mm2 \n\t" | |||||
| "psadbw %%mm2, %%mm0 \n\t" | |||||
| "addl %3, %%eax \n\t" | |||||
| "movq (%1, %%eax), %%mm1 \n\t" | |||||
| "movq (%2, %%eax), %%mm3 \n\t" | |||||
| "psadbw %%mm1, %%mm3 \n\t" | |||||
| "paddw %%mm3, %%mm0 \n\t" | |||||
| "paddw %%mm0, %%mm6 \n\t" | |||||
| "addl %3, %%eax \n\t" | |||||
| " js 1b \n\t" | |||||
| : "+a" (len) | |||||
| : "r" (blk1 - len), "r" (blk2 - len), "r" (stride) | |||||
| ); | |||||
| } | } | ||||
| int pix_abs16x16_mmx(UINT8 *blk1, UINT8 *blk2, int lx, int h) | |||||
| static inline void sad8_2_mmx2(UINT8 *blk1a, UINT8 *blk1b, UINT8 *blk2, int stride, int h) | |||||
| { | { | ||||
| const UINT8 *p1, *p2; | |||||
| h >>= 1; | |||||
| p1 = blk1; | |||||
| p2 = blk2; | |||||
| pxor_r2r(mm7, mm7); /* mm7 is accumulator */ | |||||
| pxor_r2r(mm6, mm6); /* mm7 is zero constant */ | |||||
| do { | |||||
| sad_add(p1, p2); | |||||
| sad_add(p1 + 8, p2 + 8); | |||||
| p1 += lx; | |||||
| p2 += lx; | |||||
| sad_add(p1, p2); | |||||
| sad_add(p1 + 8, p2 + 8); | |||||
| p1 += lx; | |||||
| p2 += lx; | |||||
| } while (--h); | |||||
| return sad_end(); | |||||
| int len= -(stride<<h); | |||||
| asm volatile( | |||||
| ".balign 16 \n\t" | |||||
| "1: \n\t" | |||||
| "movq (%1, %%eax), %%mm0 \n\t" | |||||
| "movq (%2, %%eax), %%mm2 \n\t" | |||||
| "pavgb %%mm2, %%mm0 \n\t" | |||||
| "movq (%3, %%eax), %%mm2 \n\t" | |||||
| "psadbw %%mm2, %%mm0 \n\t" | |||||
| "addl %4, %%eax \n\t" | |||||
| "movq (%1, %%eax), %%mm1 \n\t" | |||||
| "movq (%2, %%eax), %%mm3 \n\t" | |||||
| "pavgb %%mm1, %%mm3 \n\t" | |||||
| "movq (%3, %%eax), %%mm1 \n\t" | |||||
| "psadbw %%mm1, %%mm3 \n\t" | |||||
| "paddw %%mm3, %%mm0 \n\t" | |||||
| "paddw %%mm0, %%mm6 \n\t" | |||||
| "addl %4, %%eax \n\t" | |||||
| " js 1b \n\t" | |||||
| : "+a" (len) | |||||
| : "r" (blk1a - len), "r" (blk1b -len), "r" (blk2 - len), "r" (stride) | |||||
| ); | |||||
| } | } | ||||
| /* please test it ! */ | |||||
| static inline void sad_add_sse(const UINT8 *p1, const UINT8 *p2) | |||||
| { | |||||
| movq_m2r(*(p1 + 0), mm0); | |||||
| movq_m2r(*(p1 + 8), mm1); | |||||
| psadbw_m2r(*(p2 + 0), mm0); | |||||
| psadbw_m2r(*(p2 + 8), mm1); | |||||
| paddusw_r2r(mm0, mm7); | |||||
| paddusw_r2r(mm1, mm7); | |||||
| static inline void sad8_4_mmx2(UINT8 *blk1, UINT8 *blk2, int stride, int h) | |||||
| { //FIXME reuse src | |||||
| int len= -(stride<<h); | |||||
| asm volatile( | |||||
| ".balign 16 \n\t" | |||||
| "1: \n\t" | |||||
| "movq (%1, %%eax), %%mm0 \n\t" | |||||
| "movq (%2, %%eax), %%mm2 \n\t" | |||||
| "movq 1(%1, %%eax), %%mm1 \n\t" | |||||
| "movq 1(%2, %%eax), %%mm3 \n\t" | |||||
| "pavgb %%mm2, %%mm0 \n\t" | |||||
| "pavgb %%mm1, %%mm3 \n\t" | |||||
| "pavgb %%mm3, %%mm0 \n\t" | |||||
| "movq (%3, %%eax), %%mm2 \n\t" | |||||
| "psadbw %%mm2, %%mm0 \n\t" | |||||
| "addl %4, %%eax \n\t" | |||||
| "movq (%1, %%eax), %%mm1 \n\t" | |||||
| "movq (%2, %%eax), %%mm3 \n\t" | |||||
| "movq 1(%1, %%eax), %%mm2 \n\t" | |||||
| "movq 1(%2, %%eax), %%mm4 \n\t" | |||||
| "pavgb %%mm3, %%mm1 \n\t" | |||||
| "pavgb %%mm4, %%mm2 \n\t" | |||||
| "pavgb %%mm1, %%mm2 \n\t" | |||||
| "movq (%3, %%eax), %%mm1 \n\t" | |||||
| "psadbw %%mm1, %%mm2 \n\t" | |||||
| "paddw %%mm2, %%mm0 \n\t" | |||||
| "paddw %%mm0, %%mm6 \n\t" | |||||
| "addl %4, %%eax \n\t" | |||||
| " js 1b \n\t" | |||||
| : "+a" (len) | |||||
| : "r" (blk1 - len), "r" (blk1 - len + stride), "r" (blk2 - len), "r" (stride) | |||||
| ); | |||||
| } | } | ||||
| int pix_abs16x16_sse(UINT8 *blk1, UINT8 *blk2, int lx, int h) | |||||
| static inline void sad8_2_mmx(UINT8 *blk1a, UINT8 *blk1b, UINT8 *blk2, int stride, int h) | |||||
| { | { | ||||
| const UINT8 *p1, *p2; | |||||
| h >>= 1; | |||||
| p1 = blk1; | |||||
| p2 = blk2; | |||||
| pxor_r2r(mm7, mm7); /* mm7 is accumulator */ | |||||
| do { | |||||
| sad_add_sse(p1, p2); | |||||
| p1 += lx; | |||||
| p2 += lx; | |||||
| sad_add_sse(p1, p2); | |||||
| p1 += lx; | |||||
| p2 += lx; | |||||
| } while (--h); | |||||
| return sad_end(); | |||||
| int len= -(stride<<h); | |||||
| asm volatile( | |||||
| ".balign 16 \n\t" | |||||
| "1: \n\t" | |||||
| "movq (%1, %%eax), %%mm0 \n\t" | |||||
| "movq (%2, %%eax), %%mm1 \n\t" | |||||
| "movq (%1, %%eax), %%mm2 \n\t" | |||||
| "movq (%2, %%eax), %%mm3 \n\t" | |||||
| "punpcklbw %%mm7, %%mm0 \n\t" | |||||
| "punpcklbw %%mm7, %%mm1 \n\t" | |||||
| "punpckhbw %%mm7, %%mm2 \n\t" | |||||
| "punpckhbw %%mm7, %%mm3 \n\t" | |||||
| "paddw %%mm0, %%mm1 \n\t" | |||||
| "paddw %%mm2, %%mm3 \n\t" | |||||
| "movq (%3, %%eax), %%mm4 \n\t" | |||||
| "movq (%3, %%eax), %%mm2 \n\t" | |||||
| "paddw %%mm5, %%mm1 \n\t" | |||||
| "paddw %%mm5, %%mm3 \n\t" | |||||
| "psrlw $1, %%mm1 \n\t" | |||||
| "psrlw $1, %%mm3 \n\t" | |||||
| "packuswb %%mm3, %%mm1 \n\t" | |||||
| "psubusb %%mm1, %%mm4 \n\t" | |||||
| "psubusb %%mm2, %%mm1 \n\t" | |||||
| "por %%mm4, %%mm1 \n\t" | |||||
| "movq %%mm1, %%mm0 \n\t" | |||||
| "punpcklbw %%mm7, %%mm0 \n\t" | |||||
| "punpckhbw %%mm7, %%mm1 \n\t" | |||||
| "paddw %%mm1, %%mm0 \n\t" | |||||
| "paddw %%mm0, %%mm6 \n\t" | |||||
| "addl %4, %%eax \n\t" | |||||
| " js 1b \n\t" | |||||
| : "+a" (len) | |||||
| : "r" (blk1a - len), "r" (blk1b -len), "r" (blk2 - len), "r" (stride) | |||||
| ); | |||||
| } | } | ||||
| #define DUMP(reg) { mmx_t tmp; movq_r2m(reg, tmp); printf(#reg "=%016Lx\n", tmp.uq); } | |||||
| /* mm7 is accumulator, mm6 is zero */ | |||||
| static inline void sad_add_x2(const UINT8 *p1, const UINT8 *p2, const UINT8 *p3) | |||||
| static inline void sad8_4_mmx(UINT8 *blk1, UINT8 *blk2, int stride, int h) | |||||
| { | { | ||||
| movq_m2r(*(p2 + 0), mm0); | |||||
| movq_m2r(*(p3 + 0), mm1); | |||||
| movq_r2r(mm0, mm2); | |||||
| movq_r2r(mm1, mm3); | |||||
| punpcklbw_r2r(mm6, mm0); /* extract 4 bytes low */ | |||||
| punpcklbw_r2r(mm6, mm1); | |||||
| punpckhbw_r2r(mm6, mm2); /* high */ | |||||
| punpckhbw_r2r(mm6, mm3); | |||||
| paddusw_r2r(mm1, mm0); | |||||
| paddusw_r2r(mm3, mm2); | |||||
| movq_m2r(*(p1 + 0), mm1); /* mm1 : other value */ | |||||
| paddusw_r2r(mm5, mm0); /* + 1 */ | |||||
| paddusw_r2r(mm5, mm2); /* + 1 */ | |||||
| psrlw_i2r(1, mm0); | |||||
| psrlw_i2r(1, mm2); | |||||
| packuswb_r2r(mm2, mm0); /* average is in mm0 */ | |||||
| movq_r2r(mm1, mm2); | |||||
| psubusb_r2r(mm0, mm1); | |||||
| psubusb_r2r(mm2, mm0); | |||||
| por_r2r(mm1, mm0); /* mm0 is absolute value */ | |||||
| movq_r2r(mm0, mm1); | |||||
| punpcklbw_r2r(mm6, mm0); | |||||
| punpckhbw_r2r(mm6, mm1); | |||||
| paddusw_r2r(mm0, mm7); | |||||
| paddusw_r2r(mm1, mm7); | |||||
| int len= -(stride<<h); | |||||
| asm volatile( | |||||
| ".balign 16 \n\t" | |||||
| "1: \n\t" | |||||
| "movq (%1, %%eax), %%mm0 \n\t" | |||||
| "movq (%2, %%eax), %%mm1 \n\t" | |||||
| "movq %%mm0, %%mm4 \n\t" | |||||
| "movq %%mm1, %%mm2 \n\t" | |||||
| "punpcklbw %%mm7, %%mm0 \n\t" | |||||
| "punpcklbw %%mm7, %%mm1 \n\t" | |||||
| "punpckhbw %%mm7, %%mm4 \n\t" | |||||
| "punpckhbw %%mm7, %%mm2 \n\t" | |||||
| "paddw %%mm1, %%mm0 \n\t" | |||||
| "paddw %%mm2, %%mm4 \n\t" | |||||
| "movq 1(%1, %%eax), %%mm2 \n\t" | |||||
| "movq 1(%2, %%eax), %%mm3 \n\t" | |||||
| "movq %%mm2, %%mm1 \n\t" | |||||
| "punpcklbw %%mm7, %%mm2 \n\t" | |||||
| "punpckhbw %%mm7, %%mm1 \n\t" | |||||
| "paddw %%mm0, %%mm2 \n\t" | |||||
| "paddw %%mm4, %%mm1 \n\t" | |||||
| "movq %%mm3, %%mm4 \n\t" | |||||
| "punpcklbw %%mm7, %%mm3 \n\t" | |||||
| "punpckhbw %%mm7, %%mm4 \n\t" | |||||
| "paddw %%mm3, %%mm2 \n\t" | |||||
| "paddw %%mm4, %%mm1 \n\t" | |||||
| "movq (%3, %%eax), %%mm3 \n\t" | |||||
| "movq (%3, %%eax), %%mm4 \n\t" | |||||
| "paddw %%mm5, %%mm2 \n\t" | |||||
| "paddw %%mm5, %%mm1 \n\t" | |||||
| "psrlw $2, %%mm2 \n\t" | |||||
| "psrlw $2, %%mm1 \n\t" | |||||
| "packuswb %%mm1, %%mm2 \n\t" | |||||
| "psubusb %%mm2, %%mm3 \n\t" | |||||
| "psubusb %%mm4, %%mm2 \n\t" | |||||
| "por %%mm3, %%mm2 \n\t" | |||||
| "movq %%mm2, %%mm0 \n\t" | |||||
| "punpcklbw %%mm7, %%mm0 \n\t" | |||||
| "punpckhbw %%mm7, %%mm2 \n\t" | |||||
| "paddw %%mm2, %%mm0 \n\t" | |||||
| "paddw %%mm0, %%mm6 \n\t" | |||||
| "addl %4, %%eax \n\t" | |||||
| " js 1b \n\t" | |||||
| : "+a" (len) | |||||
| : "r" (blk1 - len), "r" (blk1 -len + stride), "r" (blk2 - len), "r" (stride) | |||||
| ); | |||||
| } | } | ||||
| int pix_abs16x16_x2_mmx(UINT8 *blk1, UINT8 *blk2, int lx, int h) | |||||
| static inline int sum_mmx() | |||||
| { | { | ||||
| const UINT8 *p1, *p2; | |||||
| p1 = blk1; | |||||
| p2 = blk2; | |||||
| pxor_r2r(mm7, mm7); /* mm7 is accumulator */ | |||||
| pxor_r2r(mm6, mm6); /* mm7 is zero constant */ | |||||
| movq_m2r(mm_wone, mm5); /* one constant */ | |||||
| do { | |||||
| sad_add_x2(p1, p2, p2 + 1); | |||||
| sad_add_x2(p1 + 8, p2 + 8, p2 + 9); | |||||
| p1 += lx; | |||||
| p2 += lx; | |||||
| } while (--h); | |||||
| return sad_end(); | |||||
| int ret; | |||||
| asm volatile( | |||||
| "movq %%mm6, %%mm0 \n\t" | |||||
| "psrlq $32, %%mm6 \n\t" | |||||
| "paddw %%mm0, %%mm6 \n\t" | |||||
| "movq %%mm6, %%mm0 \n\t" | |||||
| "psrlq $16, %%mm6 \n\t" | |||||
| "paddw %%mm0, %%mm6 \n\t" | |||||
| "movd %%mm6, %0 \n\t" | |||||
| : "=r" (ret) | |||||
| ); | |||||
| return ret&0xFFFF; | |||||
| } | } | ||||
| int pix_abs16x16_y2_mmx(UINT8 *blk1, UINT8 *blk2, int lx, int h) | |||||
| static inline int sum_mmx2() | |||||
| { | { | ||||
| const UINT8 *p1, *p2; | |||||
| p1 = blk1; | |||||
| p2 = blk2; | |||||
| pxor_r2r(mm7, mm7); /* mm7 is accumulator */ | |||||
| pxor_r2r(mm6, mm6); /* mm7 is zero constant */ | |||||
| movq_m2r(mm_wone, mm5); /* one constant */ | |||||
| do { | |||||
| sad_add_x2(p1, p2, p2 + lx); | |||||
| sad_add_x2(p1 + 8, p2 + 8, p2 + 8 + lx); | |||||
| p1 += lx; | |||||
| p2 += lx; | |||||
| } while (--h); | |||||
| return sad_end(); | |||||
| int ret; | |||||
| asm volatile( | |||||
| "movd %%mm6, %0 \n\t" | |||||
| : "=r" (ret) | |||||
| ); | |||||
| return ret; | |||||
| } | } | ||||
| /* mm7 is accumulator, mm6 is zero */ | |||||
| static inline void sad_add_xy2(const UINT8 *p1, const UINT8 *p2, const UINT8 *p3) | |||||
| { | |||||
| movq_m2r(*(p2 + 0), mm0); | |||||
| movq_m2r(*(p3 + 0), mm1); | |||||
| movq_r2r(mm0, mm2); | |||||
| movq_r2r(mm1, mm3); | |||||
| punpcklbw_r2r(mm6, mm0); /* extract 4 bytes low */ | |||||
| punpcklbw_r2r(mm6, mm1); | |||||
| punpckhbw_r2r(mm6, mm2); /* high */ | |||||
| punpckhbw_r2r(mm6, mm3); | |||||
| paddusw_r2r(mm1, mm0); | |||||
| paddusw_r2r(mm3, mm2); | |||||
| movq_m2r(*(p2 + 1), mm1); | |||||
| movq_m2r(*(p3 + 1), mm3); | |||||
| movq_r2r(mm1, mm4); | |||||
| punpcklbw_r2r(mm6, mm1); /* low */ | |||||
| punpckhbw_r2r(mm6, mm4); /* high */ | |||||
| paddusw_r2r(mm1, mm0); | |||||
| paddusw_r2r(mm4, mm2); | |||||
| movq_r2r(mm3, mm4); | |||||
| punpcklbw_r2r(mm6, mm3); /* low */ | |||||
| punpckhbw_r2r(mm6, mm4); /* high */ | |||||
| paddusw_r2r(mm3, mm0); | |||||
| paddusw_r2r(mm4, mm2); | |||||
| movq_m2r(*(p1 + 0), mm1); /* mm1 : other value */ | |||||
| paddusw_r2r(mm5, mm0); /* + 2 */ | |||||
| paddusw_r2r(mm5, mm2); /* + 2 */ | |||||
| psrlw_i2r(2, mm0); | |||||
| psrlw_i2r(2, mm2); | |||||
| packuswb_r2r(mm2, mm0); /* average is in mm0 */ | |||||
| #define PIX_SAD(suf)\ | |||||
| int pix_abs8x8_ ## suf(UINT8 *blk2, UINT8 *blk1, int stride)\ | |||||
| {\ | |||||
| asm volatile("pxor %%mm7, %%mm7 \n\t"\ | |||||
| "pxor %%mm6, %%mm6 \n\t":);\ | |||||
| \ | |||||
| sad8_ ## suf(blk1, blk2, stride, 3);\ | |||||
| \ | |||||
| return sum_ ## suf();\ | |||||
| }\ | |||||
| \ | |||||
| int pix_abs8x8_x2_ ## suf(UINT8 *blk2, UINT8 *blk1, int stride)\ | |||||
| {\ | |||||
| asm volatile("pxor %%mm7, %%mm7 \n\t"\ | |||||
| "pxor %%mm6, %%mm6 \n\t"\ | |||||
| "movq %0, %%mm5 \n\t"\ | |||||
| :: "m"(round_tab[1]) \ | |||||
| );\ | |||||
| \ | |||||
| sad8_2_ ## suf(blk1, blk2+1, blk2, stride, 3);\ | |||||
| \ | |||||
| return sum_ ## suf();\ | |||||
| }\ | |||||
| \ | |||||
| int pix_abs8x8_y2_ ## suf(UINT8 *blk2, UINT8 *blk1, int stride)\ | |||||
| {\ | |||||
| asm volatile("pxor %%mm7, %%mm7 \n\t"\ | |||||
| "pxor %%mm6, %%mm6 \n\t"\ | |||||
| "movq %0, %%mm5 \n\t"\ | |||||
| :: "m"(round_tab[1]) \ | |||||
| );\ | |||||
| \ | |||||
| sad8_2_ ## suf(blk1, blk1+stride, blk2, stride, 3);\ | |||||
| \ | |||||
| return sum_ ## suf();\ | |||||
| }\ | |||||
| \ | |||||
| int pix_abs8x8_xy2_ ## suf(UINT8 *blk2, UINT8 *blk1, int stride)\ | |||||
| {\ | |||||
| asm volatile("pxor %%mm7, %%mm7 \n\t"\ | |||||
| "pxor %%mm6, %%mm6 \n\t"\ | |||||
| "movq %0, %%mm5 \n\t"\ | |||||
| :: "m"(round_tab[2]) \ | |||||
| );\ | |||||
| \ | |||||
| sad8_4_ ## suf(blk1, blk2, stride, 3);\ | |||||
| \ | |||||
| return sum_ ## suf();\ | |||||
| }\ | |||||
| \ | |||||
| int pix_abs16x16_ ## suf(UINT8 *blk2, UINT8 *blk1, int stride)\ | |||||
| {\ | |||||
| asm volatile("pxor %%mm7, %%mm7 \n\t"\ | |||||
| "pxor %%mm6, %%mm6 \n\t":);\ | |||||
| \ | |||||
| sad8_ ## suf(blk1 , blk2 , stride, 4);\ | |||||
| sad8_ ## suf(blk1+8, blk2+8, stride, 4);\ | |||||
| \ | |||||
| return sum_ ## suf();\ | |||||
| }\ | |||||
| int pix_abs16x16_x2_ ## suf(UINT8 *blk2, UINT8 *blk1, int stride)\ | |||||
| {\ | |||||
| asm volatile("pxor %%mm7, %%mm7 \n\t"\ | |||||
| "pxor %%mm6, %%mm6 \n\t"\ | |||||
| "movq %0, %%mm5 \n\t"\ | |||||
| :: "m"(round_tab[1]) \ | |||||
| );\ | |||||
| \ | |||||
| sad8_2_ ## suf(blk1 , blk1+1, blk2 , stride, 4);\ | |||||
| sad8_2_ ## suf(blk1+8, blk1+9, blk2+8, stride, 4);\ | |||||
| \ | |||||
| return sum_ ## suf();\ | |||||
| }\ | |||||
| int pix_abs16x16_y2_ ## suf(UINT8 *blk2, UINT8 *blk1, int stride)\ | |||||
| {\ | |||||
| asm volatile("pxor %%mm7, %%mm7 \n\t"\ | |||||
| "pxor %%mm6, %%mm6 \n\t"\ | |||||
| "movq %0, %%mm5 \n\t"\ | |||||
| :: "m"(round_tab[1]) \ | |||||
| );\ | |||||
| \ | |||||
| sad8_2_ ## suf(blk1 , blk1+stride, blk2 , stride, 4);\ | |||||
| sad8_2_ ## suf(blk1+8, blk1+stride+8,blk2+8, stride, 4);\ | |||||
| \ | |||||
| return sum_ ## suf();\ | |||||
| }\ | |||||
| int pix_abs16x16_xy2_ ## suf(UINT8 *blk2, UINT8 *blk1, int stride)\ | |||||
| {\ | |||||
| asm volatile("pxor %%mm7, %%mm7 \n\t"\ | |||||
| "pxor %%mm6, %%mm6 \n\t"\ | |||||
| "movq %0, %%mm5 \n\t"\ | |||||
| :: "m"(round_tab[2]) \ | |||||
| );\ | |||||
| \ | |||||
| sad8_4_ ## suf(blk1 , blk2 , stride, 4);\ | |||||
| sad8_4_ ## suf(blk1+8, blk2+8, stride, 4);\ | |||||
| \ | |||||
| return sum_ ## suf();\ | |||||
| }\ | |||||
| movq_r2r(mm1, mm2); | |||||
| psubusb_r2r(mm0, mm1); | |||||
| psubusb_r2r(mm2, mm0); | |||||
| por_r2r(mm1, mm0); /* mm0 is absolute value */ | |||||
| movq_r2r(mm0, mm1); | |||||
| punpcklbw_r2r(mm6, mm0); | |||||
| punpckhbw_r2r(mm6, mm1); | |||||
| paddusw_r2r(mm0, mm7); | |||||
| paddusw_r2r(mm1, mm7); | |||||
| } | |||||
| int pix_abs16x16_xy2_mmx(UINT8 *blk1, UINT8 *blk2, int lx, int h) | |||||
| { | |||||
| const UINT8 *p1, *p2, *p3; | |||||
| p1 = blk1; | |||||
| p2 = blk2; | |||||
| p3 = blk2 + lx; | |||||
| pxor_r2r(mm7, mm7); /* mm7 is accumulator */ | |||||
| pxor_r2r(mm6, mm6); /* mm7 is zero constant */ | |||||
| movq_m2r(mm_wtwo, mm5); /* one constant */ | |||||
| do { | |||||
| sad_add_xy2(p1, p2, p2 + lx); | |||||
| sad_add_xy2(p1 + 8, p2 + 8, p2 + 8 + lx); | |||||
| p1 += lx; | |||||
| p2 += lx; | |||||
| } while (--h); | |||||
| return sad_end(); | |||||
| } | |||||
| PIX_SAD(mmx) | |||||
| PIX_SAD(mmx2) | |||||
| @@ -26,6 +26,7 @@ | |||||
| #include "mpegvideo.h" | #include "mpegvideo.h" | ||||
| #define ABS(a) ((a)>0 ? (a) : -(a)) | #define ABS(a) ((a)>0 ? (a) : -(a)) | ||||
| #define MAX(a,b) ((a) > (b) ? (a) : (b)) | |||||
| #define INTER_BIAS 257 | #define INTER_BIAS 257 | ||||
| static void halfpel_motion_search(MpegEncContext * s, | static void halfpel_motion_search(MpegEncContext * s, | ||||
| @@ -164,7 +165,7 @@ static int full_motion_search(MpegEncContext * s, | |||||
| for (y = y1; y <= y2; y++) { | for (y = y1; y <= y2; y++) { | ||||
| for (x = x1; x <= x2; x++) { | for (x = x1; x <= x2; x++) { | ||||
| d = pix_abs16x16(pix, s->last_picture[0] + (y * s->linesize) + x, | d = pix_abs16x16(pix, s->last_picture[0] + (y * s->linesize) + x, | ||||
| s->linesize, 16); | |||||
| s->linesize); | |||||
| if (d < dmin || | if (d < dmin || | ||||
| (d == dmin && | (d == dmin && | ||||
| (abs(x - xx) + abs(y - yy)) < | (abs(x - xx) + abs(y - yy)) < | ||||
| @@ -228,7 +229,7 @@ static int log_motion_search(MpegEncContext * s, | |||||
| do { | do { | ||||
| for (y = y1; y <= y2; y += range) { | for (y = y1; y <= y2; y += range) { | ||||
| for (x = x1; x <= x2; x += range) { | for (x = x1; x <= x2; x += range) { | ||||
| d = pix_abs16x16(pix, s->last_picture[0] + (y * s->linesize) + x, s->linesize, 16); | |||||
| d = pix_abs16x16(pix, s->last_picture[0] + (y * s->linesize) + x, s->linesize); | |||||
| if (d < dmin || (d == dmin && (abs(x - xx) + abs(y - yy)) < (abs(mx - xx) + abs(my - yy)))) { | if (d < dmin || (d == dmin && (abs(x - xx) + abs(y - yy)) < (abs(mx - xx) + abs(my - yy)))) { | ||||
| dmin = d; | dmin = d; | ||||
| mx = x; | mx = x; | ||||
| @@ -308,7 +309,7 @@ static int phods_motion_search(MpegEncContext * s, | |||||
| lastx = x; | lastx = x; | ||||
| for (x = x1; x <= x2; x += range) { | for (x = x1; x <= x2; x += range) { | ||||
| d = pix_abs16x16(pix, s->last_picture[0] + (y * s->linesize) + x, s->linesize, 16); | |||||
| d = pix_abs16x16(pix, s->last_picture[0] + (y * s->linesize) + x, s->linesize); | |||||
| if (d < dminx || (d == dminx && (abs(x - xx) + abs(y - yy)) < (abs(mx - xx) + abs(my - yy)))) { | if (d < dminx || (d == dminx && (abs(x - xx) + abs(y - yy)) < (abs(mx - xx) + abs(my - yy)))) { | ||||
| dminx = d; | dminx = d; | ||||
| mx = x; | mx = x; | ||||
| @@ -317,7 +318,7 @@ static int phods_motion_search(MpegEncContext * s, | |||||
| x = lastx; | x = lastx; | ||||
| for (y = y1; y <= y2; y += range) { | for (y = y1; y <= y2; y += range) { | ||||
| d = pix_abs16x16(pix, s->last_picture[0] + (y * s->linesize) + x, s->linesize, 16); | |||||
| d = pix_abs16x16(pix, s->last_picture[0] + (y * s->linesize) + x, s->linesize); | |||||
| if (d < dminy || (d == dminy && (abs(x - xx) + abs(y - yy)) < (abs(mx - xx) + abs(my - yy)))) { | if (d < dminy || (d == dminy && (abs(x - xx) + abs(y - yy)) < (abs(mx - xx) + abs(my - yy)))) { | ||||
| dminy = d; | dminy = d; | ||||
| my = y; | my = y; | ||||
| @@ -361,7 +362,7 @@ static int phods_motion_search(MpegEncContext * s, | |||||
| #define CHECK_MV(x,y)\ | #define CHECK_MV(x,y)\ | ||||
| {\ | {\ | ||||
| d = pix_abs16x16(new_pic, old_pic + (x) + (y)*pic_stride, pic_stride, 16);\ | |||||
| d = pix_abs16x16(new_pic, old_pic + (x) + (y)*pic_stride, pic_stride);\ | |||||
| d += (mv_penalty[((x)<<shift)-pred_x] + mv_penalty[((y)<<shift)-pred_y])*quant;\ | d += (mv_penalty[((x)<<shift)-pred_x] + mv_penalty[((y)<<shift)-pred_y])*quant;\ | ||||
| if(d<dmin){\ | if(d<dmin){\ | ||||
| best[0]=x;\ | best[0]=x;\ | ||||
| @@ -372,7 +373,7 @@ static int phods_motion_search(MpegEncContext * s, | |||||
| #define CHECK_MV_DIR(x,y,new_dir)\ | #define CHECK_MV_DIR(x,y,new_dir)\ | ||||
| {\ | {\ | ||||
| d = pix_abs16x16(new_pic, old_pic + (x) + (y)*pic_stride, pic_stride, 16);\ | |||||
| d = pix_abs16x16(new_pic, old_pic + (x) + (y)*pic_stride, pic_stride);\ | |||||
| d += (mv_penalty[((x)<<shift)-pred_x] + mv_penalty[((y)<<shift)-pred_y])*quant;\ | d += (mv_penalty[((x)<<shift)-pred_x] + mv_penalty[((y)<<shift)-pred_y])*quant;\ | ||||
| if(d<dmin){\ | if(d<dmin){\ | ||||
| best[0]=x;\ | best[0]=x;\ | ||||
| @@ -382,6 +383,30 @@ static int phods_motion_search(MpegEncContext * s, | |||||
| }\ | }\ | ||||
| } | } | ||||
| #define CHECK_MV4(x,y)\ | |||||
| {\ | |||||
| d = pix_abs8x8(new_pic, old_pic + (x) + (y)*pic_stride, pic_stride);\ | |||||
| d += (mv_penalty[((x)<<shift)-pred_x] + mv_penalty[((y)<<shift)-pred_y])*quant;\ | |||||
| if(d<dmin){\ | |||||
| best[0]=x;\ | |||||
| best[1]=y;\ | |||||
| dmin=d;\ | |||||
| }\ | |||||
| } | |||||
| #define CHECK_MV4_DIR(x,y,new_dir)\ | |||||
| {\ | |||||
| d = pix_abs8x8(new_pic, old_pic + (x) + (y)*pic_stride, pic_stride);\ | |||||
| d += (mv_penalty[((x)<<shift)-pred_x] + mv_penalty[((y)<<shift)-pred_y])*quant;\ | |||||
| if(d<dmin){\ | |||||
| best[0]=x;\ | |||||
| best[1]=y;\ | |||||
| dmin=d;\ | |||||
| next_dir= new_dir;\ | |||||
| }\ | |||||
| } | |||||
| #define check(x,y,S,v)\ | #define check(x,y,S,v)\ | ||||
| if( (x)<(xmin<<(S)) ) printf("%d %d %d %d xmin" #v, (x), (y), s->mb_x, s->mb_y);\ | if( (x)<(xmin<<(S)) ) printf("%d %d %d %d xmin" #v, (x), (y), s->mb_x, s->mb_y);\ | ||||
| if( (x)>(xmax<<(S)) ) printf("%d %d %d %d xmax" #v, (x), (y), s->mb_x, s->mb_y);\ | if( (x)>(xmax<<(S)) ) printf("%d %d %d %d xmax" #v, (x), (y), s->mb_x, s->mb_y);\ | ||||
| @@ -440,6 +465,32 @@ static inline int small_diamond_search(MpegEncContext * s, int *best, int dmin, | |||||
| */ | */ | ||||
| } | } | ||||
| static inline int small_diamond_search4MV(MpegEncContext * s, int *best, int dmin, | |||||
| UINT8 *new_pic, UINT8 *old_pic, int pic_stride, | |||||
| int pred_x, int pred_y, UINT16 *mv_penalty, int quant, | |||||
| int xmin, int ymin, int xmax, int ymax, int shift) | |||||
| { | |||||
| int next_dir=-1; | |||||
| for(;;){ | |||||
| int d; | |||||
| const int dir= next_dir; | |||||
| const int x= best[0]; | |||||
| const int y= best[1]; | |||||
| next_dir=-1; | |||||
| //printf("%d", dir); | |||||
| if(dir!=2 && x>xmin) CHECK_MV4_DIR(x-1, y , 0) | |||||
| if(dir!=3 && y>ymin) CHECK_MV4_DIR(x , y-1, 1) | |||||
| if(dir!=0 && x<xmax) CHECK_MV4_DIR(x+1, y , 2) | |||||
| if(dir!=1 && y<ymax) CHECK_MV4_DIR(x , y+1, 3) | |||||
| if(next_dir==-1){ | |||||
| return dmin; | |||||
| } | |||||
| } | |||||
| } | |||||
| static inline int snake_search(MpegEncContext * s, int *best, int dmin, | static inline int snake_search(MpegEncContext * s, int *best, int dmin, | ||||
| UINT8 *new_pic, UINT8 *old_pic, int pic_stride, | UINT8 *new_pic, UINT8 *old_pic, int pic_stride, | ||||
| int pred_x, int pred_y, UINT16 *mv_penalty, int quant, | int pred_x, int pred_y, UINT16 *mv_penalty, int quant, | ||||
| @@ -469,7 +520,7 @@ if(256*256*256*64%point==0) | |||||
| x+=x_dir[dir]; | x+=x_dir[dir]; | ||||
| y+=y_dir[dir]; | y+=y_dir[dir]; | ||||
| if(x>=xmin && x<=xmax && y>=ymin && y<=ymax){ | if(x>=xmin && x<=xmax && y>=ymin && y<=ymax){ | ||||
| d = pix_abs16x16(new_pic, old_pic + (x) + (y)*pic_stride, pic_stride, 16); | |||||
| d = pix_abs16x16(new_pic, old_pic + (x) + (y)*pic_stride, pic_stride); | |||||
| d += (mv_penalty[((x)<<shift)-pred_x] + mv_penalty[((y)<<shift)-pred_y])*quant; | d += (mv_penalty[((x)<<shift)-pred_x] + mv_penalty[((y)<<shift)-pred_y])*quant; | ||||
| }else{ | }else{ | ||||
| d = dmin + 10000; //FIXME smarter boundary handling | d = dmin + 10000; //FIXME smarter boundary handling | ||||
| @@ -517,7 +568,7 @@ static int epzs_motion_search(MpegEncContext * s, | |||||
| new_pic = s->new_picture[0] + pic_xy; | new_pic = s->new_picture[0] + pic_xy; | ||||
| old_pic = s->last_picture[0] + pic_xy; | old_pic = s->last_picture[0] + pic_xy; | ||||
| dmin = pix_abs16x16(new_pic, old_pic, pic_stride, 16); | |||||
| dmin = pix_abs16x16(new_pic, old_pic, pic_stride); | |||||
| if(dmin<Z_THRESHOLD){ | if(dmin<Z_THRESHOLD){ | ||||
| *mx_ptr= 0; | *mx_ptr= 0; | ||||
| *my_ptr= 0; | *my_ptr= 0; | ||||
| @@ -557,8 +608,56 @@ static int epzs_motion_search(MpegEncContext * s, | |||||
| return dmin; | return dmin; | ||||
| } | } | ||||
| static int epzs_motion_search4(MpegEncContext * s, int block, | |||||
| int *mx_ptr, int *my_ptr, | |||||
| int P[6][2], int pred_x, int pred_y, | |||||
| int xmin, int ymin, int xmax, int ymax) | |||||
| { | |||||
| int best[2]={0, 0}; | |||||
| int d, dmin; | |||||
| UINT8 *new_pic, *old_pic; | |||||
| const int pic_stride= s->linesize; | |||||
| const int pic_xy= ((s->mb_y*2 + (block>>1))*pic_stride + s->mb_x*2 + (block&1))*8; | |||||
| UINT16 *mv_penalty= s->mv_penalty[s->f_code] + MAX_MV; // f_code of the prev frame | |||||
| int quant= s->qscale; // qscale of the prev frame | |||||
| const int shift= 1+s->quarter_sample; | |||||
| new_pic = s->new_picture[0] + pic_xy; | |||||
| old_pic = s->last_picture[0] + pic_xy; | |||||
| dmin = pix_abs8x8(new_pic, old_pic, pic_stride); | |||||
| /* first line */ | |||||
| if ((s->mb_y == 0 || s->first_slice_line || s->first_gob_line) && block<2) { | |||||
| CHECK_MV4(P[1][0]>>shift, P[1][1]>>shift) | |||||
| }else{ | |||||
| CHECK_MV4(P[4][0]>>shift, P[4][1]>>shift) | |||||
| if(dmin<Z_THRESHOLD){ | |||||
| *mx_ptr= P[4][0]>>shift; | |||||
| *my_ptr= P[4][1]>>shift; | |||||
| //printf("M\n"); | |||||
| return dmin; | |||||
| } | |||||
| CHECK_MV4(P[1][0]>>shift, P[1][1]>>shift) | |||||
| CHECK_MV4(P[2][0]>>shift, P[2][1]>>shift) | |||||
| CHECK_MV4(P[3][0]>>shift, P[3][1]>>shift) | |||||
| } | |||||
| CHECK_MV4(P[0][0]>>shift, P[0][1]>>shift) | |||||
| CHECK_MV4(P[5][0]>>shift, P[5][1]>>shift) | |||||
| //check(best[0],best[1],0, b0) | |||||
| dmin= small_diamond_search4MV(s, best, dmin, new_pic, old_pic, pic_stride, | |||||
| pred_x, pred_y, mv_penalty, quant, xmin, ymin, xmax, ymax, shift); | |||||
| //check(best[0],best[1],0, b1) | |||||
| *mx_ptr= best[0]; | |||||
| *my_ptr= best[1]; | |||||
| // printf("%d %d %d \n", best[0], best[1], dmin); | |||||
| return dmin; | |||||
| } | |||||
| #define CHECK_HALF_MV(suffix, x, y) \ | #define CHECK_HALF_MV(suffix, x, y) \ | ||||
| d= pix_abs16x16_ ## suffix(pix, ptr+((x)>>1), s->linesize, 16);\ | |||||
| d= pix_abs16x16_ ## suffix(pix, ptr+((x)>>1), s->linesize);\ | |||||
| d += (mv_penalty[pen_x + x] + mv_penalty[pen_y + y])*quant;\ | d += (mv_penalty[pen_x + x] + mv_penalty[pen_y + y])*quant;\ | ||||
| if(d<dminh){\ | if(d<dminh){\ | ||||
| dminh= d;\ | dminh= d;\ | ||||
| @@ -566,6 +665,15 @@ static int epzs_motion_search(MpegEncContext * s, | |||||
| my= my1 + y;\ | my= my1 + y;\ | ||||
| } | } | ||||
| #define CHECK_HALF_MV4(suffix, x, y) \ | |||||
| d= pix_abs8x8_ ## suffix(pix, ptr+((x)>>1), s->linesize);\ | |||||
| d += (mv_penalty[pen_x + x] + mv_penalty[pen_y + y])*quant;\ | |||||
| if(d<dminh){\ | |||||
| dminh= d;\ | |||||
| mx= mx1 + x;\ | |||||
| my= my1 + y;\ | |||||
| } | |||||
| /* The idea would be to make half pel ME after Inter/Intra decision to | /* The idea would be to make half pel ME after Inter/Intra decision to | ||||
| save time. */ | save time. */ | ||||
| static inline void halfpel_motion_search(MpegEncContext * s, | static inline void halfpel_motion_search(MpegEncContext * s, | ||||
| @@ -614,6 +722,7 @@ static inline void halfpel_motion_search(MpegEncContext * s, | |||||
| CHECK_HALF_MV(xy2, -1, +1) | CHECK_HALF_MV(xy2, -1, +1) | ||||
| CHECK_HALF_MV(y2 , 0, +1) | CHECK_HALF_MV(y2 , 0, +1) | ||||
| CHECK_HALF_MV(xy2, +1, +1) | CHECK_HALF_MV(xy2, +1, +1) | ||||
| }else{ | }else{ | ||||
| mx= 2*(mx - xx); | mx= 2*(mx - xx); | ||||
| my= 2*(my - yy); | my= 2*(my - yy); | ||||
| @@ -623,19 +732,99 @@ static inline void halfpel_motion_search(MpegEncContext * s, | |||||
| *my_ptr = my; | *my_ptr = my; | ||||
| } | } | ||||
| static inline void halfpel_motion_search4(MpegEncContext * s, | |||||
| int *mx_ptr, int *my_ptr, int dmin, | |||||
| int xmin, int ymin, int xmax, int ymax, | |||||
| int pred_x, int pred_y, int block_x, int block_y) | |||||
| { | |||||
| UINT16 *mv_penalty= s->mv_penalty[s->f_code] + MAX_MV; // f_code of the prev frame | |||||
| const int quant= s->qscale; | |||||
| int pen_x, pen_y; | |||||
| int mx, my, mx1, my1, d, xx, yy, dminh; | |||||
| UINT8 *pix, *ptr; | |||||
| xx = 8 * block_x; | |||||
| yy = 8 * block_y; | |||||
| pix = s->new_picture[0] + (yy * s->linesize) + xx; | |||||
| mx = *mx_ptr; | |||||
| my = *my_ptr; | |||||
| ptr = s->last_picture[0] + ((yy+my) * s->linesize) + xx + mx; | |||||
| dminh = dmin; | |||||
| if (mx > xmin && mx < xmax && | |||||
| my > ymin && my < ymax) { | |||||
| mx= mx1= 2*mx; | |||||
| my= my1= 2*my; | |||||
| if(dmin < Z_THRESHOLD && mx==0 && my==0){ | |||||
| *mx_ptr = 0; | |||||
| *my_ptr = 0; | |||||
| return; | |||||
| } | |||||
| pen_x= pred_x + mx; | |||||
| pen_y= pred_y + my; | |||||
| ptr-= s->linesize; | |||||
| CHECK_HALF_MV4(xy2, -1, -1) | |||||
| CHECK_HALF_MV4(y2 , 0, -1) | |||||
| CHECK_HALF_MV4(xy2, +1, -1) | |||||
| ptr+= s->linesize; | |||||
| CHECK_HALF_MV4(x2 , -1, 0) | |||||
| CHECK_HALF_MV4(x2 , +1, 0) | |||||
| CHECK_HALF_MV4(xy2, -1, +1) | |||||
| CHECK_HALF_MV4(y2 , 0, +1) | |||||
| CHECK_HALF_MV4(xy2, +1, +1) | |||||
| }else{ | |||||
| mx*=2; | |||||
| my*=2; | |||||
| } | |||||
| *mx_ptr = mx; | |||||
| *my_ptr = my; | |||||
| } | |||||
| static inline void set_mv_tables(MpegEncContext * s, int mx, int my) | |||||
| { | |||||
| const int xy= s->mb_x + s->mb_y*s->mb_width; | |||||
| s->mv_table[0][xy] = mx; | |||||
| s->mv_table[1][xy] = my; | |||||
| /* has allready been set to the 4 MV if 4MV is done */ | |||||
| if(!(s->flags&CODEC_FLAG_4MV)){ | |||||
| int mot_xy= s->block_index[0]; | |||||
| s->motion_val[mot_xy ][0]= mx; | |||||
| s->motion_val[mot_xy ][1]= my; | |||||
| s->motion_val[mot_xy+1][0]= mx; | |||||
| s->motion_val[mot_xy+1][1]= my; | |||||
| mot_xy += s->block_wrap[0]; | |||||
| s->motion_val[mot_xy ][0]= mx; | |||||
| s->motion_val[mot_xy ][1]= my; | |||||
| s->motion_val[mot_xy+1][0]= mx; | |||||
| s->motion_val[mot_xy+1][1]= my; | |||||
| } | |||||
| } | |||||
| #ifndef CONFIG_TEST_MV_ENCODE | #ifndef CONFIG_TEST_MV_ENCODE | ||||
| int estimate_motion(MpegEncContext * s, | |||||
| int mb_x, int mb_y, | |||||
| int *mx_ptr, int *my_ptr) | |||||
| void estimate_motion(MpegEncContext * s, | |||||
| int mb_x, int mb_y) | |||||
| { | { | ||||
| UINT8 *pix, *ppix; | UINT8 *pix, *ppix; | ||||
| int sum, varc, vard, mx, my, range, dmin, xx, yy; | int sum, varc, vard, mx, my, range, dmin, xx, yy; | ||||
| int xmin, ymin, xmax, ymax; | int xmin, ymin, xmax, ymax; | ||||
| int rel_xmin, rel_ymin, rel_xmax, rel_ymax; | int rel_xmin, rel_ymin, rel_xmax, rel_ymax; | ||||
| int pred_x=0, pred_y=0; | int pred_x=0, pred_y=0; | ||||
| int P[5][2]; | |||||
| int P[6][2]; | |||||
| const int shift= 1+s->quarter_sample; | const int shift= 1+s->quarter_sample; | ||||
| int mb_type=0; | |||||
| range = 8 * (1 << (s->f_code - 1)); | range = 8 * (1 << (s->f_code - 1)); | ||||
| /* XXX: temporary kludge to avoid overflow for msmpeg4 */ | /* XXX: temporary kludge to avoid overflow for msmpeg4 */ | ||||
| @@ -680,14 +869,13 @@ int estimate_motion(MpegEncContext * s, | |||||
| case ME_X1: | case ME_X1: | ||||
| case ME_EPZS: | case ME_EPZS: | ||||
| { | { | ||||
| static const int off[4]= {2, 1, 1, -1}; | |||||
| const int mot_stride = s->mb_width*2 + 2; | |||||
| const int mot_xy = (s->mb_y*2 + 1)*mot_stride + s->mb_x*2 + 1; | |||||
| const int mot_stride = s->block_wrap[0]; | |||||
| const int mot_xy = s->block_index[0]; | |||||
| rel_xmin= xmin - s->mb_x*16; | |||||
| rel_xmax= xmax - s->mb_x*16; | |||||
| rel_ymin= ymin - s->mb_y*16; | |||||
| rel_ymax= ymax - s->mb_y*16; | |||||
| rel_xmin= xmin - mb_x*16; | |||||
| rel_xmax= xmax - mb_x*16; | |||||
| rel_ymin= ymin - mb_y*16; | |||||
| rel_ymax= ymax - mb_y*16; | |||||
| P[0][0] = s->motion_val[mot_xy ][0]; | P[0][0] = s->motion_val[mot_xy ][0]; | ||||
| P[0][1] = s->motion_val[mot_xy ][1]; | P[0][1] = s->motion_val[mot_xy ][1]; | ||||
| @@ -696,14 +884,14 @@ int estimate_motion(MpegEncContext * s, | |||||
| if(P[1][0] > (rel_xmax<<shift)) P[1][0]= (rel_xmax<<shift); | if(P[1][0] > (rel_xmax<<shift)) P[1][0]= (rel_xmax<<shift); | ||||
| /* special case for first line */ | /* special case for first line */ | ||||
| if ((s->mb_y == 0 || s->first_slice_line || s->first_gob_line)) { | |||||
| if ((mb_y == 0 || s->first_slice_line || s->first_gob_line)) { | |||||
| P[4][0] = P[1][0]; | P[4][0] = P[1][0]; | ||||
| P[4][1] = P[1][1]; | P[4][1] = P[1][1]; | ||||
| } else { | } else { | ||||
| P[2][0] = s->motion_val[mot_xy - mot_stride ][0]; | P[2][0] = s->motion_val[mot_xy - mot_stride ][0]; | ||||
| P[2][1] = s->motion_val[mot_xy - mot_stride ][1]; | P[2][1] = s->motion_val[mot_xy - mot_stride ][1]; | ||||
| P[3][0] = s->motion_val[mot_xy - mot_stride + off[0] ][0]; | |||||
| P[3][1] = s->motion_val[mot_xy - mot_stride + off[0] ][1]; | |||||
| P[3][0] = s->motion_val[mot_xy - mot_stride + 2 ][0]; | |||||
| P[3][1] = s->motion_val[mot_xy - mot_stride + 2 ][1]; | |||||
| if(P[2][1] > (rel_ymax<<shift)) P[2][1]= (rel_ymax<<shift); | if(P[2][1] > (rel_ymax<<shift)) P[2][1]= (rel_ymax<<shift); | ||||
| if(P[3][0] < (rel_xmin<<shift)) P[3][0]= (rel_xmin<<shift); | if(P[3][0] < (rel_xmin<<shift)) P[3][0]= (rel_xmin<<shift); | ||||
| if(P[3][1] > (rel_ymax<<shift)) P[3][1]= (rel_ymax<<shift); | if(P[3][1] > (rel_ymax<<shift)) P[3][1]= (rel_ymax<<shift); | ||||
| @@ -721,10 +909,72 @@ int estimate_motion(MpegEncContext * s, | |||||
| } | } | ||||
| dmin = epzs_motion_search(s, &mx, &my, P, pred_x, pred_y, rel_xmin, rel_ymin, rel_xmax, rel_ymax); | dmin = epzs_motion_search(s, &mx, &my, P, pred_x, pred_y, rel_xmin, rel_ymin, rel_xmax, rel_ymax); | ||||
| mx+= s->mb_x*16; | |||||
| my+= s->mb_y*16; | |||||
| mx+= mb_x*16; | |||||
| my+= mb_y*16; | |||||
| break; | break; | ||||
| } | } | ||||
| if(s->flags&CODEC_FLAG_4MV){ | |||||
| int block; | |||||
| mb_type|= MB_TYPE_INTER4V; | |||||
| for(block=0; block<4; block++){ | |||||
| int mx4, my4; | |||||
| int pred_x4, pred_y4; | |||||
| int dmin4; | |||||
| static const int off[4]= {2, 1, 1, -1}; | |||||
| const int mot_stride = s->block_wrap[0]; | |||||
| const int mot_xy = s->block_index[block]; | |||||
| const int block_x= mb_x*2 + (block&1); | |||||
| const int block_y= mb_y*2 + (block>>1); | |||||
| const int rel_xmin4= xmin - block_x*8; | |||||
| const int rel_xmax4= xmax - block_x*8; | |||||
| const int rel_ymin4= ymin - block_y*8; | |||||
| const int rel_ymax4= ymax - block_y*8; | |||||
| P[0][0] = s->motion_val[mot_xy ][0]; | |||||
| P[0][1] = s->motion_val[mot_xy ][1]; | |||||
| P[1][0] = s->motion_val[mot_xy - 1][0]; | |||||
| P[1][1] = s->motion_val[mot_xy - 1][1]; | |||||
| if(P[1][0] > (rel_xmax4<<shift)) P[1][0]= (rel_xmax4<<shift); | |||||
| /* special case for first line */ | |||||
| if ((mb_y == 0 || s->first_slice_line || s->first_gob_line) && block<2) { | |||||
| P[4][0] = P[1][0]; | |||||
| P[4][1] = P[1][1]; | |||||
| } else { | |||||
| P[2][0] = s->motion_val[mot_xy - mot_stride ][0]; | |||||
| P[2][1] = s->motion_val[mot_xy - mot_stride ][1]; | |||||
| P[3][0] = s->motion_val[mot_xy - mot_stride + off[block]][0]; | |||||
| P[3][1] = s->motion_val[mot_xy - mot_stride + off[block]][1]; | |||||
| if(P[2][1] > (rel_ymax4<<shift)) P[2][1]= (rel_ymax4<<shift); | |||||
| if(P[3][0] < (rel_xmin4<<shift)) P[3][0]= (rel_xmin4<<shift); | |||||
| if(P[3][1] > (rel_ymax4<<shift)) P[3][1]= (rel_ymax4<<shift); | |||||
| P[4][0]= mid_pred(P[1][0], P[2][0], P[3][0]); | |||||
| P[4][1]= mid_pred(P[1][1], P[2][1], P[3][1]); | |||||
| } | |||||
| if(s->out_format == FMT_H263){ | |||||
| pred_x4 = P[4][0]; | |||||
| pred_y4 = P[4][1]; | |||||
| }else { /* mpeg1 at least */ | |||||
| pred_x4= P[1][0]; | |||||
| pred_y4= P[1][1]; | |||||
| } | |||||
| P[5][0]= mx - mb_x*16; | |||||
| P[5][1]= my - mb_y*16; | |||||
| dmin4 = epzs_motion_search4(s, block, &mx4, &my4, P, pred_x4, pred_y4, rel_xmin4, rel_ymin4, rel_xmax4, rel_ymax4); | |||||
| halfpel_motion_search4(s, &mx4, &my4, dmin4, rel_xmin4, rel_ymin4, rel_xmax4, rel_ymax4, | |||||
| pred_x4, pred_y4, block_x, block_y); | |||||
| s->motion_val[ s->block_index[block] ][0]= mx4; | |||||
| s->motion_val[ s->block_index[block] ][1]= my4; | |||||
| } | |||||
| } | |||||
| /* intra / predictive decision */ | /* intra / predictive decision */ | ||||
| xx = mb_x * 16; | xx = mb_x * 16; | ||||
| @@ -737,7 +987,7 @@ int estimate_motion(MpegEncContext * s, | |||||
| sum = pix_sum(pix, s->linesize); | sum = pix_sum(pix, s->linesize); | ||||
| #if 0 | #if 0 | ||||
| varc = pix_dev(pix, s->linesize, (sum+128)>>8) + INTER_BIAS; | varc = pix_dev(pix, s->linesize, (sum+128)>>8) + INTER_BIAS; | ||||
| vard = pix_abs16x16(pix, ppix, s->linesize, 16); | |||||
| vard = pix_abs16x16(pix, ppix, s->linesize); | |||||
| #else | #else | ||||
| sum= (sum+8)>>4; | sum= (sum+8)>>4; | ||||
| varc = ((pix_norm1(pix, s->linesize) - sum*sum + 128 + 500)>>8); | varc = ((pix_norm1(pix, s->linesize) - sum*sum + 128 + 500)>>8); | ||||
| @@ -745,30 +995,38 @@ int estimate_motion(MpegEncContext * s, | |||||
| #endif | #endif | ||||
| s->mb_var[s->mb_width * mb_y + mb_x] = varc; | s->mb_var[s->mb_width * mb_y + mb_x] = varc; | ||||
| s->avg_mb_var += varc; | |||||
| s->avg_mb_var+= varc; | |||||
| s->mc_mb_var += vard; | s->mc_mb_var += vard; | ||||
| #if 0 | #if 0 | ||||
| printf("varc=%4d avg_var=%4d (sum=%4d) vard=%4d mx=%2d my=%2d\n", | printf("varc=%4d avg_var=%4d (sum=%4d) vard=%4d mx=%2d my=%2d\n", | ||||
| varc, s->avg_mb_var, sum, vard, mx - xx, my - yy); | varc, s->avg_mb_var, sum, vard, mx - xx, my - yy); | ||||
| #endif | #endif | ||||
| if (vard <= 64 || vard < varc) { | |||||
| if (s->full_search != ME_ZERO) { | |||||
| if(s->flags&CODEC_FLAG_HQ){ | |||||
| if (vard*2 + 200 > varc) | |||||
| mb_type|= MB_TYPE_INTRA; | |||||
| if (varc*2 + 200 > vard){ | |||||
| mb_type|= MB_TYPE_INTER; | |||||
| halfpel_motion_search(s, &mx, &my, dmin, xmin, ymin, xmax, ymax, pred_x, pred_y); | halfpel_motion_search(s, &mx, &my, dmin, xmin, ymin, xmax, ymax, pred_x, pred_y); | ||||
| } else { | |||||
| mx -= 16 * s->mb_x; | |||||
| my -= 16 * s->mb_y; | |||||
| } | } | ||||
| // check(mx + 32*s->mb_x, my + 32*s->mb_y, 1, end) | |||||
| *mx_ptr = mx; | |||||
| *my_ptr = my; | |||||
| return 0; | |||||
| } else { | |||||
| *mx_ptr = 0; | |||||
| *my_ptr = 0; | |||||
| return 1; | |||||
| }else{ | |||||
| if (vard <= 64 || vard < varc) { | |||||
| mb_type|= MB_TYPE_INTER; | |||||
| if (s->full_search != ME_ZERO) { | |||||
| halfpel_motion_search(s, &mx, &my, dmin, xmin, ymin, xmax, ymax, pred_x, pred_y); | |||||
| } else { | |||||
| mx -= 16 * mb_x; | |||||
| my -= 16 * mb_y; | |||||
| } | |||||
| }else{ | |||||
| mb_type|= MB_TYPE_INTRA; | |||||
| mx = 0;//mx*2 - 32 * mb_x; | |||||
| my = 0;//my*2 - 32 * mb_y; | |||||
| } | |||||
| } | } | ||||
| s->mb_type[mb_y*s->mb_width + mb_x]= mb_type; | |||||
| set_mv_tables(s, mx, my); | |||||
| } | } | ||||
| #else | #else | ||||
| @@ -227,6 +227,8 @@ int MPV_common_init(MpegEncContext *s) | |||||
| if (!s->mbskip_table) | if (!s->mbskip_table) | ||||
| goto fail; | goto fail; | ||||
| } | } | ||||
| s->block= s->intra_block; | |||||
| s->context_initialized = 1; | s->context_initialized = 1; | ||||
| return 0; | return 0; | ||||
| @@ -295,7 +297,7 @@ int MPV_encode_init(AVCodecContext *avctx) | |||||
| s->qblur= avctx->qblur; | s->qblur= avctx->qblur; | ||||
| s->avctx = avctx; | s->avctx = avctx; | ||||
| s->aspect_ratio_info= avctx->aspect_ratio_info; | s->aspect_ratio_info= avctx->aspect_ratio_info; | ||||
| s->hq= (avctx->flags & CODEC_FLAG_HQ); | |||||
| s->flags= avctx->flags; | |||||
| if (s->gop_size <= 1) { | if (s->gop_size <= 1) { | ||||
| s->intra_only = 1; | s->intra_only = 1; | ||||
| @@ -1078,68 +1080,183 @@ void MPV_decode_mb(MpegEncContext *s, DCTELEM block[6][64]) | |||||
| } | } | ||||
| } | } | ||||
| the_end: | the_end: | ||||
| emms_c(); | |||||
| emms_c(); //FIXME remove | |||||
| } | } | ||||
| static void encode_picture(MpegEncContext *s, int picture_number) | |||||
| static void encode_mb(MpegEncContext *s) | |||||
| { | { | ||||
| int mb_x, mb_y, wrap, last_gob, pdif = 0; | |||||
| int wrap; | |||||
| const int mb_x= s->mb_x; | |||||
| const int mb_y= s->mb_y; | |||||
| UINT8 *ptr; | UINT8 *ptr; | ||||
| int i, motion_x, motion_y; | |||||
| const int motion_x= s->mv[0][0][0]; | |||||
| const int motion_y= s->mv[0][0][1]; | |||||
| int i; | |||||
| /* get the pixels */ | |||||
| wrap = s->linesize; | |||||
| ptr = s->new_picture[0] + (mb_y * 16 * wrap) + mb_x * 16; | |||||
| get_pixels(s->block[0], ptr, wrap); | |||||
| get_pixels(s->block[1], ptr + 8, wrap); | |||||
| get_pixels(s->block[2], ptr + 8 * wrap, wrap); | |||||
| get_pixels(s->block[3], ptr + 8 * wrap + 8, wrap); | |||||
| wrap = s->linesize >> 1; | |||||
| ptr = s->new_picture[1] + (mb_y * 8 * wrap) + mb_x * 8; | |||||
| get_pixels(s->block[4], ptr, wrap); | |||||
| wrap = s->linesize >> 1; | |||||
| ptr = s->new_picture[2] + (mb_y * 8 * wrap) + mb_x * 8; | |||||
| get_pixels(s->block[5], ptr, wrap); | |||||
| /* subtract previous frame if non intra */ | |||||
| if (!s->mb_intra) { | |||||
| int dxy, offset, mx, my; | |||||
| dxy = ((motion_y & 1) << 1) | (motion_x & 1); | |||||
| ptr = s->last_picture[0] + | |||||
| ((mb_y * 16 + (motion_y >> 1)) * s->linesize) + | |||||
| (mb_x * 16 + (motion_x >> 1)); | |||||
| sub_pixels_2(s->block[0], ptr, s->linesize, dxy); | |||||
| sub_pixels_2(s->block[1], ptr + 8, s->linesize, dxy); | |||||
| sub_pixels_2(s->block[2], ptr + s->linesize * 8, s->linesize, dxy); | |||||
| sub_pixels_2(s->block[3], ptr + 8 + s->linesize * 8, s->linesize ,dxy); | |||||
| if (s->out_format == FMT_H263) { | |||||
| /* special rounding for h263 */ | |||||
| dxy = 0; | |||||
| if ((motion_x & 3) != 0) | |||||
| dxy |= 1; | |||||
| if ((motion_y & 3) != 0) | |||||
| dxy |= 2; | |||||
| mx = motion_x >> 2; | |||||
| my = motion_y >> 2; | |||||
| } else { | |||||
| mx = motion_x / 2; | |||||
| my = motion_y / 2; | |||||
| dxy = ((my & 1) << 1) | (mx & 1); | |||||
| mx >>= 1; | |||||
| my >>= 1; | |||||
| } | |||||
| offset = ((mb_y * 8 + my) * (s->linesize >> 1)) + (mb_x * 8 + mx); | |||||
| ptr = s->last_picture[1] + offset; | |||||
| sub_pixels_2(s->block[4], ptr, s->linesize >> 1, dxy); | |||||
| ptr = s->last_picture[2] + offset; | |||||
| sub_pixels_2(s->block[5], ptr, s->linesize >> 1, dxy); | |||||
| } | |||||
| #if 0 | |||||
| { | |||||
| float adap_parm; | |||||
| adap_parm = ((s->avg_mb_var << 1) + s->mb_var[s->mb_width*mb_y+mb_x] + 1.0) / | |||||
| ((s->mb_var[s->mb_width*mb_y+mb_x] << 1) + s->avg_mb_var + 1.0); | |||||
| printf("\ntype=%c qscale=%2d adap=%0.2f dquant=%4.2f var=%4d avgvar=%4d", | |||||
| (s->mb_type[s->mb_width*mb_y+mb_x] > 0) ? 'I' : 'P', | |||||
| s->qscale, adap_parm, s->qscale*adap_parm, | |||||
| s->mb_var[s->mb_width*mb_y+mb_x], s->avg_mb_var); | |||||
| } | |||||
| #endif | |||||
| /* DCT & quantize */ | |||||
| if (s->h263_msmpeg4) { | |||||
| msmpeg4_dc_scale(s); | |||||
| } else if (s->h263_pred) { | |||||
| h263_dc_scale(s); | |||||
| } else { | |||||
| /* default quantization values */ | |||||
| s->y_dc_scale = 8; | |||||
| s->c_dc_scale = 8; | |||||
| } | |||||
| for(i=0;i<6;i++) { | |||||
| s->block_last_index[i] = dct_quantize(s, s->block[i], i, s->qscale); | |||||
| } | |||||
| /* huffman encode */ | |||||
| switch(s->out_format) { | |||||
| case FMT_MPEG1: | |||||
| mpeg1_encode_mb(s, s->block, motion_x, motion_y); | |||||
| break; | |||||
| case FMT_H263: | |||||
| if (s->h263_msmpeg4) | |||||
| msmpeg4_encode_mb(s, s->block, motion_x, motion_y); | |||||
| else if(s->h263_pred) | |||||
| mpeg4_encode_mb(s, s->block, motion_x, motion_y); | |||||
| else | |||||
| h263_encode_mb(s, s->block, motion_x, motion_y); | |||||
| break; | |||||
| case FMT_MJPEG: | |||||
| mjpeg_encode_mb(s, s->block); | |||||
| break; | |||||
| } | |||||
| } | |||||
| static void copy_bits(PutBitContext *pb, UINT8 *src, int length) | |||||
| { | |||||
| int bytes= length>>3; | |||||
| int bits= length&7; | |||||
| int i; | |||||
| for(i=0; i<bytes; i++) put_bits(pb, 8, src[i]); | |||||
| put_bits(pb, bits, src[i]>>(8-bits)); | |||||
| } | |||||
| static void encode_picture(MpegEncContext *s, int picture_number) | |||||
| { | |||||
| int mb_x, mb_y, last_gob, pdif = 0; | |||||
| int i; | |||||
| int bits; | int bits; | ||||
| MpegEncContext best_s; | |||||
| UINT8 bit_buf[4][3000]; //FIXME check that this is ALLWAYS large enogh for a MB | |||||
| s->picture_number = picture_number; | s->picture_number = picture_number; | ||||
| s->block_wrap[0]= | |||||
| s->block_wrap[1]= | |||||
| s->block_wrap[2]= | |||||
| s->block_wrap[3]= s->mb_width*2 + 2; | |||||
| s->block_wrap[4]= | |||||
| s->block_wrap[5]= s->mb_width + 2; | |||||
| s->last_mc_mb_var = s->mc_mb_var; | s->last_mc_mb_var = s->mc_mb_var; | ||||
| /* Reset the average MB variance */ | /* Reset the average MB variance */ | ||||
| s->avg_mb_var = 0; | s->avg_mb_var = 0; | ||||
| s->mc_mb_var = 0; | s->mc_mb_var = 0; | ||||
| /* Estimate motion for every MB */ | /* Estimate motion for every MB */ | ||||
| for(mb_y=0; mb_y < s->mb_height; mb_y++) { | |||||
| for(mb_x=0; mb_x < s->mb_width; mb_x++) { | |||||
| int xy= mb_y * s->mb_width + mb_x; | |||||
| const int mot_stride = s->mb_width*2 + 2; | |||||
| int mot_xy = (mb_y*2 + 1)*mot_stride + mb_x*2 + 1; | |||||
| s->mb_x = mb_x; | |||||
| s->mb_y = mb_y; | |||||
| /* compute motion vector and macro block type (intra or non intra) */ | |||||
| motion_x = 0; | |||||
| motion_y = 0; | |||||
| if (s->pict_type == P_TYPE) { | |||||
| s->mb_intra = estimate_motion(s, mb_x, mb_y, | |||||
| &motion_x, | |||||
| &motion_y); | |||||
| } else { | |||||
| s->mb_intra = 1; | |||||
| if(s->pict_type == P_TYPE){ | |||||
| for(mb_y=0; mb_y < s->mb_height; mb_y++) { | |||||
| s->block_index[0]= s->block_wrap[0]*(mb_y*2 + 1) - 1; | |||||
| s->block_index[1]= s->block_wrap[0]*(mb_y*2 + 1); | |||||
| s->block_index[2]= s->block_wrap[0]*(mb_y*2 + 2) - 1; | |||||
| s->block_index[3]= s->block_wrap[0]*(mb_y*2 + 2); | |||||
| for(mb_x=0; mb_x < s->mb_width; mb_x++) { | |||||
| s->mb_x = mb_x; | |||||
| s->mb_y = mb_y; | |||||
| s->block_index[0]+=2; | |||||
| s->block_index[1]+=2; | |||||
| s->block_index[2]+=2; | |||||
| s->block_index[3]+=2; | |||||
| /* compute motion vector & mb_type and store in context */ | |||||
| estimate_motion(s, mb_x, mb_y); | |||||
| // s->mb_type[mb_y*s->mb_width + mb_x]=MB_TYPE_INTER; | |||||
| } | } | ||||
| /* Store MB type and MV */ | |||||
| s->mb_type[xy] = s->mb_intra; | |||||
| s->mv_table[0][xy] = motion_x; | |||||
| s->mv_table[1][xy] = motion_y; | |||||
| s->motion_val[mot_xy ][0]= motion_x; | |||||
| s->motion_val[mot_xy ][1]= motion_y; | |||||
| s->motion_val[mot_xy+1][0]= motion_x; | |||||
| s->motion_val[mot_xy+1][1]= motion_y; | |||||
| mot_xy += mot_stride; | |||||
| s->motion_val[mot_xy ][0]= motion_x; | |||||
| s->motion_val[mot_xy ][1]= motion_y; | |||||
| s->motion_val[mot_xy+1][0]= motion_x; | |||||
| s->motion_val[mot_xy+1][1]= motion_y; | |||||
| } | } | ||||
| emms_c(); | |||||
| }else{ | |||||
| /* I-Frame */ | |||||
| //FIXME do we need to zero them? | |||||
| memset(s->motion_val[0], 0, sizeof(INT16)*(s->mb_width*2 + 2)*(s->mb_height*2 + 2)*2); | |||||
| memset(s->mv_table[0] , 0, sizeof(INT16)*s->mb_width*s->mb_height); | |||||
| memset(s->mv_table[1] , 0, sizeof(INT16)*s->mb_width*s->mb_height); | |||||
| memset(s->mb_type , MB_TYPE_INTRA, sizeof(UINT8)*s->mb_width*s->mb_height); | |||||
| } | } | ||||
| emms_c(); | |||||
| if(s->avg_mb_var < s->mc_mb_var && s->pict_type != B_TYPE){ //FIXME subtract MV bits | if(s->avg_mb_var < s->mc_mb_var && s->pict_type != B_TYPE){ //FIXME subtract MV bits | ||||
| int i; | |||||
| s->pict_type= I_TYPE; | s->pict_type= I_TYPE; | ||||
| s->picture_in_gop_number=0; | s->picture_in_gop_number=0; | ||||
| for(i=0; i<s->mb_num; i++){ | |||||
| s->mb_type[i] = 1; | |||||
| s->mv_table[0][i] = 0; | |||||
| s->mv_table[1][i] = 0; | |||||
| } | |||||
| memset(s->mb_type , MB_TYPE_INTRA, sizeof(UINT8)*s->mb_width*s->mb_height); | |||||
| //printf("Scene change detected, encoding as I Frame\n"); | |||||
| } | } | ||||
| /* find best f_code for ME which do unlimited searches */ | /* find best f_code for ME which do unlimited searches */ | ||||
| @@ -1152,7 +1269,7 @@ static void encode_picture(MpegEncContext *s, int picture_number) | |||||
| for(i=0; i<8; i++) mv_num[i]=0; | for(i=0; i<8; i++) mv_num[i]=0; | ||||
| for(i=0; i<s->mb_num; i++){ | for(i=0; i<s->mb_num; i++){ | ||||
| if(s->mb_type[i] == 0){ | |||||
| if(s->mb_type[i] & (MB_TYPE_INTER|MB_TYPE_INTER4V)){ | |||||
| mv_num[ fcode_tab[s->mv_table[0][i] + MAX_MV] ]++; | mv_num[ fcode_tab[s->mv_table[0][i] + MAX_MV] ]++; | ||||
| mv_num[ fcode_tab[s->mv_table[1][i] + MAX_MV] ]++; | mv_num[ fcode_tab[s->mv_table[1][i] + MAX_MV] ]++; | ||||
| //printf("%d %d %d\n", s->mv_table[0][i], fcode_tab[s->mv_table[0][i] + MAX_MV], i); | //printf("%d %d %d\n", s->mv_table[0][i], fcode_tab[s->mv_table[0][i] + MAX_MV], i); | ||||
| @@ -1181,16 +1298,20 @@ static void encode_picture(MpegEncContext *s, int picture_number) | |||||
| UINT8 * fcode_tab= s->fcode_tab; | UINT8 * fcode_tab= s->fcode_tab; | ||||
| for(i=0; i<s->mb_num; i++){ | for(i=0; i<s->mb_num; i++){ | ||||
| if(s->mb_type[i] == 0){ | |||||
| if(s->mb_type[i]&MB_TYPE_INTER){ | |||||
| if( fcode_tab[s->mv_table[0][i] + MAX_MV] > f_code | if( fcode_tab[s->mv_table[0][i] + MAX_MV] > f_code | ||||
| || fcode_tab[s->mv_table[0][i] + MAX_MV] == 0 | || fcode_tab[s->mv_table[0][i] + MAX_MV] == 0 | ||||
| || fcode_tab[s->mv_table[1][i] + MAX_MV] > f_code | || fcode_tab[s->mv_table[1][i] + MAX_MV] > f_code | ||||
| || fcode_tab[s->mv_table[1][i] + MAX_MV] == 0 ){ | || fcode_tab[s->mv_table[1][i] + MAX_MV] == 0 ){ | ||||
| s->mb_type[i] = 1; | |||||
| s->mb_type[i] &= ~MB_TYPE_INTER; | |||||
| s->mb_type[i] |= MB_TYPE_INTRA; | |||||
| s->mv_table[0][i] = 0; | s->mv_table[0][i] = 0; | ||||
| s->mv_table[1][i] = 0; | s->mv_table[1][i] = 0; | ||||
| } | } | ||||
| } | } | ||||
| if(s->mb_type[i]&MB_TYPE_INTER4V){ | |||||
| //FIXME | |||||
| } | |||||
| } | } | ||||
| } | } | ||||
| @@ -1249,8 +1370,6 @@ static void encode_picture(MpegEncContext *s, int picture_number) | |||||
| s->mb_incr = 1; | s->mb_incr = 1; | ||||
| s->last_mv[0][0][0] = 0; | s->last_mv[0][0][0] = 0; | ||||
| s->last_mv[0][0][1] = 0; | s->last_mv[0][0][1] = 0; | ||||
| s->mv_type = MV_TYPE_16X16; | |||||
| s->mv_dir = MV_DIR_FORWARD; | |||||
| /* Get the GOB height based on picture height */ | /* Get the GOB height based on picture height */ | ||||
| if (s->out_format == FMT_H263 && !s->h263_pred && !s->h263_msmpeg4) { | if (s->out_format == FMT_H263 && !s->h263_pred && !s->h263_msmpeg4) { | ||||
| @@ -1264,12 +1383,6 @@ static void encode_picture(MpegEncContext *s, int picture_number) | |||||
| s->avg_mb_var = s->avg_mb_var / s->mb_num; | s->avg_mb_var = s->avg_mb_var / s->mb_num; | ||||
| s->block_wrap[0]= | |||||
| s->block_wrap[1]= | |||||
| s->block_wrap[2]= | |||||
| s->block_wrap[3]= s->mb_width*2 + 2; | |||||
| s->block_wrap[4]= | |||||
| s->block_wrap[5]= s->mb_width + 2; | |||||
| for(mb_y=0; mb_y < s->mb_height; mb_y++) { | for(mb_y=0; mb_y < s->mb_height; mb_y++) { | ||||
| /* Put GOB header based on RTP MTU */ | /* Put GOB header based on RTP MTU */ | ||||
| /* TODO: Put all this stuff in a separate generic function */ | /* TODO: Put all this stuff in a separate generic function */ | ||||
| @@ -1292,6 +1405,11 @@ static void encode_picture(MpegEncContext *s, int picture_number) | |||||
| s->block_index[4]= s->block_wrap[4]*(mb_y + 1) + s->block_wrap[0]*(s->mb_height*2 + 2); | s->block_index[4]= s->block_wrap[4]*(mb_y + 1) + s->block_wrap[0]*(s->mb_height*2 + 2); | ||||
| s->block_index[5]= s->block_wrap[4]*(mb_y + 1 + s->mb_height + 2) + s->block_wrap[0]*(s->mb_height*2 + 2); | s->block_index[5]= s->block_wrap[4]*(mb_y + 1 + s->mb_height + 2) + s->block_wrap[0]*(s->mb_height*2 + 2); | ||||
| for(mb_x=0; mb_x < s->mb_width; mb_x++) { | for(mb_x=0; mb_x < s->mb_width; mb_x++) { | ||||
| const int mb_type= s->mb_type[mb_y * s->mb_width + mb_x]; | |||||
| PutBitContext pb; | |||||
| int d; | |||||
| int dmin=10000000; | |||||
| int best=0; | |||||
| s->mb_x = mb_x; | s->mb_x = mb_x; | ||||
| s->mb_y = mb_y; | s->mb_y = mb_y; | ||||
| @@ -1301,124 +1419,78 @@ static void encode_picture(MpegEncContext *s, int picture_number) | |||||
| s->block_index[3]+=2; | s->block_index[3]+=2; | ||||
| s->block_index[4]++; | s->block_index[4]++; | ||||
| s->block_index[5]++; | s->block_index[5]++; | ||||
| #if 0 | |||||
| /* compute motion vector and macro block type (intra or non intra) */ | |||||
| motion_x = 0; | |||||
| motion_y = 0; | |||||
| if (s->pict_type == P_TYPE) { | |||||
| s->mb_intra = estimate_motion(s, mb_x, mb_y, | |||||
| &motion_x, | |||||
| &motion_y); | |||||
| } else { | |||||
| s->mb_intra = 1; | |||||
| } | |||||
| #endif | |||||
| s->mb_intra = s->mb_type[mb_y * s->mb_width + mb_x]; | |||||
| motion_x = s->mv_table[0][mb_y * s->mb_width + mb_x]; | |||||
| motion_y = s->mv_table[1][mb_y * s->mb_width + mb_x]; | |||||
| /* get the pixels */ | |||||
| wrap = s->linesize; | |||||
| ptr = s->new_picture[0] + (mb_y * 16 * wrap) + mb_x * 16; | |||||
| get_pixels(s->block[0], ptr, wrap); | |||||
| get_pixels(s->block[1], ptr + 8, wrap); | |||||
| get_pixels(s->block[2], ptr + 8 * wrap, wrap); | |||||
| get_pixels(s->block[3], ptr + 8 * wrap + 8, wrap); | |||||
| wrap = s->linesize >> 1; | |||||
| ptr = s->new_picture[1] + (mb_y * 8 * wrap) + mb_x * 8; | |||||
| get_pixels(s->block[4], ptr, wrap); | |||||
| wrap = s->linesize >> 1; | |||||
| ptr = s->new_picture[2] + (mb_y * 8 * wrap) + mb_x * 8; | |||||
| get_pixels(s->block[5], ptr, wrap); | |||||
| /* subtract previous frame if non intra */ | |||||
| if (!s->mb_intra) { | |||||
| int dxy, offset, mx, my; | |||||
| dxy = ((motion_y & 1) << 1) | (motion_x & 1); | |||||
| ptr = s->last_picture[0] + | |||||
| ((mb_y * 16 + (motion_y >> 1)) * s->linesize) + | |||||
| (mb_x * 16 + (motion_x >> 1)); | |||||
| sub_pixels_2(s->block[0], ptr, s->linesize, dxy); | |||||
| sub_pixels_2(s->block[1], ptr + 8, s->linesize, dxy); | |||||
| sub_pixels_2(s->block[2], ptr + s->linesize * 8, s->linesize, dxy); | |||||
| sub_pixels_2(s->block[3], ptr + 8 + s->linesize * 8, s->linesize ,dxy); | |||||
| if (s->out_format == FMT_H263) { | |||||
| /* special rounding for h263 */ | |||||
| dxy = 0; | |||||
| if ((motion_x & 3) != 0) | |||||
| dxy |= 1; | |||||
| if ((motion_y & 3) != 0) | |||||
| dxy |= 2; | |||||
| mx = motion_x >> 2; | |||||
| my = motion_y >> 2; | |||||
| } else { | |||||
| mx = motion_x / 2; | |||||
| my = motion_y / 2; | |||||
| dxy = ((my & 1) << 1) | (mx & 1); | |||||
| mx >>= 1; | |||||
| my >>= 1; | |||||
| s->mv_type = MV_TYPE_16X16; | |||||
| s->mv_dir = MV_DIR_FORWARD; | |||||
| if(mb_type & (mb_type-1)){ // more than 1 MB type possible | |||||
| pb= s->pb; | |||||
| if(mb_type&MB_TYPE_INTER){ | |||||
| s->mb_intra= 0; | |||||
| s->mv[0][0][0] = s->mv_table[0][mb_y * s->mb_width + mb_x]; | |||||
| s->mv[0][0][1] = s->mv_table[1][mb_y * s->mb_width + mb_x]; | |||||
| init_put_bits(&s->pb, bit_buf[1], 3000, NULL, NULL); | |||||
| s->block= s->inter_block; | |||||
| encode_mb(s); | |||||
| d= get_bit_count(&s->pb); | |||||
| if(d<dmin){ | |||||
| flush_put_bits(&s->pb); | |||||
| dmin=d; | |||||
| best_s.mv[0][0][0]= s->mv[0][0][0]; | |||||
| best_s.mv[0][0][1]= s->mv[0][0][1]; | |||||
| best_s.mb_intra= 0; | |||||
| best_s.pb=s->pb; | |||||
| best_s.block= s->block; | |||||
| best=1; | |||||
| for(i=0; i<6; i++) | |||||
| best_s.block_last_index[i]= s->block_last_index[i]; | |||||
| } | |||||
| } | } | ||||
| offset = ((mb_y * 8 + my) * (s->linesize >> 1)) + (mb_x * 8 + mx); | |||||
| ptr = s->last_picture[1] + offset; | |||||
| sub_pixels_2(s->block[4], ptr, s->linesize >> 1, dxy); | |||||
| ptr = s->last_picture[2] + offset; | |||||
| sub_pixels_2(s->block[5], ptr, s->linesize >> 1, dxy); | |||||
| } | |||||
| emms_c(); | |||||
| #if 0 | |||||
| { | |||||
| float adap_parm; | |||||
| adap_parm = ((s->avg_mb_var << 1) + s->mb_var[s->mb_width*mb_y+mb_x] + 1.0) / | |||||
| ((s->mb_var[s->mb_width*mb_y+mb_x] << 1) + s->avg_mb_var + 1.0); | |||||
| printf("\ntype=%c qscale=%2d adap=%0.2f dquant=%4.2f var=%4d avgvar=%4d", | |||||
| (s->mb_type[s->mb_width*mb_y+mb_x] > 0) ? 'I' : 'P', | |||||
| s->qscale, adap_parm, s->qscale*adap_parm, | |||||
| s->mb_var[s->mb_width*mb_y+mb_x], s->avg_mb_var); | |||||
| } | |||||
| #endif | |||||
| /* DCT & quantize */ | |||||
| if (s->h263_msmpeg4) { | |||||
| msmpeg4_dc_scale(s); | |||||
| } else if (s->h263_pred) { | |||||
| h263_dc_scale(s); | |||||
| if(mb_type&MB_TYPE_INTRA){ | |||||
| s->mb_intra= 1; | |||||
| s->mv[0][0][0] = 0; | |||||
| s->mv[0][0][1] = 0; | |||||
| init_put_bits(&s->pb, bit_buf[0], 3000, NULL, NULL); | |||||
| s->block= s->intra_block; | |||||
| encode_mb(s); | |||||
| d= get_bit_count(&s->pb); | |||||
| if(d<dmin){ | |||||
| flush_put_bits(&s->pb); | |||||
| dmin=d; | |||||
| best_s.mv[0][0][0]= 0; | |||||
| best_s.mv[0][0][1]= 0; | |||||
| best_s.mb_intra= 1; | |||||
| best_s.pb=s->pb; | |||||
| best_s.block= s->block; | |||||
| for(i=0; i<6; i++) | |||||
| best_s.block_last_index[i]= s->block_last_index[i]; | |||||
| best=0; | |||||
| } | |||||
| /* force cleaning of ac/dc if needed ... */ | |||||
| s->mbintra_table[mb_x + mb_y*s->mb_width]=1; | |||||
| } | |||||
| s->mv[0][0][0]= best_s.mv[0][0][0]; | |||||
| s->mv[0][0][1]= best_s.mv[0][0][1]; | |||||
| s->mb_intra= best_s.mb_intra; | |||||
| for(i=0; i<6; i++) | |||||
| s->block_last_index[i]= best_s.block_last_index[i]; | |||||
| copy_bits(&pb, bit_buf[best], dmin); | |||||
| s->block= best_s.block; | |||||
| s->pb= pb; | |||||
| } else { | } else { | ||||
| /* default quantization values */ | |||||
| s->y_dc_scale = 8; | |||||
| s->c_dc_scale = 8; | |||||
| } | |||||
| for(i=0;i<6;i++) { | |||||
| s->block_last_index[i] = dct_quantize(s, s->block[i], i, s->qscale); | |||||
| } | |||||
| /* huffman encode */ | |||||
| switch(s->out_format) { | |||||
| case FMT_MPEG1: | |||||
| mpeg1_encode_mb(s, s->block, motion_x, motion_y); | |||||
| break; | |||||
| case FMT_H263: | |||||
| if (s->h263_msmpeg4) | |||||
| msmpeg4_encode_mb(s, s->block, motion_x, motion_y); | |||||
| else if(s->h263_pred) | |||||
| mpeg4_encode_mb(s, s->block, motion_x, motion_y); | |||||
| else | |||||
| h263_encode_mb(s, s->block, motion_x, motion_y); | |||||
| break; | |||||
| case FMT_MJPEG: | |||||
| mjpeg_encode_mb(s, s->block); | |||||
| break; | |||||
| // only one MB-Type possible | |||||
| if(mb_type&MB_TYPE_INTRA){ | |||||
| s->mb_intra= 1; | |||||
| s->mv[0][0][0] = 0; | |||||
| s->mv[0][0][1] = 0; | |||||
| }else{ | |||||
| s->mb_intra= 0; | |||||
| s->mv[0][0][0] = s->mv_table[0][mb_y * s->mb_width + mb_x]; | |||||
| s->mv[0][0][1] = s->mv_table[1][mb_y * s->mb_width + mb_x]; | |||||
| } | |||||
| encode_mb(s); | |||||
| } | } | ||||
| /* decompress blocks so that we keep the state of the decoder */ | |||||
| s->mv[0][0][0] = motion_x; | |||||
| s->mv[0][0][1] = motion_y; | |||||
| MPV_decode_mb(s, s->block); | MPV_decode_mb(s, s->block); | ||||
| } | } | ||||
| @@ -1437,6 +1509,7 @@ static void encode_picture(MpegEncContext *s, int picture_number) | |||||
| s->first_gob_line = 0; | s->first_gob_line = 0; | ||||
| } | } | ||||
| } | } | ||||
| emms_c(); | |||||
| if (s->h263_msmpeg4 && s->pict_type == I_TYPE) | if (s->h263_msmpeg4 && s->pict_type == I_TYPE) | ||||
| msmpeg4_encode_ext_header(s); | msmpeg4_encode_ext_header(s); | ||||
| @@ -1454,7 +1527,6 @@ static void encode_picture(MpegEncContext *s, int picture_number) | |||||
| s->ptr_lastgob = pbBufPtr(&s->pb); | s->ptr_lastgob = pbBufPtr(&s->pb); | ||||
| //fprintf(stderr,"\nGOB: %2d size: %d (last)", s->gob_number, pdif); | //fprintf(stderr,"\nGOB: %2d size: %d (last)", s->gob_number, pdif); | ||||
| } | } | ||||
| } | } | ||||
| static int dct_quantize_c(MpegEncContext *s, | static int dct_quantize_c(MpegEncContext *s, | ||||
| @@ -65,7 +65,7 @@ typedef struct MpegEncContext { | |||||
| int qmax; /* max qscale */ | int qmax; /* max qscale */ | ||||
| int max_qdiff; /* max qscale difference between frames */ | int max_qdiff; /* max qscale difference between frames */ | ||||
| int encoding; /* true if we are encoding (vs decoding) */ | int encoding; /* true if we are encoding (vs decoding) */ | ||||
| int hq; /* set if CODEC_FLAG_HQ is used in AVCodecContext.flags */ | |||||
| int flags; /* AVCodecContext.flags (HQ, MV4, ...) */ | |||||
| /* the following fields are managed internally by the encoder */ | /* the following fields are managed internally by the encoder */ | ||||
| /* bit output */ | /* bit output */ | ||||
| @@ -141,8 +141,16 @@ typedef struct MpegEncContext { | |||||
| int mb_x, mb_y; | int mb_x, mb_y; | ||||
| int mb_incr; | int mb_incr; | ||||
| int mb_intra; | int mb_intra; | ||||
| INT16 *mb_var; /* Table for MB variances */ | |||||
| char *mb_type; /* Table for MB type */ | |||||
| UINT16 *mb_var; /* Table for MB variances */ | |||||
| UINT8 *mb_type; /* Table for MB type */ | |||||
| #define MB_TYPE_INTRA 0x01 | |||||
| #define MB_TYPE_INTER 0x02 | |||||
| #define MB_TYPE_INTER4V 0x04 | |||||
| #define MB_TYPE_SKIPED 0x08 | |||||
| #define MB_TYPE_DIRECT 0x10 | |||||
| #define MB_TYPE_FORWARD 0x20 | |||||
| #define MB_TYPE_BACKWAD 0x40 | |||||
| #define MB_TYPE_BIDIR 0x80 | |||||
| int block_index[6]; | int block_index[6]; | ||||
| int block_wrap[6]; | int block_wrap[6]; | ||||
| @@ -295,7 +303,10 @@ typedef struct MpegEncContext { | |||||
| UINT8 *ptr_last_mb_line; | UINT8 *ptr_last_mb_line; | ||||
| UINT32 mb_line_avgsize; | UINT32 mb_line_avgsize; | ||||
| DCTELEM block[6][64] __align8; | |||||
| DCTELEM (*block)[64]; /* points to one of the following blocks */ | |||||
| DCTELEM intra_block[6][64] __align8; | |||||
| DCTELEM inter_block[6][64] __align8; | |||||
| DCTELEM inter4v_block[6][64] __align8; | |||||
| void (*dct_unquantize)(struct MpegEncContext *s, | void (*dct_unquantize)(struct MpegEncContext *s, | ||||
| DCTELEM *block, int n, int qscale); | DCTELEM *block, int n, int qscale); | ||||
| } MpegEncContext; | } MpegEncContext; | ||||
| @@ -311,9 +322,8 @@ void MPV_common_init_mmx(MpegEncContext *s); | |||||
| /* motion_est.c */ | /* motion_est.c */ | ||||
| int estimate_motion(MpegEncContext *s, | |||||
| int mb_x, int mb_y, | |||||
| int *mx_ptr, int *my_ptr); | |||||
| void estimate_motion(MpegEncContext *s, | |||||
| int mb_x, int mb_y); | |||||
| /* mpeg12.c */ | /* mpeg12.c */ | ||||
| extern INT16 default_intra_matrix[64]; | extern INT16 default_intra_matrix[64]; | ||||