This allows some simplifications and optimizations and should not have any effect on quality. Originally committed as revision 10172 to svn://svn.ffmpeg.org/ffmpeg/trunktags/v0.5
| @@ -111,8 +111,7 @@ void ff_snow_horizontal_compose97i_sse2(DWTELEM *b, int width){ | |||||
| i = 0; | i = 0; | ||||
| asm volatile( | asm volatile( | ||||
| "pcmpeqd %%xmm7, %%xmm7 \n\t" | |||||
| "psrad $29, %%xmm7 \n\t" | |||||
| "pslld $1, %%xmm7 \n\t" | |||||
| ::); | ::); | ||||
| for(; i<w_l-7; i+=8){ | for(; i<w_l-7; i+=8){ | ||||
| asm volatile( | asm volatile( | ||||
| @@ -157,25 +156,21 @@ void ff_snow_horizontal_compose97i_sse2(DWTELEM *b, int width){ | |||||
| "movdqu 20(%1), %%xmm6 \n\t" | "movdqu 20(%1), %%xmm6 \n\t" | ||||
| "paddd (%1), %%xmm2 \n\t" | "paddd (%1), %%xmm2 \n\t" | ||||
| "paddd 16(%1), %%xmm6 \n\t" | "paddd 16(%1), %%xmm6 \n\t" | ||||
| "movdqa %%xmm2, %%xmm0 \n\t" | |||||
| "movdqa %%xmm6, %%xmm4 \n\t" | |||||
| "pslld $2, %%xmm2 \n\t" | |||||
| "pslld $2, %%xmm6 \n\t" | |||||
| "psubd %%xmm2, %%xmm0 \n\t" | |||||
| "psubd %%xmm6, %%xmm4 \n\t" | |||||
| "psrad $1, %%xmm0 \n\t" | |||||
| "psrad $1, %%xmm4 \n\t" | |||||
| "movdqu (%0), %%xmm2 \n\t" | |||||
| "movdqu 16(%0), %%xmm6 \n\t" | |||||
| "psubd %%xmm0, %%xmm2 \n\t" | |||||
| "psubd %%xmm4, %%xmm6 \n\t" | |||||
| "movdqu (%0), %%xmm0 \n\t" | |||||
| "movdqu 16(%0), %%xmm4 \n\t" | |||||
| "paddd %%xmm2, %%xmm0 \n\t" | |||||
| "paddd %%xmm6, %%xmm4 \n\t" | |||||
| "psrad $1, %%xmm2 \n\t" | |||||
| "psrad $1, %%xmm6 \n\t" | |||||
| "paddd %%xmm0, %%xmm2 \n\t" | |||||
| "paddd %%xmm4, %%xmm6 \n\t" | |||||
| "movdqa %%xmm2, (%2) \n\t" | "movdqa %%xmm2, (%2) \n\t" | ||||
| "movdqa %%xmm6, 16(%2) \n\t" | "movdqa %%xmm6, 16(%2) \n\t" | ||||
| :: "r"(&src[i]), "r"(&b[i]), "r"(&temp[i]) | :: "r"(&src[i]), "r"(&b[i]), "r"(&temp[i]) | ||||
| : "memory" | : "memory" | ||||
| ); | ); | ||||
| } | } | ||||
| snow_horizontal_compose_lift_lead_out(i, temp, src, b, width, w_r, 1, -W_AM, W_AO, W_AS); | |||||
| snow_horizontal_compose_lift_lead_out(i, temp, src, b, width, w_r, 1, -W_AM, W_AO+1, W_AS); | |||||
| } | } | ||||
| { | { | ||||
| @@ -291,10 +286,9 @@ void ff_snow_horizontal_compose97i_mmx(DWTELEM *b, int width){ | |||||
| DWTELEM * const ref = b+w2 - 1; | DWTELEM * const ref = b+w2 - 1; | ||||
| i = 1; | i = 1; | ||||
| b[0] = b[0] + (((2 * ref[1] + W_BO-1) + 4 * b[0]) >> W_BS); | |||||
| b[0] = b[0] + (((2 * ref[1] + W_BO) + 4 * b[0]) >> W_BS); | |||||
| asm volatile( | asm volatile( | ||||
| "pcmpeqd %%mm7, %%mm7 \n\t" | |||||
| "psrld $29, %%mm7 \n\t" | |||||
| "pslld $1, %%mm7 \n\t" | |||||
| ::); | ::); | ||||
| for(; i<w_l-3; i+=4){ | for(; i<w_l-3; i+=4){ | ||||
| asm volatile( | asm volatile( | ||||
| @@ -333,16 +327,12 @@ void ff_snow_horizontal_compose97i_mmx(DWTELEM *b, int width){ | |||||
| "movq 12(%1), %%mm6 \n\t" | "movq 12(%1), %%mm6 \n\t" | ||||
| "paddd (%1), %%mm2 \n\t" | "paddd (%1), %%mm2 \n\t" | ||||
| "paddd 8(%1), %%mm6 \n\t" | "paddd 8(%1), %%mm6 \n\t" | ||||
| "pxor %%mm0, %%mm0 \n\t" //note: the 2 xor could be avoided if we would flip the rounding direction | |||||
| "pxor %%mm4, %%mm4 \n\t" | |||||
| "psubd %%mm2, %%mm0 \n\t" | |||||
| "psubd %%mm6, %%mm4 \n\t" | |||||
| "psrad $1, %%mm0 \n\t" | |||||
| "psrad $1, %%mm4 \n\t" | |||||
| "psubd %%mm0, %%mm2 \n\t" | |||||
| "psubd %%mm4, %%mm6 \n\t" | |||||
| "movq (%0), %%mm0 \n\t" | "movq (%0), %%mm0 \n\t" | ||||
| "movq 8(%0), %%mm4 \n\t" | "movq 8(%0), %%mm4 \n\t" | ||||
| "paddd %%mm2, %%mm0 \n\t" | |||||
| "paddd %%mm6, %%mm4 \n\t" | |||||
| "psrad $1, %%mm2 \n\t" | |||||
| "psrad $1, %%mm6 \n\t" | |||||
| "paddd %%mm0, %%mm2 \n\t" | "paddd %%mm0, %%mm2 \n\t" | ||||
| "paddd %%mm4, %%mm6 \n\t" | "paddd %%mm4, %%mm6 \n\t" | ||||
| "movq %%mm2, (%2) \n\t" | "movq %%mm2, (%2) \n\t" | ||||
| @@ -351,7 +341,7 @@ void ff_snow_horizontal_compose97i_mmx(DWTELEM *b, int width){ | |||||
| : "memory" | : "memory" | ||||
| ); | ); | ||||
| } | } | ||||
| snow_horizontal_compose_lift_lead_out(i, temp, src, b, width, w_r, 1, -W_AM, W_AO, W_AS); | |||||
| snow_horizontal_compose_lift_lead_out(i, temp, src, b, width, w_r, 1, -W_AM, W_AO+1, W_AS); | |||||
| } | } | ||||
| { | { | ||||
| @@ -775,7 +775,7 @@ static av_always_inline void liftS(DWTELEM *dst, DWTELEM *src, DWTELEM *ref, int | |||||
| int i; | int i; | ||||
| assert(shift == 4); | assert(shift == 4); | ||||
| #define LIFTS(src, ref, inv) ((inv) ? (src) + (((ref) + 4*(src))>>shift): (16*4*(src) + 4*(ref) + 8 + (5<<27))/(5*16) - (1<<23)) | |||||
| #define LIFTS(src, ref, inv) ((inv) ? (src) + (((ref) + 4*(src))>>shift): -((-16*4*(src) + 4*(ref) + add + 5 + (5<<27))/(5*16) - (1<<23))) | |||||
| if(mirror_left){ | if(mirror_left){ | ||||
| dst[0] = LIFTS(src[0], mul*2*ref[0]+add, inverse); | dst[0] = LIFTS(src[0], mul*2*ref[0]+add, inverse); | ||||
| dst += dst_step; | dst += dst_step; | ||||
| @@ -1113,8 +1113,8 @@ static void horizontal_decompose97i(DWTELEM *b, int width){ | |||||
| DWTELEM temp[width]; | DWTELEM temp[width]; | ||||
| const int w2= (width+1)>>1; | const int w2= (width+1)>>1; | ||||
| lift (temp+w2, b +1, b , 1, 2, 2, width, -W_AM, W_AO, W_AS, 1, 0); | |||||
| liftS(temp , b , temp+w2, 1, 2, 1, width, -W_BM, W_BO, W_BS, 0, 0); | |||||
| lift (temp+w2, b +1, b , 1, 2, 2, width, W_AM, W_AO, W_AS, 1, 1); | |||||
| liftS(temp , b , temp+w2, 1, 2, 1, width, W_BM, W_BO, W_BS, 0, 0); | |||||
| lift5(b +w2, temp+w2, temp , 1, 1, 1, width, W_CM, W_CO, W_CS, 1, 0); | lift5(b +w2, temp+w2, temp , 1, 1, 1, width, W_CM, W_CO, W_CS, 1, 0); | ||||
| lift (b , temp , b +w2, 1, 1, 1, width, W_DM, W_DO, W_DS, 0, 0); | lift (b , temp , b +w2, 1, 1, 1, width, W_DM, W_DO, W_DS, 0, 0); | ||||
| } | } | ||||
| @@ -1150,7 +1150,7 @@ static void vertical_decompose97iL0(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, int w | |||||
| #ifdef liftS | #ifdef liftS | ||||
| b1[i] -= (W_BM*(b0[i] + b2[i])+W_BO)>>W_BS; | b1[i] -= (W_BM*(b0[i] + b2[i])+W_BO)>>W_BS; | ||||
| #else | #else | ||||
| b1[i] = (16*4*b1[i] - 4*(b0[i] + b2[i]) + 8*5 + (5<<27)) / (5*16) - (1<<23); | |||||
| b1[i] = (16*4*b1[i] - 4*(b0[i] + b2[i]) + W_BO*5 + (5<<27)) / (5*16) - (1<<23); | |||||
| #endif | #endif | ||||
| } | } | ||||
| } | } | ||||
| @@ -1344,8 +1344,8 @@ void ff_snow_horizontal_compose97i(DWTELEM *b, int width){ | |||||
| lift (temp , b , b +w2, 1, 1, 1, width, W_DM, W_DO, W_DS, 0, 1); | lift (temp , b , b +w2, 1, 1, 1, width, W_DM, W_DO, W_DS, 0, 1); | ||||
| lift5(temp+w2, b +w2, temp , 1, 1, 1, width, W_CM, W_CO, W_CS, 1, 1); | lift5(temp+w2, b +w2, temp , 1, 1, 1, width, W_CM, W_CO, W_CS, 1, 1); | ||||
| liftS(b , temp , temp+w2, 2, 1, 1, width, W_BM, W_BO-1, W_BS, 0, 1); | |||||
| lift (b+1 , temp+w2, b , 2, 1, 2, width, -W_AM, W_AO, W_AS, 1, 1); | |||||
| liftS(b , temp , temp+w2, 2, 1, 1, width, W_BM, W_BO, W_BS, 0, 1); | |||||
| lift (b+1 , temp+w2, b , 2, 1, 2, width, W_AM, W_AO, W_AS, 1, 0); | |||||
| } | } | ||||
| static void vertical_compose97iH0(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, int width){ | static void vertical_compose97iH0(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, int width){ | ||||
| @@ -165,11 +165,11 @@ static av_always_inline void snow_horizontal_compose_lift_lead_out(int i, DWTELE | |||||
| static av_always_inline void snow_horizontal_compose_liftS_lead_out(int i, DWTELEM * dst, DWTELEM * src, DWTELEM * ref, int width, int w){ | static av_always_inline void snow_horizontal_compose_liftS_lead_out(int i, DWTELEM * dst, DWTELEM * src, DWTELEM * ref, int width, int w){ | ||||
| for(; i<w; i++){ | for(; i<w; i++){ | ||||
| dst[i] = src[i] + ((ref[i] + ref[(i+1)]+W_BO-1 + 4 * src[i]) >> W_BS); | |||||
| dst[i] = src[i] + ((ref[i] + ref[(i+1)]+W_BO + 4 * src[i]) >> W_BS); | |||||
| } | } | ||||
| if(width&1){ | if(width&1){ | ||||
| dst[w] = src[w] + ((2 * ref[w] + W_BO-1 + 4 * src[w]) >> W_BS); | |||||
| dst[w] = src[w] + ((2 * ref[w] + W_BO + 4 * src[w]) >> W_BS); | |||||
| } | } | ||||
| } | } | ||||
| @@ -141,9 +141,9 @@ f8f51fa737add17f7fecaefa118b57ed *./tests/data/a-ffv1.avi | |||||
| 2654678 ./tests/data/a-ffv1.avi | 2654678 ./tests/data/a-ffv1.avi | ||||
| 799d3db687f6cdd7a837ec156efc171f *./tests/data/out.yuv | 799d3db687f6cdd7a837ec156efc171f *./tests/data/out.yuv | ||||
| stddev: 0.00 PSNR:99.99 bytes:7602176 | stddev: 0.00 PSNR:99.99 bytes:7602176 | ||||
| 9078723c943de5d79490f54b99e6ea9e *./tests/data/a-snow.avi | |||||
| 156656 ./tests/data/a-snow.avi | |||||
| f2932084b52e2ede167c9ba21eae0656 *./tests/data/out.yuv | |||||
| 958d649d09b7361d5f00b5b3fcccbcd2 *./tests/data/a-snow.avi | |||||
| 156606 ./tests/data/a-snow.avi | |||||
| b19cb7f9134f922326028c6bb44e96de *./tests/data/out.yuv | |||||
| stddev: 23.14 PSNR:20.83 bytes:7602176 | stddev: 23.14 PSNR:20.83 bytes:7602176 | ||||
| ba999e86070aa971376e7f317a022c37 *./tests/data/a-snow53.avi | ba999e86070aa971376e7f317a022c37 *./tests/data/a-snow53.avi | ||||
| 3519486 ./tests/data/a-snow53.avi | 3519486 ./tests/data/a-snow53.avi | ||||
| @@ -141,9 +141,9 @@ d72b0960e162d4998b9acbabb07e99ab *./tests/data/a-ffv1.avi | |||||
| 3525804 ./tests/data/a-ffv1.avi | 3525804 ./tests/data/a-ffv1.avi | ||||
| dde5895817ad9d219f79a52d0bdfb001 *./tests/data/out.yuv | dde5895817ad9d219f79a52d0bdfb001 *./tests/data/out.yuv | ||||
| stddev: 0.00 PSNR:99.99 bytes:7602176 | stddev: 0.00 PSNR:99.99 bytes:7602176 | ||||
| 40a6e938ac2bd92ee12cd57925e86454 *./tests/data/a-snow.avi | |||||
| 68758 ./tests/data/a-snow.avi | |||||
| 1e356854142898c7c4aab4bfedadf235 *./tests/data/out.yuv | |||||
| 2cfa1bdb443d04a890208a83fd239461 *./tests/data/a-snow.avi | |||||
| 68872 ./tests/data/a-snow.avi | |||||
| 64a0495b7ab53509d3b791465262795c *./tests/data/out.yuv | |||||
| stddev: 10.86 PSNR:27.40 bytes:7602176 | stddev: 10.86 PSNR:27.40 bytes:7602176 | ||||
| 3d0da6aeec9b80c6ee0ff4b747bdd0f0 *./tests/data/a-snow53.avi | 3d0da6aeec9b80c6ee0ff4b747bdd0f0 *./tests/data/a-snow53.avi | ||||
| 2721980 ./tests/data/a-snow53.avi | 2721980 ./tests/data/a-snow53.avi | ||||
| @@ -2046,51 +2046,51 @@ ret: 0 st:-1 ts:-0.645825 flags:1 | |||||
| ret: 0 st: 0 dts:0.040000 pts:0.040000 pos:9610 size:1075 flags:0 | ret: 0 st: 0 dts:0.040000 pts:0.040000 pos:9610 size:1075 flags:0 | ||||
| ---------------- | ---------------- | ||||
| tests/data/a-snow.avi | tests/data/a-snow.avi | ||||
| ret: 0 st: 0 dts:0.000000 pts:0.000000 pos:5660 size:2986 flags:1 | |||||
| ret: 0 st: 0 dts:0.000000 pts:0.000000 pos:5660 size:2987 flags:1 | |||||
| ret: 0 st:-1 ts:-1.000000 flags:0 | ret: 0 st:-1 ts:-1.000000 flags:0 | ||||
| ret: 0 st: 0 dts:0.000000 pts:0.000000 pos:5660 size:2986 flags:1 | |||||
| ret: 0 st: 0 dts:0.000000 pts:0.000000 pos:5660 size:2987 flags:1 | |||||
| ret: 0 st:-1 ts:1.894167 flags:1 | ret: 0 st:-1 ts:1.894167 flags:1 | ||||
| ret: 0 st: 0 dts:1.440000 pts:1.440000 pos:46794 size:3663 flags:1 | |||||
| ret: 0 st: 0 dts:1.440000 pts:1.440000 pos:46908 size:3663 flags:1 | |||||
| ret: 0 st: 0 ts:0.800000 flags:0 | ret: 0 st: 0 ts:0.800000 flags:0 | ||||
| ret: 0 st: 0 dts:0.960000 pts:0.960000 pos:31726 size:3478 flags:1 | |||||
| ret: 0 st: 0 dts:0.960000 pts:0.960000 pos:31690 size:3478 flags:1 | |||||
| ret:-1 st: 0 ts:-0.320000 flags:1 | ret:-1 st: 0 ts:-0.320000 flags:1 | ||||
| ret:-1 st:-1 ts:2.576668 flags:0 | ret:-1 st:-1 ts:2.576668 flags:0 | ||||
| ret: 0 st:-1 ts:1.470835 flags:1 | ret: 0 st:-1 ts:1.470835 flags:1 | ||||
| ret: 0 st: 0 dts:1.440000 pts:1.440000 pos:46794 size:3663 flags:1 | |||||
| ret: 0 st: 0 dts:1.440000 pts:1.440000 pos:46908 size:3663 flags:1 | |||||
| ret: 0 st: 0 ts:0.360000 flags:0 | ret: 0 st: 0 ts:0.360000 flags:0 | ||||
| ret: 0 st: 0 dts:0.480000 pts:0.480000 pos:18006 size:3229 flags:1 | |||||
| ret: 0 st: 0 dts:0.480000 pts:0.480000 pos:17990 size:3229 flags:1 | |||||
| ret:-1 st: 0 ts:-0.760000 flags:1 | ret:-1 st: 0 ts:-0.760000 flags:1 | ||||
| ret:-1 st:-1 ts:2.153336 flags:0 | ret:-1 st:-1 ts:2.153336 flags:0 | ||||
| ret: 0 st:-1 ts:1.047503 flags:1 | ret: 0 st:-1 ts:1.047503 flags:1 | ||||
| ret: 0 st: 0 dts:0.960000 pts:0.960000 pos:31726 size:3478 flags:1 | |||||
| ret: 0 st: 0 dts:0.960000 pts:0.960000 pos:31690 size:3478 flags:1 | |||||
| ret: 0 st: 0 ts:-0.040000 flags:0 | ret: 0 st: 0 ts:-0.040000 flags:0 | ||||
| ret: 0 st: 0 dts:0.000000 pts:0.000000 pos:5660 size:2986 flags:1 | |||||
| ret: 0 st: 0 dts:0.000000 pts:0.000000 pos:5660 size:2987 flags:1 | |||||
| ret: 0 st: 0 ts:2.840000 flags:1 | ret: 0 st: 0 ts:2.840000 flags:1 | ||||
| ret: 0 st: 0 dts:1.920000 pts:1.920000 pos:63240 size:3635 flags:1 | |||||
| ret: 0 st: 0 dts:1.920000 pts:1.920000 pos:63350 size:3635 flags:1 | |||||
| ret: 0 st:-1 ts:1.730004 flags:0 | ret: 0 st:-1 ts:1.730004 flags:0 | ||||
| ret: 0 st: 0 dts:1.920000 pts:1.920000 pos:63240 size:3635 flags:1 | |||||
| ret: 0 st: 0 dts:1.920000 pts:1.920000 pos:63350 size:3635 flags:1 | |||||
| ret: 0 st:-1 ts:0.624171 flags:1 | ret: 0 st:-1 ts:0.624171 flags:1 | ||||
| ret: 0 st: 0 dts:0.480000 pts:0.480000 pos:18006 size:3229 flags:1 | |||||
| ret: 0 st: 0 dts:0.480000 pts:0.480000 pos:17990 size:3229 flags:1 | |||||
| ret: 0 st: 0 ts:-0.480000 flags:0 | ret: 0 st: 0 ts:-0.480000 flags:0 | ||||
| ret: 0 st: 0 dts:0.000000 pts:0.000000 pos:5660 size:2986 flags:1 | |||||
| ret: 0 st: 0 dts:0.000000 pts:0.000000 pos:5660 size:2987 flags:1 | |||||
| ret: 0 st: 0 ts:2.400000 flags:1 | ret: 0 st: 0 ts:2.400000 flags:1 | ||||
| ret: 0 st: 0 dts:1.920000 pts:1.920000 pos:63240 size:3635 flags:1 | |||||
| ret: 0 st: 0 dts:1.920000 pts:1.920000 pos:63350 size:3635 flags:1 | |||||
| ret: 0 st:-1 ts:1.306672 flags:0 | ret: 0 st:-1 ts:1.306672 flags:0 | ||||
| ret: 0 st: 0 dts:1.440000 pts:1.440000 pos:46794 size:3663 flags:1 | |||||
| ret: 0 st: 0 dts:1.440000 pts:1.440000 pos:46908 size:3663 flags:1 | |||||
| ret: 0 st:-1 ts:0.200839 flags:1 | ret: 0 st:-1 ts:0.200839 flags:1 | ||||
| ret: 0 st: 0 dts:0.000000 pts:0.000000 pos:5660 size:2986 flags:1 | |||||
| ret: 0 st: 0 dts:0.000000 pts:0.000000 pos:5660 size:2987 flags:1 | |||||
| ret: 0 st: 0 ts:-0.920000 flags:0 | ret: 0 st: 0 ts:-0.920000 flags:0 | ||||
| ret: 0 st: 0 dts:0.000000 pts:0.000000 pos:5660 size:2986 flags:1 | |||||
| ret: 0 st: 0 dts:0.000000 pts:0.000000 pos:5660 size:2987 flags:1 | |||||
| ret: 0 st: 0 ts:2.000000 flags:1 | ret: 0 st: 0 ts:2.000000 flags:1 | ||||
| ret: 0 st: 0 dts:1.920000 pts:1.920000 pos:63240 size:3635 flags:1 | |||||
| ret: 0 st: 0 dts:1.920000 pts:1.920000 pos:63350 size:3635 flags:1 | |||||
| ret: 0 st:-1 ts:0.883340 flags:0 | ret: 0 st:-1 ts:0.883340 flags:0 | ||||
| ret: 0 st: 0 dts:0.960000 pts:0.960000 pos:31726 size:3478 flags:1 | |||||
| ret: 0 st: 0 dts:0.960000 pts:0.960000 pos:31690 size:3478 flags:1 | |||||
| ret:-1 st:-1 ts:-0.222493 flags:1 | ret:-1 st:-1 ts:-0.222493 flags:1 | ||||
| ret:-1 st: 0 ts:2.680000 flags:0 | ret:-1 st: 0 ts:2.680000 flags:0 | ||||
| ret: 0 st: 0 ts:1.560000 flags:1 | ret: 0 st: 0 ts:1.560000 flags:1 | ||||
| ret: 0 st: 0 dts:1.440000 pts:1.440000 pos:46794 size:3663 flags:1 | |||||
| ret: 0 st: 0 dts:1.440000 pts:1.440000 pos:46908 size:3663 flags:1 | |||||
| ret: 0 st:-1 ts:0.460008 flags:0 | ret: 0 st:-1 ts:0.460008 flags:0 | ||||
| ret: 0 st: 0 dts:0.480000 pts:0.480000 pos:18006 size:3229 flags:1 | |||||
| ret: 0 st: 0 dts:0.480000 pts:0.480000 pos:17990 size:3229 flags:1 | |||||
| ret:-1 st:-1 ts:-0.645825 flags:1 | ret:-1 st:-1 ts:-0.645825 flags:1 | ||||
| ---------------- | ---------------- | ||||
| tests/data/a-snow53.avi | tests/data/a-snow53.avi | ||||