You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

3743 lines
108KB

  1. /*
  2. Copyright (C) 2001 Michael Niedermayer (michaelni@gmx.at)
  3. This program is free software; you can redistribute it and/or modify
  4. it under the terms of the GNU General Public License as published by
  5. the Free Software Foundation; either version 2 of the License, or
  6. (at your option) any later version.
  7. This program is distributed in the hope that it will be useful,
  8. but WITHOUT ANY WARRANTY; without even the implied warranty of
  9. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  10. GNU General Public License for more details.
  11. You should have received a copy of the GNU General Public License
  12. along with this program; if not, write to the Free Software
  13. Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  14. */
  15. /*
  16. C MMX MMX2 3DNow
  17. isVertDC Ec Ec
  18. isVertMinMaxOk Ec Ec
  19. doVertLowPass E e e
  20. doVertDefFilter Ec Ec e e
  21. isHorizDC Ec Ec
  22. isHorizMinMaxOk a E
  23. doHorizLowPass E e e
  24. doHorizDefFilter Ec Ec e e
  25. deRing E e e*
  26. Vertical RKAlgo1 E a a
  27. Horizontal RKAlgo1 a a
  28. Vertical X1# a E E
  29. Horizontal X1# a E E
  30. LinIpolDeinterlace e E E*
  31. CubicIpolDeinterlace a e e*
  32. LinBlendDeinterlace e E E*
  33. MedianDeinterlace# Ec Ec
  34. TempDeNoiser# E e e
  35. * i dont have a 3dnow CPU -> its untested, but noone said it doesnt work so it seems to work
  36. # more or less selfinvented filters so the exactness isnt too meaningfull
  37. E = Exact implementation
  38. e = allmost exact implementation (slightly different rounding,...)
  39. a = alternative / approximate impl
  40. c = checked against the other implementations (-vo md5)
  41. */
  42. /*
  43. TODO:
  44. verify that everything workes as it should (how?)
  45. reduce the time wasted on the mem transfer
  46. implement everything in C at least (done at the moment but ...)
  47. unroll stuff if instructions depend too much on the prior one
  48. we use 8x8 blocks for the horizontal filters, opendivx seems to use 8x4?
  49. move YScale thing to the end instead of fixing QP
  50. write a faster and higher quality deblocking filter :)
  51. make the mainloop more flexible (variable number of blocks at once
  52. (the if/else stuff per block is slowing things down)
  53. compare the quality & speed of all filters
  54. split this huge file
  55. border remover
  56. optimize c versions
  57. try to unroll inner for(x=0 ... loop to avoid these damn if(x ... checks
  58. smart blur
  59. commandline option for the deblock thresholds
  60. ...
  61. */
  62. //Changelog: use the CVS log
  63. #include "../config.h"
  64. #include <inttypes.h>
  65. #include <stdio.h>
  66. #include <stdlib.h>
  67. #include <string.h>
  68. #ifdef HAVE_MALLOC_H
  69. #include <malloc.h>
  70. #endif
  71. //#undef HAVE_MMX2
  72. //#define HAVE_3DNOW
  73. //#undef HAVE_MMX
  74. //#define DEBUG_BRIGHTNESS
  75. #include "postprocess.h"
  76. #define MIN(a,b) ((a) > (b) ? (b) : (a))
  77. #define MAX(a,b) ((a) < (b) ? (b) : (a))
  78. #define ABS(a) ((a) > 0 ? (a) : (-(a)))
  79. #define SIGN(a) ((a) > 0 ? 1 : -1)
  80. #ifdef HAVE_MMX2
  81. #define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
  82. #elif defined (HAVE_3DNOW)
  83. #define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
  84. #endif
  85. #ifdef HAVE_MMX2
  86. #define PMINUB(a,b,t) "pminub " #a ", " #b " \n\t"
  87. #elif defined (HAVE_MMX)
  88. #define PMINUB(b,a,t) \
  89. "movq " #a ", " #t " \n\t"\
  90. "psubusb " #b ", " #t " \n\t"\
  91. "psubb " #t ", " #a " \n\t"
  92. #endif
  93. #ifdef HAVE_MMX2
  94. #define PMAXUB(a,b) "pmaxub " #a ", " #b " \n\t"
  95. #elif defined (HAVE_MMX)
  96. #define PMAXUB(a,b) \
  97. "psubusb " #a ", " #b " \n\t"\
  98. "paddb " #a ", " #b " \n\t"
  99. #endif
  100. #define GET_MODE_BUFFER_SIZE 500
  101. #define OPTIONS_ARRAY_SIZE 10
  102. #ifdef HAVE_MMX
  103. static volatile uint64_t __attribute__((aligned(8))) packedYOffset= 0x0000000000000000LL;
  104. static volatile uint64_t __attribute__((aligned(8))) packedYScale= 0x0100010001000100LL;
  105. static uint64_t __attribute__((aligned(8))) w05= 0x0005000500050005LL;
  106. static uint64_t __attribute__((aligned(8))) w20= 0x0020002000200020LL;
  107. static uint64_t __attribute__((aligned(8))) w1400= 0x1400140014001400LL;
  108. static uint64_t __attribute__((aligned(8))) bm00000001= 0x00000000000000FFLL;
  109. static uint64_t __attribute__((aligned(8))) bm00010000= 0x000000FF00000000LL;
  110. static uint64_t __attribute__((aligned(8))) bm00001000= 0x00000000FF000000LL;
  111. static uint64_t __attribute__((aligned(8))) bm10000000= 0xFF00000000000000LL;
  112. static uint64_t __attribute__((aligned(8))) bm10000001= 0xFF000000000000FFLL;
  113. static uint64_t __attribute__((aligned(8))) bm11000011= 0xFFFF00000000FFFFLL;
  114. static uint64_t __attribute__((aligned(8))) bm00000011= 0x000000000000FFFFLL;
  115. static uint64_t __attribute__((aligned(8))) bm11111110= 0xFFFFFFFFFFFFFF00LL;
  116. static uint64_t __attribute__((aligned(8))) bm11000000= 0xFFFF000000000000LL;
  117. static uint64_t __attribute__((aligned(8))) bm00011000= 0x000000FFFF000000LL;
  118. static uint64_t __attribute__((aligned(8))) bm00110011= 0x0000FFFF0000FFFFLL;
  119. static uint64_t __attribute__((aligned(8))) bm11001100= 0xFFFF0000FFFF0000LL;
  120. static uint64_t __attribute__((aligned(8))) b00= 0x0000000000000000LL;
  121. static uint64_t __attribute__((aligned(8))) b01= 0x0101010101010101LL;
  122. static uint64_t __attribute__((aligned(8))) b02= 0x0202020202020202LL;
  123. static uint64_t __attribute__((aligned(8))) b0F= 0x0F0F0F0F0F0F0F0FLL;
  124. static uint64_t __attribute__((aligned(8))) b04= 0x0404040404040404LL;
  125. static uint64_t __attribute__((aligned(8))) b08= 0x0808080808080808LL;
  126. static uint64_t __attribute__((aligned(8))) bFF= 0xFFFFFFFFFFFFFFFFLL;
  127. static uint64_t __attribute__((aligned(8))) b20= 0x2020202020202020LL;
  128. static uint64_t __attribute__((aligned(8))) b80= 0x8080808080808080LL;
  129. static uint64_t __attribute__((aligned(8))) b7E= 0x7E7E7E7E7E7E7E7ELL;
  130. static uint64_t __attribute__((aligned(8))) b7C= 0x7C7C7C7C7C7C7C7CLL;
  131. static uint64_t __attribute__((aligned(8))) b3F= 0x3F3F3F3F3F3F3F3FLL;
  132. static uint64_t __attribute__((aligned(8))) temp0=0;
  133. static uint64_t __attribute__((aligned(8))) temp1=0;
  134. static uint64_t __attribute__((aligned(8))) temp2=0;
  135. static uint64_t __attribute__((aligned(8))) temp3=0;
  136. static uint64_t __attribute__((aligned(8))) temp4=0;
  137. static uint64_t __attribute__((aligned(8))) temp5=0;
  138. static uint64_t __attribute__((aligned(8))) pQPb=0;
  139. static uint64_t __attribute__((aligned(8))) pQPb2=0;
  140. static uint8_t __attribute__((aligned(8))) tempBlocks[8*16*2]; //used for the horizontal code
  141. static uint32_t __attribute__((aligned(4))) maxTmpNoise[4];
  142. #else
  143. static uint64_t packedYOffset= 0x0000000000000000LL;
  144. static uint64_t packedYScale= 0x0100010001000100LL;
  145. static uint8_t tempBlocks[8*16*2]; //used for the horizontal code
  146. #endif
  147. int hFlatnessThreshold= 56 - 16;
  148. int vFlatnessThreshold= 56 - 16;
  149. //amount of "black" u r willing to loose to get a brightness corrected picture
  150. double maxClippedThreshold= 0.01;
  151. int maxAllowedY=234;
  152. int minAllowedY=16;
  153. static struct PPFilter filters[]=
  154. {
  155. {"hb", "hdeblock", 1, 1, 3, H_DEBLOCK},
  156. {"vb", "vdeblock", 1, 2, 4, V_DEBLOCK},
  157. {"vr", "rkvdeblock", 1, 2, 4, H_RK1_FILTER},
  158. {"h1", "x1hdeblock", 1, 1, 3, H_X1_FILTER},
  159. {"v1", "x1vdeblock", 1, 2, 4, V_X1_FILTER},
  160. {"dr", "dering", 1, 5, 6, DERING},
  161. {"al", "autolevels", 0, 1, 2, LEVEL_FIX},
  162. {"lb", "linblenddeint", 0, 1, 6, LINEAR_BLEND_DEINT_FILTER},
  163. {"li", "linipoldeint", 0, 1, 6, LINEAR_IPOL_DEINT_FILTER},
  164. {"ci", "cubicipoldeint", 0, 1, 6, CUBIC_IPOL_DEINT_FILTER},
  165. {"md", "mediandeint", 0, 1, 6, MEDIAN_DEINT_FILTER},
  166. {"tn", "tmpnoise", 1, 7, 8, TEMP_NOISE_FILTER},
  167. {NULL, NULL,0,0,0,0} //End Marker
  168. };
  169. static char *replaceTable[]=
  170. {
  171. "default", "hdeblock:a,vdeblock:a,dering:a,autolevels,tmpnoise:a:150:200:400",
  172. "de", "hdeblock:a,vdeblock:a,dering:a,autolevels,tmpnoise:a:150:200:400",
  173. "fast", "x1hdeblock:a,x1vdeblock:a,dering:a,autolevels,tmpnoise:a:150:200:400",
  174. "fa", "x1hdeblock:a,x1vdeblock:a,dering:a,autolevels,tmpnoise:a:150:200:400",
  175. NULL //End Marker
  176. };
  177. #ifdef HAVE_MMX
  178. static inline void unusedVariableWarningFixer()
  179. {
  180. if(
  181. packedYOffset + packedYScale + w05 + w20 + w1400 + bm00000001 + bm00010000
  182. + bm00001000 + bm10000000 + bm10000001 + bm11000011 + bm00000011 + bm11111110
  183. + bm11000000 + bm00011000 + bm00110011 + bm11001100 + b00 + b01 + b02 + b0F
  184. + bFF + b20 + b04+ b08 + pQPb2 + b80 + b7E + b7C + b3F + temp0 + temp1 + temp2 + temp3 + temp4
  185. + temp5 + pQPb== 0) b00=0;
  186. }
  187. #endif
  188. #ifdef TIMING
  189. static inline long long rdtsc()
  190. {
  191. long long l;
  192. asm volatile( "rdtsc\n\t"
  193. : "=A" (l)
  194. );
  195. // printf("%d\n", int(l/1000));
  196. return l;
  197. }
  198. #endif
  199. #ifdef HAVE_MMX2
  200. static inline void prefetchnta(void *p)
  201. {
  202. asm volatile( "prefetchnta (%0)\n\t"
  203. : : "r" (p)
  204. );
  205. }
  206. static inline void prefetcht0(void *p)
  207. {
  208. asm volatile( "prefetcht0 (%0)\n\t"
  209. : : "r" (p)
  210. );
  211. }
  212. static inline void prefetcht1(void *p)
  213. {
  214. asm volatile( "prefetcht1 (%0)\n\t"
  215. : : "r" (p)
  216. );
  217. }
  218. static inline void prefetcht2(void *p)
  219. {
  220. asm volatile( "prefetcht2 (%0)\n\t"
  221. : : "r" (p)
  222. );
  223. }
  224. #endif
  225. //FIXME? |255-0| = 1 (shouldnt be a problem ...)
  226. /**
  227. * Check if the middle 8x8 Block in the given 8x16 block is flat
  228. */
  229. static inline int isVertDC(uint8_t src[], int stride){
  230. int numEq= 0;
  231. #ifndef HAVE_MMX
  232. int y;
  233. #endif
  234. src+= stride*4; // src points to begin of the 8x8 Block
  235. #ifdef HAVE_MMX
  236. asm volatile(
  237. "leal (%1, %2), %%eax \n\t"
  238. "leal (%%eax, %2, 4), %%ebx \n\t"
  239. // 0 1 2 3 4 5 6 7 8 9
  240. // %1 eax eax+%2 eax+2%2 %1+4%2 ebx ebx+%2 ebx+2%2 %1+8%2 ebx+4%2
  241. "movq b7E, %%mm7 \n\t" // mm7 = 0x7F
  242. "movq b7C, %%mm6 \n\t" // mm6 = 0x7D
  243. "movq (%1), %%mm0 \n\t"
  244. "movq (%%eax), %%mm1 \n\t"
  245. "psubb %%mm1, %%mm0 \n\t" // mm0 = differnece
  246. "paddb %%mm7, %%mm0 \n\t"
  247. "pcmpgtb %%mm6, %%mm0 \n\t"
  248. "movq (%%eax,%2), %%mm2 \n\t"
  249. "psubb %%mm2, %%mm1 \n\t"
  250. "paddb %%mm7, %%mm1 \n\t"
  251. "pcmpgtb %%mm6, %%mm1 \n\t"
  252. "paddb %%mm1, %%mm0 \n\t"
  253. "movq (%%eax, %2, 2), %%mm1 \n\t"
  254. "psubb %%mm1, %%mm2 \n\t"
  255. "paddb %%mm7, %%mm2 \n\t"
  256. "pcmpgtb %%mm6, %%mm2 \n\t"
  257. "paddb %%mm2, %%mm0 \n\t"
  258. "movq (%1, %2, 4), %%mm2 \n\t"
  259. "psubb %%mm2, %%mm1 \n\t"
  260. "paddb %%mm7, %%mm1 \n\t"
  261. "pcmpgtb %%mm6, %%mm1 \n\t"
  262. "paddb %%mm1, %%mm0 \n\t"
  263. "movq (%%ebx), %%mm1 \n\t"
  264. "psubb %%mm1, %%mm2 \n\t"
  265. "paddb %%mm7, %%mm2 \n\t"
  266. "pcmpgtb %%mm6, %%mm2 \n\t"
  267. "paddb %%mm2, %%mm0 \n\t"
  268. "movq (%%ebx, %2), %%mm2 \n\t"
  269. "psubb %%mm2, %%mm1 \n\t"
  270. "paddb %%mm7, %%mm1 \n\t"
  271. "pcmpgtb %%mm6, %%mm1 \n\t"
  272. "paddb %%mm1, %%mm0 \n\t"
  273. "movq (%%ebx, %2, 2), %%mm1 \n\t"
  274. "psubb %%mm1, %%mm2 \n\t"
  275. "paddb %%mm7, %%mm2 \n\t"
  276. "pcmpgtb %%mm6, %%mm2 \n\t"
  277. "paddb %%mm2, %%mm0 \n\t"
  278. " \n\t"
  279. "movq %%mm0, %%mm1 \n\t"
  280. "psrlw $8, %%mm0 \n\t"
  281. "paddb %%mm1, %%mm0 \n\t"
  282. #ifdef HAVE_MMX2
  283. "pshufw $0xF9, %%mm0, %%mm1 \n\t"
  284. "paddb %%mm1, %%mm0 \n\t"
  285. "pshufw $0xFE, %%mm0, %%mm1 \n\t"
  286. #else
  287. "movq %%mm0, %%mm1 \n\t"
  288. "psrlq $16, %%mm0 \n\t"
  289. "paddb %%mm1, %%mm0 \n\t"
  290. "movq %%mm0, %%mm1 \n\t"
  291. "psrlq $32, %%mm0 \n\t"
  292. #endif
  293. "paddb %%mm1, %%mm0 \n\t"
  294. "movd %%mm0, %0 \n\t"
  295. : "=r" (numEq)
  296. : "r" (src), "r" (stride)
  297. : "%eax", "%ebx"
  298. );
  299. numEq= (256 - numEq) &0xFF;
  300. #else
  301. for(y=0; y<BLOCK_SIZE-1; y++)
  302. {
  303. if(((src[0] - src[0+stride] + 1)&0xFFFF) < 3) numEq++;
  304. if(((src[1] - src[1+stride] + 1)&0xFFFF) < 3) numEq++;
  305. if(((src[2] - src[2+stride] + 1)&0xFFFF) < 3) numEq++;
  306. if(((src[3] - src[3+stride] + 1)&0xFFFF) < 3) numEq++;
  307. if(((src[4] - src[4+stride] + 1)&0xFFFF) < 3) numEq++;
  308. if(((src[5] - src[5+stride] + 1)&0xFFFF) < 3) numEq++;
  309. if(((src[6] - src[6+stride] + 1)&0xFFFF) < 3) numEq++;
  310. if(((src[7] - src[7+stride] + 1)&0xFFFF) < 3) numEq++;
  311. src+= stride;
  312. }
  313. #endif
  314. /* if(abs(numEq - asmEq) > 0)
  315. {
  316. printf("\nasm:%d c:%d\n", asmEq, numEq);
  317. for(int y=0; y<8; y++)
  318. {
  319. for(int x=0; x<8; x++)
  320. {
  321. printf("%d ", temp[x + y*stride]);
  322. }
  323. printf("\n");
  324. }
  325. }
  326. */
  327. // for(int i=0; i<numEq/8; i++) src[i]=255;
  328. return (numEq > vFlatnessThreshold) ? 1 : 0;
  329. }
  330. static inline int isVertMinMaxOk(uint8_t src[], int stride, int QP)
  331. {
  332. #ifdef HAVE_MMX
  333. int isOk;
  334. src+= stride*3;
  335. asm volatile(
  336. // "int $3 \n\t"
  337. "movq (%1, %2), %%mm0 \n\t"
  338. "movq (%1, %2, 8), %%mm1 \n\t"
  339. "movq %%mm0, %%mm2 \n\t"
  340. "psubusb %%mm1, %%mm0 \n\t"
  341. "psubusb %%mm2, %%mm1 \n\t"
  342. "por %%mm1, %%mm0 \n\t" // ABS Diff
  343. "movq pQPb, %%mm7 \n\t" // QP,..., QP
  344. "paddusb %%mm7, %%mm7 \n\t" // 2QP ... 2QP
  345. "psubusb %%mm7, %%mm0 \n\t" // Diff <= 2QP -> 0
  346. "pcmpeqd b00, %%mm0 \n\t"
  347. "psrlq $16, %%mm0 \n\t"
  348. "pcmpeqd bFF, %%mm0 \n\t"
  349. // "movd %%mm0, (%1, %2, 4)\n\t"
  350. "movd %%mm0, %0 \n\t"
  351. : "=r" (isOk)
  352. : "r" (src), "r" (stride)
  353. );
  354. return isOk;
  355. #else
  356. int isOk2= 1;
  357. int x;
  358. src+= stride*3;
  359. for(x=0; x<BLOCK_SIZE; x++)
  360. {
  361. if(abs((int)src[x + stride] - (int)src[x + (stride<<3)]) > 2*QP) isOk2=0;
  362. }
  363. /* if(isOk && !isOk2 || !isOk && isOk2)
  364. {
  365. printf("\nasm:%d c:%d QP:%d\n", isOk, isOk2, QP);
  366. for(int y=0; y<9; y++)
  367. {
  368. for(int x=0; x<8; x++)
  369. {
  370. printf("%d ", src[x + y*stride]);
  371. }
  372. printf("\n");
  373. }
  374. } */
  375. return isOk2;
  376. #endif
  377. }
  378. /**
  379. * Do a vertical low pass filter on the 8x16 block (only write to the 8x8 block in the middle)
  380. * using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16
  381. */
  382. static inline void doVertLowPass(uint8_t *src, int stride, int QP)
  383. {
  384. #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
  385. src+= stride*3;
  386. asm volatile( //"movv %0 %1 %2\n\t"
  387. "movq pQPb, %%mm0 \n\t" // QP,..., QP
  388. "movq (%0), %%mm6 \n\t"
  389. "movq (%0, %1), %%mm5 \n\t"
  390. "movq %%mm5, %%mm1 \n\t"
  391. "movq %%mm6, %%mm2 \n\t"
  392. "psubusb %%mm6, %%mm5 \n\t"
  393. "psubusb %%mm1, %%mm2 \n\t"
  394. "por %%mm5, %%mm2 \n\t" // ABS Diff of lines
  395. "psubusb %%mm0, %%mm2 \n\t" // diff <= QP -> 0
  396. "pcmpeqb b00, %%mm2 \n\t" // diff <= QP -> FF
  397. "pand %%mm2, %%mm6 \n\t"
  398. "pandn %%mm1, %%mm2 \n\t"
  399. "por %%mm2, %%mm6 \n\t"// First Line to Filter
  400. "movq (%0, %1, 8), %%mm5 \n\t"
  401. "leal (%0, %1, 4), %%eax \n\t"
  402. "leal (%0, %1, 8), %%ebx \n\t"
  403. "subl %1, %%ebx \n\t"
  404. "addl %1, %0 \n\t" // %0 points to line 1 not 0
  405. "movq (%0, %1, 8), %%mm7 \n\t"
  406. "movq %%mm5, %%mm1 \n\t"
  407. "movq %%mm7, %%mm2 \n\t"
  408. "psubusb %%mm7, %%mm5 \n\t"
  409. "psubusb %%mm1, %%mm2 \n\t"
  410. "por %%mm5, %%mm2 \n\t" // ABS Diff of lines
  411. "psubusb %%mm0, %%mm2 \n\t" // diff <= QP -> 0
  412. "pcmpeqb b00, %%mm2 \n\t" // diff <= QP -> FF
  413. "pand %%mm2, %%mm7 \n\t"
  414. "pandn %%mm1, %%mm2 \n\t"
  415. "por %%mm2, %%mm7 \n\t" // First Line to Filter
  416. // 1 2 3 4 5 6 7 8
  417. // %0 %0+%1 %0+2%1 eax %0+4%1 eax+2%1 ebx eax+4%1
  418. // 6 4 2 2 1 1
  419. // 6 4 4 2
  420. // 6 8 2
  421. "movq (%0, %1), %%mm0 \n\t" // 1
  422. "movq %%mm0, %%mm1 \n\t" // 1
  423. PAVGB(%%mm6, %%mm0) //1 1 /2
  424. PAVGB(%%mm6, %%mm0) //3 1 /4
  425. "movq (%0, %1, 4), %%mm2 \n\t" // 1
  426. "movq %%mm2, %%mm5 \n\t" // 1
  427. PAVGB((%%eax), %%mm2) // 11 /2
  428. PAVGB((%0, %1, 2), %%mm2) // 211 /4
  429. "movq %%mm2, %%mm3 \n\t" // 211 /4
  430. "movq (%0), %%mm4 \n\t" // 1
  431. PAVGB(%%mm4, %%mm3) // 4 211 /8
  432. PAVGB(%%mm0, %%mm3) //642211 /16
  433. "movq %%mm3, (%0) \n\t" // X
  434. // mm1=2 mm2=3(211) mm4=1 mm5=5 mm6=0 mm7=9
  435. "movq %%mm1, %%mm0 \n\t" // 1
  436. PAVGB(%%mm6, %%mm0) //1 1 /2
  437. "movq %%mm4, %%mm3 \n\t" // 1
  438. PAVGB((%0,%1,2), %%mm3) // 1 1 /2
  439. PAVGB((%%eax,%1,2), %%mm5) // 11 /2
  440. PAVGB((%%eax), %%mm5) // 211 /4
  441. PAVGB(%%mm5, %%mm3) // 2 2211 /8
  442. PAVGB(%%mm0, %%mm3) //4242211 /16
  443. "movq %%mm3, (%0,%1) \n\t" // X
  444. // mm1=2 mm2=3(211) mm4=1 mm5=4(211) mm6=0 mm7=9
  445. PAVGB(%%mm4, %%mm6) //11 /2
  446. "movq (%%ebx), %%mm0 \n\t" // 1
  447. PAVGB((%%eax, %1, 2), %%mm0) // 11/2
  448. "movq %%mm0, %%mm3 \n\t" // 11/2
  449. PAVGB(%%mm1, %%mm0) // 2 11/4
  450. PAVGB(%%mm6, %%mm0) //222 11/8
  451. PAVGB(%%mm2, %%mm0) //22242211/16
  452. "movq (%0, %1, 2), %%mm2 \n\t" // 1
  453. "movq %%mm0, (%0, %1, 2) \n\t" // X
  454. // mm1=2 mm2=3 mm3=6(11) mm4=1 mm5=4(211) mm6=0(11) mm7=9
  455. "movq (%%eax, %1, 4), %%mm0 \n\t" // 1
  456. PAVGB((%%ebx), %%mm0) // 11 /2
  457. PAVGB(%%mm0, %%mm6) //11 11 /4
  458. PAVGB(%%mm1, %%mm4) // 11 /2
  459. PAVGB(%%mm2, %%mm1) // 11 /2
  460. PAVGB(%%mm1, %%mm6) //1122 11 /8
  461. PAVGB(%%mm5, %%mm6) //112242211 /16
  462. "movq (%%eax), %%mm5 \n\t" // 1
  463. "movq %%mm6, (%%eax) \n\t" // X
  464. // mm0=7(11) mm1=2(11) mm2=3 mm3=6(11) mm4=1(11) mm5=4 mm7=9
  465. "movq (%%eax, %1, 4), %%mm6 \n\t" // 1
  466. PAVGB(%%mm7, %%mm6) // 11 /2
  467. PAVGB(%%mm4, %%mm6) // 11 11 /4
  468. PAVGB(%%mm3, %%mm6) // 11 2211 /8
  469. PAVGB(%%mm5, %%mm2) // 11 /2
  470. "movq (%0, %1, 4), %%mm4 \n\t" // 1
  471. PAVGB(%%mm4, %%mm2) // 112 /4
  472. PAVGB(%%mm2, %%mm6) // 112242211 /16
  473. "movq %%mm6, (%0, %1, 4) \n\t" // X
  474. // mm0=7(11) mm1=2(11) mm2=3(112) mm3=6(11) mm4=5 mm5=4 mm7=9
  475. PAVGB(%%mm7, %%mm1) // 11 2 /4
  476. PAVGB(%%mm4, %%mm5) // 11 /2
  477. PAVGB(%%mm5, %%mm0) // 11 11 /4
  478. "movq (%%eax, %1, 2), %%mm6 \n\t" // 1
  479. PAVGB(%%mm6, %%mm1) // 11 4 2 /8
  480. PAVGB(%%mm0, %%mm1) // 11224222 /16
  481. "movq %%mm1, (%%eax, %1, 2) \n\t" // X
  482. // mm2=3(112) mm3=6(11) mm4=5 mm5=4(11) mm6=6 mm7=9
  483. PAVGB((%%ebx), %%mm2) // 112 4 /8
  484. "movq (%%eax, %1, 4), %%mm0 \n\t" // 1
  485. PAVGB(%%mm0, %%mm6) // 1 1 /2
  486. PAVGB(%%mm7, %%mm6) // 1 12 /4
  487. PAVGB(%%mm2, %%mm6) // 1122424 /4
  488. "movq %%mm6, (%%ebx) \n\t" // X
  489. // mm0=8 mm3=6(11) mm4=5 mm5=4(11) mm7=9
  490. PAVGB(%%mm7, %%mm5) // 11 2 /4
  491. PAVGB(%%mm7, %%mm5) // 11 6 /8
  492. PAVGB(%%mm3, %%mm0) // 112 /4
  493. PAVGB(%%mm0, %%mm5) // 112246 /16
  494. "movq %%mm5, (%%eax, %1, 4) \n\t" // X
  495. "subl %1, %0 \n\t"
  496. :
  497. : "r" (src), "r" (stride)
  498. : "%eax", "%ebx"
  499. );
  500. #else
  501. const int l1= stride;
  502. const int l2= stride + l1;
  503. const int l3= stride + l2;
  504. const int l4= stride + l3;
  505. const int l5= stride + l4;
  506. const int l6= stride + l5;
  507. const int l7= stride + l6;
  508. const int l8= stride + l7;
  509. const int l9= stride + l8;
  510. int x;
  511. src+= stride*3;
  512. for(x=0; x<BLOCK_SIZE; x++)
  513. {
  514. const int first= ABS(src[0] - src[l1]) < QP ? src[0] : src[l1];
  515. const int last= ABS(src[l8] - src[l9]) < QP ? src[l9] : src[l8];
  516. int sums[9];
  517. sums[0] = first + src[l1];
  518. sums[1] = src[l1] + src[l2];
  519. sums[2] = src[l2] + src[l3];
  520. sums[3] = src[l3] + src[l4];
  521. sums[4] = src[l4] + src[l5];
  522. sums[5] = src[l5] + src[l6];
  523. sums[6] = src[l6] + src[l7];
  524. sums[7] = src[l7] + src[l8];
  525. sums[8] = src[l8] + last;
  526. src[l1]= ((sums[0]<<2) + ((first + sums[2])<<1) + sums[4] + 8)>>4;
  527. src[l2]= ((src[l2]<<2) + ((first + sums[0] + sums[3])<<1) + sums[5] + 8)>>4;
  528. src[l3]= ((src[l3]<<2) + ((first + sums[1] + sums[4])<<1) + sums[6] + 8)>>4;
  529. src[l4]= ((src[l4]<<2) + ((sums[2] + sums[5])<<1) + sums[0] + sums[7] + 8)>>4;
  530. src[l5]= ((src[l5]<<2) + ((sums[3] + sums[6])<<1) + sums[1] + sums[8] + 8)>>4;
  531. src[l6]= ((src[l6]<<2) + ((last + sums[7] + sums[4])<<1) + sums[2] + 8)>>4;
  532. src[l7]= (((last + src[l7])<<2) + ((src[l8] + sums[5])<<1) + sums[3] + 8)>>4;
  533. src[l8]= ((sums[8]<<2) + ((last + sums[6])<<1) + sums[4] + 8)>>4;
  534. src++;
  535. }
  536. #endif
  537. }
  538. /**
  539. * Experimental implementation of the filter (Algorithm 1) described in a paper from Ramkishor & Karandikar
  540. * values are correctly clipped (MMX2)
  541. * values are wraparound (C)
  542. * conclusion: its fast, but introduces ugly horizontal patterns if there is a continious gradient
  543. 0 8 16 24
  544. x = 8
  545. x/2 = 4
  546. x/8 = 1
  547. 1 12 12 23
  548. */
  549. static inline void vertRK1Filter(uint8_t *src, int stride, int QP)
  550. {
  551. #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
  552. src+= stride*3;
  553. // FIXME rounding
  554. asm volatile(
  555. "pxor %%mm7, %%mm7 \n\t" // 0
  556. "movq b80, %%mm6 \n\t" // MIN_SIGNED_BYTE
  557. "leal (%0, %1), %%eax \n\t"
  558. "leal (%%eax, %1, 4), %%ebx \n\t"
  559. // 0 1 2 3 4 5 6 7 8 9
  560. // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
  561. "movq pQPb, %%mm0 \n\t" // QP,..., QP
  562. "movq %%mm0, %%mm1 \n\t" // QP,..., QP
  563. "paddusb b02, %%mm0 \n\t"
  564. "psrlw $2, %%mm0 \n\t"
  565. "pand b3F, %%mm0 \n\t" // QP/4,..., QP/4
  566. "paddusb %%mm1, %%mm0 \n\t" // QP*1.25 ...
  567. "movq (%0, %1, 4), %%mm2 \n\t" // line 4
  568. "movq (%%ebx), %%mm3 \n\t" // line 5
  569. "movq %%mm2, %%mm4 \n\t" // line 4
  570. "pcmpeqb %%mm5, %%mm5 \n\t" // -1
  571. "pxor %%mm2, %%mm5 \n\t" // -line 4 - 1
  572. PAVGB(%%mm3, %%mm5)
  573. "paddb %%mm6, %%mm5 \n\t" // (l5-l4)/2
  574. "psubusb %%mm3, %%mm4 \n\t"
  575. "psubusb %%mm2, %%mm3 \n\t"
  576. "por %%mm3, %%mm4 \n\t" // |l4 - l5|
  577. "psubusb %%mm0, %%mm4 \n\t"
  578. "pcmpeqb %%mm7, %%mm4 \n\t"
  579. "pand %%mm4, %%mm5 \n\t" // d/2
  580. // "paddb %%mm6, %%mm2 \n\t" // line 4 + 0x80
  581. "paddb %%mm5, %%mm2 \n\t"
  582. // "psubb %%mm6, %%mm2 \n\t"
  583. "movq %%mm2, (%0,%1, 4) \n\t"
  584. "movq (%%ebx), %%mm2 \n\t"
  585. // "paddb %%mm6, %%mm2 \n\t" // line 5 + 0x80
  586. "psubb %%mm5, %%mm2 \n\t"
  587. // "psubb %%mm6, %%mm2 \n\t"
  588. "movq %%mm2, (%%ebx) \n\t"
  589. "paddb %%mm6, %%mm5 \n\t"
  590. "psrlw $2, %%mm5 \n\t"
  591. "pand b3F, %%mm5 \n\t"
  592. "psubb b20, %%mm5 \n\t" // (l5-l4)/8
  593. "movq (%%eax, %1, 2), %%mm2 \n\t"
  594. "paddb %%mm6, %%mm2 \n\t" // line 3 + 0x80
  595. "paddsb %%mm5, %%mm2 \n\t"
  596. "psubb %%mm6, %%mm2 \n\t"
  597. "movq %%mm2, (%%eax, %1, 2) \n\t"
  598. "movq (%%ebx, %1), %%mm2 \n\t"
  599. "paddb %%mm6, %%mm2 \n\t" // line 6 + 0x80
  600. "psubsb %%mm5, %%mm2 \n\t"
  601. "psubb %%mm6, %%mm2 \n\t"
  602. "movq %%mm2, (%%ebx, %1) \n\t"
  603. :
  604. : "r" (src), "r" (stride)
  605. : "%eax", "%ebx"
  606. );
  607. #else
  608. const int l1= stride;
  609. const int l2= stride + l1;
  610. const int l3= stride + l2;
  611. const int l4= stride + l3;
  612. const int l5= stride + l4;
  613. const int l6= stride + l5;
  614. // const int l7= stride + l6;
  615. // const int l8= stride + l7;
  616. // const int l9= stride + l8;
  617. int x;
  618. const int QP15= QP + (QP>>2);
  619. src+= stride*3;
  620. for(x=0; x<BLOCK_SIZE; x++)
  621. {
  622. const int v = (src[x+l5] - src[x+l4]);
  623. if(ABS(v) < QP15)
  624. {
  625. src[x+l3] +=v>>3;
  626. src[x+l4] +=v>>1;
  627. src[x+l5] -=v>>1;
  628. src[x+l6] -=v>>3;
  629. }
  630. }
  631. #endif
  632. }
  633. /**
  634. * Experimental Filter 1
  635. * will not damage linear gradients
  636. * Flat blocks should look like they where passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter
  637. * can only smooth blocks at the expected locations (it cant smooth them if they did move)
  638. * MMX2 version does correct clipping C version doesnt
  639. */
  640. static inline void vertX1Filter(uint8_t *src, int stride, int QP)
  641. {
  642. #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
  643. src+= stride*3;
  644. asm volatile(
  645. "pxor %%mm7, %%mm7 \n\t" // 0
  646. // "movq b80, %%mm6 \n\t" // MIN_SIGNED_BYTE
  647. "leal (%0, %1), %%eax \n\t"
  648. "leal (%%eax, %1, 4), %%ebx \n\t"
  649. // 0 1 2 3 4 5 6 7 8 9
  650. // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
  651. "movq (%%eax, %1, 2), %%mm0 \n\t" // line 3
  652. "movq (%0, %1, 4), %%mm1 \n\t" // line 4
  653. "movq %%mm1, %%mm2 \n\t" // line 4
  654. "psubusb %%mm0, %%mm1 \n\t"
  655. "psubusb %%mm2, %%mm0 \n\t"
  656. "por %%mm1, %%mm0 \n\t" // |l2 - l3|
  657. "movq (%%ebx), %%mm3 \n\t" // line 5
  658. "movq (%%ebx, %1), %%mm4 \n\t" // line 6
  659. "movq %%mm3, %%mm5 \n\t" // line 5
  660. "psubusb %%mm4, %%mm3 \n\t"
  661. "psubusb %%mm5, %%mm4 \n\t"
  662. "por %%mm4, %%mm3 \n\t" // |l5 - l6|
  663. PAVGB(%%mm3, %%mm0) // (|l2 - l3| + |l5 - l6|)/2
  664. "movq %%mm2, %%mm1 \n\t" // line 4
  665. "psubusb %%mm5, %%mm2 \n\t"
  666. "movq %%mm2, %%mm4 \n\t"
  667. "pcmpeqb %%mm7, %%mm2 \n\t" // (l4 - l5) <= 0 ? -1 : 0
  668. "psubusb %%mm1, %%mm5 \n\t"
  669. "por %%mm5, %%mm4 \n\t" // |l4 - l5|
  670. "psubusb %%mm0, %%mm4 \n\t" //d = MAX(0, |l4-l5| - (|l2-l3| + |l5-l6|)/2)
  671. "movq %%mm4, %%mm3 \n\t" // d
  672. "psubusb pQPb, %%mm4 \n\t"
  673. "pcmpeqb %%mm7, %%mm4 \n\t" // d <= QP ? -1 : 0
  674. "psubusb b01, %%mm3 \n\t"
  675. "pand %%mm4, %%mm3 \n\t" // d <= QP ? d : 0
  676. PAVGB(%%mm7, %%mm3) // d/2
  677. "movq %%mm3, %%mm1 \n\t" // d/2
  678. PAVGB(%%mm7, %%mm3) // d/4
  679. PAVGB(%%mm1, %%mm3) // 3*d/8
  680. "movq (%0, %1, 4), %%mm0 \n\t" // line 4
  681. "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4
  682. "psubusb %%mm3, %%mm0 \n\t"
  683. "pxor %%mm2, %%mm0 \n\t"
  684. "movq %%mm0, (%0, %1, 4) \n\t" // line 4
  685. "movq (%%ebx), %%mm0 \n\t" // line 5
  686. "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5
  687. "paddusb %%mm3, %%mm0 \n\t"
  688. "pxor %%mm2, %%mm0 \n\t"
  689. "movq %%mm0, (%%ebx) \n\t" // line 5
  690. PAVGB(%%mm7, %%mm1) // d/4
  691. "movq (%%eax, %1, 2), %%mm0 \n\t" // line 3
  692. "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4
  693. "psubusb %%mm1, %%mm0 \n\t"
  694. "pxor %%mm2, %%mm0 \n\t"
  695. "movq %%mm0, (%%eax, %1, 2) \n\t" // line 3
  696. "movq (%%ebx, %1), %%mm0 \n\t" // line 6
  697. "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5
  698. "paddusb %%mm1, %%mm0 \n\t"
  699. "pxor %%mm2, %%mm0 \n\t"
  700. "movq %%mm0, (%%ebx, %1) \n\t" // line 6
  701. PAVGB(%%mm7, %%mm1) // d/8
  702. "movq (%%eax, %1), %%mm0 \n\t" // line 2
  703. "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l2-1 : l2
  704. "psubusb %%mm1, %%mm0 \n\t"
  705. "pxor %%mm2, %%mm0 \n\t"
  706. "movq %%mm0, (%%eax, %1) \n\t" // line 2
  707. "movq (%%ebx, %1, 2), %%mm0 \n\t" // line 7
  708. "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l7-1 : l7
  709. "paddusb %%mm1, %%mm0 \n\t"
  710. "pxor %%mm2, %%mm0 \n\t"
  711. "movq %%mm0, (%%ebx, %1, 2) \n\t" // line 7
  712. :
  713. : "r" (src), "r" (stride)
  714. : "%eax", "%ebx"
  715. );
  716. #else
  717. const int l1= stride;
  718. const int l2= stride + l1;
  719. const int l3= stride + l2;
  720. const int l4= stride + l3;
  721. const int l5= stride + l4;
  722. const int l6= stride + l5;
  723. const int l7= stride + l6;
  724. // const int l8= stride + l7;
  725. // const int l9= stride + l8;
  726. int x;
  727. src+= stride*3;
  728. for(x=0; x<BLOCK_SIZE; x++)
  729. {
  730. int a= src[l3] - src[l4];
  731. int b= src[l4] - src[l5];
  732. int c= src[l5] - src[l6];
  733. int d= ABS(b) - ((ABS(a) + ABS(c))>>1);
  734. d= MAX(d, 0);
  735. if(d < QP)
  736. {
  737. int v = d * SIGN(-b);
  738. src[l2] +=v>>3;
  739. src[l3] +=v>>2;
  740. src[l4] +=(3*v)>>3;
  741. src[l5] -=(3*v)>>3;
  742. src[l6] -=v>>2;
  743. src[l7] -=v>>3;
  744. }
  745. src++;
  746. }
  747. /*
  748. const int l1= stride;
  749. const int l2= stride + l1;
  750. const int l3= stride + l2;
  751. const int l4= stride + l3;
  752. const int l5= stride + l4;
  753. const int l6= stride + l5;
  754. const int l7= stride + l6;
  755. const int l8= stride + l7;
  756. const int l9= stride + l8;
  757. for(int x=0; x<BLOCK_SIZE; x++)
  758. {
  759. int v2= src[l2];
  760. int v3= src[l3];
  761. int v4= src[l4];
  762. int v5= src[l5];
  763. int v6= src[l6];
  764. int v7= src[l7];
  765. if(ABS(v4-v5)<QP && ABS(v4-v5) - (ABS(v3-v4) + ABS(v5-v6))>0 )
  766. {
  767. src[l3] = (6*v2 + 4*v3 + 3*v4 + 2*v5 + v6 )/16;
  768. src[l4] = (3*v2 + 3*v3 + 4*v4 + 3*v5 + 2*v6 + v7 )/16;
  769. src[l5] = (1*v2 + 2*v3 + 3*v4 + 4*v5 + 3*v6 + 3*v7)/16;
  770. src[l6] = ( 1*v3 + 2*v4 + 3*v5 + 4*v6 + 6*v7)/16;
  771. }
  772. src++;
  773. }
  774. */
  775. #endif
  776. }
  777. /**
  778. * Experimental Filter 1 (Horizontal)
  779. * will not damage linear gradients
  780. * Flat blocks should look like they where passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter
  781. * can only smooth blocks at the expected locations (it cant smooth them if they did move)
  782. * MMX2 version does correct clipping C version doesnt
  783. * not identical with the vertical one
  784. */
  785. static inline void horizX1Filter(uint8_t *src, int stride, int QP)
  786. {
  787. int y;
  788. //FIXME (has little in common with the mmx2 version)
  789. for(y=0; y<BLOCK_SIZE; y++)
  790. {
  791. int a= src[1] - src[2];
  792. int b= src[3] - src[4];
  793. int c= src[5] - src[6];
  794. int d= MAX(ABS(b) - (ABS(a) + ABS(c))/2, 0);
  795. if(d < QP)
  796. {
  797. int v = d * SIGN(-b);
  798. src[1] +=v/8;
  799. src[2] +=v/4;
  800. src[3] +=3*v/8;
  801. src[4] -=3*v/8;
  802. src[5] -=v/4;
  803. src[6] -=v/8;
  804. }
  805. src+=stride;
  806. }
  807. }
  808. static inline void doVertDefFilter(uint8_t src[], int stride, int QP)
  809. {
  810. #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
  811. /*
  812. uint8_t tmp[16];
  813. const int l1= stride;
  814. const int l2= stride + l1;
  815. const int l3= stride + l2;
  816. const int l4= (int)tmp - (int)src - stride*3;
  817. const int l5= (int)tmp - (int)src - stride*3 + 8;
  818. const int l6= stride*3 + l3;
  819. const int l7= stride + l6;
  820. const int l8= stride + l7;
  821. memcpy(tmp, src+stride*7, 8);
  822. memcpy(tmp+8, src+stride*8, 8);
  823. */
  824. src+= stride*4;
  825. asm volatile(
  826. #if 0 //sligtly more accurate and slightly slower
  827. "pxor %%mm7, %%mm7 \n\t" // 0
  828. "leal (%0, %1), %%eax \n\t"
  829. "leal (%%eax, %1, 4), %%ebx \n\t"
  830. // 0 1 2 3 4 5 6 7
  831. // %0 %0+%1 %0+2%1 eax+2%1 %0+4%1 eax+4%1 ebx+%1 ebx+2%1
  832. // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1
  833. "movq (%0, %1, 2), %%mm0 \n\t" // l2
  834. "movq (%0), %%mm1 \n\t" // l0
  835. "movq %%mm0, %%mm2 \n\t" // l2
  836. PAVGB(%%mm7, %%mm0) // ~l2/2
  837. PAVGB(%%mm1, %%mm0) // ~(l2 + 2l0)/4
  838. PAVGB(%%mm2, %%mm0) // ~(5l2 + 2l0)/8
  839. "movq (%%eax), %%mm1 \n\t" // l1
  840. "movq (%%eax, %1, 2), %%mm3 \n\t" // l3
  841. "movq %%mm1, %%mm4 \n\t" // l1
  842. PAVGB(%%mm7, %%mm1) // ~l1/2
  843. PAVGB(%%mm3, %%mm1) // ~(l1 + 2l3)/4
  844. PAVGB(%%mm4, %%mm1) // ~(5l1 + 2l3)/8
  845. "movq %%mm0, %%mm4 \n\t" // ~(5l2 + 2l0)/8
  846. "psubusb %%mm1, %%mm0 \n\t"
  847. "psubusb %%mm4, %%mm1 \n\t"
  848. "por %%mm0, %%mm1 \n\t" // ~|2l0 - 5l1 + 5l2 - 2l3|/8
  849. // mm1= |lenergy|, mm2= l2, mm3= l3, mm7=0
  850. "movq (%0, %1, 4), %%mm0 \n\t" // l4
  851. "movq %%mm0, %%mm4 \n\t" // l4
  852. PAVGB(%%mm7, %%mm0) // ~l4/2
  853. PAVGB(%%mm2, %%mm0) // ~(l4 + 2l2)/4
  854. PAVGB(%%mm4, %%mm0) // ~(5l4 + 2l2)/8
  855. "movq (%%ebx), %%mm2 \n\t" // l5
  856. "movq %%mm3, %%mm5 \n\t" // l3
  857. PAVGB(%%mm7, %%mm3) // ~l3/2
  858. PAVGB(%%mm2, %%mm3) // ~(l3 + 2l5)/4
  859. PAVGB(%%mm5, %%mm3) // ~(5l3 + 2l5)/8
  860. "movq %%mm0, %%mm6 \n\t" // ~(5l4 + 2l2)/8
  861. "psubusb %%mm3, %%mm0 \n\t"
  862. "psubusb %%mm6, %%mm3 \n\t"
  863. "por %%mm0, %%mm3 \n\t" // ~|2l2 - 5l3 + 5l4 - 2l5|/8
  864. "pcmpeqb %%mm7, %%mm0 \n\t" // SIGN(2l2 - 5l3 + 5l4 - 2l5)
  865. // mm0= SIGN(menergy), mm1= |lenergy|, mm2= l5, mm3= |menergy|, mm4=l4, mm5= l3, mm7=0
  866. "movq (%%ebx, %1), %%mm6 \n\t" // l6
  867. "movq %%mm6, %%mm5 \n\t" // l6
  868. PAVGB(%%mm7, %%mm6) // ~l6/2
  869. PAVGB(%%mm4, %%mm6) // ~(l6 + 2l4)/4
  870. PAVGB(%%mm5, %%mm6) // ~(5l6 + 2l4)/8
  871. "movq (%%ebx, %1, 2), %%mm5 \n\t" // l7
  872. "movq %%mm2, %%mm4 \n\t" // l5
  873. PAVGB(%%mm7, %%mm2) // ~l5/2
  874. PAVGB(%%mm5, %%mm2) // ~(l5 + 2l7)/4
  875. PAVGB(%%mm4, %%mm2) // ~(5l5 + 2l7)/8
  876. "movq %%mm6, %%mm4 \n\t" // ~(5l6 + 2l4)/8
  877. "psubusb %%mm2, %%mm6 \n\t"
  878. "psubusb %%mm4, %%mm2 \n\t"
  879. "por %%mm6, %%mm2 \n\t" // ~|2l4 - 5l5 + 5l6 - 2l7|/8
  880. // mm0= SIGN(menergy), mm1= |lenergy|/8, mm2= |renergy|/8, mm3= |menergy|/8, mm7=0
  881. PMINUB(%%mm2, %%mm1, %%mm4) // MIN(|lenergy|,|renergy|)/8
  882. "movq pQPb, %%mm4 \n\t" // QP //FIXME QP+1 ?
  883. "paddusb b01, %%mm4 \n\t"
  884. "pcmpgtb %%mm3, %%mm4 \n\t" // |menergy|/8 < QP
  885. "psubusb %%mm1, %%mm3 \n\t" // d=|menergy|/8-MIN(|lenergy|,|renergy|)/8
  886. "pand %%mm4, %%mm3 \n\t"
  887. "movq %%mm3, %%mm1 \n\t"
  888. // "psubusb b01, %%mm3 \n\t"
  889. PAVGB(%%mm7, %%mm3)
  890. PAVGB(%%mm7, %%mm3)
  891. "paddusb %%mm1, %%mm3 \n\t"
  892. // "paddusb b01, %%mm3 \n\t"
  893. "movq (%%eax, %1, 2), %%mm6 \n\t" //l3
  894. "movq (%0, %1, 4), %%mm5 \n\t" //l4
  895. "movq (%0, %1, 4), %%mm4 \n\t" //l4
  896. "psubusb %%mm6, %%mm5 \n\t"
  897. "psubusb %%mm4, %%mm6 \n\t"
  898. "por %%mm6, %%mm5 \n\t" // |l3-l4|
  899. "pcmpeqb %%mm7, %%mm6 \n\t" // SIGN(l3-l4)
  900. "pxor %%mm6, %%mm0 \n\t"
  901. "pand %%mm0, %%mm3 \n\t"
  902. PMINUB(%%mm5, %%mm3, %%mm0)
  903. "psubusb b01, %%mm3 \n\t"
  904. PAVGB(%%mm7, %%mm3)
  905. "movq (%%eax, %1, 2), %%mm0 \n\t"
  906. "movq (%0, %1, 4), %%mm2 \n\t"
  907. "pxor %%mm6, %%mm0 \n\t"
  908. "pxor %%mm6, %%mm2 \n\t"
  909. "psubb %%mm3, %%mm0 \n\t"
  910. "paddb %%mm3, %%mm2 \n\t"
  911. "pxor %%mm6, %%mm0 \n\t"
  912. "pxor %%mm6, %%mm2 \n\t"
  913. "movq %%mm0, (%%eax, %1, 2) \n\t"
  914. "movq %%mm2, (%0, %1, 4) \n\t"
  915. #endif
  916. "leal (%0, %1), %%eax \n\t"
  917. "pcmpeqb %%mm6, %%mm6 \n\t" // -1
  918. // 0 1 2 3 4 5 6 7
  919. // %0 %0+%1 %0+2%1 eax+2%1 %0+4%1 eax+4%1 ebx+%1 ebx+2%1
  920. // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1
  921. "movq (%%eax, %1, 2), %%mm1 \n\t" // l3
  922. "movq (%0, %1, 4), %%mm0 \n\t" // l4
  923. "pxor %%mm6, %%mm1 \n\t" // -l3-1
  924. PAVGB(%%mm1, %%mm0) // -q+128 = (l4-l3+256)/2
  925. // mm1=-l3-1, mm0=128-q
  926. "movq (%%eax, %1, 4), %%mm2 \n\t" // l5
  927. "movq (%%eax, %1), %%mm3 \n\t" // l2
  928. "pxor %%mm6, %%mm2 \n\t" // -l5-1
  929. "movq %%mm2, %%mm5 \n\t" // -l5-1
  930. "movq b80, %%mm4 \n\t" // 128
  931. "leal (%%eax, %1, 4), %%ebx \n\t"
  932. PAVGB(%%mm3, %%mm2) // (l2-l5+256)/2
  933. PAVGB(%%mm0, %%mm4) // ~(l4-l3)/4 + 128
  934. PAVGB(%%mm2, %%mm4) // ~(l2-l5)/4 +(l4-l3)/8 + 128
  935. PAVGB(%%mm0, %%mm4) // ~(l2-l5)/8 +5(l4-l3)/16 + 128
  936. // mm1=-l3-1, mm0=128-q, mm3=l2, mm4=menergy/16 + 128, mm5= -l5-1
  937. "movq (%%eax), %%mm2 \n\t" // l1
  938. "pxor %%mm6, %%mm2 \n\t" // -l1-1
  939. PAVGB(%%mm3, %%mm2) // (l2-l1+256)/2
  940. PAVGB((%0), %%mm1) // (l0-l3+256)/2
  941. "movq b80, %%mm3 \n\t" // 128
  942. PAVGB(%%mm2, %%mm3) // ~(l2-l1)/4 + 128
  943. PAVGB(%%mm1, %%mm3) // ~(l0-l3)/4 +(l2-l1)/8 + 128
  944. PAVGB(%%mm2, %%mm3) // ~(l0-l3)/8 +5(l2-l1)/16 + 128
  945. // mm0=128-q, mm3=lenergy/16 + 128, mm4= menergy/16 + 128, mm5= -l5-1
  946. PAVGB((%%ebx, %1), %%mm5) // (l6-l5+256)/2
  947. "movq (%%ebx, %1, 2), %%mm1 \n\t" // l7
  948. "pxor %%mm6, %%mm1 \n\t" // -l7-1
  949. PAVGB((%0, %1, 4), %%mm1) // (l4-l7+256)/2
  950. "movq b80, %%mm2 \n\t" // 128
  951. PAVGB(%%mm5, %%mm2) // ~(l6-l5)/4 + 128
  952. PAVGB(%%mm1, %%mm2) // ~(l4-l7)/4 +(l6-l5)/8 + 128
  953. PAVGB(%%mm5, %%mm2) // ~(l4-l7)/8 +5(l6-l5)/16 + 128
  954. // mm0=128-q, mm2=renergy/16 + 128, mm3=lenergy/16 + 128, mm4= menergy/16 + 128
  955. "movq b00, %%mm1 \n\t" // 0
  956. "movq b00, %%mm5 \n\t" // 0
  957. "psubb %%mm2, %%mm1 \n\t" // 128 - renergy/16
  958. "psubb %%mm3, %%mm5 \n\t" // 128 - lenergy/16
  959. PMAXUB(%%mm1, %%mm2) // 128 + |renergy/16|
  960. PMAXUB(%%mm5, %%mm3) // 128 + |lenergy/16|
  961. PMINUB(%%mm2, %%mm3, %%mm1) // 128 + MIN(|lenergy|,|renergy|)/16
  962. // mm0=128-q, mm3=128 + MIN(|lenergy|,|renergy|)/16, mm4= menergy/16 + 128
  963. "movq b00, %%mm7 \n\t" // 0
  964. "movq pQPb, %%mm2 \n\t" // QP
  965. PAVGB(%%mm6, %%mm2) // 128 + QP/2
  966. "psubb %%mm6, %%mm2 \n\t"
  967. "movq %%mm4, %%mm1 \n\t"
  968. "pcmpgtb %%mm7, %%mm1 \n\t" // SIGN(menergy)
  969. "pxor %%mm1, %%mm4 \n\t"
  970. "psubb %%mm1, %%mm4 \n\t" // 128 + |menergy|/16
  971. "pcmpgtb %%mm4, %%mm2 \n\t" // |menergy|/16 < QP/2
  972. "psubusb %%mm3, %%mm4 \n\t" //d=|menergy|/16 - MIN(|lenergy|,|renergy|)/16
  973. // mm0=128-q, mm1= SIGN(menergy), mm2= |menergy|/16 < QP/2, mm4= d/16
  974. "movq %%mm4, %%mm3 \n\t" // d
  975. "psubusb b01, %%mm4 \n\t"
  976. PAVGB(%%mm7, %%mm4) // d/32
  977. PAVGB(%%mm7, %%mm4) // (d + 32)/64
  978. "paddb %%mm3, %%mm4 \n\t" // 5d/64
  979. "pand %%mm2, %%mm4 \n\t"
  980. "movq b80, %%mm5 \n\t" // 128
  981. "psubb %%mm0, %%mm5 \n\t" // q
  982. "paddsb %%mm6, %%mm5 \n\t" // fix bad rounding
  983. "pcmpgtb %%mm5, %%mm7 \n\t" // SIGN(q)
  984. "pxor %%mm7, %%mm5 \n\t"
  985. PMINUB(%%mm5, %%mm4, %%mm3) // MIN(|q|, 5d/64)
  986. "pxor %%mm1, %%mm7 \n\t" // SIGN(d*q)
  987. "pand %%mm7, %%mm4 \n\t"
  988. "movq (%%eax, %1, 2), %%mm0 \n\t"
  989. "movq (%0, %1, 4), %%mm2 \n\t"
  990. "pxor %%mm1, %%mm0 \n\t"
  991. "pxor %%mm1, %%mm2 \n\t"
  992. "paddb %%mm4, %%mm0 \n\t"
  993. "psubb %%mm4, %%mm2 \n\t"
  994. "pxor %%mm1, %%mm0 \n\t"
  995. "pxor %%mm1, %%mm2 \n\t"
  996. "movq %%mm0, (%%eax, %1, 2) \n\t"
  997. "movq %%mm2, (%0, %1, 4) \n\t"
  998. :
  999. : "r" (src), "r" (stride)
  1000. : "%eax", "%ebx"
  1001. );
  1002. /*
  1003. {
  1004. int x;
  1005. src-= stride;
  1006. for(x=0; x<BLOCK_SIZE; x++)
  1007. {
  1008. const int middleEnergy= 5*(src[l5] - src[l4]) + 2*(src[l3] - src[l6]);
  1009. if(ABS(middleEnergy)< 8*QP)
  1010. {
  1011. const int q=(src[l4] - src[l5])/2;
  1012. const int leftEnergy= 5*(src[l3] - src[l2]) + 2*(src[l1] - src[l4]);
  1013. const int rightEnergy= 5*(src[l7] - src[l6]) + 2*(src[l5] - src[l8]);
  1014. int d= ABS(middleEnergy) - MIN( ABS(leftEnergy), ABS(rightEnergy) );
  1015. d= MAX(d, 0);
  1016. d= (5*d + 32) >> 6;
  1017. d*= SIGN(-middleEnergy);
  1018. if(q>0)
  1019. {
  1020. d= d<0 ? 0 : d;
  1021. d= d>q ? q : d;
  1022. }
  1023. else
  1024. {
  1025. d= d>0 ? 0 : d;
  1026. d= d<q ? q : d;
  1027. }
  1028. src[l4]-= d;
  1029. src[l5]+= d;
  1030. }
  1031. src++;
  1032. }
  1033. src-=8;
  1034. for(x=0; x<8; x++)
  1035. {
  1036. int y;
  1037. for(y=4; y<6; y++)
  1038. {
  1039. int d= src[x+y*stride] - tmp[x+(y-4)*8];
  1040. int ad= ABS(d);
  1041. static int max=0;
  1042. static int sum=0;
  1043. static int num=0;
  1044. static int bias=0;
  1045. if(max<ad) max=ad;
  1046. sum+= ad>3 ? 1 : 0;
  1047. if(ad>3)
  1048. {
  1049. src[0] = src[7] = src[stride*7] = src[(stride+1)*7]=255;
  1050. }
  1051. if(y==4) bias+=d;
  1052. num++;
  1053. if(num%1000000 == 0)
  1054. {
  1055. printf(" %d %d %d %d\n", num, sum, max, bias);
  1056. }
  1057. }
  1058. }
  1059. }
  1060. */
  1061. #elif defined (HAVE_MMX)
  1062. src+= stride*4;
  1063. asm volatile(
  1064. "pxor %%mm7, %%mm7 \n\t"
  1065. "leal (%0, %1), %%eax \n\t"
  1066. "leal (%%eax, %1, 4), %%ebx \n\t"
  1067. // 0 1 2 3 4 5 6 7
  1068. // %0 %0+%1 %0+2%1 eax+2%1 %0+4%1 eax+4%1 ebx+%1 ebx+2%1
  1069. // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1
  1070. "movq (%0), %%mm0 \n\t"
  1071. "movq %%mm0, %%mm1 \n\t"
  1072. "punpcklbw %%mm7, %%mm0 \n\t" // low part of line 0
  1073. "punpckhbw %%mm7, %%mm1 \n\t" // high part of line 0
  1074. "movq (%%eax), %%mm2 \n\t"
  1075. "movq %%mm2, %%mm3 \n\t"
  1076. "punpcklbw %%mm7, %%mm2 \n\t" // low part of line 1
  1077. "punpckhbw %%mm7, %%mm3 \n\t" // high part of line 1
  1078. "movq (%%eax, %1), %%mm4 \n\t"
  1079. "movq %%mm4, %%mm5 \n\t"
  1080. "punpcklbw %%mm7, %%mm4 \n\t" // low part of line 2
  1081. "punpckhbw %%mm7, %%mm5 \n\t" // high part of line 2
  1082. "paddw %%mm0, %%mm0 \n\t" // 2L0
  1083. "paddw %%mm1, %%mm1 \n\t" // 2H0
  1084. "psubw %%mm4, %%mm2 \n\t" // L1 - L2
  1085. "psubw %%mm5, %%mm3 \n\t" // H1 - H2
  1086. "psubw %%mm2, %%mm0 \n\t" // 2L0 - L1 + L2
  1087. "psubw %%mm3, %%mm1 \n\t" // 2H0 - H1 + H2
  1088. "psllw $2, %%mm2 \n\t" // 4L1 - 4L2
  1089. "psllw $2, %%mm3 \n\t" // 4H1 - 4H2
  1090. "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2
  1091. "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2
  1092. "movq (%%eax, %1, 2), %%mm2 \n\t"
  1093. "movq %%mm2, %%mm3 \n\t"
  1094. "punpcklbw %%mm7, %%mm2 \n\t" // L3
  1095. "punpckhbw %%mm7, %%mm3 \n\t" // H3
  1096. "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 - L3
  1097. "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 - H3
  1098. "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 - 2L3
  1099. "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 - 2H3
  1100. "movq %%mm0, temp0 \n\t" // 2L0 - 5L1 + 5L2 - 2L3
  1101. "movq %%mm1, temp1 \n\t" // 2H0 - 5H1 + 5H2 - 2H3
  1102. "movq (%0, %1, 4), %%mm0 \n\t"
  1103. "movq %%mm0, %%mm1 \n\t"
  1104. "punpcklbw %%mm7, %%mm0 \n\t" // L4
  1105. "punpckhbw %%mm7, %%mm1 \n\t" // H4
  1106. "psubw %%mm0, %%mm2 \n\t" // L3 - L4
  1107. "psubw %%mm1, %%mm3 \n\t" // H3 - H4
  1108. "movq %%mm2, temp2 \n\t" // L3 - L4
  1109. "movq %%mm3, temp3 \n\t" // H3 - H4
  1110. "paddw %%mm4, %%mm4 \n\t" // 2L2
  1111. "paddw %%mm5, %%mm5 \n\t" // 2H2
  1112. "psubw %%mm2, %%mm4 \n\t" // 2L2 - L3 + L4
  1113. "psubw %%mm3, %%mm5 \n\t" // 2H2 - H3 + H4
  1114. "psllw $2, %%mm2 \n\t" // 4L3 - 4L4
  1115. "psllw $2, %%mm3 \n\t" // 4H3 - 4H4
  1116. "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4
  1117. "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4
  1118. //50 opcodes so far
  1119. "movq (%%ebx), %%mm2 \n\t"
  1120. "movq %%mm2, %%mm3 \n\t"
  1121. "punpcklbw %%mm7, %%mm2 \n\t" // L5
  1122. "punpckhbw %%mm7, %%mm3 \n\t" // H5
  1123. "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 - L5
  1124. "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 - H5
  1125. "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 - 2L5
  1126. "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 - 2H5
  1127. "movq (%%ebx, %1), %%mm6 \n\t"
  1128. "punpcklbw %%mm7, %%mm6 \n\t" // L6
  1129. "psubw %%mm6, %%mm2 \n\t" // L5 - L6
  1130. "movq (%%ebx, %1), %%mm6 \n\t"
  1131. "punpckhbw %%mm7, %%mm6 \n\t" // H6
  1132. "psubw %%mm6, %%mm3 \n\t" // H5 - H6
  1133. "paddw %%mm0, %%mm0 \n\t" // 2L4
  1134. "paddw %%mm1, %%mm1 \n\t" // 2H4
  1135. "psubw %%mm2, %%mm0 \n\t" // 2L4 - L5 + L6
  1136. "psubw %%mm3, %%mm1 \n\t" // 2H4 - H5 + H6
  1137. "psllw $2, %%mm2 \n\t" // 4L5 - 4L6
  1138. "psllw $2, %%mm3 \n\t" // 4H5 - 4H6
  1139. "psubw %%mm2, %%mm0 \n\t" // 2L4 - 5L5 + 5L6
  1140. "psubw %%mm3, %%mm1 \n\t" // 2H4 - 5H5 + 5H6
  1141. "movq (%%ebx, %1, 2), %%mm2 \n\t"
  1142. "movq %%mm2, %%mm3 \n\t"
  1143. "punpcklbw %%mm7, %%mm2 \n\t" // L7
  1144. "punpckhbw %%mm7, %%mm3 \n\t" // H7
  1145. "paddw %%mm2, %%mm2 \n\t" // 2L7
  1146. "paddw %%mm3, %%mm3 \n\t" // 2H7
  1147. "psubw %%mm2, %%mm0 \n\t" // 2L4 - 5L5 + 5L6 - 2L7
  1148. "psubw %%mm3, %%mm1 \n\t" // 2H4 - 5H5 + 5H6 - 2H7
  1149. "movq temp0, %%mm2 \n\t" // 2L0 - 5L1 + 5L2 - 2L3
  1150. "movq temp1, %%mm3 \n\t" // 2H0 - 5H1 + 5H2 - 2H3
  1151. #ifdef HAVE_MMX2
  1152. "movq %%mm7, %%mm6 \n\t" // 0
  1153. "psubw %%mm0, %%mm6 \n\t"
  1154. "pmaxsw %%mm6, %%mm0 \n\t" // |2L4 - 5L5 + 5L6 - 2L7|
  1155. "movq %%mm7, %%mm6 \n\t" // 0
  1156. "psubw %%mm1, %%mm6 \n\t"
  1157. "pmaxsw %%mm6, %%mm1 \n\t" // |2H4 - 5H5 + 5H6 - 2H7|
  1158. "movq %%mm7, %%mm6 \n\t" // 0
  1159. "psubw %%mm2, %%mm6 \n\t"
  1160. "pmaxsw %%mm6, %%mm2 \n\t" // |2L0 - 5L1 + 5L2 - 2L3|
  1161. "movq %%mm7, %%mm6 \n\t" // 0
  1162. "psubw %%mm3, %%mm6 \n\t"
  1163. "pmaxsw %%mm6, %%mm3 \n\t" // |2H0 - 5H1 + 5H2 - 2H3|
  1164. #else
  1165. "movq %%mm7, %%mm6 \n\t" // 0
  1166. "pcmpgtw %%mm0, %%mm6 \n\t"
  1167. "pxor %%mm6, %%mm0 \n\t"
  1168. "psubw %%mm6, %%mm0 \n\t" // |2L4 - 5L5 + 5L6 - 2L7|
  1169. "movq %%mm7, %%mm6 \n\t" // 0
  1170. "pcmpgtw %%mm1, %%mm6 \n\t"
  1171. "pxor %%mm6, %%mm1 \n\t"
  1172. "psubw %%mm6, %%mm1 \n\t" // |2H4 - 5H5 + 5H6 - 2H7|
  1173. "movq %%mm7, %%mm6 \n\t" // 0
  1174. "pcmpgtw %%mm2, %%mm6 \n\t"
  1175. "pxor %%mm6, %%mm2 \n\t"
  1176. "psubw %%mm6, %%mm2 \n\t" // |2L0 - 5L1 + 5L2 - 2L3|
  1177. "movq %%mm7, %%mm6 \n\t" // 0
  1178. "pcmpgtw %%mm3, %%mm6 \n\t"
  1179. "pxor %%mm6, %%mm3 \n\t"
  1180. "psubw %%mm6, %%mm3 \n\t" // |2H0 - 5H1 + 5H2 - 2H3|
  1181. #endif
  1182. #ifdef HAVE_MMX2
  1183. "pminsw %%mm2, %%mm0 \n\t"
  1184. "pminsw %%mm3, %%mm1 \n\t"
  1185. #else
  1186. "movq %%mm0, %%mm6 \n\t"
  1187. "psubusw %%mm2, %%mm6 \n\t"
  1188. "psubw %%mm6, %%mm0 \n\t"
  1189. "movq %%mm1, %%mm6 \n\t"
  1190. "psubusw %%mm3, %%mm6 \n\t"
  1191. "psubw %%mm6, %%mm1 \n\t"
  1192. #endif
  1193. "movq %%mm7, %%mm6 \n\t" // 0
  1194. "pcmpgtw %%mm4, %%mm6 \n\t" // sign(2L2 - 5L3 + 5L4 - 2L5)
  1195. "pxor %%mm6, %%mm4 \n\t"
  1196. "psubw %%mm6, %%mm4 \n\t" // |2L2 - 5L3 + 5L4 - 2L5|
  1197. "pcmpgtw %%mm5, %%mm7 \n\t" // sign(2H2 - 5H3 + 5H4 - 2H5)
  1198. "pxor %%mm7, %%mm5 \n\t"
  1199. "psubw %%mm7, %%mm5 \n\t" // |2H2 - 5H3 + 5H4 - 2H5|
  1200. // 100 opcodes
  1201. "movd %2, %%mm2 \n\t" // QP
  1202. "punpcklwd %%mm2, %%mm2 \n\t"
  1203. "punpcklwd %%mm2, %%mm2 \n\t"
  1204. "psllw $3, %%mm2 \n\t" // 8QP
  1205. "movq %%mm2, %%mm3 \n\t" // 8QP
  1206. "pcmpgtw %%mm4, %%mm2 \n\t"
  1207. "pcmpgtw %%mm5, %%mm3 \n\t"
  1208. "pand %%mm2, %%mm4 \n\t"
  1209. "pand %%mm3, %%mm5 \n\t"
  1210. "psubusw %%mm0, %%mm4 \n\t" // hd
  1211. "psubusw %%mm1, %%mm5 \n\t" // ld
  1212. "movq w05, %%mm2 \n\t" // 5
  1213. "pmullw %%mm2, %%mm4 \n\t"
  1214. "pmullw %%mm2, %%mm5 \n\t"
  1215. "movq w20, %%mm2 \n\t" // 32
  1216. "paddw %%mm2, %%mm4 \n\t"
  1217. "paddw %%mm2, %%mm5 \n\t"
  1218. "psrlw $6, %%mm4 \n\t"
  1219. "psrlw $6, %%mm5 \n\t"
  1220. /*
  1221. "movq w06, %%mm2 \n\t" // 6
  1222. "paddw %%mm2, %%mm4 \n\t"
  1223. "paddw %%mm2, %%mm5 \n\t"
  1224. "movq w1400, %%mm2 \n\t" // 1400h = 5120 = 5/64*2^16
  1225. //FIXME if *5/64 is supposed to be /13 then we should use 5041 instead of 5120
  1226. "pmulhw %%mm2, %%mm4 \n\t" // hd/13
  1227. "pmulhw %%mm2, %%mm5 \n\t" // ld/13
  1228. */
  1229. "movq temp2, %%mm0 \n\t" // L3 - L4
  1230. "movq temp3, %%mm1 \n\t" // H3 - H4
  1231. "pxor %%mm2, %%mm2 \n\t"
  1232. "pxor %%mm3, %%mm3 \n\t"
  1233. "pcmpgtw %%mm0, %%mm2 \n\t" // sign (L3-L4)
  1234. "pcmpgtw %%mm1, %%mm3 \n\t" // sign (H3-H4)
  1235. "pxor %%mm2, %%mm0 \n\t"
  1236. "pxor %%mm3, %%mm1 \n\t"
  1237. "psubw %%mm2, %%mm0 \n\t" // |L3-L4|
  1238. "psubw %%mm3, %%mm1 \n\t" // |H3-H4|
  1239. "psrlw $1, %%mm0 \n\t" // |L3 - L4|/2
  1240. "psrlw $1, %%mm1 \n\t" // |H3 - H4|/2
  1241. "pxor %%mm6, %%mm2 \n\t"
  1242. "pxor %%mm7, %%mm3 \n\t"
  1243. "pand %%mm2, %%mm4 \n\t"
  1244. "pand %%mm3, %%mm5 \n\t"
  1245. #ifdef HAVE_MMX2
  1246. "pminsw %%mm0, %%mm4 \n\t"
  1247. "pminsw %%mm1, %%mm5 \n\t"
  1248. #else
  1249. "movq %%mm4, %%mm2 \n\t"
  1250. "psubusw %%mm0, %%mm2 \n\t"
  1251. "psubw %%mm2, %%mm4 \n\t"
  1252. "movq %%mm5, %%mm2 \n\t"
  1253. "psubusw %%mm1, %%mm2 \n\t"
  1254. "psubw %%mm2, %%mm5 \n\t"
  1255. #endif
  1256. "pxor %%mm6, %%mm4 \n\t"
  1257. "pxor %%mm7, %%mm5 \n\t"
  1258. "psubw %%mm6, %%mm4 \n\t"
  1259. "psubw %%mm7, %%mm5 \n\t"
  1260. "packsswb %%mm5, %%mm4 \n\t"
  1261. "movq (%%eax, %1, 2), %%mm0 \n\t"
  1262. "paddb %%mm4, %%mm0 \n\t"
  1263. "movq %%mm0, (%%eax, %1, 2) \n\t"
  1264. "movq (%0, %1, 4), %%mm0 \n\t"
  1265. "psubb %%mm4, %%mm0 \n\t"
  1266. "movq %%mm0, (%0, %1, 4) \n\t"
  1267. :
  1268. : "r" (src), "r" (stride), "r" (QP)
  1269. : "%eax", "%ebx"
  1270. );
  1271. #else
  1272. const int l1= stride;
  1273. const int l2= stride + l1;
  1274. const int l3= stride + l2;
  1275. const int l4= stride + l3;
  1276. const int l5= stride + l4;
  1277. const int l6= stride + l5;
  1278. const int l7= stride + l6;
  1279. const int l8= stride + l7;
  1280. // const int l9= stride + l8;
  1281. int x;
  1282. src+= stride*3;
  1283. for(x=0; x<BLOCK_SIZE; x++)
  1284. {
  1285. const int middleEnergy= 5*(src[l5] - src[l4]) + 2*(src[l3] - src[l6]);
  1286. if(ABS(middleEnergy) < 8*QP)
  1287. {
  1288. const int q=(src[l4] - src[l5])/2;
  1289. const int leftEnergy= 5*(src[l3] - src[l2]) + 2*(src[l1] - src[l4]);
  1290. const int rightEnergy= 5*(src[l7] - src[l6]) + 2*(src[l5] - src[l8]);
  1291. int d= ABS(middleEnergy) - MIN( ABS(leftEnergy), ABS(rightEnergy) );
  1292. d= MAX(d, 0);
  1293. d= (5*d + 32) >> 6;
  1294. d*= SIGN(-middleEnergy);
  1295. if(q>0)
  1296. {
  1297. d= d<0 ? 0 : d;
  1298. d= d>q ? q : d;
  1299. }
  1300. else
  1301. {
  1302. d= d>0 ? 0 : d;
  1303. d= d<q ? q : d;
  1304. }
  1305. src[l4]-= d;
  1306. src[l5]+= d;
  1307. }
  1308. src++;
  1309. }
  1310. #endif
  1311. }
  1312. /**
  1313. * Check if the given 8x8 Block is mostly "flat"
  1314. */
  1315. static inline int isHorizDC(uint8_t src[], int stride)
  1316. {
  1317. int numEq= 0;
  1318. int y;
  1319. for(y=0; y<BLOCK_SIZE; y++)
  1320. {
  1321. if(((src[0] - src[1] + 1) & 0xFFFF) < 3) numEq++;
  1322. if(((src[1] - src[2] + 1) & 0xFFFF) < 3) numEq++;
  1323. if(((src[2] - src[3] + 1) & 0xFFFF) < 3) numEq++;
  1324. if(((src[3] - src[4] + 1) & 0xFFFF) < 3) numEq++;
  1325. if(((src[4] - src[5] + 1) & 0xFFFF) < 3) numEq++;
  1326. if(((src[5] - src[6] + 1) & 0xFFFF) < 3) numEq++;
  1327. if(((src[6] - src[7] + 1) & 0xFFFF) < 3) numEq++;
  1328. src+= stride;
  1329. }
  1330. return numEq > hFlatnessThreshold;
  1331. }
  1332. static inline int isHorizMinMaxOk(uint8_t src[], int stride, int QP)
  1333. {
  1334. if(abs(src[0] - src[7]) > 2*QP) return 0;
  1335. return 1;
  1336. }
  1337. static inline void doHorizDefFilter(uint8_t dst[], int stride, int QP)
  1338. {
  1339. int y;
  1340. for(y=0; y<BLOCK_SIZE; y++)
  1341. {
  1342. const int middleEnergy= 5*(dst[4] - dst[5]) + 2*(dst[2] - dst[5]);
  1343. if(ABS(middleEnergy) < 8*QP)
  1344. {
  1345. const int q=(dst[3] - dst[4])/2;
  1346. const int leftEnergy= 5*(dst[2] - dst[1]) + 2*(dst[0] - dst[3]);
  1347. const int rightEnergy= 5*(dst[6] - dst[5]) + 2*(dst[4] - dst[7]);
  1348. int d= ABS(middleEnergy) - MIN( ABS(leftEnergy), ABS(rightEnergy) );
  1349. d= MAX(d, 0);
  1350. d= (5*d + 32) >> 6;
  1351. d*= SIGN(-middleEnergy);
  1352. if(q>0)
  1353. {
  1354. d= d<0 ? 0 : d;
  1355. d= d>q ? q : d;
  1356. }
  1357. else
  1358. {
  1359. d= d>0 ? 0 : d;
  1360. d= d<q ? q : d;
  1361. }
  1362. dst[3]-= d;
  1363. dst[4]+= d;
  1364. }
  1365. dst+= stride;
  1366. }
  1367. }
  1368. /**
  1369. * Do a horizontal low pass filter on the 10x8 block (dst points to middle 8x8 Block)
  1370. * using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16 (C version)
  1371. */
  1372. static inline void doHorizLowPass(uint8_t dst[], int stride, int QP)
  1373. {
  1374. int y;
  1375. for(y=0; y<BLOCK_SIZE; y++)
  1376. {
  1377. const int first= ABS(dst[-1] - dst[0]) < QP ? dst[-1] : dst[0];
  1378. const int last= ABS(dst[8] - dst[7]) < QP ? dst[8] : dst[7];
  1379. int sums[9];
  1380. sums[0] = first + dst[0];
  1381. sums[1] = dst[0] + dst[1];
  1382. sums[2] = dst[1] + dst[2];
  1383. sums[3] = dst[2] + dst[3];
  1384. sums[4] = dst[3] + dst[4];
  1385. sums[5] = dst[4] + dst[5];
  1386. sums[6] = dst[5] + dst[6];
  1387. sums[7] = dst[6] + dst[7];
  1388. sums[8] = dst[7] + last;
  1389. dst[0]= ((sums[0]<<2) + ((first + sums[2])<<1) + sums[4] + 8)>>4;
  1390. dst[1]= ((dst[1]<<2) + ((first + sums[0] + sums[3])<<1) + sums[5] + 8)>>4;
  1391. dst[2]= ((dst[2]<<2) + ((first + sums[1] + sums[4])<<1) + sums[6] + 8)>>4;
  1392. dst[3]= ((dst[3]<<2) + ((sums[2] + sums[5])<<1) + sums[0] + sums[7] + 8)>>4;
  1393. dst[4]= ((dst[4]<<2) + ((sums[3] + sums[6])<<1) + sums[1] + sums[8] + 8)>>4;
  1394. dst[5]= ((dst[5]<<2) + ((last + sums[7] + sums[4])<<1) + sums[2] + 8)>>4;
  1395. dst[6]= (((last + dst[6])<<2) + ((dst[7] + sums[5])<<1) + sums[3] + 8)>>4;
  1396. dst[7]= ((sums[8]<<2) + ((last + sums[6])<<1) + sums[4] + 8)>>4;
  1397. dst+= stride;
  1398. }
  1399. }
  1400. static inline void dering(uint8_t src[], int stride, int QP)
  1401. {
  1402. #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
  1403. asm volatile(
  1404. "movq pQPb, %%mm0 \n\t"
  1405. "paddusb %%mm0, %%mm0 \n\t"
  1406. "movq %%mm0, pQPb2 \n\t"
  1407. "leal (%0, %1), %%eax \n\t"
  1408. "leal (%%eax, %1, 4), %%ebx \n\t"
  1409. // 0 1 2 3 4 5 6 7 8 9
  1410. // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
  1411. "pcmpeqb %%mm6, %%mm6 \n\t"
  1412. "pxor %%mm7, %%mm7 \n\t"
  1413. #ifdef HAVE_MMX2
  1414. #define FIND_MIN_MAX(addr)\
  1415. "movq " #addr ", %%mm0 \n\t"\
  1416. "pminub %%mm0, %%mm6 \n\t"\
  1417. "pmaxub %%mm0, %%mm7 \n\t"
  1418. #else
  1419. #define FIND_MIN_MAX(addr)\
  1420. "movq " #addr ", %%mm0 \n\t"\
  1421. "movq %%mm6, %%mm1 \n\t"\
  1422. "psubusb %%mm0, %%mm7 \n\t"\
  1423. "paddb %%mm0, %%mm7 \n\t"\
  1424. "psubusb %%mm0, %%mm1 \n\t"\
  1425. "psubb %%mm1, %%mm6 \n\t"
  1426. #endif
  1427. FIND_MIN_MAX((%%eax))
  1428. FIND_MIN_MAX((%%eax, %1))
  1429. FIND_MIN_MAX((%%eax, %1, 2))
  1430. FIND_MIN_MAX((%0, %1, 4))
  1431. FIND_MIN_MAX((%%ebx))
  1432. FIND_MIN_MAX((%%ebx, %1))
  1433. FIND_MIN_MAX((%%ebx, %1, 2))
  1434. FIND_MIN_MAX((%0, %1, 8))
  1435. "movq %%mm6, %%mm4 \n\t"
  1436. "psrlq $8, %%mm6 \n\t"
  1437. #ifdef HAVE_MMX2
  1438. "pminub %%mm4, %%mm6 \n\t" // min of pixels
  1439. "pshufw $0xF9, %%mm6, %%mm4 \n\t"
  1440. "pminub %%mm4, %%mm6 \n\t" // min of pixels
  1441. "pshufw $0xFE, %%mm6, %%mm4 \n\t"
  1442. "pminub %%mm4, %%mm6 \n\t"
  1443. #else
  1444. "movq %%mm6, %%mm1 \n\t"
  1445. "psubusb %%mm4, %%mm1 \n\t"
  1446. "psubb %%mm1, %%mm6 \n\t"
  1447. "movq %%mm6, %%mm4 \n\t"
  1448. "psrlq $16, %%mm6 \n\t"
  1449. "movq %%mm6, %%mm1 \n\t"
  1450. "psubusb %%mm4, %%mm1 \n\t"
  1451. "psubb %%mm1, %%mm6 \n\t"
  1452. "movq %%mm6, %%mm4 \n\t"
  1453. "psrlq $32, %%mm6 \n\t"
  1454. "movq %%mm6, %%mm1 \n\t"
  1455. "psubusb %%mm4, %%mm1 \n\t"
  1456. "psubb %%mm1, %%mm6 \n\t"
  1457. #endif
  1458. "movq %%mm7, %%mm4 \n\t"
  1459. "psrlq $8, %%mm7 \n\t"
  1460. #ifdef HAVE_MMX2
  1461. "pmaxub %%mm4, %%mm7 \n\t" // max of pixels
  1462. "pshufw $0xF9, %%mm7, %%mm4 \n\t"
  1463. "pmaxub %%mm4, %%mm7 \n\t"
  1464. "pshufw $0xFE, %%mm7, %%mm4 \n\t"
  1465. "pmaxub %%mm4, %%mm7 \n\t"
  1466. #else
  1467. "psubusb %%mm4, %%mm7 \n\t"
  1468. "paddb %%mm4, %%mm7 \n\t"
  1469. "movq %%mm7, %%mm4 \n\t"
  1470. "psrlq $16, %%mm7 \n\t"
  1471. "psubusb %%mm4, %%mm7 \n\t"
  1472. "paddb %%mm4, %%mm7 \n\t"
  1473. "movq %%mm7, %%mm4 \n\t"
  1474. "psrlq $32, %%mm7 \n\t"
  1475. "psubusb %%mm4, %%mm7 \n\t"
  1476. "paddb %%mm4, %%mm7 \n\t"
  1477. #endif
  1478. PAVGB(%%mm6, %%mm7) // a=(max + min)/2
  1479. "punpcklbw %%mm7, %%mm7 \n\t"
  1480. "punpcklbw %%mm7, %%mm7 \n\t"
  1481. "punpcklbw %%mm7, %%mm7 \n\t"
  1482. "movq %%mm7, temp0 \n\t"
  1483. "movq (%0), %%mm0 \n\t" // L10
  1484. "movq %%mm0, %%mm1 \n\t" // L10
  1485. "movq %%mm0, %%mm2 \n\t" // L10
  1486. "psllq $8, %%mm1 \n\t"
  1487. "psrlq $8, %%mm2 \n\t"
  1488. "movd -4(%0), %%mm3 \n\t"
  1489. "movd 8(%0), %%mm4 \n\t"
  1490. "psrlq $24, %%mm3 \n\t"
  1491. "psllq $56, %%mm4 \n\t"
  1492. "por %%mm3, %%mm1 \n\t" // L00
  1493. "por %%mm4, %%mm2 \n\t" // L20
  1494. "movq %%mm1, %%mm3 \n\t" // L00
  1495. PAVGB(%%mm2, %%mm1) // (L20 + L00)/2
  1496. PAVGB(%%mm0, %%mm1) // (L20 + L00 + 2L10)/4
  1497. "psubusb %%mm7, %%mm0 \n\t"
  1498. "psubusb %%mm7, %%mm2 \n\t"
  1499. "psubusb %%mm7, %%mm3 \n\t"
  1500. "pcmpeqb b00, %%mm0 \n\t" // L10 > a ? 0 : -1
  1501. "pcmpeqb b00, %%mm2 \n\t" // L20 > a ? 0 : -1
  1502. "pcmpeqb b00, %%mm3 \n\t" // L00 > a ? 0 : -1
  1503. "paddb %%mm2, %%mm0 \n\t"
  1504. "paddb %%mm3, %%mm0 \n\t"
  1505. "movq (%%eax), %%mm2 \n\t" // L11
  1506. "movq %%mm2, %%mm3 \n\t" // L11
  1507. "movq %%mm2, %%mm4 \n\t" // L11
  1508. "psllq $8, %%mm3 \n\t"
  1509. "psrlq $8, %%mm4 \n\t"
  1510. "movd -4(%%eax), %%mm5 \n\t"
  1511. "movd 8(%%eax), %%mm6 \n\t"
  1512. "psrlq $24, %%mm5 \n\t"
  1513. "psllq $56, %%mm6 \n\t"
  1514. "por %%mm5, %%mm3 \n\t" // L01
  1515. "por %%mm6, %%mm4 \n\t" // L21
  1516. "movq %%mm3, %%mm5 \n\t" // L01
  1517. PAVGB(%%mm4, %%mm3) // (L21 + L01)/2
  1518. PAVGB(%%mm2, %%mm3) // (L21 + L01 + 2L11)/4
  1519. "psubusb %%mm7, %%mm2 \n\t"
  1520. "psubusb %%mm7, %%mm4 \n\t"
  1521. "psubusb %%mm7, %%mm5 \n\t"
  1522. "pcmpeqb b00, %%mm2 \n\t" // L11 > a ? 0 : -1
  1523. "pcmpeqb b00, %%mm4 \n\t" // L21 > a ? 0 : -1
  1524. "pcmpeqb b00, %%mm5 \n\t" // L01 > a ? 0 : -1
  1525. "paddb %%mm4, %%mm2 \n\t"
  1526. "paddb %%mm5, %%mm2 \n\t"
  1527. // 0, 2, 3, 1
  1528. #define DERING_CORE(dst,src,ppsx,psx,sx,pplx,plx,lx,t0,t1) \
  1529. "movq " #src ", " #sx " \n\t" /* src[0] */\
  1530. "movq " #sx ", " #lx " \n\t" /* src[0] */\
  1531. "movq " #sx ", " #t0 " \n\t" /* src[0] */\
  1532. "psllq $8, " #lx " \n\t"\
  1533. "psrlq $8, " #t0 " \n\t"\
  1534. "movd -4" #src ", " #t1 " \n\t"\
  1535. "psrlq $24, " #t1 " \n\t"\
  1536. "por " #t1 ", " #lx " \n\t" /* src[-1] */\
  1537. "movd 8" #src ", " #t1 " \n\t"\
  1538. "psllq $56, " #t1 " \n\t"\
  1539. "por " #t1 ", " #t0 " \n\t" /* src[+1] */\
  1540. "movq " #lx ", " #t1 " \n\t" /* src[-1] */\
  1541. PAVGB(t0, lx) /* (src[-1] + src[+1])/2 */\
  1542. PAVGB(sx, lx) /* (src[-1] + 2src[0] + src[+1])/4 */\
  1543. PAVGB(lx, pplx) \
  1544. "movq " #lx ", temp1 \n\t"\
  1545. "movq temp0, " #lx " \n\t"\
  1546. "psubusb " #lx ", " #t1 " \n\t"\
  1547. "psubusb " #lx ", " #t0 " \n\t"\
  1548. "psubusb " #lx ", " #sx " \n\t"\
  1549. "movq b00, " #lx " \n\t"\
  1550. "pcmpeqb " #lx ", " #t1 " \n\t" /* src[-1] > a ? 0 : -1*/\
  1551. "pcmpeqb " #lx ", " #t0 " \n\t" /* src[+1] > a ? 0 : -1*/\
  1552. "pcmpeqb " #lx ", " #sx " \n\t" /* src[0] > a ? 0 : -1*/\
  1553. "paddb " #t1 ", " #t0 " \n\t"\
  1554. "paddb " #t0 ", " #sx " \n\t"\
  1555. \
  1556. PAVGB(plx, pplx) /* filtered */\
  1557. "movq " #dst ", " #t0 " \n\t" /* dst */\
  1558. "movq " #t0 ", " #t1 " \n\t" /* dst */\
  1559. "psubusb pQPb2, " #t0 " \n\t"\
  1560. "paddusb pQPb2, " #t1 " \n\t"\
  1561. PMAXUB(t0, pplx)\
  1562. PMINUB(t1, pplx, t0)\
  1563. "paddb " #sx ", " #ppsx " \n\t"\
  1564. "paddb " #psx ", " #ppsx " \n\t"\
  1565. "#paddb b02, " #ppsx " \n\t"\
  1566. "pand b08, " #ppsx " \n\t"\
  1567. "pcmpeqb " #lx ", " #ppsx " \n\t"\
  1568. "pand " #ppsx ", " #pplx " \n\t"\
  1569. "pandn " #dst ", " #ppsx " \n\t"\
  1570. "por " #pplx ", " #ppsx " \n\t"\
  1571. "movq " #ppsx ", " #dst " \n\t"\
  1572. "movq temp1, " #lx " \n\t"
  1573. /*
  1574. 0000000
  1575. 1111111
  1576. 1111110
  1577. 1111101
  1578. 1111100
  1579. 1111011
  1580. 1111010
  1581. 1111001
  1582. 1111000
  1583. 1110111
  1584. */
  1585. //DERING_CORE(dst,src ,ppsx ,psx ,sx ,pplx ,plx ,lx ,t0 ,t1)
  1586. DERING_CORE((%%eax),(%%eax, %1) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7)
  1587. DERING_CORE((%%eax, %1),(%%eax, %1, 2) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7)
  1588. DERING_CORE((%%eax, %1, 2),(%0, %1, 4) ,%%mm4,%%mm0,%%mm2,%%mm5,%%mm1,%%mm3,%%mm6,%%mm7)
  1589. DERING_CORE((%0, %1, 4),(%%ebx) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7)
  1590. DERING_CORE((%%ebx),(%%ebx, %1) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7)
  1591. DERING_CORE((%%ebx, %1), (%%ebx, %1, 2),%%mm4,%%mm0,%%mm2,%%mm5,%%mm1,%%mm3,%%mm6,%%mm7)
  1592. DERING_CORE((%%ebx, %1, 2),(%0, %1, 8) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7)
  1593. DERING_CORE((%0, %1, 8),(%%ebx, %1, 4) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7)
  1594. : : "r" (src), "r" (stride), "r" (QP)
  1595. : "%eax", "%ebx"
  1596. );
  1597. #else
  1598. int y;
  1599. int min=255;
  1600. int max=0;
  1601. int avg;
  1602. uint8_t *p;
  1603. int s[10];
  1604. for(y=1; y<9; y++)
  1605. {
  1606. int x;
  1607. p= src + stride*y;
  1608. for(x=1; x<9; x++)
  1609. {
  1610. p++;
  1611. if(*p > max) max= *p;
  1612. if(*p < min) min= *p;
  1613. }
  1614. }
  1615. avg= (min + max + 1)/2;
  1616. for(y=0; y<10; y++)
  1617. {
  1618. int x;
  1619. int t = 0;
  1620. p= src + stride*y;
  1621. for(x=0; x<10; x++)
  1622. {
  1623. if(*p > avg) t |= (1<<x);
  1624. p++;
  1625. }
  1626. t |= (~t)<<16;
  1627. t &= (t<<1) & (t>>1);
  1628. s[y] = t;
  1629. }
  1630. for(y=1; y<9; y++)
  1631. {
  1632. int x;
  1633. int t = s[y-1] & s[y] & s[y+1];
  1634. t|= t>>16;
  1635. p= src + stride*y;
  1636. for(x=1; x<9; x++)
  1637. {
  1638. p++;
  1639. if(t & (1<<x))
  1640. {
  1641. int f= (*(p-stride-1)) + 2*(*(p-stride)) + (*(p-stride+1))
  1642. +2*(*(p -1)) + 4*(*p ) + 2*(*(p +1))
  1643. +(*(p+stride-1)) + 2*(*(p+stride)) + (*(p+stride+1));
  1644. f= (f + 8)>>4;
  1645. if (*p + 2*QP < f) *p= *p + 2*QP;
  1646. else if(*p - 2*QP > f) *p= *p - 2*QP;
  1647. else *p=f;
  1648. }
  1649. }
  1650. }
  1651. #endif
  1652. }
  1653. /**
  1654. * Deinterlaces the given block
  1655. * will be called for every 8x8 block and can read & write from line 4-15
  1656. * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too
  1657. * lines 4-12 will be read into the deblocking filter and should be deinterlaced
  1658. */
  1659. static inline void deInterlaceInterpolateLinear(uint8_t src[], int stride)
  1660. {
  1661. #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
  1662. src+= 4*stride;
  1663. asm volatile(
  1664. "leal (%0, %1), %%eax \n\t"
  1665. "leal (%%eax, %1, 4), %%ebx \n\t"
  1666. // 0 1 2 3 4 5 6 7 8 9
  1667. // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
  1668. "movq (%0), %%mm0 \n\t"
  1669. "movq (%%eax, %1), %%mm1 \n\t"
  1670. PAVGB(%%mm1, %%mm0)
  1671. "movq %%mm0, (%%eax) \n\t"
  1672. "movq (%0, %1, 4), %%mm0 \n\t"
  1673. PAVGB(%%mm0, %%mm1)
  1674. "movq %%mm1, (%%eax, %1, 2) \n\t"
  1675. "movq (%%ebx, %1), %%mm1 \n\t"
  1676. PAVGB(%%mm1, %%mm0)
  1677. "movq %%mm0, (%%ebx) \n\t"
  1678. "movq (%0, %1, 8), %%mm0 \n\t"
  1679. PAVGB(%%mm0, %%mm1)
  1680. "movq %%mm1, (%%ebx, %1, 2) \n\t"
  1681. : : "r" (src), "r" (stride)
  1682. : "%eax", "%ebx"
  1683. );
  1684. #else
  1685. int x;
  1686. src+= 4*stride;
  1687. for(x=0; x<8; x++)
  1688. {
  1689. src[stride] = (src[0] + src[stride*2])>>1;
  1690. src[stride*3] = (src[stride*2] + src[stride*4])>>1;
  1691. src[stride*5] = (src[stride*4] + src[stride*6])>>1;
  1692. src[stride*7] = (src[stride*6] + src[stride*8])>>1;
  1693. src++;
  1694. }
  1695. #endif
  1696. }
  1697. /**
  1698. * Deinterlaces the given block
  1699. * will be called for every 8x8 block and can read & write from line 4-15
  1700. * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too
  1701. * lines 4-12 will be read into the deblocking filter and should be deinterlaced
  1702. * this filter will read lines 3-15 and write 7-13
  1703. * no cliping in C version
  1704. */
  1705. static inline void deInterlaceInterpolateCubic(uint8_t src[], int stride)
  1706. {
  1707. #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
  1708. src+= stride*3;
  1709. asm volatile(
  1710. "leal (%0, %1), %%eax \n\t"
  1711. "leal (%%eax, %1, 4), %%ebx \n\t"
  1712. "leal (%%ebx, %1, 4), %%ecx \n\t"
  1713. "addl %1, %%ecx \n\t"
  1714. "pxor %%mm7, %%mm7 \n\t"
  1715. // 0 1 2 3 4 5 6 7 8 9 10
  1716. // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1 ecx
  1717. #define DEINT_CUBIC(a,b,c,d,e)\
  1718. "movq " #a ", %%mm0 \n\t"\
  1719. "movq " #b ", %%mm1 \n\t"\
  1720. "movq " #d ", %%mm2 \n\t"\
  1721. "movq " #e ", %%mm3 \n\t"\
  1722. PAVGB(%%mm2, %%mm1) /* (b+d) /2 */\
  1723. PAVGB(%%mm3, %%mm0) /* a(a+e) /2 */\
  1724. "movq %%mm0, %%mm2 \n\t"\
  1725. "punpcklbw %%mm7, %%mm0 \n\t"\
  1726. "punpckhbw %%mm7, %%mm2 \n\t"\
  1727. "movq %%mm1, %%mm3 \n\t"\
  1728. "punpcklbw %%mm7, %%mm1 \n\t"\
  1729. "punpckhbw %%mm7, %%mm3 \n\t"\
  1730. "psubw %%mm1, %%mm0 \n\t" /* L(a+e - (b+d))/2 */\
  1731. "psubw %%mm3, %%mm2 \n\t" /* H(a+e - (b+d))/2 */\
  1732. "psraw $3, %%mm0 \n\t" /* L(a+e - (b+d))/16 */\
  1733. "psraw $3, %%mm2 \n\t" /* H(a+e - (b+d))/16 */\
  1734. "psubw %%mm0, %%mm1 \n\t" /* L(9b + 9d - a - e)/16 */\
  1735. "psubw %%mm2, %%mm3 \n\t" /* H(9b + 9d - a - e)/16 */\
  1736. "packuswb %%mm3, %%mm1 \n\t"\
  1737. "movq %%mm1, " #c " \n\t"
  1738. DEINT_CUBIC((%0), (%%eax, %1), (%%eax, %1, 2), (%0, %1, 4), (%%ebx, %1))
  1739. DEINT_CUBIC((%%eax, %1), (%0, %1, 4), (%%ebx), (%%ebx, %1), (%0, %1, 8))
  1740. DEINT_CUBIC((%0, %1, 4), (%%ebx, %1), (%%ebx, %1, 2), (%0, %1, 8), (%%ecx))
  1741. DEINT_CUBIC((%%ebx, %1), (%0, %1, 8), (%%ebx, %1, 4), (%%ecx), (%%ecx, %1, 2))
  1742. : : "r" (src), "r" (stride)
  1743. : "%eax", "%ebx", "ecx"
  1744. );
  1745. #else
  1746. int x;
  1747. src+= stride*3;
  1748. for(x=0; x<8; x++)
  1749. {
  1750. src[stride*3] = (-src[0] + 9*src[stride*2] + 9*src[stride*4] - src[stride*6])>>4;
  1751. src[stride*5] = (-src[stride*2] + 9*src[stride*4] + 9*src[stride*6] - src[stride*8])>>4;
  1752. src[stride*7] = (-src[stride*4] + 9*src[stride*6] + 9*src[stride*8] - src[stride*10])>>4;
  1753. src[stride*9] = (-src[stride*6] + 9*src[stride*8] + 9*src[stride*10] - src[stride*12])>>4;
  1754. src++;
  1755. }
  1756. #endif
  1757. }
  1758. /**
  1759. * Deinterlaces the given block
  1760. * will be called for every 8x8 block and can read & write from line 4-15
  1761. * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too
  1762. * lines 4-12 will be read into the deblocking filter and should be deinterlaced
  1763. * will shift the image up by 1 line (FIXME if this is a problem)
  1764. * this filter will read lines 4-13 and write 4-11
  1765. */
  1766. static inline void deInterlaceBlendLinear(uint8_t src[], int stride)
  1767. {
  1768. #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
  1769. src+= 4*stride;
  1770. asm volatile(
  1771. "leal (%0, %1), %%eax \n\t"
  1772. "leal (%%eax, %1, 4), %%ebx \n\t"
  1773. // 0 1 2 3 4 5 6 7 8 9
  1774. // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
  1775. "movq (%0), %%mm0 \n\t" // L0
  1776. "movq (%%eax, %1), %%mm1 \n\t" // L2
  1777. PAVGB(%%mm1, %%mm0) // L0+L2
  1778. "movq (%%eax), %%mm2 \n\t" // L1
  1779. PAVGB(%%mm2, %%mm0)
  1780. "movq %%mm0, (%0) \n\t"
  1781. "movq (%%eax, %1, 2), %%mm0 \n\t" // L3
  1782. PAVGB(%%mm0, %%mm2) // L1+L3
  1783. PAVGB(%%mm1, %%mm2) // 2L2 + L1 + L3
  1784. "movq %%mm2, (%%eax) \n\t"
  1785. "movq (%0, %1, 4), %%mm2 \n\t" // L4
  1786. PAVGB(%%mm2, %%mm1) // L2+L4
  1787. PAVGB(%%mm0, %%mm1) // 2L3 + L2 + L4
  1788. "movq %%mm1, (%%eax, %1) \n\t"
  1789. "movq (%%ebx), %%mm1 \n\t" // L5
  1790. PAVGB(%%mm1, %%mm0) // L3+L5
  1791. PAVGB(%%mm2, %%mm0) // 2L4 + L3 + L5
  1792. "movq %%mm0, (%%eax, %1, 2) \n\t"
  1793. "movq (%%ebx, %1), %%mm0 \n\t" // L6
  1794. PAVGB(%%mm0, %%mm2) // L4+L6
  1795. PAVGB(%%mm1, %%mm2) // 2L5 + L4 + L6
  1796. "movq %%mm2, (%0, %1, 4) \n\t"
  1797. "movq (%%ebx, %1, 2), %%mm2 \n\t" // L7
  1798. PAVGB(%%mm2, %%mm1) // L5+L7
  1799. PAVGB(%%mm0, %%mm1) // 2L6 + L5 + L7
  1800. "movq %%mm1, (%%ebx) \n\t"
  1801. "movq (%0, %1, 8), %%mm1 \n\t" // L8
  1802. PAVGB(%%mm1, %%mm0) // L6+L8
  1803. PAVGB(%%mm2, %%mm0) // 2L7 + L6 + L8
  1804. "movq %%mm0, (%%ebx, %1) \n\t"
  1805. "movq (%%ebx, %1, 4), %%mm0 \n\t" // L9
  1806. PAVGB(%%mm0, %%mm2) // L7+L9
  1807. PAVGB(%%mm1, %%mm2) // 2L8 + L7 + L9
  1808. "movq %%mm2, (%%ebx, %1, 2) \n\t"
  1809. : : "r" (src), "r" (stride)
  1810. : "%eax", "%ebx"
  1811. );
  1812. #else
  1813. int x;
  1814. src+= 4*stride;
  1815. for(x=0; x<8; x++)
  1816. {
  1817. src[0 ] = (src[0 ] + 2*src[stride ] + src[stride*2])>>2;
  1818. src[stride ] = (src[stride ] + 2*src[stride*2] + src[stride*3])>>2;
  1819. src[stride*2] = (src[stride*2] + 2*src[stride*3] + src[stride*4])>>2;
  1820. src[stride*3] = (src[stride*3] + 2*src[stride*4] + src[stride*5])>>2;
  1821. src[stride*4] = (src[stride*4] + 2*src[stride*5] + src[stride*6])>>2;
  1822. src[stride*5] = (src[stride*5] + 2*src[stride*6] + src[stride*7])>>2;
  1823. src[stride*6] = (src[stride*6] + 2*src[stride*7] + src[stride*8])>>2;
  1824. src[stride*7] = (src[stride*7] + 2*src[stride*8] + src[stride*9])>>2;
  1825. src++;
  1826. }
  1827. #endif
  1828. }
  1829. /**
  1830. * Deinterlaces the given block
  1831. * will be called for every 8x8 block and can read & write from line 4-15,
  1832. * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too
  1833. * lines 4-12 will be read into the deblocking filter and should be deinterlaced
  1834. */
  1835. static inline void deInterlaceMedian(uint8_t src[], int stride)
  1836. {
  1837. #ifdef HAVE_MMX
  1838. src+= 4*stride;
  1839. #ifdef HAVE_MMX2
  1840. asm volatile(
  1841. "leal (%0, %1), %%eax \n\t"
  1842. "leal (%%eax, %1, 4), %%ebx \n\t"
  1843. // 0 1 2 3 4 5 6 7 8 9
  1844. // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
  1845. "movq (%0), %%mm0 \n\t" //
  1846. "movq (%%eax, %1), %%mm2 \n\t" //
  1847. "movq (%%eax), %%mm1 \n\t" //
  1848. "movq %%mm0, %%mm3 \n\t"
  1849. "pmaxub %%mm1, %%mm0 \n\t" //
  1850. "pminub %%mm3, %%mm1 \n\t" //
  1851. "pmaxub %%mm2, %%mm1 \n\t" //
  1852. "pminub %%mm1, %%mm0 \n\t"
  1853. "movq %%mm0, (%%eax) \n\t"
  1854. "movq (%0, %1, 4), %%mm0 \n\t" //
  1855. "movq (%%eax, %1, 2), %%mm1 \n\t" //
  1856. "movq %%mm2, %%mm3 \n\t"
  1857. "pmaxub %%mm1, %%mm2 \n\t" //
  1858. "pminub %%mm3, %%mm1 \n\t" //
  1859. "pmaxub %%mm0, %%mm1 \n\t" //
  1860. "pminub %%mm1, %%mm2 \n\t"
  1861. "movq %%mm2, (%%eax, %1, 2) \n\t"
  1862. "movq (%%ebx), %%mm2 \n\t" //
  1863. "movq (%%ebx, %1), %%mm1 \n\t" //
  1864. "movq %%mm2, %%mm3 \n\t"
  1865. "pmaxub %%mm0, %%mm2 \n\t" //
  1866. "pminub %%mm3, %%mm0 \n\t" //
  1867. "pmaxub %%mm1, %%mm0 \n\t" //
  1868. "pminub %%mm0, %%mm2 \n\t"
  1869. "movq %%mm2, (%%ebx) \n\t"
  1870. "movq (%%ebx, %1, 2), %%mm2 \n\t" //
  1871. "movq (%0, %1, 8), %%mm0 \n\t" //
  1872. "movq %%mm2, %%mm3 \n\t"
  1873. "pmaxub %%mm0, %%mm2 \n\t" //
  1874. "pminub %%mm3, %%mm0 \n\t" //
  1875. "pmaxub %%mm1, %%mm0 \n\t" //
  1876. "pminub %%mm0, %%mm2 \n\t"
  1877. "movq %%mm2, (%%ebx, %1, 2) \n\t"
  1878. : : "r" (src), "r" (stride)
  1879. : "%eax", "%ebx"
  1880. );
  1881. #else // MMX without MMX2
  1882. asm volatile(
  1883. "leal (%0, %1), %%eax \n\t"
  1884. "leal (%%eax, %1, 4), %%ebx \n\t"
  1885. // 0 1 2 3 4 5 6 7 8 9
  1886. // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
  1887. "pxor %%mm7, %%mm7 \n\t"
  1888. #define MEDIAN(a,b,c)\
  1889. "movq " #a ", %%mm0 \n\t"\
  1890. "movq " #b ", %%mm2 \n\t"\
  1891. "movq " #c ", %%mm1 \n\t"\
  1892. "movq %%mm0, %%mm3 \n\t"\
  1893. "movq %%mm1, %%mm4 \n\t"\
  1894. "movq %%mm2, %%mm5 \n\t"\
  1895. "psubusb %%mm1, %%mm3 \n\t"\
  1896. "psubusb %%mm2, %%mm4 \n\t"\
  1897. "psubusb %%mm0, %%mm5 \n\t"\
  1898. "pcmpeqb %%mm7, %%mm3 \n\t"\
  1899. "pcmpeqb %%mm7, %%mm4 \n\t"\
  1900. "pcmpeqb %%mm7, %%mm5 \n\t"\
  1901. "movq %%mm3, %%mm6 \n\t"\
  1902. "pxor %%mm4, %%mm3 \n\t"\
  1903. "pxor %%mm5, %%mm4 \n\t"\
  1904. "pxor %%mm6, %%mm5 \n\t"\
  1905. "por %%mm3, %%mm1 \n\t"\
  1906. "por %%mm4, %%mm2 \n\t"\
  1907. "por %%mm5, %%mm0 \n\t"\
  1908. "pand %%mm2, %%mm0 \n\t"\
  1909. "pand %%mm1, %%mm0 \n\t"\
  1910. "movq %%mm0, " #b " \n\t"
  1911. MEDIAN((%0), (%%eax), (%%eax, %1))
  1912. MEDIAN((%%eax, %1), (%%eax, %1, 2), (%0, %1, 4))
  1913. MEDIAN((%0, %1, 4), (%%ebx), (%%ebx, %1))
  1914. MEDIAN((%%ebx, %1), (%%ebx, %1, 2), (%0, %1, 8))
  1915. : : "r" (src), "r" (stride)
  1916. : "%eax", "%ebx"
  1917. );
  1918. #endif // MMX
  1919. #else
  1920. //FIXME
  1921. int x;
  1922. src+= 4*stride;
  1923. for(x=0; x<8; x++)
  1924. {
  1925. src[0 ] = (src[0 ] + 2*src[stride ] + src[stride*2])>>2;
  1926. src[stride ] = (src[stride ] + 2*src[stride*2] + src[stride*3])>>2;
  1927. src[stride*2] = (src[stride*2] + 2*src[stride*3] + src[stride*4])>>2;
  1928. src[stride*3] = (src[stride*3] + 2*src[stride*4] + src[stride*5])>>2;
  1929. src[stride*4] = (src[stride*4] + 2*src[stride*5] + src[stride*6])>>2;
  1930. src[stride*5] = (src[stride*5] + 2*src[stride*6] + src[stride*7])>>2;
  1931. src[stride*6] = (src[stride*6] + 2*src[stride*7] + src[stride*8])>>2;
  1932. src[stride*7] = (src[stride*7] + 2*src[stride*8] + src[stride*9])>>2;
  1933. src++;
  1934. }
  1935. #endif
  1936. }
  1937. #ifdef HAVE_MMX
  1938. /**
  1939. * transposes and shift the given 8x8 Block into dst1 and dst2
  1940. */
  1941. static inline void transpose1(uint8_t *dst1, uint8_t *dst2, uint8_t *src, int srcStride)
  1942. {
  1943. asm(
  1944. "leal (%0, %1), %%eax \n\t"
  1945. "leal (%%eax, %1, 4), %%ebx \n\t"
  1946. // 0 1 2 3 4 5 6 7 8 9
  1947. // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
  1948. "movq (%0), %%mm0 \n\t" // 12345678
  1949. "movq (%%eax), %%mm1 \n\t" // abcdefgh
  1950. "movq %%mm0, %%mm2 \n\t" // 12345678
  1951. "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d
  1952. "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h
  1953. "movq (%%eax, %1), %%mm1 \n\t"
  1954. "movq (%%eax, %1, 2), %%mm3 \n\t"
  1955. "movq %%mm1, %%mm4 \n\t"
  1956. "punpcklbw %%mm3, %%mm1 \n\t"
  1957. "punpckhbw %%mm3, %%mm4 \n\t"
  1958. "movq %%mm0, %%mm3 \n\t"
  1959. "punpcklwd %%mm1, %%mm0 \n\t"
  1960. "punpckhwd %%mm1, %%mm3 \n\t"
  1961. "movq %%mm2, %%mm1 \n\t"
  1962. "punpcklwd %%mm4, %%mm2 \n\t"
  1963. "punpckhwd %%mm4, %%mm1 \n\t"
  1964. "movd %%mm0, 128(%2) \n\t"
  1965. "psrlq $32, %%mm0 \n\t"
  1966. "movd %%mm0, 144(%2) \n\t"
  1967. "movd %%mm3, 160(%2) \n\t"
  1968. "psrlq $32, %%mm3 \n\t"
  1969. "movd %%mm3, 176(%2) \n\t"
  1970. "movd %%mm3, 48(%3) \n\t"
  1971. "movd %%mm2, 192(%2) \n\t"
  1972. "movd %%mm2, 64(%3) \n\t"
  1973. "psrlq $32, %%mm2 \n\t"
  1974. "movd %%mm2, 80(%3) \n\t"
  1975. "movd %%mm1, 96(%3) \n\t"
  1976. "psrlq $32, %%mm1 \n\t"
  1977. "movd %%mm1, 112(%3) \n\t"
  1978. "movq (%0, %1, 4), %%mm0 \n\t" // 12345678
  1979. "movq (%%ebx), %%mm1 \n\t" // abcdefgh
  1980. "movq %%mm0, %%mm2 \n\t" // 12345678
  1981. "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d
  1982. "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h
  1983. "movq (%%ebx, %1), %%mm1 \n\t"
  1984. "movq (%%ebx, %1, 2), %%mm3 \n\t"
  1985. "movq %%mm1, %%mm4 \n\t"
  1986. "punpcklbw %%mm3, %%mm1 \n\t"
  1987. "punpckhbw %%mm3, %%mm4 \n\t"
  1988. "movq %%mm0, %%mm3 \n\t"
  1989. "punpcklwd %%mm1, %%mm0 \n\t"
  1990. "punpckhwd %%mm1, %%mm3 \n\t"
  1991. "movq %%mm2, %%mm1 \n\t"
  1992. "punpcklwd %%mm4, %%mm2 \n\t"
  1993. "punpckhwd %%mm4, %%mm1 \n\t"
  1994. "movd %%mm0, 132(%2) \n\t"
  1995. "psrlq $32, %%mm0 \n\t"
  1996. "movd %%mm0, 148(%2) \n\t"
  1997. "movd %%mm3, 164(%2) \n\t"
  1998. "psrlq $32, %%mm3 \n\t"
  1999. "movd %%mm3, 180(%2) \n\t"
  2000. "movd %%mm3, 52(%3) \n\t"
  2001. "movd %%mm2, 196(%2) \n\t"
  2002. "movd %%mm2, 68(%3) \n\t"
  2003. "psrlq $32, %%mm2 \n\t"
  2004. "movd %%mm2, 84(%3) \n\t"
  2005. "movd %%mm1, 100(%3) \n\t"
  2006. "psrlq $32, %%mm1 \n\t"
  2007. "movd %%mm1, 116(%3) \n\t"
  2008. :: "r" (src), "r" (srcStride), "r" (dst1), "r" (dst2)
  2009. : "%eax", "%ebx"
  2010. );
  2011. }
  2012. /**
  2013. * transposes the given 8x8 block
  2014. */
  2015. static inline void transpose2(uint8_t *dst, int dstStride, uint8_t *src)
  2016. {
  2017. asm(
  2018. "leal (%0, %1), %%eax \n\t"
  2019. "leal (%%eax, %1, 4), %%ebx \n\t"
  2020. // 0 1 2 3 4 5 6 7 8 9
  2021. // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
  2022. "movq (%2), %%mm0 \n\t" // 12345678
  2023. "movq 16(%2), %%mm1 \n\t" // abcdefgh
  2024. "movq %%mm0, %%mm2 \n\t" // 12345678
  2025. "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d
  2026. "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h
  2027. "movq 32(%2), %%mm1 \n\t"
  2028. "movq 48(%2), %%mm3 \n\t"
  2029. "movq %%mm1, %%mm4 \n\t"
  2030. "punpcklbw %%mm3, %%mm1 \n\t"
  2031. "punpckhbw %%mm3, %%mm4 \n\t"
  2032. "movq %%mm0, %%mm3 \n\t"
  2033. "punpcklwd %%mm1, %%mm0 \n\t"
  2034. "punpckhwd %%mm1, %%mm3 \n\t"
  2035. "movq %%mm2, %%mm1 \n\t"
  2036. "punpcklwd %%mm4, %%mm2 \n\t"
  2037. "punpckhwd %%mm4, %%mm1 \n\t"
  2038. "movd %%mm0, (%0) \n\t"
  2039. "psrlq $32, %%mm0 \n\t"
  2040. "movd %%mm0, (%%eax) \n\t"
  2041. "movd %%mm3, (%%eax, %1) \n\t"
  2042. "psrlq $32, %%mm3 \n\t"
  2043. "movd %%mm3, (%%eax, %1, 2) \n\t"
  2044. "movd %%mm2, (%0, %1, 4) \n\t"
  2045. "psrlq $32, %%mm2 \n\t"
  2046. "movd %%mm2, (%%ebx) \n\t"
  2047. "movd %%mm1, (%%ebx, %1) \n\t"
  2048. "psrlq $32, %%mm1 \n\t"
  2049. "movd %%mm1, (%%ebx, %1, 2) \n\t"
  2050. "movq 64(%2), %%mm0 \n\t" // 12345678
  2051. "movq 80(%2), %%mm1 \n\t" // abcdefgh
  2052. "movq %%mm0, %%mm2 \n\t" // 12345678
  2053. "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d
  2054. "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h
  2055. "movq 96(%2), %%mm1 \n\t"
  2056. "movq 112(%2), %%mm3 \n\t"
  2057. "movq %%mm1, %%mm4 \n\t"
  2058. "punpcklbw %%mm3, %%mm1 \n\t"
  2059. "punpckhbw %%mm3, %%mm4 \n\t"
  2060. "movq %%mm0, %%mm3 \n\t"
  2061. "punpcklwd %%mm1, %%mm0 \n\t"
  2062. "punpckhwd %%mm1, %%mm3 \n\t"
  2063. "movq %%mm2, %%mm1 \n\t"
  2064. "punpcklwd %%mm4, %%mm2 \n\t"
  2065. "punpckhwd %%mm4, %%mm1 \n\t"
  2066. "movd %%mm0, 4(%0) \n\t"
  2067. "psrlq $32, %%mm0 \n\t"
  2068. "movd %%mm0, 4(%%eax) \n\t"
  2069. "movd %%mm3, 4(%%eax, %1) \n\t"
  2070. "psrlq $32, %%mm3 \n\t"
  2071. "movd %%mm3, 4(%%eax, %1, 2) \n\t"
  2072. "movd %%mm2, 4(%0, %1, 4) \n\t"
  2073. "psrlq $32, %%mm2 \n\t"
  2074. "movd %%mm2, 4(%%ebx) \n\t"
  2075. "movd %%mm1, 4(%%ebx, %1) \n\t"
  2076. "psrlq $32, %%mm1 \n\t"
  2077. "movd %%mm1, 4(%%ebx, %1, 2) \n\t"
  2078. :: "r" (dst), "r" (dstStride), "r" (src)
  2079. : "%eax", "%ebx"
  2080. );
  2081. }
  2082. #endif
  2083. //static int test=0;
  2084. static void inline tempNoiseReducer(uint8_t *src, int stride,
  2085. uint8_t *tempBlured, uint32_t *tempBluredPast, int *maxNoise)
  2086. {
  2087. #define FAST_L2_DIFF
  2088. //#define L1_DIFF //u should change the thresholds too if u try that one
  2089. #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
  2090. asm volatile(
  2091. "leal (%2, %2, 2), %%eax \n\t" // 3*stride
  2092. "leal (%2, %2, 4), %%ebx \n\t" // 5*stride
  2093. "leal (%%ebx, %2, 2), %%ecx \n\t" // 7*stride
  2094. // 0 1 2 3 4 5 6 7 8 9
  2095. // %x %x+%2 %x+2%2 %x+eax %x+4%2 %x+ebx %x+2eax %x+ecx %x+8%2
  2096. //FIXME reorder?
  2097. #ifdef L1_DIFF //needs mmx2
  2098. "movq (%0), %%mm0 \n\t" // L0
  2099. "psadbw (%1), %%mm0 \n\t" // |L0-R0|
  2100. "movq (%0, %2), %%mm1 \n\t" // L1
  2101. "psadbw (%1, %2), %%mm1 \n\t" // |L1-R1|
  2102. "movq (%0, %2, 2), %%mm2 \n\t" // L2
  2103. "psadbw (%1, %2, 2), %%mm2 \n\t" // |L2-R2|
  2104. "movq (%0, %%eax), %%mm3 \n\t" // L3
  2105. "psadbw (%1, %%eax), %%mm3 \n\t" // |L3-R3|
  2106. "movq (%0, %2, 4), %%mm4 \n\t" // L4
  2107. "paddw %%mm1, %%mm0 \n\t"
  2108. "psadbw (%1, %2, 4), %%mm4 \n\t" // |L4-R4|
  2109. "movq (%0, %%ebx), %%mm5 \n\t" // L5
  2110. "paddw %%mm2, %%mm0 \n\t"
  2111. "psadbw (%1, %%ebx), %%mm5 \n\t" // |L5-R5|
  2112. "movq (%0, %%eax, 2), %%mm6 \n\t" // L6
  2113. "paddw %%mm3, %%mm0 \n\t"
  2114. "psadbw (%1, %%eax, 2), %%mm6 \n\t" // |L6-R6|
  2115. "movq (%0, %%ecx), %%mm7 \n\t" // L7
  2116. "paddw %%mm4, %%mm0 \n\t"
  2117. "psadbw (%1, %%ecx), %%mm7 \n\t" // |L7-R7|
  2118. "paddw %%mm5, %%mm6 \n\t"
  2119. "paddw %%mm7, %%mm6 \n\t"
  2120. "paddw %%mm6, %%mm0 \n\t"
  2121. #elif defined (FAST_L2_DIFF)
  2122. "pcmpeqb %%mm7, %%mm7 \n\t"
  2123. "movq b80, %%mm6 \n\t"
  2124. "pxor %%mm0, %%mm0 \n\t"
  2125. #define L2_DIFF_CORE(a, b)\
  2126. "movq " #a ", %%mm5 \n\t"\
  2127. "movq " #b ", %%mm2 \n\t"\
  2128. "pxor %%mm7, %%mm2 \n\t"\
  2129. PAVGB(%%mm2, %%mm5)\
  2130. "paddb %%mm6, %%mm5 \n\t"\
  2131. "movq %%mm5, %%mm2 \n\t"\
  2132. "psllw $8, %%mm5 \n\t"\
  2133. "pmaddwd %%mm5, %%mm5 \n\t"\
  2134. "pmaddwd %%mm2, %%mm2 \n\t"\
  2135. "paddd %%mm2, %%mm5 \n\t"\
  2136. "psrld $14, %%mm5 \n\t"\
  2137. "paddd %%mm5, %%mm0 \n\t"
  2138. L2_DIFF_CORE((%0), (%1))
  2139. L2_DIFF_CORE((%0, %2), (%1, %2))
  2140. L2_DIFF_CORE((%0, %2, 2), (%1, %2, 2))
  2141. L2_DIFF_CORE((%0, %%eax), (%1, %%eax))
  2142. L2_DIFF_CORE((%0, %2, 4), (%1, %2, 4))
  2143. L2_DIFF_CORE((%0, %%ebx), (%1, %%ebx))
  2144. L2_DIFF_CORE((%0, %%eax,2), (%1, %%eax,2))
  2145. L2_DIFF_CORE((%0, %%ecx), (%1, %%ecx))
  2146. #else
  2147. "pxor %%mm7, %%mm7 \n\t"
  2148. "pxor %%mm0, %%mm0 \n\t"
  2149. #define L2_DIFF_CORE(a, b)\
  2150. "movq " #a ", %%mm5 \n\t"\
  2151. "movq " #b ", %%mm2 \n\t"\
  2152. "movq %%mm5, %%mm1 \n\t"\
  2153. "movq %%mm2, %%mm3 \n\t"\
  2154. "punpcklbw %%mm7, %%mm5 \n\t"\
  2155. "punpckhbw %%mm7, %%mm1 \n\t"\
  2156. "punpcklbw %%mm7, %%mm2 \n\t"\
  2157. "punpckhbw %%mm7, %%mm3 \n\t"\
  2158. "psubw %%mm2, %%mm5 \n\t"\
  2159. "psubw %%mm3, %%mm1 \n\t"\
  2160. "pmaddwd %%mm5, %%mm5 \n\t"\
  2161. "pmaddwd %%mm1, %%mm1 \n\t"\
  2162. "paddd %%mm1, %%mm5 \n\t"\
  2163. "paddd %%mm5, %%mm0 \n\t"
  2164. L2_DIFF_CORE((%0), (%1))
  2165. L2_DIFF_CORE((%0, %2), (%1, %2))
  2166. L2_DIFF_CORE((%0, %2, 2), (%1, %2, 2))
  2167. L2_DIFF_CORE((%0, %%eax), (%1, %%eax))
  2168. L2_DIFF_CORE((%0, %2, 4), (%1, %2, 4))
  2169. L2_DIFF_CORE((%0, %%ebx), (%1, %%ebx))
  2170. L2_DIFF_CORE((%0, %%eax,2), (%1, %%eax,2))
  2171. L2_DIFF_CORE((%0, %%ecx), (%1, %%ecx))
  2172. #endif
  2173. "movq %%mm0, %%mm4 \n\t"
  2174. "psrlq $32, %%mm0 \n\t"
  2175. "paddd %%mm0, %%mm4 \n\t"
  2176. "movd %%mm4, %%ecx \n\t"
  2177. "shll $2, %%ecx \n\t"
  2178. "movl %3, %%ebx \n\t"
  2179. "addl -4(%%ebx), %%ecx \n\t"
  2180. "addl 4(%%ebx), %%ecx \n\t"
  2181. "addl -1024(%%ebx), %%ecx \n\t"
  2182. "addl $4, %%ecx \n\t"
  2183. "addl 1024(%%ebx), %%ecx \n\t"
  2184. "shrl $3, %%ecx \n\t"
  2185. "movl %%ecx, (%%ebx) \n\t"
  2186. "leal (%%eax, %2, 2), %%ebx \n\t" // 5*stride
  2187. // "movl %3, %%ecx \n\t"
  2188. // "movl %%ecx, test \n\t"
  2189. // "jmp 4f \n\t"
  2190. "cmpl 4+maxTmpNoise, %%ecx \n\t"
  2191. " jb 2f \n\t"
  2192. "cmpl 8+maxTmpNoise, %%ecx \n\t"
  2193. " jb 1f \n\t"
  2194. "leal (%%ebx, %2, 2), %%ecx \n\t" // 7*stride
  2195. "movq (%0), %%mm0 \n\t" // L0
  2196. "movq (%0, %2), %%mm1 \n\t" // L1
  2197. "movq (%0, %2, 2), %%mm2 \n\t" // L2
  2198. "movq (%0, %%eax), %%mm3 \n\t" // L3
  2199. "movq (%0, %2, 4), %%mm4 \n\t" // L4
  2200. "movq (%0, %%ebx), %%mm5 \n\t" // L5
  2201. "movq (%0, %%eax, 2), %%mm6 \n\t" // L6
  2202. "movq (%0, %%ecx), %%mm7 \n\t" // L7
  2203. "movq %%mm0, (%1) \n\t" // L0
  2204. "movq %%mm1, (%1, %2) \n\t" // L1
  2205. "movq %%mm2, (%1, %2, 2) \n\t" // L2
  2206. "movq %%mm3, (%1, %%eax) \n\t" // L3
  2207. "movq %%mm4, (%1, %2, 4) \n\t" // L4
  2208. "movq %%mm5, (%1, %%ebx) \n\t" // L5
  2209. "movq %%mm6, (%1, %%eax, 2) \n\t" // L6
  2210. "movq %%mm7, (%1, %%ecx) \n\t" // L7
  2211. "jmp 4f \n\t"
  2212. "1: \n\t"
  2213. "leal (%%ebx, %2, 2), %%ecx \n\t" // 7*stride
  2214. "movq (%0), %%mm0 \n\t" // L0
  2215. "pavgb (%1), %%mm0 \n\t" // L0
  2216. "movq (%0, %2), %%mm1 \n\t" // L1
  2217. "pavgb (%1, %2), %%mm1 \n\t" // L1
  2218. "movq (%0, %2, 2), %%mm2 \n\t" // L2
  2219. "pavgb (%1, %2, 2), %%mm2 \n\t" // L2
  2220. "movq (%0, %%eax), %%mm3 \n\t" // L3
  2221. "pavgb (%1, %%eax), %%mm3 \n\t" // L3
  2222. "movq (%0, %2, 4), %%mm4 \n\t" // L4
  2223. "pavgb (%1, %2, 4), %%mm4 \n\t" // L4
  2224. "movq (%0, %%ebx), %%mm5 \n\t" // L5
  2225. "pavgb (%1, %%ebx), %%mm5 \n\t" // L5
  2226. "movq (%0, %%eax, 2), %%mm6 \n\t" // L6
  2227. "pavgb (%1, %%eax, 2), %%mm6 \n\t" // L6
  2228. "movq (%0, %%ecx), %%mm7 \n\t" // L7
  2229. "pavgb (%1, %%ecx), %%mm7 \n\t" // L7
  2230. "movq %%mm0, (%1) \n\t" // R0
  2231. "movq %%mm1, (%1, %2) \n\t" // R1
  2232. "movq %%mm2, (%1, %2, 2) \n\t" // R2
  2233. "movq %%mm3, (%1, %%eax) \n\t" // R3
  2234. "movq %%mm4, (%1, %2, 4) \n\t" // R4
  2235. "movq %%mm5, (%1, %%ebx) \n\t" // R5
  2236. "movq %%mm6, (%1, %%eax, 2) \n\t" // R6
  2237. "movq %%mm7, (%1, %%ecx) \n\t" // R7
  2238. "movq %%mm0, (%0) \n\t" // L0
  2239. "movq %%mm1, (%0, %2) \n\t" // L1
  2240. "movq %%mm2, (%0, %2, 2) \n\t" // L2
  2241. "movq %%mm3, (%0, %%eax) \n\t" // L3
  2242. "movq %%mm4, (%0, %2, 4) \n\t" // L4
  2243. "movq %%mm5, (%0, %%ebx) \n\t" // L5
  2244. "movq %%mm6, (%0, %%eax, 2) \n\t" // L6
  2245. "movq %%mm7, (%0, %%ecx) \n\t" // L7
  2246. "jmp 4f \n\t"
  2247. "2: \n\t"
  2248. "cmpl maxTmpNoise, %%ecx \n\t"
  2249. " jb 3f \n\t"
  2250. "leal (%%ebx, %2, 2), %%ecx \n\t" // 7*stride
  2251. "movq (%0), %%mm0 \n\t" // L0
  2252. "movq (%0, %2), %%mm1 \n\t" // L1
  2253. "movq (%0, %2, 2), %%mm2 \n\t" // L2
  2254. "movq (%0, %%eax), %%mm3 \n\t" // L3
  2255. "movq (%1), %%mm4 \n\t" // R0
  2256. "movq (%1, %2), %%mm5 \n\t" // R1
  2257. "movq (%1, %2, 2), %%mm6 \n\t" // R2
  2258. "movq (%1, %%eax), %%mm7 \n\t" // R3
  2259. PAVGB(%%mm4, %%mm0)
  2260. PAVGB(%%mm5, %%mm1)
  2261. PAVGB(%%mm6, %%mm2)
  2262. PAVGB(%%mm7, %%mm3)
  2263. PAVGB(%%mm4, %%mm0)
  2264. PAVGB(%%mm5, %%mm1)
  2265. PAVGB(%%mm6, %%mm2)
  2266. PAVGB(%%mm7, %%mm3)
  2267. "movq %%mm0, (%1) \n\t" // R0
  2268. "movq %%mm1, (%1, %2) \n\t" // R1
  2269. "movq %%mm2, (%1, %2, 2) \n\t" // R2
  2270. "movq %%mm3, (%1, %%eax) \n\t" // R3
  2271. "movq %%mm0, (%0) \n\t" // L0
  2272. "movq %%mm1, (%0, %2) \n\t" // L1
  2273. "movq %%mm2, (%0, %2, 2) \n\t" // L2
  2274. "movq %%mm3, (%0, %%eax) \n\t" // L3
  2275. "movq (%0, %2, 4), %%mm0 \n\t" // L4
  2276. "movq (%0, %%ebx), %%mm1 \n\t" // L5
  2277. "movq (%0, %%eax, 2), %%mm2 \n\t" // L6
  2278. "movq (%0, %%ecx), %%mm3 \n\t" // L7
  2279. "movq (%1, %2, 4), %%mm4 \n\t" // R4
  2280. "movq (%1, %%ebx), %%mm5 \n\t" // R5
  2281. "movq (%1, %%eax, 2), %%mm6 \n\t" // R6
  2282. "movq (%1, %%ecx), %%mm7 \n\t" // R7
  2283. PAVGB(%%mm4, %%mm0)
  2284. PAVGB(%%mm5, %%mm1)
  2285. PAVGB(%%mm6, %%mm2)
  2286. PAVGB(%%mm7, %%mm3)
  2287. PAVGB(%%mm4, %%mm0)
  2288. PAVGB(%%mm5, %%mm1)
  2289. PAVGB(%%mm6, %%mm2)
  2290. PAVGB(%%mm7, %%mm3)
  2291. "movq %%mm0, (%1, %2, 4) \n\t" // R4
  2292. "movq %%mm1, (%1, %%ebx) \n\t" // R5
  2293. "movq %%mm2, (%1, %%eax, 2) \n\t" // R6
  2294. "movq %%mm3, (%1, %%ecx) \n\t" // R7
  2295. "movq %%mm0, (%0, %2, 4) \n\t" // L4
  2296. "movq %%mm1, (%0, %%ebx) \n\t" // L5
  2297. "movq %%mm2, (%0, %%eax, 2) \n\t" // L6
  2298. "movq %%mm3, (%0, %%ecx) \n\t" // L7
  2299. "jmp 4f \n\t"
  2300. "3: \n\t"
  2301. "leal (%%ebx, %2, 2), %%ecx \n\t" // 7*stride
  2302. "movq (%0), %%mm0 \n\t" // L0
  2303. "movq (%0, %2), %%mm1 \n\t" // L1
  2304. "movq (%0, %2, 2), %%mm2 \n\t" // L2
  2305. "movq (%0, %%eax), %%mm3 \n\t" // L3
  2306. "movq (%1), %%mm4 \n\t" // R0
  2307. "movq (%1, %2), %%mm5 \n\t" // R1
  2308. "movq (%1, %2, 2), %%mm6 \n\t" // R2
  2309. "movq (%1, %%eax), %%mm7 \n\t" // R3
  2310. PAVGB(%%mm4, %%mm0)
  2311. PAVGB(%%mm5, %%mm1)
  2312. PAVGB(%%mm6, %%mm2)
  2313. PAVGB(%%mm7, %%mm3)
  2314. PAVGB(%%mm4, %%mm0)
  2315. PAVGB(%%mm5, %%mm1)
  2316. PAVGB(%%mm6, %%mm2)
  2317. PAVGB(%%mm7, %%mm3)
  2318. PAVGB(%%mm4, %%mm0)
  2319. PAVGB(%%mm5, %%mm1)
  2320. PAVGB(%%mm6, %%mm2)
  2321. PAVGB(%%mm7, %%mm3)
  2322. "movq %%mm0, (%1) \n\t" // R0
  2323. "movq %%mm1, (%1, %2) \n\t" // R1
  2324. "movq %%mm2, (%1, %2, 2) \n\t" // R2
  2325. "movq %%mm3, (%1, %%eax) \n\t" // R3
  2326. "movq %%mm0, (%0) \n\t" // L0
  2327. "movq %%mm1, (%0, %2) \n\t" // L1
  2328. "movq %%mm2, (%0, %2, 2) \n\t" // L2
  2329. "movq %%mm3, (%0, %%eax) \n\t" // L3
  2330. "movq (%0, %2, 4), %%mm0 \n\t" // L4
  2331. "movq (%0, %%ebx), %%mm1 \n\t" // L5
  2332. "movq (%0, %%eax, 2), %%mm2 \n\t" // L6
  2333. "movq (%0, %%ecx), %%mm3 \n\t" // L7
  2334. "movq (%1, %2, 4), %%mm4 \n\t" // R4
  2335. "movq (%1, %%ebx), %%mm5 \n\t" // R5
  2336. "movq (%1, %%eax, 2), %%mm6 \n\t" // R6
  2337. "movq (%1, %%ecx), %%mm7 \n\t" // R7
  2338. PAVGB(%%mm4, %%mm0)
  2339. PAVGB(%%mm5, %%mm1)
  2340. PAVGB(%%mm6, %%mm2)
  2341. PAVGB(%%mm7, %%mm3)
  2342. PAVGB(%%mm4, %%mm0)
  2343. PAVGB(%%mm5, %%mm1)
  2344. PAVGB(%%mm6, %%mm2)
  2345. PAVGB(%%mm7, %%mm3)
  2346. PAVGB(%%mm4, %%mm0)
  2347. PAVGB(%%mm5, %%mm1)
  2348. PAVGB(%%mm6, %%mm2)
  2349. PAVGB(%%mm7, %%mm3)
  2350. "movq %%mm0, (%1, %2, 4) \n\t" // R4
  2351. "movq %%mm1, (%1, %%ebx) \n\t" // R5
  2352. "movq %%mm2, (%1, %%eax, 2) \n\t" // R6
  2353. "movq %%mm3, (%1, %%ecx) \n\t" // R7
  2354. "movq %%mm0, (%0, %2, 4) \n\t" // L4
  2355. "movq %%mm1, (%0, %%ebx) \n\t" // L5
  2356. "movq %%mm2, (%0, %%eax, 2) \n\t" // L6
  2357. "movq %%mm3, (%0, %%ecx) \n\t" // L7
  2358. "4: \n\t"
  2359. :: "r" (src), "r" (tempBlured), "r"(stride), "m" (tempBluredPast)
  2360. : "%eax", "%ebx", "%ecx", "memory"
  2361. );
  2362. //printf("%d\n", test);
  2363. #else
  2364. int y;
  2365. int d=0;
  2366. int sysd=0;
  2367. int i;
  2368. for(y=0; y<8; y++)
  2369. {
  2370. int x;
  2371. for(x=0; x<8; x++)
  2372. {
  2373. int ref= tempBlured[ x + y*stride ];
  2374. int cur= src[ x + y*stride ];
  2375. int d1=ref - cur;
  2376. // if(x==0 || x==7) d1+= d1>>1;
  2377. // if(y==0 || y==7) d1+= d1>>1;
  2378. // d+= ABS(d1);
  2379. d+= d1*d1;
  2380. sysd+= d1;
  2381. }
  2382. }
  2383. i=d;
  2384. d= (
  2385. 4*d
  2386. +(*(tempBluredPast-256))
  2387. +(*(tempBluredPast-1))+ (*(tempBluredPast+1))
  2388. +(*(tempBluredPast+256))
  2389. +4)>>3;
  2390. *tempBluredPast=i;
  2391. // ((*tempBluredPast)*3 + d + 2)>>2;
  2392. //printf("%d %d %d\n", maxNoise[0], maxNoise[1], maxNoise[2]);
  2393. /*
  2394. Switch between
  2395. 1 0 0 0 0 0 0 (0)
  2396. 64 32 16 8 4 2 1 (1)
  2397. 64 48 36 27 20 15 11 (33) (approx)
  2398. 64 56 49 43 37 33 29 (200) (approx)
  2399. */
  2400. if(d > maxNoise[1])
  2401. {
  2402. if(d < maxNoise[2])
  2403. {
  2404. for(y=0; y<8; y++)
  2405. {
  2406. int x;
  2407. for(x=0; x<8; x++)
  2408. {
  2409. int ref= tempBlured[ x + y*stride ];
  2410. int cur= src[ x + y*stride ];
  2411. tempBlured[ x + y*stride ]=
  2412. src[ x + y*stride ]=
  2413. (ref + cur + 1)>>1;
  2414. }
  2415. }
  2416. }
  2417. else
  2418. {
  2419. for(y=0; y<8; y++)
  2420. {
  2421. int x;
  2422. for(x=0; x<8; x++)
  2423. {
  2424. tempBlured[ x + y*stride ]= src[ x + y*stride ];
  2425. }
  2426. }
  2427. }
  2428. }
  2429. else
  2430. {
  2431. if(d < maxNoise[0])
  2432. {
  2433. for(y=0; y<8; y++)
  2434. {
  2435. int x;
  2436. for(x=0; x<8; x++)
  2437. {
  2438. int ref= tempBlured[ x + y*stride ];
  2439. int cur= src[ x + y*stride ];
  2440. tempBlured[ x + y*stride ]=
  2441. src[ x + y*stride ]=
  2442. (ref*7 + cur + 4)>>3;
  2443. }
  2444. }
  2445. }
  2446. else
  2447. {
  2448. for(y=0; y<8; y++)
  2449. {
  2450. int x;
  2451. for(x=0; x<8; x++)
  2452. {
  2453. int ref= tempBlured[ x + y*stride ];
  2454. int cur= src[ x + y*stride ];
  2455. tempBlured[ x + y*stride ]=
  2456. src[ x + y*stride ]=
  2457. (ref*3 + cur + 2)>>2;
  2458. }
  2459. }
  2460. }
  2461. }
  2462. #endif
  2463. }
  2464. #ifdef HAVE_ODIVX_POSTPROCESS
  2465. #include "../opendivx/postprocess.h"
  2466. int use_old_pp=0;
  2467. #endif
  2468. static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
  2469. QP_STORE_T QPs[], int QPStride, int isColor, struct PPMode *ppMode);
  2470. /* -pp Command line Help
  2471. NOTE/FIXME: put this at an appropriate place (--help, html docs, man mplayer)?
  2472. -pp <filterName>[:<option>[:<option>...]][,[-]<filterName>[:<option>...]]...
  2473. long form example:
  2474. -pp vdeblock:autoq,hdeblock:autoq,linblenddeint -pp default,-vdeblock
  2475. short form example:
  2476. -pp vb:a,hb:a,lb -pp de,-vb
  2477. more examples:
  2478. -pp tn:64:128:256
  2479. Filters Options
  2480. short long name short long option Description
  2481. * * a autoq cpu power dependant enabler
  2482. c chrom chrominance filtring enabled
  2483. y nochrom chrominance filtring disabled
  2484. hb hdeblock horizontal deblocking filter
  2485. vb vdeblock vertical deblocking filter
  2486. vr rkvdeblock
  2487. h1 x1hdeblock Experimental horizontal deblock filter 1
  2488. v1 x1vdeblock Experimental vertical deblock filter 1
  2489. dr dering not implemented yet
  2490. al autolevels automatic brightness / contrast fixer
  2491. f fullyrange stretch luminance range to (0..255)
  2492. lb linblenddeint linear blend deinterlacer
  2493. li linipoldeint linear interpolating deinterlacer
  2494. ci cubicipoldeint cubic interpolating deinterlacer
  2495. md mediandeint median deinterlacer
  2496. de default hdeblock:a,vdeblock:a,dering:a,autolevels
  2497. fa fast x1hdeblock:a,x1vdeblock:a,dering:a,autolevels
  2498. tn tmpnoise (3 Thresholds) Temporal Noise Reducer
  2499. */
  2500. /**
  2501. * returns a PPMode struct which will have a non 0 error variable if an error occured
  2502. * name is the string after "-pp" on the command line
  2503. * quality is a number from 0 to GET_PP_QUALITY_MAX
  2504. */
  2505. struct PPMode getPPModeByNameAndQuality(char *name, int quality)
  2506. {
  2507. char temp[GET_MODE_BUFFER_SIZE];
  2508. char *p= temp;
  2509. char *filterDelimiters= ",";
  2510. char *optionDelimiters= ":";
  2511. struct PPMode ppMode= {0,0,0,0,0,0,{150,200,400}};
  2512. char *filterToken;
  2513. strncpy(temp, name, GET_MODE_BUFFER_SIZE);
  2514. printf("%s\n", name);
  2515. for(;;){
  2516. char *filterName;
  2517. int q= 1000000; //GET_PP_QUALITY_MAX;
  2518. int chrom=-1;
  2519. char *option;
  2520. char *options[OPTIONS_ARRAY_SIZE];
  2521. int i;
  2522. int filterNameOk=0;
  2523. int numOfUnknownOptions=0;
  2524. int enable=1; //does the user want us to enabled or disabled the filter
  2525. filterToken= strtok(p, filterDelimiters);
  2526. if(filterToken == NULL) break;
  2527. p+= strlen(filterToken) + 1; // p points to next filterToken
  2528. filterName= strtok(filterToken, optionDelimiters);
  2529. printf("%s::%s\n", filterToken, filterName);
  2530. if(*filterName == '-')
  2531. {
  2532. enable=0;
  2533. filterName++;
  2534. }
  2535. for(;;){ //for all options
  2536. option= strtok(NULL, optionDelimiters);
  2537. if(option == NULL) break;
  2538. printf("%s\n", option);
  2539. if(!strcmp("autoq", option) || !strcmp("a", option)) q= quality;
  2540. else if(!strcmp("nochrom", option) || !strcmp("y", option)) chrom=0;
  2541. else if(!strcmp("chrom", option) || !strcmp("c", option)) chrom=1;
  2542. else
  2543. {
  2544. options[numOfUnknownOptions] = option;
  2545. numOfUnknownOptions++;
  2546. }
  2547. if(numOfUnknownOptions >= OPTIONS_ARRAY_SIZE-1) break;
  2548. }
  2549. options[numOfUnknownOptions] = NULL;
  2550. /* replace stuff from the replace Table */
  2551. for(i=0; replaceTable[2*i]!=NULL; i++)
  2552. {
  2553. if(!strcmp(replaceTable[2*i], filterName))
  2554. {
  2555. int newlen= strlen(replaceTable[2*i + 1]);
  2556. int plen;
  2557. int spaceLeft;
  2558. if(p==NULL) p= temp, *p=0; //last filter
  2559. else p--, *p=','; //not last filter
  2560. plen= strlen(p);
  2561. spaceLeft= (int)p - (int)temp + plen;
  2562. if(spaceLeft + newlen >= GET_MODE_BUFFER_SIZE)
  2563. {
  2564. ppMode.error++;
  2565. break;
  2566. }
  2567. memmove(p + newlen, p, plen+1);
  2568. memcpy(p, replaceTable[2*i + 1], newlen);
  2569. filterNameOk=1;
  2570. }
  2571. }
  2572. for(i=0; filters[i].shortName!=NULL; i++)
  2573. {
  2574. // printf("Compareing %s, %s, %s\n", filters[i].shortName,filters[i].longName, filterName);
  2575. if( !strcmp(filters[i].longName, filterName)
  2576. || !strcmp(filters[i].shortName, filterName))
  2577. {
  2578. ppMode.lumMode &= ~filters[i].mask;
  2579. ppMode.chromMode &= ~filters[i].mask;
  2580. filterNameOk=1;
  2581. if(!enable) break; // user wants to disable it
  2582. if(q >= filters[i].minLumQuality)
  2583. ppMode.lumMode|= filters[i].mask;
  2584. if(chrom==1 || (chrom==-1 && filters[i].chromDefault))
  2585. if(q >= filters[i].minChromQuality)
  2586. ppMode.chromMode|= filters[i].mask;
  2587. if(filters[i].mask == LEVEL_FIX)
  2588. {
  2589. int o;
  2590. ppMode.minAllowedY= 16;
  2591. ppMode.maxAllowedY= 234;
  2592. for(o=0; options[o]!=NULL; o++)
  2593. if( !strcmp(options[o],"fullyrange")
  2594. ||!strcmp(options[o],"f"))
  2595. {
  2596. ppMode.minAllowedY= 0;
  2597. ppMode.maxAllowedY= 255;
  2598. numOfUnknownOptions--;
  2599. }
  2600. }
  2601. else if(filters[i].mask == TEMP_NOISE_FILTER)
  2602. {
  2603. int o;
  2604. int numOfNoises=0;
  2605. ppMode.maxTmpNoise[0]= 150;
  2606. ppMode.maxTmpNoise[1]= 200;
  2607. ppMode.maxTmpNoise[2]= 400;
  2608. for(o=0; options[o]!=NULL; o++)
  2609. {
  2610. char *tail;
  2611. ppMode.maxTmpNoise[numOfNoises]=
  2612. strtol(options[o], &tail, 0);
  2613. if(tail!=options[o])
  2614. {
  2615. numOfNoises++;
  2616. numOfUnknownOptions--;
  2617. if(numOfNoises >= 3) break;
  2618. }
  2619. }
  2620. }
  2621. }
  2622. }
  2623. if(!filterNameOk) ppMode.error++;
  2624. ppMode.error += numOfUnknownOptions;
  2625. }
  2626. #ifdef HAVE_ODIVX_POSTPROCESS
  2627. if(ppMode.lumMode & H_DEBLOCK) ppMode.oldMode |= PP_DEBLOCK_Y_H;
  2628. if(ppMode.lumMode & V_DEBLOCK) ppMode.oldMode |= PP_DEBLOCK_Y_V;
  2629. if(ppMode.chromMode & H_DEBLOCK) ppMode.oldMode |= PP_DEBLOCK_C_H;
  2630. if(ppMode.chromMode & V_DEBLOCK) ppMode.oldMode |= PP_DEBLOCK_C_V;
  2631. if(ppMode.lumMode & DERING) ppMode.oldMode |= PP_DERING_Y;
  2632. if(ppMode.chromMode & DERING) ppMode.oldMode |= PP_DERING_C;
  2633. #endif
  2634. return ppMode;
  2635. }
  2636. /**
  2637. * Obsolete, dont use it, use postprocess2() instead
  2638. */
  2639. void postprocess(unsigned char * src[], int src_stride,
  2640. unsigned char * dst[], int dst_stride,
  2641. int horizontal_size, int vertical_size,
  2642. QP_STORE_T *QP_store, int QP_stride,
  2643. int mode)
  2644. {
  2645. struct PPMode ppMode;
  2646. static QP_STORE_T zeroArray[2048/8];
  2647. /*
  2648. static int qual=0;
  2649. ppMode= getPPModeByNameAndQuality("fast,default,-hdeblock,-vdeblock,tmpnoise:150:200:300", qual);
  2650. printf("OK\n");
  2651. qual++;
  2652. qual%=7;
  2653. printf("\n%X %X %X %X :%d: %d %d %d\n", ppMode.lumMode, ppMode.chromMode, ppMode.oldMode, ppMode.error,
  2654. qual, ppMode.maxTmpNoise[0], ppMode.maxTmpNoise[1], ppMode.maxTmpNoise[2]);
  2655. postprocess2(src, src_stride, dst, dst_stride,
  2656. horizontal_size, vertical_size, QP_store, QP_stride, &ppMode);
  2657. return;
  2658. */
  2659. if(QP_store==NULL)
  2660. {
  2661. QP_store= zeroArray;
  2662. QP_stride= 0;
  2663. }
  2664. ppMode.lumMode= mode;
  2665. mode= ((mode&0xFF)>>4) | (mode&0xFFFFFF00);
  2666. ppMode.chromMode= mode;
  2667. ppMode.maxTmpNoise[0]= 700;
  2668. ppMode.maxTmpNoise[1]= 1500;
  2669. ppMode.maxTmpNoise[2]= 3000;
  2670. #ifdef HAVE_ODIVX_POSTPROCESS
  2671. // Note: I could make this shit outside of this file, but it would mean one
  2672. // more function call...
  2673. if(use_old_pp){
  2674. odivx_postprocess(src,src_stride,dst,dst_stride,horizontal_size,vertical_size,QP_store,QP_stride,mode);
  2675. return;
  2676. }
  2677. #endif
  2678. postProcess(src[0], src_stride, dst[0], dst_stride,
  2679. horizontal_size, vertical_size, QP_store, QP_stride, 0, &ppMode);
  2680. horizontal_size >>= 1;
  2681. vertical_size >>= 1;
  2682. src_stride >>= 1;
  2683. dst_stride >>= 1;
  2684. if(1)
  2685. {
  2686. postProcess(src[1], src_stride, dst[1], dst_stride,
  2687. horizontal_size, vertical_size, QP_store, QP_stride, 1, &ppMode);
  2688. postProcess(src[2], src_stride, dst[2], dst_stride,
  2689. horizontal_size, vertical_size, QP_store, QP_stride, 2, &ppMode);
  2690. }
  2691. else
  2692. {
  2693. memset(dst[1], 128, dst_stride*vertical_size);
  2694. memset(dst[2], 128, dst_stride*vertical_size);
  2695. // memcpy(dst[1], src[1], src_stride*horizontal_size);
  2696. // memcpy(dst[2], src[2], src_stride*horizontal_size);
  2697. }
  2698. }
  2699. void postprocess2(unsigned char * src[], int src_stride,
  2700. unsigned char * dst[], int dst_stride,
  2701. int horizontal_size, int vertical_size,
  2702. QP_STORE_T *QP_store, int QP_stride,
  2703. struct PPMode *mode)
  2704. {
  2705. static QP_STORE_T zeroArray[2048/8];
  2706. if(QP_store==NULL)
  2707. {
  2708. QP_store= zeroArray;
  2709. QP_stride= 0;
  2710. }
  2711. #ifdef HAVE_ODIVX_POSTPROCESS
  2712. // Note: I could make this shit outside of this file, but it would mean one
  2713. // more function call...
  2714. if(use_old_pp){
  2715. odivx_postprocess(src,src_stride,dst,dst_stride,horizontal_size,vertical_size,QP_store,QP_stride,
  2716. mode->oldMode);
  2717. return;
  2718. }
  2719. #endif
  2720. postProcess(src[0], src_stride, dst[0], dst_stride,
  2721. horizontal_size, vertical_size, QP_store, QP_stride, 0, mode);
  2722. horizontal_size >>= 1;
  2723. vertical_size >>= 1;
  2724. src_stride >>= 1;
  2725. dst_stride >>= 1;
  2726. postProcess(src[1], src_stride, dst[1], dst_stride,
  2727. horizontal_size, vertical_size, QP_store, QP_stride, 1, mode);
  2728. postProcess(src[2], src_stride, dst[2], dst_stride,
  2729. horizontal_size, vertical_size, QP_store, QP_stride, 2, mode);
  2730. }
  2731. /**
  2732. * gets the mode flags for a given quality (larger values mean slower but better postprocessing)
  2733. * 0 <= quality <= 6
  2734. */
  2735. int getPpModeForQuality(int quality){
  2736. int modes[1+GET_PP_QUALITY_MAX]= {
  2737. 0,
  2738. #if 1
  2739. // horizontal filters first
  2740. LUM_H_DEBLOCK,
  2741. LUM_H_DEBLOCK | LUM_V_DEBLOCK,
  2742. LUM_H_DEBLOCK | LUM_V_DEBLOCK | CHROM_H_DEBLOCK,
  2743. LUM_H_DEBLOCK | LUM_V_DEBLOCK | CHROM_H_DEBLOCK | CHROM_V_DEBLOCK,
  2744. LUM_H_DEBLOCK | LUM_V_DEBLOCK | CHROM_H_DEBLOCK | CHROM_V_DEBLOCK | LUM_DERING,
  2745. LUM_H_DEBLOCK | LUM_V_DEBLOCK | CHROM_H_DEBLOCK | CHROM_V_DEBLOCK | LUM_DERING | CHROM_DERING
  2746. #else
  2747. // vertical filters first
  2748. LUM_V_DEBLOCK,
  2749. LUM_V_DEBLOCK | LUM_H_DEBLOCK,
  2750. LUM_V_DEBLOCK | LUM_H_DEBLOCK | CHROM_V_DEBLOCK,
  2751. LUM_V_DEBLOCK | LUM_H_DEBLOCK | CHROM_V_DEBLOCK | CHROM_H_DEBLOCK,
  2752. LUM_V_DEBLOCK | LUM_H_DEBLOCK | CHROM_V_DEBLOCK | CHROM_H_DEBLOCK | LUM_DERING,
  2753. LUM_V_DEBLOCK | LUM_H_DEBLOCK | CHROM_V_DEBLOCK | CHROM_H_DEBLOCK | LUM_DERING | CHROM_DERING
  2754. #endif
  2755. };
  2756. #ifdef HAVE_ODIVX_POSTPROCESS
  2757. int odivx_modes[1+GET_PP_QUALITY_MAX]= {
  2758. 0,
  2759. PP_DEBLOCK_Y_H,
  2760. PP_DEBLOCK_Y_H|PP_DEBLOCK_Y_V,
  2761. PP_DEBLOCK_Y_H|PP_DEBLOCK_Y_V|PP_DEBLOCK_C_H,
  2762. PP_DEBLOCK_Y_H|PP_DEBLOCK_Y_V|PP_DEBLOCK_C_H|PP_DEBLOCK_C_V,
  2763. PP_DEBLOCK_Y_H|PP_DEBLOCK_Y_V|PP_DEBLOCK_C_H|PP_DEBLOCK_C_V|PP_DERING_Y,
  2764. PP_DEBLOCK_Y_H|PP_DEBLOCK_Y_V|PP_DEBLOCK_C_H|PP_DEBLOCK_C_V|PP_DERING_Y|PP_DERING_C
  2765. };
  2766. if(use_old_pp) return odivx_modes[quality];
  2767. #endif
  2768. return modes[quality];
  2769. }
  2770. /**
  2771. * Copies a block from src to dst and fixes the blacklevel
  2772. * numLines must be a multiple of 4
  2773. * levelFix == 0 -> dont touch the brighness & contrast
  2774. */
  2775. static inline void blockCopy(uint8_t dst[], int dstStride, uint8_t src[], int srcStride,
  2776. int levelFix)
  2777. {
  2778. #ifndef HAVE_MMX
  2779. int i;
  2780. #endif
  2781. if(levelFix)
  2782. {
  2783. #ifdef HAVE_MMX
  2784. asm volatile(
  2785. "leal (%2,%2), %%eax \n\t"
  2786. "leal (%3,%3), %%ebx \n\t"
  2787. "movq packedYOffset, %%mm2 \n\t"
  2788. "movq packedYScale, %%mm3 \n\t"
  2789. "pxor %%mm4, %%mm4 \n\t"
  2790. #define SCALED_CPY \
  2791. "movq (%0), %%mm0 \n\t"\
  2792. "movq (%0), %%mm5 \n\t"\
  2793. "punpcklbw %%mm4, %%mm0 \n\t"\
  2794. "punpckhbw %%mm4, %%mm5 \n\t"\
  2795. "psubw %%mm2, %%mm0 \n\t"\
  2796. "psubw %%mm2, %%mm5 \n\t"\
  2797. "movq (%0,%2), %%mm1 \n\t"\
  2798. "psllw $6, %%mm0 \n\t"\
  2799. "psllw $6, %%mm5 \n\t"\
  2800. "pmulhw %%mm3, %%mm0 \n\t"\
  2801. "movq (%0,%2), %%mm6 \n\t"\
  2802. "pmulhw %%mm3, %%mm5 \n\t"\
  2803. "punpcklbw %%mm4, %%mm1 \n\t"\
  2804. "punpckhbw %%mm4, %%mm6 \n\t"\
  2805. "psubw %%mm2, %%mm1 \n\t"\
  2806. "psubw %%mm2, %%mm6 \n\t"\
  2807. "psllw $6, %%mm1 \n\t"\
  2808. "psllw $6, %%mm6 \n\t"\
  2809. "pmulhw %%mm3, %%mm1 \n\t"\
  2810. "pmulhw %%mm3, %%mm6 \n\t"\
  2811. "addl %%eax, %0 \n\t"\
  2812. "packuswb %%mm5, %%mm0 \n\t"\
  2813. "packuswb %%mm6, %%mm1 \n\t"\
  2814. "movq %%mm0, (%1) \n\t"\
  2815. "movq %%mm1, (%1, %3) \n\t"\
  2816. SCALED_CPY
  2817. "addl %%ebx, %1 \n\t"
  2818. SCALED_CPY
  2819. "addl %%ebx, %1 \n\t"
  2820. SCALED_CPY
  2821. "addl %%ebx, %1 \n\t"
  2822. SCALED_CPY
  2823. : "+r"(src),
  2824. "+r"(dst)
  2825. :"r" (srcStride),
  2826. "r" (dstStride)
  2827. : "%eax", "%ebx"
  2828. );
  2829. #else
  2830. for(i=0; i<8; i++)
  2831. memcpy( &(dst[dstStride*i]),
  2832. &(src[srcStride*i]), BLOCK_SIZE);
  2833. #endif
  2834. }
  2835. else
  2836. {
  2837. #ifdef HAVE_MMX
  2838. asm volatile(
  2839. "pushl %0 \n\t"
  2840. "pushl %1 \n\t"
  2841. "leal (%2,%2), %%eax \n\t"
  2842. "leal (%3,%3), %%ebx \n\t"
  2843. #define SIMPLE_CPY \
  2844. "movq (%0), %%mm0 \n\t"\
  2845. "movq (%0,%2), %%mm1 \n\t"\
  2846. "movq %%mm0, (%1) \n\t"\
  2847. "movq %%mm1, (%1, %3) \n\t"\
  2848. SIMPLE_CPY
  2849. "addl %%eax, %0 \n\t"
  2850. "addl %%ebx, %1 \n\t"
  2851. SIMPLE_CPY
  2852. "addl %%eax, %0 \n\t"
  2853. "addl %%ebx, %1 \n\t"
  2854. SIMPLE_CPY
  2855. "addl %%eax, %0 \n\t"
  2856. "addl %%ebx, %1 \n\t"
  2857. SIMPLE_CPY
  2858. "popl %1 \n\t"
  2859. "popl %0 \n\t"
  2860. : : "r" (src),
  2861. "r" (dst),
  2862. "r" (srcStride),
  2863. "r" (dstStride)
  2864. : "%eax", "%ebx"
  2865. );
  2866. #else
  2867. for(i=0; i<8; i++)
  2868. memcpy( &(dst[dstStride*i]),
  2869. &(src[srcStride*i]), BLOCK_SIZE);
  2870. #endif
  2871. }
  2872. }
  2873. /**
  2874. * Filters array of bytes (Y or U or V values)
  2875. */
  2876. static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
  2877. QP_STORE_T QPs[], int QPStride, int isColor, struct PPMode *ppMode)
  2878. {
  2879. int x,y;
  2880. const int mode= isColor ? ppMode->chromMode : ppMode->lumMode;
  2881. /* we need 64bit here otherwise we´ll going to have a problem
  2882. after watching a black picture for 5 hours*/
  2883. static uint64_t *yHistogram= NULL;
  2884. int black=0, white=255; // blackest black and whitest white in the picture
  2885. int QPCorrecture= 256;
  2886. /* Temporary buffers for handling the last row(s) */
  2887. static uint8_t *tempDst= NULL;
  2888. static uint8_t *tempSrc= NULL;
  2889. /* Temporary buffers for handling the last block */
  2890. static uint8_t *tempDstBlock= NULL;
  2891. static uint8_t *tempSrcBlock= NULL;
  2892. /* Temporal noise reducing buffers */
  2893. static uint8_t *tempBlured[3]= {NULL,NULL,NULL};
  2894. static uint32_t *tempBluredPast[3]= {NULL,NULL,NULL};
  2895. int copyAhead;
  2896. #ifdef PP_FUNNY_STRIDE
  2897. uint8_t *dstBlockPtrBackup;
  2898. uint8_t *srcBlockPtrBackup;
  2899. #endif
  2900. #ifdef MORE_TIMING
  2901. long long T0, T1, diffTime=0;
  2902. #endif
  2903. #ifdef TIMING
  2904. long long memcpyTime=0, vertTime=0, horizTime=0, sumTime;
  2905. sumTime= rdtsc();
  2906. #endif
  2907. //mode= 0x7F;
  2908. #ifdef HAVE_MMX
  2909. maxTmpNoise[0]= ppMode->maxTmpNoise[0];
  2910. maxTmpNoise[1]= ppMode->maxTmpNoise[1];
  2911. maxTmpNoise[2]= ppMode->maxTmpNoise[2];
  2912. #endif
  2913. if(mode & CUBIC_IPOL_DEINT_FILTER) copyAhead=16;
  2914. else if(mode & LINEAR_BLEND_DEINT_FILTER) copyAhead=14;
  2915. else if( (mode & V_DEBLOCK)
  2916. || (mode & LINEAR_IPOL_DEINT_FILTER)
  2917. || (mode & MEDIAN_DEINT_FILTER)) copyAhead=13;
  2918. else if(mode & V_X1_FILTER) copyAhead=11;
  2919. else if(mode & V_RK1_FILTER) copyAhead=10;
  2920. else if(mode & DERING) copyAhead=9;
  2921. else copyAhead=8;
  2922. copyAhead-= 8;
  2923. if(tempDst==NULL)
  2924. {
  2925. tempDst= (uint8_t*)memalign(8, 1024*24);
  2926. tempSrc= (uint8_t*)memalign(8, 1024*24);
  2927. tempDstBlock= (uint8_t*)memalign(8, 1024*24);
  2928. tempSrcBlock= (uint8_t*)memalign(8, 1024*24);
  2929. }
  2930. if(tempBlured[isColor]==NULL && (mode & TEMP_NOISE_FILTER))
  2931. {
  2932. // printf("%d %d %d\n", isColor, dstStride, height);
  2933. //FIXME works only as long as the size doesnt increase
  2934. //Note:the +17*1024 is just there so i dont have to worry about r/w over te end
  2935. tempBlured[isColor]= (uint8_t*)memalign(8, dstStride*((height+7)&(~7)) + 17*1024);
  2936. tempBluredPast[isColor]= (uint32_t*)memalign(8, 256*((height+7)&(~7))/2 + 17*1024);
  2937. memset(tempBlured[isColor], 0, dstStride*((height+7)&(~7)) + 17*1024);
  2938. memset(tempBluredPast[isColor], 0, 256*((height+7)&(~7))/2 + 17*1024);
  2939. }
  2940. if(!yHistogram)
  2941. {
  2942. int i;
  2943. yHistogram= (uint64_t*)malloc(8*256);
  2944. for(i=0; i<256; i++) yHistogram[i]= width*height/64*15/256;
  2945. if(mode & FULL_Y_RANGE)
  2946. {
  2947. maxAllowedY=255;
  2948. minAllowedY=0;
  2949. }
  2950. }
  2951. if(!isColor)
  2952. {
  2953. uint64_t sum= 0;
  2954. int i;
  2955. static int framenum= -1;
  2956. uint64_t maxClipped;
  2957. uint64_t clipped;
  2958. double scale;
  2959. framenum++;
  2960. if(framenum == 1) yHistogram[0]= width*height/64*15/256;
  2961. for(i=0; i<256; i++)
  2962. {
  2963. sum+= yHistogram[i];
  2964. // printf("%d ", yHistogram[i]);
  2965. }
  2966. // printf("\n\n");
  2967. /* we allways get a completly black picture first */
  2968. maxClipped= (uint64_t)(sum * maxClippedThreshold);
  2969. clipped= sum;
  2970. for(black=255; black>0; black--)
  2971. {
  2972. if(clipped < maxClipped) break;
  2973. clipped-= yHistogram[black];
  2974. }
  2975. clipped= sum;
  2976. for(white=0; white<256; white++)
  2977. {
  2978. if(clipped < maxClipped) break;
  2979. clipped-= yHistogram[white];
  2980. }
  2981. packedYOffset= (black - minAllowedY) & 0xFFFF;
  2982. packedYOffset|= packedYOffset<<32;
  2983. packedYOffset|= packedYOffset<<16;
  2984. scale= (double)(maxAllowedY - minAllowedY) / (double)(white-black);
  2985. packedYScale= (uint16_t)(scale*1024.0 + 0.5);
  2986. packedYScale|= packedYScale<<32;
  2987. packedYScale|= packedYScale<<16;
  2988. }
  2989. else
  2990. {
  2991. packedYScale= 0x0100010001000100LL;
  2992. packedYOffset= 0;
  2993. }
  2994. if(mode & LEVEL_FIX) QPCorrecture= packedYScale &0xFFFF;
  2995. else QPCorrecture= 256;
  2996. /* copy & deinterlace first row of blocks */
  2997. y=-BLOCK_SIZE;
  2998. {
  2999. //1% speedup if these are here instead of the inner loop
  3000. uint8_t *srcBlock= &(src[y*srcStride]);
  3001. uint8_t *dstBlock= &(dst[y*dstStride]);
  3002. dstBlock= tempDst + dstStride;
  3003. // From this point on it is guranteed that we can read and write 16 lines downward
  3004. // finish 1 block before the next otherwise we´ll might have a problem
  3005. // with the L1 Cache of the P4 ... or only a few blocks at a time or soemthing
  3006. for(x=0; x<width; x+=BLOCK_SIZE)
  3007. {
  3008. #ifdef HAVE_MMX2
  3009. /*
  3010. prefetchnta(srcBlock + (((x>>2)&6) + 5)*srcStride + 32);
  3011. prefetchnta(srcBlock + (((x>>2)&6) + 6)*srcStride + 32);
  3012. prefetcht0(dstBlock + (((x>>2)&6) + 5)*dstStride + 32);
  3013. prefetcht0(dstBlock + (((x>>2)&6) + 6)*dstStride + 32);
  3014. */
  3015. asm(
  3016. "movl %4, %%eax \n\t"
  3017. "shrl $2, %%eax \n\t"
  3018. "andl $6, %%eax \n\t"
  3019. "addl %5, %%eax \n\t"
  3020. "movl %%eax, %%ebx \n\t"
  3021. "imul %1, %%eax \n\t"
  3022. "imul %3, %%ebx \n\t"
  3023. "prefetchnta 32(%%eax, %0) \n\t"
  3024. "prefetcht0 32(%%ebx, %2) \n\t"
  3025. "addl %1, %%eax \n\t"
  3026. "addl %3, %%ebx \n\t"
  3027. "prefetchnta 32(%%eax, %0) \n\t"
  3028. "prefetcht0 32(%%ebx, %2) \n\t"
  3029. :: "r" (srcBlock), "r" (srcStride), "r" (dstBlock), "r" (dstStride),
  3030. "m" (x), "m" (copyAhead)
  3031. : "%eax", "%ebx"
  3032. );
  3033. #elif defined(HAVE_3DNOW)
  3034. //FIXME check if this is faster on an 3dnow chip or if its faster without the prefetch or ...
  3035. /* prefetch(srcBlock + (((x>>3)&3) + 5)*srcStride + 32);
  3036. prefetch(srcBlock + (((x>>3)&3) + 9)*srcStride + 32);
  3037. prefetchw(dstBlock + (((x>>3)&3) + 5)*dstStride + 32);
  3038. prefetchw(dstBlock + (((x>>3)&3) + 9)*dstStride + 32);
  3039. */
  3040. #endif
  3041. blockCopy(dstBlock + dstStride*copyAhead, dstStride,
  3042. srcBlock + srcStride*copyAhead, srcStride, mode & LEVEL_FIX);
  3043. if(mode & LINEAR_IPOL_DEINT_FILTER)
  3044. deInterlaceInterpolateLinear(dstBlock, dstStride);
  3045. else if(mode & LINEAR_BLEND_DEINT_FILTER)
  3046. deInterlaceBlendLinear(dstBlock, dstStride);
  3047. else if(mode & MEDIAN_DEINT_FILTER)
  3048. deInterlaceMedian(dstBlock, dstStride);
  3049. else if(mode & CUBIC_IPOL_DEINT_FILTER)
  3050. deInterlaceInterpolateCubic(dstBlock, dstStride);
  3051. /* else if(mode & CUBIC_BLEND_DEINT_FILTER)
  3052. deInterlaceBlendCubic(dstBlock, dstStride);
  3053. */
  3054. dstBlock+=8;
  3055. srcBlock+=8;
  3056. }
  3057. memcpy(&(dst[y*dstStride]) + 8*dstStride, tempDst + 9*dstStride, copyAhead*dstStride );
  3058. }
  3059. for(y=0; y<height; y+=BLOCK_SIZE)
  3060. {
  3061. //1% speedup if these are here instead of the inner loop
  3062. uint8_t *srcBlock= &(src[y*srcStride]);
  3063. uint8_t *dstBlock= &(dst[y*dstStride]);
  3064. #ifdef ARCH_X86
  3065. int *QPptr= isColor ? &QPs[(y>>3)*QPStride] :&QPs[(y>>4)*QPStride];
  3066. int QPDelta= isColor ? 1<<(32-3) : 1<<(32-4);
  3067. int QPFrac= QPDelta;
  3068. uint8_t *tempBlock1= tempBlocks;
  3069. uint8_t *tempBlock2= tempBlocks + 8;
  3070. #endif
  3071. int QP=0;
  3072. /* can we mess with a 8x16 block from srcBlock/dstBlock downwards and 1 line upwards
  3073. if not than use a temporary buffer */
  3074. if(y+15 >= height)
  3075. {
  3076. int i;
  3077. /* copy from line (copyAhead) to (copyAhead+7) of src, these will be copied with
  3078. blockcopy to dst later */
  3079. memcpy(tempSrc + srcStride*copyAhead, srcBlock + srcStride*copyAhead,
  3080. srcStride*MAX(height-y-copyAhead, 0) );
  3081. /* duplicate last line of src to fill the void upto line (copyAhead+7) */
  3082. for(i=MAX(height-y, 8); i<copyAhead+8; i++)
  3083. memcpy(tempSrc + srcStride*i, src + srcStride*(height-1), srcStride);
  3084. /* copy up to (copyAhead+1) lines of dst (line -1 to (copyAhead-1))*/
  3085. memcpy(tempDst, dstBlock - dstStride, dstStride*MIN(height-y+1, copyAhead+1) );
  3086. /* duplicate last line of dst to fill the void upto line (copyAhead) */
  3087. for(i=height-y+1; i<=copyAhead; i++)
  3088. memcpy(tempDst + dstStride*i, dst + dstStride*(height-1), dstStride);
  3089. dstBlock= tempDst + dstStride;
  3090. srcBlock= tempSrc;
  3091. }
  3092. // From this point on it is guranteed that we can read and write 16 lines downward
  3093. // finish 1 block before the next otherwise we´ll might have a problem
  3094. // with the L1 Cache of the P4 ... or only a few blocks at a time or soemthing
  3095. for(x=0; x<width; x+=BLOCK_SIZE)
  3096. {
  3097. const int stride= dstStride;
  3098. uint8_t *tmpXchg;
  3099. #ifdef ARCH_X86
  3100. QP= *QPptr;
  3101. asm volatile(
  3102. "addl %2, %1 \n\t"
  3103. "sbbl %%eax, %%eax \n\t"
  3104. "shll $2, %%eax \n\t"
  3105. "subl %%eax, %0 \n\t"
  3106. : "+r" (QPptr), "+m" (QPFrac)
  3107. : "r" (QPDelta)
  3108. : "%eax"
  3109. );
  3110. #else
  3111. QP= isColor ?
  3112. QPs[(y>>3)*QPStride + (x>>3)]:
  3113. QPs[(y>>4)*QPStride + (x>>4)];
  3114. #endif
  3115. if(!isColor)
  3116. {
  3117. QP= (QP* QPCorrecture)>>8;
  3118. yHistogram[ srcBlock[srcStride*12 + 4] ]++;
  3119. }
  3120. #ifdef HAVE_MMX
  3121. asm volatile(
  3122. "movd %0, %%mm7 \n\t"
  3123. "packuswb %%mm7, %%mm7 \n\t" // 0, 0, 0, QP, 0, 0, 0, QP
  3124. "packuswb %%mm7, %%mm7 \n\t" // 0,QP, 0, QP, 0,QP, 0, QP
  3125. "packuswb %%mm7, %%mm7 \n\t" // QP,..., QP
  3126. "movq %%mm7, pQPb \n\t"
  3127. : : "r" (QP)
  3128. );
  3129. #endif
  3130. #ifdef MORE_TIMING
  3131. T0= rdtsc();
  3132. #endif
  3133. #ifdef HAVE_MMX2
  3134. /*
  3135. prefetchnta(srcBlock + (((x>>2)&6) + 5)*srcStride + 32);
  3136. prefetchnta(srcBlock + (((x>>2)&6) + 6)*srcStride + 32);
  3137. prefetcht0(dstBlock + (((x>>2)&6) + 5)*dstStride + 32);
  3138. prefetcht0(dstBlock + (((x>>2)&6) + 6)*dstStride + 32);
  3139. */
  3140. asm(
  3141. "movl %4, %%eax \n\t"
  3142. "shrl $2, %%eax \n\t"
  3143. "andl $6, %%eax \n\t"
  3144. "addl %5, %%eax \n\t"
  3145. "movl %%eax, %%ebx \n\t"
  3146. "imul %1, %%eax \n\t"
  3147. "imul %3, %%ebx \n\t"
  3148. "prefetchnta 32(%%eax, %0) \n\t"
  3149. "prefetcht0 32(%%ebx, %2) \n\t"
  3150. "addl %1, %%eax \n\t"
  3151. "addl %3, %%ebx \n\t"
  3152. "prefetchnta 32(%%eax, %0) \n\t"
  3153. "prefetcht0 32(%%ebx, %2) \n\t"
  3154. :: "r" (srcBlock), "r" (srcStride), "r" (dstBlock), "r" (dstStride),
  3155. "m" (x), "m" (copyAhead)
  3156. : "%eax", "%ebx"
  3157. );
  3158. #elif defined(HAVE_3DNOW)
  3159. //FIXME check if this is faster on an 3dnow chip or if its faster without the prefetch or ...
  3160. /* prefetch(srcBlock + (((x>>3)&3) + 5)*srcStride + 32);
  3161. prefetch(srcBlock + (((x>>3)&3) + 9)*srcStride + 32);
  3162. prefetchw(dstBlock + (((x>>3)&3) + 5)*dstStride + 32);
  3163. prefetchw(dstBlock + (((x>>3)&3) + 9)*dstStride + 32);
  3164. */
  3165. #endif
  3166. #ifdef PP_FUNNY_STRIDE
  3167. //can we mess with a 8x16 block, if not use a temp buffer, yes again
  3168. if(x+7 >= width)
  3169. {
  3170. int i;
  3171. dstBlockPtrBackup= dstBlock;
  3172. srcBlockPtrBackup= srcBlock;
  3173. for(i=0;i<BLOCK_SIZE*2; i++)
  3174. {
  3175. memcpy(tempSrcBlock+i*srcStride, srcBlock+i*srcStride, width-x);
  3176. memcpy(tempDstBlock+i*dstStride, dstBlock+i*dstStride, width-x);
  3177. }
  3178. dstBlock= tempDstBlock;
  3179. srcBlock= tempSrcBlock;
  3180. }
  3181. #endif
  3182. blockCopy(dstBlock + dstStride*copyAhead, dstStride,
  3183. srcBlock + srcStride*copyAhead, srcStride, mode & LEVEL_FIX);
  3184. if(mode & LINEAR_IPOL_DEINT_FILTER)
  3185. deInterlaceInterpolateLinear(dstBlock, dstStride);
  3186. else if(mode & LINEAR_BLEND_DEINT_FILTER)
  3187. deInterlaceBlendLinear(dstBlock, dstStride);
  3188. else if(mode & MEDIAN_DEINT_FILTER)
  3189. deInterlaceMedian(dstBlock, dstStride);
  3190. else if(mode & CUBIC_IPOL_DEINT_FILTER)
  3191. deInterlaceInterpolateCubic(dstBlock, dstStride);
  3192. /* else if(mode & CUBIC_BLEND_DEINT_FILTER)
  3193. deInterlaceBlendCubic(dstBlock, dstStride);
  3194. */
  3195. /* only deblock if we have 2 blocks */
  3196. if(y + 8 < height)
  3197. {
  3198. #ifdef MORE_TIMING
  3199. T1= rdtsc();
  3200. memcpyTime+= T1-T0;
  3201. T0=T1;
  3202. #endif
  3203. if(mode & V_RK1_FILTER)
  3204. vertRK1Filter(dstBlock, stride, QP);
  3205. else if(mode & V_X1_FILTER)
  3206. vertX1Filter(dstBlock, stride, QP);
  3207. else if(mode & V_DEBLOCK)
  3208. {
  3209. if( isVertDC(dstBlock, stride))
  3210. {
  3211. if(isVertMinMaxOk(dstBlock, stride, QP))
  3212. doVertLowPass(dstBlock, stride, QP);
  3213. }
  3214. else
  3215. doVertDefFilter(dstBlock, stride, QP);
  3216. }
  3217. #ifdef MORE_TIMING
  3218. T1= rdtsc();
  3219. vertTime+= T1-T0;
  3220. T0=T1;
  3221. #endif
  3222. }
  3223. #ifdef HAVE_MMX
  3224. transpose1(tempBlock1, tempBlock2, dstBlock, dstStride);
  3225. #endif
  3226. /* check if we have a previous block to deblock it with dstBlock */
  3227. if(x - 8 >= 0)
  3228. {
  3229. #ifdef MORE_TIMING
  3230. T0= rdtsc();
  3231. #endif
  3232. #ifdef HAVE_MMX
  3233. if(mode & H_RK1_FILTER)
  3234. vertRK1Filter(tempBlock1, 16, QP);
  3235. else if(mode & H_X1_FILTER)
  3236. vertX1Filter(tempBlock1, 16, QP);
  3237. else if(mode & H_DEBLOCK)
  3238. {
  3239. if( isVertDC(tempBlock1, 16) )
  3240. {
  3241. if(isVertMinMaxOk(tempBlock1, 16, QP))
  3242. doVertLowPass(tempBlock1, 16, QP);
  3243. }
  3244. else
  3245. doVertDefFilter(tempBlock1, 16, QP);
  3246. }
  3247. transpose2(dstBlock-4, dstStride, tempBlock1 + 4*16);
  3248. #else
  3249. if(mode & H_X1_FILTER)
  3250. horizX1Filter(dstBlock-4, stride, QP);
  3251. else if(mode & H_DEBLOCK)
  3252. {
  3253. if( isHorizDC(dstBlock-4, stride))
  3254. {
  3255. if(isHorizMinMaxOk(dstBlock-4, stride, QP))
  3256. doHorizLowPass(dstBlock-4, stride, QP);
  3257. }
  3258. else
  3259. doHorizDefFilter(dstBlock-4, stride, QP);
  3260. }
  3261. #endif
  3262. #ifdef MORE_TIMING
  3263. T1= rdtsc();
  3264. horizTime+= T1-T0;
  3265. T0=T1;
  3266. #endif
  3267. if(mode & DERING)
  3268. {
  3269. //FIXME filter first line
  3270. if(y>0) dering(dstBlock - stride - 8, stride, QP);
  3271. }
  3272. if(mode & TEMP_NOISE_FILTER)
  3273. {
  3274. tempNoiseReducer(dstBlock-8, stride,
  3275. tempBlured[isColor] + y*dstStride + x,
  3276. tempBluredPast[isColor] + (y>>3)*256 + (x>>3),
  3277. ppMode->maxTmpNoise);
  3278. }
  3279. }
  3280. #ifdef PP_FUNNY_STRIDE
  3281. /* did we use a tmp-block buffer */
  3282. if(x+7 >= width)
  3283. {
  3284. int i;
  3285. dstBlock= dstBlockPtrBackup;
  3286. srcBlock= srcBlockPtrBackup;
  3287. for(i=0;i<BLOCK_SIZE*2; i++)
  3288. {
  3289. memcpy(dstBlock+i*dstStride, tempDstBlock+i*dstStride, width-x);
  3290. }
  3291. }
  3292. #endif
  3293. dstBlock+=8;
  3294. srcBlock+=8;
  3295. #ifdef HAVE_MMX
  3296. tmpXchg= tempBlock1;
  3297. tempBlock1= tempBlock2;
  3298. tempBlock2 = tmpXchg;
  3299. #endif
  3300. }
  3301. if(mode & DERING)
  3302. {
  3303. if(y > 0) dering(dstBlock - dstStride - 8, dstStride, QP);
  3304. }
  3305. if((mode & TEMP_NOISE_FILTER))
  3306. {
  3307. tempNoiseReducer(dstBlock-8, dstStride,
  3308. tempBlured[isColor] + y*dstStride + x,
  3309. tempBluredPast[isColor] + (y>>3)*256 + (x>>3),
  3310. ppMode->maxTmpNoise);
  3311. }
  3312. /* did we use a tmp buffer for the last lines*/
  3313. if(y+15 >= height)
  3314. {
  3315. uint8_t *dstBlock= &(dst[y*dstStride]);
  3316. memcpy(dstBlock, tempDst + dstStride, dstStride*(height-y) );
  3317. }
  3318. /*
  3319. for(x=0; x<width; x+=32)
  3320. {
  3321. volatile int i;
  3322. i+= + dstBlock[x + 7*dstStride] + dstBlock[x + 8*dstStride]
  3323. + dstBlock[x + 9*dstStride] + dstBlock[x +10*dstStride]
  3324. + dstBlock[x +11*dstStride] + dstBlock[x +12*dstStride];
  3325. // + dstBlock[x +13*dstStride]
  3326. // + dstBlock[x +14*dstStride] + dstBlock[x +15*dstStride];
  3327. }*/
  3328. }
  3329. #ifdef HAVE_3DNOW
  3330. asm volatile("femms");
  3331. #elif defined (HAVE_MMX)
  3332. asm volatile("emms");
  3333. #endif
  3334. #ifdef TIMING
  3335. // FIXME diff is mostly the time spent for rdtsc (should subtract that but ...)
  3336. sumTime= rdtsc() - sumTime;
  3337. if(!isColor)
  3338. printf("cpy:%4dk, vert:%4dk, horiz:%4dk, sum:%4dk, diff:%4dk, color: %d/%d \r",
  3339. (int)(memcpyTime/1000), (int)(vertTime/1000), (int)(horizTime/1000),
  3340. (int)(sumTime/1000), (int)((sumTime-memcpyTime-vertTime-horizTime)/1000)
  3341. , black, white);
  3342. #endif
  3343. #ifdef DEBUG_BRIGHTNESS
  3344. if(!isColor)
  3345. {
  3346. int max=1;
  3347. int i;
  3348. for(i=0; i<256; i++)
  3349. if(yHistogram[i] > max) max=yHistogram[i];
  3350. for(i=1; i<256; i++)
  3351. {
  3352. int x;
  3353. int start=yHistogram[i-1]/(max/256+1);
  3354. int end=yHistogram[i]/(max/256+1);
  3355. int inc= end > start ? 1 : -1;
  3356. for(x=start; x!=end+inc; x+=inc)
  3357. dst[ i*dstStride + x]+=128;
  3358. }
  3359. for(i=0; i<100; i+=2)
  3360. {
  3361. dst[ (white)*dstStride + i]+=128;
  3362. dst[ (black)*dstStride + i]+=128;
  3363. }
  3364. }
  3365. #endif
  3366. }