You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

2698 lines
76KB

  1. /*
  2. Copyright (C) 2001 Michael Niedermayer (michaelni@gmx.at)
  3. This program is free software; you can redistribute it and/or modify
  4. it under the terms of the GNU General Public License as published by
  5. the Free Software Foundation; either version 2 of the License, or
  6. (at your option) any later version.
  7. This program is distributed in the hope that it will be useful,
  8. but WITHOUT ANY WARRANTY; without even the implied warranty of
  9. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  10. GNU General Public License for more details.
  11. You should have received a copy of the GNU General Public License
  12. along with this program; if not, write to the Free Software
  13. Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  14. */
  15. /*
  16. C MMX MMX2 3DNow
  17. isVertDC Ec Ec
  18. isVertMinMaxOk Ec Ec
  19. doVertLowPass E e e
  20. doVertDefFilter Ec Ec Ec
  21. isHorizDC Ec Ec
  22. isHorizMinMaxOk a
  23. doHorizLowPass E a a
  24. doHorizDefFilter E ac ac
  25. deRing
  26. Vertical RKAlgo1 E a a
  27. Vertical X1 a E E
  28. Horizontal X1 a E E
  29. LinIpolDeinterlace e E E*
  30. CubicIpolDeinterlace a e e*
  31. LinBlendDeinterlace e E E*
  32. MedianDeinterlace Ec Ec
  33. * i dont have a 3dnow CPU -> its untested
  34. E = Exact implementation
  35. e = allmost exact implementation (slightly different rounding,...)
  36. a = alternative / approximate impl
  37. c = checked against the other implementations (-vo md5)
  38. */
  39. /*
  40. TODO:
  41. verify that everything workes as it should (how?)
  42. reduce the time wasted on the mem transfer
  43. implement dering
  44. implement everything in C at least (done at the moment but ...)
  45. unroll stuff if instructions depend too much on the prior one
  46. we use 8x8 blocks for the horizontal filters, opendivx seems to use 8x4?
  47. move YScale thing to the end instead of fixing QP
  48. write a faster and higher quality deblocking filter :)
  49. do something about the speed of the horizontal filters
  50. make the mainloop more flexible (variable number of blocks at once
  51. (the if/else stuff per block is slowing things down)
  52. compare the quality & speed of all filters
  53. split this huge file
  54. fix warnings (unused vars, ...)
  55. noise reduction filters
  56. ...
  57. Notes:
  58. */
  59. //Changelog: use the CVS log
  60. #include <inttypes.h>
  61. #include <stdio.h>
  62. #include <stdlib.h>
  63. #include "../config.h"
  64. //#undef HAVE_MMX2
  65. //#define HAVE_3DNOW
  66. //#undef HAVE_MMX
  67. #include "postprocess.h"
  68. #define MIN(a,b) ((a) > (b) ? (b) : (a))
  69. #define MAX(a,b) ((a) < (b) ? (b) : (a))
  70. #define ABS(a) ((a) > 0 ? (a) : (-(a)))
  71. #define SIGN(a) ((a) > 0 ? 1 : -1)
  72. #ifdef HAVE_MMX2
  73. #define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
  74. #elif defined (HAVE_3DNOW)
  75. #define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
  76. #endif
  77. static uint64_t packedYOffset= 0x0000000000000000LL;
  78. static uint64_t packedYScale= 0x0100010001000100LL;
  79. static uint64_t w05= 0x0005000500050005LL;
  80. static uint64_t w20= 0x0020002000200020LL;
  81. static uint64_t w1400= 0x1400140014001400LL;
  82. static uint64_t bm00000001= 0x00000000000000FFLL;
  83. static uint64_t bm00010000= 0x000000FF00000000LL;
  84. static uint64_t bm00001000= 0x00000000FF000000LL;
  85. static uint64_t bm10000000= 0xFF00000000000000LL;
  86. static uint64_t bm10000001= 0xFF000000000000FFLL;
  87. static uint64_t bm11000011= 0xFFFF00000000FFFFLL;
  88. static uint64_t bm00000011= 0x000000000000FFFFLL;
  89. static uint64_t bm11111110= 0xFFFFFFFFFFFFFF00LL;
  90. static uint64_t bm11000000= 0xFFFF000000000000LL;
  91. static uint64_t bm00011000= 0x000000FFFF000000LL;
  92. static uint64_t bm00110011= 0x0000FFFF0000FFFFLL;
  93. static uint64_t bm11001100= 0xFFFF0000FFFF0000LL;
  94. static uint64_t b00= 0x0000000000000000LL;
  95. static uint64_t b01= 0x0101010101010101LL;
  96. static uint64_t b02= 0x0202020202020202LL;
  97. static uint64_t b0F= 0x0F0F0F0F0F0F0F0FLL;
  98. static uint64_t bFF= 0xFFFFFFFFFFFFFFFFLL;
  99. static uint64_t b20= 0x2020202020202020LL;
  100. static uint64_t b80= 0x8080808080808080LL;
  101. static uint64_t b7E= 0x7E7E7E7E7E7E7E7ELL;
  102. static uint64_t b7C= 0x7C7C7C7C7C7C7C7CLL;
  103. static uint64_t b3F= 0x3F3F3F3F3F3F3F3FLL;
  104. static uint64_t temp0=0;
  105. static uint64_t temp1=0;
  106. static uint64_t temp2=0;
  107. static uint64_t temp3=0;
  108. static uint64_t temp4=0;
  109. static uint64_t temp5=0;
  110. static uint64_t pQPb=0;
  111. static uint8_t tempBlock[16*16]; //used so the horizontal code gets aligned data
  112. int hFlatnessThreshold= 56 - 16;
  113. int vFlatnessThreshold= 56 - 16;
  114. //amount of "black" u r willing to loose to get a brightness corrected picture
  115. double maxClippedThreshold= 0.01;
  116. int maxAllowedY=255;
  117. //FIXME can never make a movie´s black brighter (anyone needs that?)
  118. int minAllowedY=16;
  119. #ifdef TIMING
  120. static inline long long rdtsc()
  121. {
  122. long long l;
  123. asm volatile( "rdtsc\n\t"
  124. : "=A" (l)
  125. );
  126. // printf("%d\n", int(l/1000));
  127. return l;
  128. }
  129. #endif
  130. #ifdef HAVE_MMX2
  131. static inline void prefetchnta(void *p)
  132. {
  133. asm volatile( "prefetchnta (%0)\n\t"
  134. : : "r" (p)
  135. );
  136. }
  137. static inline void prefetcht0(void *p)
  138. {
  139. asm volatile( "prefetcht0 (%0)\n\t"
  140. : : "r" (p)
  141. );
  142. }
  143. static inline void prefetcht1(void *p)
  144. {
  145. asm volatile( "prefetcht1 (%0)\n\t"
  146. : : "r" (p)
  147. );
  148. }
  149. static inline void prefetcht2(void *p)
  150. {
  151. asm volatile( "prefetcht2 (%0)\n\t"
  152. : : "r" (p)
  153. );
  154. }
  155. #endif
  156. //FIXME? |255-0| = 1 (shouldnt be a problem ...)
  157. /**
  158. * Check if the middle 8x8 Block in the given 8x16 block is flat
  159. */
  160. static inline int isVertDC(uint8_t src[], int stride){
  161. int numEq= 0;
  162. int y;
  163. src+= stride*4; // src points to begin of the 8x8 Block
  164. #ifdef HAVE_MMX
  165. asm volatile(
  166. "pushl %1\n\t"
  167. "movq b7E, %%mm7 \n\t" // mm7 = 0x7F
  168. "movq b7C, %%mm6 \n\t" // mm6 = 0x7D
  169. "movq (%1), %%mm0 \n\t"
  170. "addl %2, %1 \n\t"
  171. "movq (%1), %%mm1 \n\t"
  172. "psubb %%mm1, %%mm0 \n\t" // mm0 = differnece
  173. "paddb %%mm7, %%mm0 \n\t"
  174. "pcmpgtb %%mm6, %%mm0 \n\t"
  175. "addl %2, %1 \n\t"
  176. "movq (%1), %%mm2 \n\t"
  177. "psubb %%mm2, %%mm1 \n\t"
  178. "paddb %%mm7, %%mm1 \n\t"
  179. "pcmpgtb %%mm6, %%mm1 \n\t"
  180. "paddb %%mm1, %%mm0 \n\t"
  181. "addl %2, %1 \n\t"
  182. "movq (%1), %%mm1 \n\t"
  183. "psubb %%mm1, %%mm2 \n\t"
  184. "paddb %%mm7, %%mm2 \n\t"
  185. "pcmpgtb %%mm6, %%mm2 \n\t"
  186. "paddb %%mm2, %%mm0 \n\t"
  187. "addl %2, %1 \n\t"
  188. "movq (%1), %%mm2 \n\t"
  189. "psubb %%mm2, %%mm1 \n\t"
  190. "paddb %%mm7, %%mm1 \n\t"
  191. "pcmpgtb %%mm6, %%mm1 \n\t"
  192. "paddb %%mm1, %%mm0 \n\t"
  193. "addl %2, %1 \n\t"
  194. "movq (%1), %%mm1 \n\t"
  195. "psubb %%mm1, %%mm2 \n\t"
  196. "paddb %%mm7, %%mm2 \n\t"
  197. "pcmpgtb %%mm6, %%mm2 \n\t"
  198. "paddb %%mm2, %%mm0 \n\t"
  199. "addl %2, %1 \n\t"
  200. "movq (%1), %%mm2 \n\t"
  201. "psubb %%mm2, %%mm1 \n\t"
  202. "paddb %%mm7, %%mm1 \n\t"
  203. "pcmpgtb %%mm6, %%mm1 \n\t"
  204. "paddb %%mm1, %%mm0 \n\t"
  205. "addl %2, %1 \n\t"
  206. "movq (%1), %%mm1 \n\t"
  207. "psubb %%mm1, %%mm2 \n\t"
  208. "paddb %%mm7, %%mm2 \n\t"
  209. "pcmpgtb %%mm6, %%mm2 \n\t"
  210. "paddb %%mm2, %%mm0 \n\t"
  211. " \n\t"
  212. "movq %%mm0, %%mm1 \n\t"
  213. "psrlw $8, %%mm0 \n\t"
  214. "paddb %%mm1, %%mm0 \n\t"
  215. "movq %%mm0, %%mm1 \n\t"
  216. "psrlq $16, %%mm0 \n\t"
  217. "paddb %%mm1, %%mm0 \n\t"
  218. "movq %%mm0, %%mm1 \n\t"
  219. "psrlq $32, %%mm0 \n\t"
  220. "paddb %%mm1, %%mm0 \n\t"
  221. "popl %1\n\t"
  222. "movd %%mm0, %0 \n\t"
  223. : "=r" (numEq)
  224. : "r" (src), "r" (stride)
  225. );
  226. // printf("%d\n", numEq);
  227. numEq= (256 - (numEq & 0xFF)) &0xFF;
  228. // int asmEq= numEq;
  229. // numEq=0;
  230. // uint8_t *temp= src;
  231. #else
  232. for(y=0; y<BLOCK_SIZE-1; y++)
  233. {
  234. if(((src[0] - src[0+stride] + 1)&0xFFFF) < 3) numEq++;
  235. if(((src[1] - src[1+stride] + 1)&0xFFFF) < 3) numEq++;
  236. if(((src[2] - src[2+stride] + 1)&0xFFFF) < 3) numEq++;
  237. if(((src[3] - src[3+stride] + 1)&0xFFFF) < 3) numEq++;
  238. if(((src[4] - src[4+stride] + 1)&0xFFFF) < 3) numEq++;
  239. if(((src[5] - src[5+stride] + 1)&0xFFFF) < 3) numEq++;
  240. if(((src[6] - src[6+stride] + 1)&0xFFFF) < 3) numEq++;
  241. if(((src[7] - src[7+stride] + 1)&0xFFFF) < 3) numEq++;
  242. src+= stride;
  243. }
  244. #endif
  245. /* if(abs(numEq - asmEq) > 0)
  246. {
  247. printf("\nasm:%d c:%d\n", asmEq, numEq);
  248. for(int y=0; y<8; y++)
  249. {
  250. for(int x=0; x<8; x++)
  251. {
  252. printf("%d ", temp[x + y*stride]);
  253. }
  254. printf("\n");
  255. }
  256. }
  257. */
  258. // for(int i=0; i<numEq/8; i++) src[i]=255;
  259. return (numEq > vFlatnessThreshold) ? 1 : 0;
  260. }
  261. static inline int isVertMinMaxOk(uint8_t src[], int stride, int QP)
  262. {
  263. #ifdef HAVE_MMX
  264. int isOk;
  265. src+= stride*3;
  266. asm volatile(
  267. // "int $3 \n\t"
  268. "movq (%1, %2), %%mm0 \n\t"
  269. "movq (%1, %2, 8), %%mm1 \n\t"
  270. "movq %%mm0, %%mm2 \n\t"
  271. "psubusb %%mm1, %%mm0 \n\t"
  272. "psubusb %%mm2, %%mm1 \n\t"
  273. "por %%mm1, %%mm0 \n\t" // ABS Diff
  274. "movq pQPb, %%mm7 \n\t" // QP,..., QP
  275. "paddusb %%mm7, %%mm7 \n\t" // 2QP ... 2QP
  276. "psubusb %%mm7, %%mm0 \n\t" // Diff <= 2QP -> 0
  277. "pcmpeqd b00, %%mm0 \n\t"
  278. "psrlq $16, %%mm0 \n\t"
  279. "pcmpeqd bFF, %%mm0 \n\t"
  280. // "movd %%mm0, (%1, %2, 4)\n\t"
  281. "movd %%mm0, %0 \n\t"
  282. : "=r" (isOk)
  283. : "r" (src), "r" (stride)
  284. );
  285. return isOk ? 1 : 0;
  286. #else
  287. int isOk2= 1;
  288. int x;
  289. src+= stride*3;
  290. for(x=0; x<BLOCK_SIZE; x++)
  291. {
  292. if(abs((int)src[x + stride] - (int)src[x + (stride<<3)]) > 2*QP) isOk2=0;
  293. }
  294. /* if(isOk && !isOk2 || !isOk && isOk2)
  295. {
  296. printf("\nasm:%d c:%d QP:%d\n", isOk, isOk2, QP);
  297. for(int y=0; y<9; y++)
  298. {
  299. for(int x=0; x<8; x++)
  300. {
  301. printf("%d ", src[x + y*stride]);
  302. }
  303. printf("\n");
  304. }
  305. } */
  306. return isOk2;
  307. #endif
  308. }
  309. /**
  310. * Do a vertical low pass filter on the 8x16 block (only write to the 8x8 block in the middle)
  311. * using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16
  312. */
  313. static inline void doVertLowPass(uint8_t *src, int stride, int QP)
  314. {
  315. #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
  316. src+= stride*3;
  317. asm volatile( //"movv %0 %1 %2\n\t"
  318. "pushl %0 \n\t"
  319. "movq pQPb, %%mm0 \n\t" // QP,..., QP
  320. "movq (%0), %%mm6 \n\t"
  321. "movq (%0, %1), %%mm5 \n\t"
  322. "movq %%mm5, %%mm1 \n\t"
  323. "movq %%mm6, %%mm2 \n\t"
  324. "psubusb %%mm6, %%mm5 \n\t"
  325. "psubusb %%mm1, %%mm2 \n\t"
  326. "por %%mm5, %%mm2 \n\t" // ABS Diff of lines
  327. "psubusb %%mm0, %%mm2 \n\t" // diff <= QP -> 0
  328. "pcmpeqb b00, %%mm2 \n\t" // diff <= QP -> FF
  329. "pand %%mm2, %%mm6 \n\t"
  330. "pandn %%mm1, %%mm2 \n\t"
  331. "por %%mm2, %%mm6 \n\t"// First Line to Filter
  332. "movq (%0, %1, 8), %%mm5 \n\t"
  333. "leal (%0, %1, 4), %%eax \n\t"
  334. "leal (%0, %1, 8), %%ebx \n\t"
  335. "subl %1, %%ebx \n\t"
  336. "addl %1, %0 \n\t" // %0 points to line 1 not 0
  337. "movq (%0, %1, 8), %%mm7 \n\t"
  338. "movq %%mm5, %%mm1 \n\t"
  339. "movq %%mm7, %%mm2 \n\t"
  340. "psubusb %%mm7, %%mm5 \n\t"
  341. "psubusb %%mm1, %%mm2 \n\t"
  342. "por %%mm5, %%mm2 \n\t" // ABS Diff of lines
  343. "psubusb %%mm0, %%mm2 \n\t" // diff <= QP -> 0
  344. "pcmpeqb b00, %%mm2 \n\t" // diff <= QP -> FF
  345. "pand %%mm2, %%mm7 \n\t"
  346. "pandn %%mm1, %%mm2 \n\t"
  347. "por %%mm2, %%mm7 \n\t" // First Line to Filter
  348. // 1 2 3 4 5 6 7 8
  349. // %0 %0+%1 %0+2%1 eax %0+4%1 eax+2%1 ebx eax+4%1
  350. // 6 4 2 2 1 1
  351. // 6 4 4 2
  352. // 6 8 2
  353. "movq (%0, %1), %%mm0 \n\t" // 1
  354. "movq %%mm0, %%mm1 \n\t" // 1
  355. PAVGB(%%mm6, %%mm0) //1 1 /2
  356. PAVGB(%%mm6, %%mm0) //3 1 /4
  357. "movq (%0, %1, 4), %%mm2 \n\t" // 1
  358. "movq %%mm2, %%mm5 \n\t" // 1
  359. PAVGB((%%eax), %%mm2) // 11 /2
  360. PAVGB((%0, %1, 2), %%mm2) // 211 /4
  361. "movq %%mm2, %%mm3 \n\t" // 211 /4
  362. "movq (%0), %%mm4 \n\t" // 1
  363. PAVGB(%%mm4, %%mm3) // 4 211 /8
  364. PAVGB(%%mm0, %%mm3) //642211 /16
  365. "movq %%mm3, (%0) \n\t" // X
  366. // mm1=2 mm2=3(211) mm4=1 mm5=5 mm6=0 mm7=9
  367. "movq %%mm1, %%mm0 \n\t" // 1
  368. PAVGB(%%mm6, %%mm0) //1 1 /2
  369. "movq %%mm4, %%mm3 \n\t" // 1
  370. PAVGB((%0,%1,2), %%mm3) // 1 1 /2
  371. PAVGB((%%eax,%1,2), %%mm5) // 11 /2
  372. PAVGB((%%eax), %%mm5) // 211 /4
  373. PAVGB(%%mm5, %%mm3) // 2 2211 /8
  374. PAVGB(%%mm0, %%mm3) //4242211 /16
  375. "movq %%mm3, (%0,%1) \n\t" // X
  376. // mm1=2 mm2=3(211) mm4=1 mm5=4(211) mm6=0 mm7=9
  377. PAVGB(%%mm4, %%mm6) //11 /2
  378. "movq (%%ebx), %%mm0 \n\t" // 1
  379. PAVGB((%%eax, %1, 2), %%mm0) // 11/2
  380. "movq %%mm0, %%mm3 \n\t" // 11/2
  381. PAVGB(%%mm1, %%mm0) // 2 11/4
  382. PAVGB(%%mm6, %%mm0) //222 11/8
  383. PAVGB(%%mm2, %%mm0) //22242211/16
  384. "movq (%0, %1, 2), %%mm2 \n\t" // 1
  385. "movq %%mm0, (%0, %1, 2) \n\t" // X
  386. // mm1=2 mm2=3 mm3=6(11) mm4=1 mm5=4(211) mm6=0(11) mm7=9
  387. "movq (%%eax, %1, 4), %%mm0 \n\t" // 1
  388. PAVGB((%%ebx), %%mm0) // 11 /2
  389. PAVGB(%%mm0, %%mm6) //11 11 /4
  390. PAVGB(%%mm1, %%mm4) // 11 /2
  391. PAVGB(%%mm2, %%mm1) // 11 /2
  392. PAVGB(%%mm1, %%mm6) //1122 11 /8
  393. PAVGB(%%mm5, %%mm6) //112242211 /16
  394. "movq (%%eax), %%mm5 \n\t" // 1
  395. "movq %%mm6, (%%eax) \n\t" // X
  396. // mm0=7(11) mm1=2(11) mm2=3 mm3=6(11) mm4=1(11) mm5=4 mm7=9
  397. "movq (%%eax, %1, 4), %%mm6 \n\t" // 1
  398. PAVGB(%%mm7, %%mm6) // 11 /2
  399. PAVGB(%%mm4, %%mm6) // 11 11 /4
  400. PAVGB(%%mm3, %%mm6) // 11 2211 /8
  401. PAVGB(%%mm5, %%mm2) // 11 /2
  402. "movq (%0, %1, 4), %%mm4 \n\t" // 1
  403. PAVGB(%%mm4, %%mm2) // 112 /4
  404. PAVGB(%%mm2, %%mm6) // 112242211 /16
  405. "movq %%mm6, (%0, %1, 4) \n\t" // X
  406. // mm0=7(11) mm1=2(11) mm2=3(112) mm3=6(11) mm4=5 mm5=4 mm7=9
  407. PAVGB(%%mm7, %%mm1) // 11 2 /4
  408. PAVGB(%%mm4, %%mm5) // 11 /2
  409. PAVGB(%%mm5, %%mm0) // 11 11 /4
  410. "movq (%%eax, %1, 2), %%mm6 \n\t" // 1
  411. PAVGB(%%mm6, %%mm1) // 11 4 2 /8
  412. PAVGB(%%mm0, %%mm1) // 11224222 /16
  413. "movq %%mm1, (%%eax, %1, 2) \n\t" // X
  414. // mm2=3(112) mm3=6(11) mm4=5 mm5=4(11) mm6=6 mm7=9
  415. PAVGB((%%ebx), %%mm2) // 112 4 /8
  416. "movq (%%eax, %1, 4), %%mm0 \n\t" // 1
  417. PAVGB(%%mm0, %%mm6) // 1 1 /2
  418. PAVGB(%%mm7, %%mm6) // 1 12 /4
  419. PAVGB(%%mm2, %%mm6) // 1122424 /4
  420. "movq %%mm6, (%%ebx) \n\t" // X
  421. // mm0=8 mm3=6(11) mm4=5 mm5=4(11) mm7=9
  422. PAVGB(%%mm7, %%mm5) // 11 2 /4
  423. PAVGB(%%mm7, %%mm5) // 11 6 /8
  424. PAVGB(%%mm3, %%mm0) // 112 /4
  425. PAVGB(%%mm0, %%mm5) // 112246 /16
  426. "movq %%mm5, (%%eax, %1, 4) \n\t" // X
  427. "popl %0\n\t"
  428. :
  429. : "r" (src), "r" (stride)
  430. : "%eax", "%ebx"
  431. );
  432. #else
  433. const int l1= stride;
  434. const int l2= stride + l1;
  435. const int l3= stride + l2;
  436. const int l4= stride + l3;
  437. const int l5= stride + l4;
  438. const int l6= stride + l5;
  439. const int l7= stride + l6;
  440. const int l8= stride + l7;
  441. const int l9= stride + l8;
  442. int x;
  443. src+= stride*3;
  444. for(x=0; x<BLOCK_SIZE; x++)
  445. {
  446. const int first= ABS(src[0] - src[l1]) < QP ? src[0] : src[l1];
  447. const int last= ABS(src[l8] - src[l9]) < QP ? src[l9] : src[l8];
  448. int sums[9];
  449. sums[0] = first + src[l1];
  450. sums[1] = src[l1] + src[l2];
  451. sums[2] = src[l2] + src[l3];
  452. sums[3] = src[l3] + src[l4];
  453. sums[4] = src[l4] + src[l5];
  454. sums[5] = src[l5] + src[l6];
  455. sums[6] = src[l6] + src[l7];
  456. sums[7] = src[l7] + src[l8];
  457. sums[8] = src[l8] + last;
  458. src[l1]= ((sums[0]<<2) + ((first + sums[2])<<1) + sums[4] + 8)>>4;
  459. src[l2]= ((src[l2]<<2) + (first + sums[0] + sums[3]<<1) + sums[5] + 8)>>4;
  460. src[l3]= ((src[l3]<<2) + (first + sums[1] + sums[4]<<1) + sums[6] + 8)>>4;
  461. src[l4]= ((src[l4]<<2) + (sums[2] + sums[5]<<1) + sums[0] + sums[7] + 8)>>4;
  462. src[l5]= ((src[l5]<<2) + (sums[3] + sums[6]<<1) + sums[1] + sums[8] + 8)>>4;
  463. src[l6]= ((src[l6]<<2) + (last + sums[7] + sums[4]<<1) + sums[2] + 8)>>4;
  464. src[l7]= ((last + src[l7]<<2) + (src[l8] + sums[5]<<1) + sums[3] + 8)>>4;
  465. src[l8]= ((sums[8]<<2) + (last + sums[6]<<1) + sums[4] + 8)>>4;
  466. src++;
  467. }
  468. #endif
  469. }
  470. /**
  471. * Experimental implementation of the filter (Algorithm 1) described in a paper from Ramkishor & Karandikar
  472. * values are correctly clipped (MMX2)
  473. * values are wraparound (C)
  474. * conclusion: its fast, but introduces ugly horizontal patterns if there is a continious gradient
  475. 0 8 16 24
  476. x = 8
  477. x/2 = 4
  478. x/8 = 1
  479. 1 12 12 23
  480. */
  481. static inline void vertRK1Filter(uint8_t *src, int stride, int QP)
  482. {
  483. #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
  484. src+= stride*3;
  485. // FIXME rounding
  486. asm volatile(
  487. "pxor %%mm7, %%mm7 \n\t" // 0
  488. "movq b80, %%mm6 \n\t" // MIN_SIGNED_BYTE
  489. "leal (%0, %1), %%eax \n\t"
  490. "leal (%%eax, %1, 4), %%ebx \n\t"
  491. // 0 1 2 3 4 5 6 7 8 9
  492. // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
  493. "movq pQPb, %%mm0 \n\t" // QP,..., QP
  494. "movq %%mm0, %%mm1 \n\t" // QP,..., QP
  495. "paddusb b02, %%mm0 \n\t"
  496. "psrlw $2, %%mm0 \n\t"
  497. "pand b3F, %%mm0 \n\t" // QP/4,..., QP/4
  498. "paddusb %%mm1, %%mm0 \n\t" // QP*1.25 ...
  499. "movq (%0, %1, 4), %%mm2 \n\t" // line 4
  500. "movq (%%ebx), %%mm3 \n\t" // line 5
  501. "movq %%mm2, %%mm4 \n\t" // line 4
  502. "pcmpeqb %%mm5, %%mm5 \n\t" // -1
  503. "pxor %%mm2, %%mm5 \n\t" // -line 4 - 1
  504. PAVGB(%%mm3, %%mm5)
  505. "paddb %%mm6, %%mm5 \n\t" // (l5-l4)/2
  506. "psubusb %%mm3, %%mm4 \n\t"
  507. "psubusb %%mm2, %%mm3 \n\t"
  508. "por %%mm3, %%mm4 \n\t" // |l4 - l5|
  509. "psubusb %%mm0, %%mm4 \n\t"
  510. "pcmpeqb %%mm7, %%mm4 \n\t"
  511. "pand %%mm4, %%mm5 \n\t" // d/2
  512. // "paddb %%mm6, %%mm2 \n\t" // line 4 + 0x80
  513. "paddb %%mm5, %%mm2 \n\t"
  514. // "psubb %%mm6, %%mm2 \n\t"
  515. "movq %%mm2, (%0,%1, 4) \n\t"
  516. "movq (%%ebx), %%mm2 \n\t"
  517. // "paddb %%mm6, %%mm2 \n\t" // line 5 + 0x80
  518. "psubb %%mm5, %%mm2 \n\t"
  519. // "psubb %%mm6, %%mm2 \n\t"
  520. "movq %%mm2, (%%ebx) \n\t"
  521. "paddb %%mm6, %%mm5 \n\t"
  522. "psrlw $2, %%mm5 \n\t"
  523. "pand b3F, %%mm5 \n\t"
  524. "psubb b20, %%mm5 \n\t" // (l5-l4)/8
  525. "movq (%%eax, %1, 2), %%mm2 \n\t"
  526. "paddb %%mm6, %%mm2 \n\t" // line 3 + 0x80
  527. "paddsb %%mm5, %%mm2 \n\t"
  528. "psubb %%mm6, %%mm2 \n\t"
  529. "movq %%mm2, (%%eax, %1, 2) \n\t"
  530. "movq (%%ebx, %1), %%mm2 \n\t"
  531. "paddb %%mm6, %%mm2 \n\t" // line 6 + 0x80
  532. "psubsb %%mm5, %%mm2 \n\t"
  533. "psubb %%mm6, %%mm2 \n\t"
  534. "movq %%mm2, (%%ebx, %1) \n\t"
  535. :
  536. : "r" (src), "r" (stride)
  537. : "%eax", "%ebx"
  538. );
  539. #else
  540. const int l1= stride;
  541. const int l2= stride + l1;
  542. const int l3= stride + l2;
  543. const int l4= stride + l3;
  544. const int l5= stride + l4;
  545. const int l6= stride + l5;
  546. const int l7= stride + l6;
  547. const int l8= stride + l7;
  548. const int l9= stride + l8;
  549. int x;
  550. src+= stride*3;
  551. for(x=0; x<BLOCK_SIZE; x++)
  552. {
  553. if(ABS(src[l4]-src[l5]) < QP + QP/4)
  554. {
  555. int v = (src[l5] - src[l4]);
  556. src[l3] +=v/8;
  557. src[l4] +=v/2;
  558. src[l5] -=v/2;
  559. src[l6] -=v/8;
  560. }
  561. src++;
  562. }
  563. #endif
  564. }
  565. /**
  566. * Experimental Filter 1
  567. * will not damage linear gradients
  568. * Flat blocks should look like they where passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter
  569. * can only smooth blocks at the expected locations (it cant smooth them if they did move)
  570. * MMX2 version does correct clipping C version doesnt
  571. */
  572. static inline void vertX1Filter(uint8_t *src, int stride, int QP)
  573. {
  574. #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
  575. src+= stride*3;
  576. asm volatile(
  577. "pxor %%mm7, %%mm7 \n\t" // 0
  578. // "movq b80, %%mm6 \n\t" // MIN_SIGNED_BYTE
  579. "leal (%0, %1), %%eax \n\t"
  580. "leal (%%eax, %1, 4), %%ebx \n\t"
  581. // 0 1 2 3 4 5 6 7 8 9
  582. // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
  583. "movq (%%eax, %1, 2), %%mm0 \n\t" // line 3
  584. "movq (%0, %1, 4), %%mm1 \n\t" // line 4
  585. "movq %%mm1, %%mm2 \n\t" // line 4
  586. "psubusb %%mm0, %%mm1 \n\t"
  587. "psubusb %%mm2, %%mm0 \n\t"
  588. "por %%mm1, %%mm0 \n\t" // |l2 - l3|
  589. "movq (%%ebx), %%mm3 \n\t" // line 5
  590. "movq (%%ebx, %1), %%mm4 \n\t" // line 6
  591. "movq %%mm3, %%mm5 \n\t" // line 5
  592. "psubusb %%mm4, %%mm3 \n\t"
  593. "psubusb %%mm5, %%mm4 \n\t"
  594. "por %%mm4, %%mm3 \n\t" // |l5 - l6|
  595. PAVGB(%%mm3, %%mm0) // (|l2 - l3| + |l5 - l6|)/2
  596. "movq %%mm2, %%mm1 \n\t" // line 4
  597. "psubusb %%mm5, %%mm2 \n\t"
  598. "movq %%mm2, %%mm4 \n\t"
  599. "pcmpeqb %%mm7, %%mm2 \n\t" // (l4 - l5) <= 0 ? -1 : 0
  600. "psubusb %%mm1, %%mm5 \n\t"
  601. "por %%mm5, %%mm4 \n\t" // |l4 - l5|
  602. "psubusb %%mm0, %%mm4 \n\t" //d = MAX(0, |l4-l5| - (|l2-l3| + |l5-l6|)/2)
  603. "movq %%mm4, %%mm3 \n\t" // d
  604. "psubusb pQPb, %%mm4 \n\t"
  605. "pcmpeqb %%mm7, %%mm4 \n\t" // d <= QP ? -1 : 0
  606. "psubusb b01, %%mm3 \n\t"
  607. "pand %%mm4, %%mm3 \n\t" // d <= QP ? d : 0
  608. PAVGB(%%mm7, %%mm3) // d/2
  609. "movq %%mm3, %%mm1 \n\t" // d/2
  610. PAVGB(%%mm7, %%mm3) // d/4
  611. PAVGB(%%mm1, %%mm3) // 3*d/8
  612. "movq (%0, %1, 4), %%mm0 \n\t" // line 4
  613. "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4
  614. "psubusb %%mm3, %%mm0 \n\t"
  615. "pxor %%mm2, %%mm0 \n\t"
  616. "movq %%mm0, (%0, %1, 4) \n\t" // line 4
  617. "movq (%%ebx), %%mm0 \n\t" // line 5
  618. "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5
  619. "paddusb %%mm3, %%mm0 \n\t"
  620. "pxor %%mm2, %%mm0 \n\t"
  621. "movq %%mm0, (%%ebx) \n\t" // line 5
  622. PAVGB(%%mm7, %%mm1) // d/4
  623. "movq (%%eax, %1, 2), %%mm0 \n\t" // line 3
  624. "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4
  625. "psubusb %%mm1, %%mm0 \n\t"
  626. "pxor %%mm2, %%mm0 \n\t"
  627. "movq %%mm0, (%%eax, %1, 2) \n\t" // line 3
  628. "movq (%%ebx, %1), %%mm0 \n\t" // line 6
  629. "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5
  630. "paddusb %%mm1, %%mm0 \n\t"
  631. "pxor %%mm2, %%mm0 \n\t"
  632. "movq %%mm0, (%%ebx, %1) \n\t" // line 6
  633. PAVGB(%%mm7, %%mm1) // d/8
  634. "movq (%%eax, %1), %%mm0 \n\t" // line 2
  635. "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l2-1 : l2
  636. "psubusb %%mm1, %%mm0 \n\t"
  637. "pxor %%mm2, %%mm0 \n\t"
  638. "movq %%mm0, (%%eax, %1) \n\t" // line 2
  639. "movq (%%ebx, %1, 2), %%mm0 \n\t" // line 7
  640. "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l7-1 : l7
  641. "paddusb %%mm1, %%mm0 \n\t"
  642. "pxor %%mm2, %%mm0 \n\t"
  643. "movq %%mm0, (%%ebx, %1, 2) \n\t" // line 7
  644. :
  645. : "r" (src), "r" (stride)
  646. : "%eax", "%ebx"
  647. );
  648. #else
  649. const int l1= stride;
  650. const int l2= stride + l1;
  651. const int l3= stride + l2;
  652. const int l4= stride + l3;
  653. const int l5= stride + l4;
  654. const int l6= stride + l5;
  655. const int l7= stride + l6;
  656. const int l8= stride + l7;
  657. const int l9= stride + l8;
  658. int x;
  659. src+= stride*3;
  660. for(x=0; x<BLOCK_SIZE; x++)
  661. {
  662. int a= src[l3] - src[l4];
  663. int b= src[l4] - src[l5];
  664. int c= src[l5] - src[l6];
  665. int d= MAX(ABS(b) - (ABS(a) + ABS(c))/2, 0);
  666. if(d < QP)
  667. {
  668. int v = d * SIGN(-b);
  669. src[l2] +=v/8;
  670. src[l3] +=v/4;
  671. src[l4] +=3*v/8;
  672. src[l5] -=3*v/8;
  673. src[l6] -=v/4;
  674. src[l7] -=v/8;
  675. }
  676. src++;
  677. }
  678. /*
  679. const int l1= stride;
  680. const int l2= stride + l1;
  681. const int l3= stride + l2;
  682. const int l4= stride + l3;
  683. const int l5= stride + l4;
  684. const int l6= stride + l5;
  685. const int l7= stride + l6;
  686. const int l8= stride + l7;
  687. const int l9= stride + l8;
  688. for(int x=0; x<BLOCK_SIZE; x++)
  689. {
  690. int v2= src[l2];
  691. int v3= src[l3];
  692. int v4= src[l4];
  693. int v5= src[l5];
  694. int v6= src[l6];
  695. int v7= src[l7];
  696. if(ABS(v4-v5)<QP && ABS(v4-v5) - (ABS(v3-v4) + ABS(v5-v6))>0 )
  697. {
  698. src[l3] = (6*v2 + 4*v3 + 3*v4 + 2*v5 + v6 )/16;
  699. src[l4] = (3*v2 + 3*v3 + 4*v4 + 3*v5 + 2*v6 + v7 )/16;
  700. src[l5] = (1*v2 + 2*v3 + 3*v4 + 4*v5 + 3*v6 + 3*v7)/16;
  701. src[l6] = ( 1*v3 + 2*v4 + 3*v5 + 4*v6 + 6*v7)/16;
  702. }
  703. src++;
  704. }
  705. */
  706. #endif
  707. }
  708. /**
  709. * Experimental Filter 1 (Horizontal)
  710. * will not damage linear gradients
  711. * Flat blocks should look like they where passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter
  712. * can only smooth blocks at the expected locations (it cant smooth them if they did move)
  713. * MMX2 version does correct clipping C version doesnt
  714. * not identical with the vertical one
  715. */
  716. static inline void horizX1Filter(uint8_t *src, int stride, int QP)
  717. {
  718. int y;
  719. static uint64_t *lut= NULL;
  720. if(lut==NULL)
  721. {
  722. int i;
  723. lut= (uint64_t*)memalign(8, 256*8);
  724. for(i=0; i<256; i++)
  725. {
  726. int v= i < 128 ? 2*i : 2*(i-256);
  727. /*
  728. //Simulate 112242211 9-Tap filter
  729. uint64_t a= (v/16) & 0xFF;
  730. uint64_t b= (v/8) & 0xFF;
  731. uint64_t c= (v/4) & 0xFF;
  732. uint64_t d= (3*v/8) & 0xFF;
  733. */
  734. //Simulate piecewise linear interpolation
  735. uint64_t a= (v/16) & 0xFF;
  736. uint64_t b= (v*3/16) & 0xFF;
  737. uint64_t c= (v*5/16) & 0xFF;
  738. uint64_t d= (7*v/16) & 0xFF;
  739. uint64_t A= (0x100 - a)&0xFF;
  740. uint64_t B= (0x100 - b)&0xFF;
  741. uint64_t C= (0x100 - c)&0xFF;
  742. uint64_t D= (0x100 - c)&0xFF;
  743. lut[i] = (a<<56) | (b<<48) | (c<<40) | (d<<32) |
  744. (D<<24) | (C<<16) | (B<<8) | (A);
  745. //lut[i] = (v<<32) | (v<<24);
  746. }
  747. }
  748. #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
  749. asm volatile(
  750. "pxor %%mm7, %%mm7 \n\t" // 0
  751. // "movq b80, %%mm6 \n\t" // MIN_SIGNED_BYTE
  752. "leal (%0, %1), %%eax \n\t"
  753. "leal (%%eax, %1, 4), %%ebx \n\t"
  754. "movq b80, %%mm6 \n\t"
  755. "movd pQPb, %%mm5 \n\t" // QP
  756. "movq %%mm5, %%mm4 \n\t"
  757. "paddusb %%mm5, %%mm5 \n\t" // 2QP
  758. "paddusb %%mm5, %%mm4 \n\t" // 3QP
  759. "pxor %%mm5, %%mm5 \n\t" // 0
  760. "psubb %%mm4, %%mm5 \n\t" // -3QP
  761. "por bm11111110, %%mm5 \n\t" // ...,FF,FF,-3QP
  762. "psllq $24, %%mm5 \n\t"
  763. // 0 1 2 3 4 5 6 7 8 9
  764. // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
  765. #define HX1old(a) \
  766. "movd " #a ", %%mm0 \n\t"\
  767. "movd 4" #a ", %%mm1 \n\t"\
  768. "punpckldq %%mm1, %%mm0 \n\t"\
  769. "movq %%mm0, %%mm1 \n\t"\
  770. "movq %%mm0, %%mm2 \n\t"\
  771. "psrlq $8, %%mm1 \n\t"\
  772. "psubusb %%mm1, %%mm2 \n\t"\
  773. "psubusb %%mm0, %%mm1 \n\t"\
  774. "por %%mm2, %%mm1 \n\t" /* p´x = |px - p(x+1)| */\
  775. "pcmpeqb %%mm7, %%mm2 \n\t" /* p´x = sgn[px - p(x+1)] */\
  776. "pshufw $0x00, %%mm1, %%mm3 \n\t" /* p´5 = |p1 - p2| */\
  777. PAVGB(%%mm1, %%mm3) /* p´5 = (|p2-p1| + |p6-p5|)/2 */\
  778. "psrlq $16, %%mm3 \n\t" /* p´3 = (|p2-p1| + |p6-p5|)/2 */\
  779. "psubusb %%mm3, %%mm1 \n\t" /* |p3-p4|-(|p2-p1| + |p6-p5|)/2 */\
  780. "paddb %%mm5, %%mm1 \n\t"\
  781. "psubusb %%mm5, %%mm1 \n\t"\
  782. PAVGB(%%mm7, %%mm1)\
  783. "pxor %%mm2, %%mm1 \n\t"\
  784. "psubb %%mm2, %%mm1 \n\t"\
  785. "psrlq $24, %%mm1 \n\t"\
  786. "movd %%mm1, %%ecx \n\t"\
  787. "paddb %%mm6, %%mm0 \n\t"\
  788. "paddsb (%3, %%ecx, 8), %%mm0 \n\t"\
  789. "paddb %%mm6, %%mm0 \n\t"\
  790. "movq %%mm0, " #a " \n\t"\
  791. /*
  792. HX1old((%0))
  793. HX1old((%%eax))
  794. HX1old((%%eax, %1))
  795. HX1old((%%eax, %1, 2))
  796. HX1old((%0, %1, 4))
  797. HX1old((%%ebx))
  798. HX1old((%%ebx, %1))
  799. HX1old((%%ebx, %1, 2))
  800. */
  801. //FIXME add some comments, its unreadable ...
  802. #define HX1b(a, c, b, d) \
  803. "movd " #a ", %%mm0 \n\t"\
  804. "movd 4" #a ", %%mm1 \n\t"\
  805. "punpckldq %%mm1, %%mm0 \n\t"\
  806. "movd " #b ", %%mm4 \n\t"\
  807. "movq %%mm0, %%mm1 \n\t"\
  808. "movq %%mm0, %%mm2 \n\t"\
  809. "psrlq $8, %%mm1 \n\t"\
  810. "movd 4" #b ", %%mm3 \n\t"\
  811. "psubusb %%mm1, %%mm2 \n\t"\
  812. "psubusb %%mm0, %%mm1 \n\t"\
  813. "por %%mm2, %%mm1 \n\t" /* p´x = |px - p(x+1)| */\
  814. "pcmpeqb %%mm7, %%mm2 \n\t" /* p´x = sgn[px - p(x+1)] */\
  815. "punpckldq %%mm3, %%mm4 \n\t"\
  816. "movq %%mm1, %%mm3 \n\t"\
  817. "psllq $32, %%mm3 \n\t" /* p´5 = |p1 - p2| */\
  818. PAVGB(%%mm1, %%mm3) /* p´5 = (|p2-p1| + |p6-p5|)/2 */\
  819. "paddb %%mm6, %%mm0 \n\t"\
  820. "psrlq $16, %%mm3 \n\t" /* p´3 = (|p2-p1| + |p6-p5|)/2 */\
  821. "psubusb %%mm3, %%mm1 \n\t" /* |p3-p4|-(|p2-p1| + |p6-p5|)/2 */\
  822. "movq %%mm4, %%mm3 \n\t"\
  823. "paddb %%mm5, %%mm1 \n\t"\
  824. "psubusb %%mm5, %%mm1 \n\t"\
  825. "psrlq $8, %%mm3 \n\t"\
  826. PAVGB(%%mm7, %%mm1)\
  827. "pxor %%mm2, %%mm1 \n\t"\
  828. "psubb %%mm2, %%mm1 \n\t"\
  829. "movq %%mm4, %%mm2 \n\t"\
  830. "psrlq $24, %%mm1 \n\t"\
  831. "psubusb %%mm3, %%mm2 \n\t"\
  832. "movd %%mm1, %%ecx \n\t"\
  833. "psubusb %%mm4, %%mm3 \n\t"\
  834. "paddsb (%2, %%ecx, 8), %%mm0 \n\t"\
  835. "por %%mm2, %%mm3 \n\t" /* p´x = |px - p(x+1)| */\
  836. "paddb %%mm6, %%mm0 \n\t"\
  837. "pcmpeqb %%mm7, %%mm2 \n\t" /* p´x = sgn[px - p(x+1)] */\
  838. "movq %%mm3, %%mm1 \n\t"\
  839. "psllq $32, %%mm1 \n\t" /* p´5 = |p1 - p2| */\
  840. "movq %%mm0, " #a " \n\t"\
  841. PAVGB(%%mm3, %%mm1) /* p´5 = (|p2-p1| + |p6-p5|)/2 */\
  842. "paddb %%mm6, %%mm4 \n\t"\
  843. "psrlq $16, %%mm1 \n\t" /* p´3 = (|p2-p1| + |p6-p5|)/2 */\
  844. "psubusb %%mm1, %%mm3 \n\t" /* |p3-p4|-(|p2-p1| + |p6-p5|)/2 */\
  845. "paddb %%mm5, %%mm3 \n\t"\
  846. "psubusb %%mm5, %%mm3 \n\t"\
  847. PAVGB(%%mm7, %%mm3)\
  848. "pxor %%mm2, %%mm3 \n\t"\
  849. "psubb %%mm2, %%mm3 \n\t"\
  850. "psrlq $24, %%mm3 \n\t"\
  851. "movd " #c ", %%mm0 \n\t"\
  852. "movd 4" #c ", %%mm1 \n\t"\
  853. "punpckldq %%mm1, %%mm0 \n\t"\
  854. "paddb %%mm6, %%mm0 \n\t"\
  855. "paddsb (%2, %%ecx, 8), %%mm0 \n\t"\
  856. "paddb %%mm6, %%mm0 \n\t"\
  857. "movq %%mm0, " #c " \n\t"\
  858. "movd %%mm3, %%ecx \n\t"\
  859. "movd " #d ", %%mm0 \n\t"\
  860. "paddsb (%2, %%ecx, 8), %%mm4 \n\t"\
  861. "movd 4" #d ", %%mm1 \n\t"\
  862. "paddb %%mm6, %%mm4 \n\t"\
  863. "punpckldq %%mm1, %%mm0 \n\t"\
  864. "movq %%mm4, " #b " \n\t"\
  865. "paddb %%mm6, %%mm0 \n\t"\
  866. "paddsb (%2, %%ecx, 8), %%mm0 \n\t"\
  867. "paddb %%mm6, %%mm0 \n\t"\
  868. "movq %%mm0, " #d " \n\t"\
  869. HX1b((%0),(%%eax),(%%eax, %1),(%%eax, %1, 2))
  870. HX1b((%0, %1, 4),(%%ebx),(%%ebx, %1),(%%ebx, %1, 2))
  871. :
  872. : "r" (src), "r" (stride), "r" (lut)
  873. : "%eax", "%ebx", "%ecx"
  874. );
  875. #else
  876. //FIXME (has little in common with the mmx2 version)
  877. for(y=0; y<BLOCK_SIZE; y++)
  878. {
  879. int a= src[1] - src[2];
  880. int b= src[3] - src[4];
  881. int c= src[5] - src[6];
  882. int d= MAX(ABS(b) - (ABS(a) + ABS(c))/2, 0);
  883. if(d < QP)
  884. {
  885. int v = d * SIGN(-b);
  886. src[1] +=v/8;
  887. src[2] +=v/4;
  888. src[3] +=3*v/8;
  889. src[4] -=3*v/8;
  890. src[5] -=v/4;
  891. src[6] -=v/8;
  892. }
  893. src+=stride;
  894. }
  895. #endif
  896. }
  897. static inline void doVertDefFilter(uint8_t src[], int stride, int QP)
  898. {
  899. #ifdef HAVE_MMX
  900. src+= stride*4;
  901. //FIXME try pmul for *5 stuff
  902. // src[0]=0;
  903. asm volatile(
  904. "pxor %%mm7, %%mm7 \n\t"
  905. "leal (%0, %1), %%eax \n\t"
  906. "leal (%%eax, %1, 4), %%ebx \n\t"
  907. // 0 1 2 3 4 5 6 7
  908. // %0 %0+%1 %0+2%1 eax+2%1 %0+4%1 eax+4%1 ebx+%1 ebx+2%1
  909. // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1
  910. "movq (%0), %%mm0 \n\t"
  911. "movq %%mm0, %%mm1 \n\t"
  912. "punpcklbw %%mm7, %%mm0 \n\t" // low part of line 0
  913. "punpckhbw %%mm7, %%mm1 \n\t" // high part of line 0
  914. "movq (%%eax), %%mm2 \n\t"
  915. "movq %%mm2, %%mm3 \n\t"
  916. "punpcklbw %%mm7, %%mm2 \n\t" // low part of line 1
  917. "punpckhbw %%mm7, %%mm3 \n\t" // high part of line 1
  918. "movq (%%eax, %1), %%mm4 \n\t"
  919. "movq %%mm4, %%mm5 \n\t"
  920. "punpcklbw %%mm7, %%mm4 \n\t" // low part of line 2
  921. "punpckhbw %%mm7, %%mm5 \n\t" // high part of line 2
  922. "paddw %%mm0, %%mm0 \n\t" // 2L0
  923. "paddw %%mm1, %%mm1 \n\t" // 2H0
  924. "psubw %%mm4, %%mm2 \n\t" // L1 - L2
  925. "psubw %%mm5, %%mm3 \n\t" // H1 - H2
  926. "psubw %%mm2, %%mm0 \n\t" // 2L0 - L1 + L2
  927. "psubw %%mm3, %%mm1 \n\t" // 2H0 - H1 + H2
  928. "psllw $2, %%mm2 \n\t" // 4L1 - 4L2
  929. "psllw $2, %%mm3 \n\t" // 4H1 - 4H2
  930. "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2
  931. "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2
  932. "movq (%%eax, %1, 2), %%mm2 \n\t"
  933. "movq %%mm2, %%mm3 \n\t"
  934. "punpcklbw %%mm7, %%mm2 \n\t" // L3
  935. "punpckhbw %%mm7, %%mm3 \n\t" // H3
  936. "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 - L3
  937. "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 - H3
  938. "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 - 2L3
  939. "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 - 2H3
  940. "movq %%mm0, temp0 \n\t" // 2L0 - 5L1 + 5L2 - 2L3
  941. "movq %%mm1, temp1 \n\t" // 2H0 - 5H1 + 5H2 - 2H3
  942. "movq (%0, %1, 4), %%mm0 \n\t"
  943. "movq %%mm0, %%mm1 \n\t"
  944. "punpcklbw %%mm7, %%mm0 \n\t" // L4
  945. "punpckhbw %%mm7, %%mm1 \n\t" // H4
  946. "psubw %%mm0, %%mm2 \n\t" // L3 - L4
  947. "psubw %%mm1, %%mm3 \n\t" // H3 - H4
  948. "movq %%mm2, temp2 \n\t" // L3 - L4
  949. "movq %%mm3, temp3 \n\t" // H3 - H4
  950. "paddw %%mm4, %%mm4 \n\t" // 2L2
  951. "paddw %%mm5, %%mm5 \n\t" // 2H2
  952. "psubw %%mm2, %%mm4 \n\t" // 2L2 - L3 + L4
  953. "psubw %%mm3, %%mm5 \n\t" // 2H2 - H3 + H4
  954. "psllw $2, %%mm2 \n\t" // 4L3 - 4L4
  955. "psllw $2, %%mm3 \n\t" // 4H3 - 4H4
  956. "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4
  957. "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4
  958. //50 opcodes so far
  959. "movq (%%ebx), %%mm2 \n\t"
  960. "movq %%mm2, %%mm3 \n\t"
  961. "punpcklbw %%mm7, %%mm2 \n\t" // L5
  962. "punpckhbw %%mm7, %%mm3 \n\t" // H5
  963. "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 - L5
  964. "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 - H5
  965. "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 - 2L5
  966. "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 - 2H5
  967. "movq (%%ebx, %1), %%mm6 \n\t"
  968. "punpcklbw %%mm7, %%mm6 \n\t" // L6
  969. "psubw %%mm6, %%mm2 \n\t" // L5 - L6
  970. "movq (%%ebx, %1), %%mm6 \n\t"
  971. "punpckhbw %%mm7, %%mm6 \n\t" // H6
  972. "psubw %%mm6, %%mm3 \n\t" // H5 - H6
  973. "paddw %%mm0, %%mm0 \n\t" // 2L4
  974. "paddw %%mm1, %%mm1 \n\t" // 2H4
  975. "psubw %%mm2, %%mm0 \n\t" // 2L4 - L5 + L6
  976. "psubw %%mm3, %%mm1 \n\t" // 2H4 - H5 + H6
  977. "psllw $2, %%mm2 \n\t" // 4L5 - 4L6
  978. "psllw $2, %%mm3 \n\t" // 4H5 - 4H6
  979. "psubw %%mm2, %%mm0 \n\t" // 2L4 - 5L5 + 5L6
  980. "psubw %%mm3, %%mm1 \n\t" // 2H4 - 5H5 + 5H6
  981. "movq (%%ebx, %1, 2), %%mm2 \n\t"
  982. "movq %%mm2, %%mm3 \n\t"
  983. "punpcklbw %%mm7, %%mm2 \n\t" // L7
  984. "punpckhbw %%mm7, %%mm3 \n\t" // H7
  985. "paddw %%mm2, %%mm2 \n\t" // 2L7
  986. "paddw %%mm3, %%mm3 \n\t" // 2H7
  987. "psubw %%mm2, %%mm0 \n\t" // 2L4 - 5L5 + 5L6 - 2L7
  988. "psubw %%mm3, %%mm1 \n\t" // 2H4 - 5H5 + 5H6 - 2H7
  989. "movq temp0, %%mm2 \n\t" // 2L0 - 5L1 + 5L2 - 2L3
  990. "movq temp1, %%mm3 \n\t" // 2H0 - 5H1 + 5H2 - 2H3
  991. //FIXME pxor, psubw, pmax for abs
  992. "movq %%mm7, %%mm6 \n\t" // 0
  993. "pcmpgtw %%mm0, %%mm6 \n\t"
  994. "pxor %%mm6, %%mm0 \n\t"
  995. "psubw %%mm6, %%mm0 \n\t" // |2L4 - 5L5 + 5L6 - 2L7|
  996. "movq %%mm7, %%mm6 \n\t" // 0
  997. "pcmpgtw %%mm1, %%mm6 \n\t"
  998. "pxor %%mm6, %%mm1 \n\t"
  999. "psubw %%mm6, %%mm1 \n\t" // |2H4 - 5H5 + 5H6 - 2H7|
  1000. "movq %%mm7, %%mm6 \n\t" // 0
  1001. "pcmpgtw %%mm2, %%mm6 \n\t"
  1002. "pxor %%mm6, %%mm2 \n\t"
  1003. "psubw %%mm6, %%mm2 \n\t" // |2L0 - 5L1 + 5L2 - 2L3|
  1004. "movq %%mm7, %%mm6 \n\t" // 0
  1005. "pcmpgtw %%mm3, %%mm6 \n\t"
  1006. "pxor %%mm6, %%mm3 \n\t"
  1007. "psubw %%mm6, %%mm3 \n\t" // |2H0 - 5H1 + 5H2 - 2H3|
  1008. #ifdef HAVE_MMX2
  1009. "pminsw %%mm2, %%mm0 \n\t"
  1010. "pminsw %%mm3, %%mm1 \n\t"
  1011. #else
  1012. "movq %%mm0, %%mm6 \n\t"
  1013. "psubusw %%mm2, %%mm6 \n\t"
  1014. "psubw %%mm6, %%mm0 \n\t"
  1015. "movq %%mm1, %%mm6 \n\t"
  1016. "psubusw %%mm3, %%mm6 \n\t"
  1017. "psubw %%mm6, %%mm1 \n\t"
  1018. #endif
  1019. "movq %%mm7, %%mm6 \n\t" // 0
  1020. "pcmpgtw %%mm4, %%mm6 \n\t" // sign(2L2 - 5L3 + 5L4 - 2L5)
  1021. "pxor %%mm6, %%mm4 \n\t"
  1022. "psubw %%mm6, %%mm4 \n\t" // |2L2 - 5L3 + 5L4 - 2L5|
  1023. "pcmpgtw %%mm5, %%mm7 \n\t" // sign(2H2 - 5H3 + 5H4 - 2H5)
  1024. "pxor %%mm7, %%mm5 \n\t"
  1025. "psubw %%mm7, %%mm5 \n\t" // |2H2 - 5H3 + 5H4 - 2H5|
  1026. // 100 opcodes
  1027. "movd %2, %%mm2 \n\t" // QP
  1028. "punpcklwd %%mm2, %%mm2 \n\t"
  1029. "punpcklwd %%mm2, %%mm2 \n\t"
  1030. "psllw $3, %%mm2 \n\t" // 8QP
  1031. "movq %%mm2, %%mm3 \n\t" // 8QP
  1032. "pcmpgtw %%mm4, %%mm2 \n\t"
  1033. "pcmpgtw %%mm5, %%mm3 \n\t"
  1034. "pand %%mm2, %%mm4 \n\t"
  1035. "pand %%mm3, %%mm5 \n\t"
  1036. "psubusw %%mm0, %%mm4 \n\t" // hd
  1037. "psubusw %%mm1, %%mm5 \n\t" // ld
  1038. "movq w05, %%mm2 \n\t" // 5
  1039. "pmullw %%mm2, %%mm4 \n\t"
  1040. "pmullw %%mm2, %%mm5 \n\t"
  1041. "movq w20, %%mm2 \n\t" // 32
  1042. "paddw %%mm2, %%mm4 \n\t"
  1043. "paddw %%mm2, %%mm5 \n\t"
  1044. "psrlw $6, %%mm4 \n\t"
  1045. "psrlw $6, %%mm5 \n\t"
  1046. /*
  1047. "movq w06, %%mm2 \n\t" // 6
  1048. "paddw %%mm2, %%mm4 \n\t"
  1049. "paddw %%mm2, %%mm5 \n\t"
  1050. "movq w1400, %%mm2 \n\t" // 1400h = 5120 = 5/64*2^16
  1051. //FIXME if *5/64 is supposed to be /13 then we should use 5041 instead of 5120
  1052. "pmulhw %%mm2, %%mm4 \n\t" // hd/13
  1053. "pmulhw %%mm2, %%mm5 \n\t" // ld/13
  1054. */
  1055. "movq temp2, %%mm0 \n\t" // L3 - L4
  1056. "movq temp3, %%mm1 \n\t" // H3 - H4
  1057. "pxor %%mm2, %%mm2 \n\t"
  1058. "pxor %%mm3, %%mm3 \n\t"
  1059. // FIXME rounding error
  1060. "psraw $1, %%mm0 \n\t" // (L3 - L4)/2
  1061. "psraw $1, %%mm1 \n\t" // (H3 - H4)/2
  1062. "pcmpgtw %%mm0, %%mm2 \n\t" // sign (L3-L4)
  1063. "pcmpgtw %%mm1, %%mm3 \n\t" // sign (H3-H4)
  1064. "pxor %%mm2, %%mm0 \n\t"
  1065. "pxor %%mm3, %%mm1 \n\t"
  1066. "psubw %%mm2, %%mm0 \n\t" // |L3-L4|
  1067. "psubw %%mm3, %%mm1 \n\t" // |H3-H4|
  1068. // "psrlw $1, %%mm0 \n\t" // |L3 - L4|/2
  1069. // "psrlw $1, %%mm1 \n\t" // |H3 - H4|/2
  1070. "pxor %%mm6, %%mm2 \n\t"
  1071. "pxor %%mm7, %%mm3 \n\t"
  1072. "pand %%mm2, %%mm4 \n\t"
  1073. "pand %%mm3, %%mm5 \n\t"
  1074. #ifdef HAVE_MMX2
  1075. "pminsw %%mm0, %%mm4 \n\t"
  1076. "pminsw %%mm1, %%mm5 \n\t"
  1077. #else
  1078. "movq %%mm4, %%mm2 \n\t"
  1079. "psubusw %%mm0, %%mm2 \n\t"
  1080. "psubw %%mm2, %%mm4 \n\t"
  1081. "movq %%mm5, %%mm2 \n\t"
  1082. "psubusw %%mm1, %%mm2 \n\t"
  1083. "psubw %%mm2, %%mm5 \n\t"
  1084. #endif
  1085. "pxor %%mm6, %%mm4 \n\t"
  1086. "pxor %%mm7, %%mm5 \n\t"
  1087. "psubw %%mm6, %%mm4 \n\t"
  1088. "psubw %%mm7, %%mm5 \n\t"
  1089. "packsswb %%mm5, %%mm4 \n\t"
  1090. "movq (%%eax, %1, 2), %%mm0 \n\t"
  1091. "paddb %%mm4, %%mm0 \n\t"
  1092. "movq %%mm0, (%%eax, %1, 2) \n\t"
  1093. "movq (%0, %1, 4), %%mm0 \n\t"
  1094. "psubb %%mm4, %%mm0 \n\t"
  1095. "movq %%mm0, (%0, %1, 4) \n\t"
  1096. :
  1097. : "r" (src), "r" (stride), "r" (QP)
  1098. : "%eax", "%ebx"
  1099. );
  1100. #else
  1101. const int l1= stride;
  1102. const int l2= stride + l1;
  1103. const int l3= stride + l2;
  1104. const int l4= stride + l3;
  1105. const int l5= stride + l4;
  1106. const int l6= stride + l5;
  1107. const int l7= stride + l6;
  1108. const int l8= stride + l7;
  1109. // const int l9= stride + l8;
  1110. int x;
  1111. src+= stride*3;
  1112. for(x=0; x<BLOCK_SIZE; x++)
  1113. {
  1114. const int middleEnergy= 5*(src[l5] - src[l4]) + 2*(src[l3] - src[l6]);
  1115. if(ABS(middleEnergy) < 8*QP)
  1116. {
  1117. const int q=(src[l4] - src[l5])/2;
  1118. const int leftEnergy= 5*(src[l3] - src[l2]) + 2*(src[l1] - src[l4]);
  1119. const int rightEnergy= 5*(src[l7] - src[l6]) + 2*(src[l5] - src[l8]);
  1120. int d= ABS(middleEnergy) - MIN( ABS(leftEnergy), ABS(rightEnergy) );
  1121. d= MAX(d, 0);
  1122. d= (5*d + 32) >> 6;
  1123. d*= SIGN(-middleEnergy);
  1124. if(q>0)
  1125. {
  1126. d= d<0 ? 0 : d;
  1127. d= d>q ? q : d;
  1128. }
  1129. else
  1130. {
  1131. d= d>0 ? 0 : d;
  1132. d= d<q ? q : d;
  1133. }
  1134. src[l4]-= d;
  1135. src[l5]+= d;
  1136. }
  1137. src++;
  1138. }
  1139. #endif
  1140. }
  1141. //FIXME? |255-0| = 1
  1142. /**
  1143. * Check if the given 8x8 Block is mostly "flat" and copy the unaliged data into tempBlock.
  1144. */
  1145. static inline int isHorizDCAndCopy2Temp(uint8_t src[], int stride)
  1146. {
  1147. // src++;
  1148. int numEq= 0;
  1149. #ifdef HAVE_MMX
  1150. asm volatile (
  1151. // "int $3 \n\t"
  1152. "pushl %1\n\t"
  1153. "movq b7E, %%mm7 \n\t" // mm7 = 0x7F
  1154. "movq b7C, %%mm6 \n\t" // mm6 = 0x7D
  1155. "leal tempBlock, %%eax \n\t"
  1156. "pxor %%mm0, %%mm0 \n\t"
  1157. #define HDC_CHECK_AND_CPY(i) \
  1158. "movq -4(%1), %%mm2 \n\t"\
  1159. "psrlq $32, %%mm2 \n\t"\
  1160. "punpckldq 4(%1), %%mm2 \n\t" /* (%1) */\
  1161. "movq %%mm2, %%mm1 \n\t"\
  1162. "psrlq $8, %%mm2 \n\t"\
  1163. "psubb %%mm1, %%mm2 \n\t"\
  1164. "paddb %%mm7, %%mm2 \n\t"\
  1165. "pcmpgtb %%mm6, %%mm2 \n\t"\
  1166. "paddb %%mm2, %%mm0 \n\t"\
  1167. "movq %%mm1," #i "(%%eax) \n\t"
  1168. HDC_CHECK_AND_CPY(0)
  1169. "addl %2, %1 \n\t"
  1170. HDC_CHECK_AND_CPY(8)
  1171. "addl %2, %1 \n\t"
  1172. HDC_CHECK_AND_CPY(16)
  1173. "addl %2, %1 \n\t"
  1174. HDC_CHECK_AND_CPY(24)
  1175. "addl %2, %1 \n\t"
  1176. HDC_CHECK_AND_CPY(32)
  1177. "addl %2, %1 \n\t"
  1178. HDC_CHECK_AND_CPY(40)
  1179. "addl %2, %1 \n\t"
  1180. HDC_CHECK_AND_CPY(48)
  1181. "addl %2, %1 \n\t"
  1182. HDC_CHECK_AND_CPY(56)
  1183. "psllq $8, %%mm0 \n\t" // remove dummy value
  1184. "movq %%mm0, %%mm1 \n\t"
  1185. "psrlw $8, %%mm0 \n\t"
  1186. "paddb %%mm1, %%mm0 \n\t"
  1187. "movq %%mm0, %%mm1 \n\t"
  1188. "psrlq $16, %%mm0 \n\t"
  1189. "paddb %%mm1, %%mm0 \n\t"
  1190. "movq %%mm0, %%mm1 \n\t"
  1191. "psrlq $32, %%mm0 \n\t"
  1192. "paddb %%mm1, %%mm0 \n\t"
  1193. "popl %1\n\t"
  1194. "movd %%mm0, %0 \n\t"
  1195. : "=r" (numEq)
  1196. : "r" (src), "r" (stride)
  1197. : "%eax"
  1198. );
  1199. // printf("%d\n", numEq);
  1200. numEq= (256 - (numEq & 0xFF)) &0xFF;
  1201. #else
  1202. int y;
  1203. for(y=0; y<BLOCK_SIZE; y++)
  1204. {
  1205. if(((src[0] - src[1] + 1) & 0xFFFF) < 3) numEq++;
  1206. if(((src[1] - src[2] + 1) & 0xFFFF) < 3) numEq++;
  1207. if(((src[2] - src[3] + 1) & 0xFFFF) < 3) numEq++;
  1208. if(((src[3] - src[4] + 1) & 0xFFFF) < 3) numEq++;
  1209. if(((src[4] - src[5] + 1) & 0xFFFF) < 3) numEq++;
  1210. if(((src[5] - src[6] + 1) & 0xFFFF) < 3) numEq++;
  1211. if(((src[6] - src[7] + 1) & 0xFFFF) < 3) numEq++;
  1212. tempBlock[0 + y*TEMP_STRIDE] = src[0];
  1213. tempBlock[1 + y*TEMP_STRIDE] = src[1];
  1214. tempBlock[2 + y*TEMP_STRIDE] = src[2];
  1215. tempBlock[3 + y*TEMP_STRIDE] = src[3];
  1216. tempBlock[4 + y*TEMP_STRIDE] = src[4];
  1217. tempBlock[5 + y*TEMP_STRIDE] = src[5];
  1218. tempBlock[6 + y*TEMP_STRIDE] = src[6];
  1219. tempBlock[7 + y*TEMP_STRIDE] = src[7];
  1220. src+= stride;
  1221. }
  1222. #endif
  1223. /* if(abs(numEq - asmEq) > 0)
  1224. {
  1225. // printf("\nasm:%d c:%d\n", asmEq, numEq);
  1226. for(int y=0; y<8; y++)
  1227. {
  1228. for(int x=0; x<8; x++)
  1229. {
  1230. printf("%d ", src[x + y*stride]);
  1231. }
  1232. printf("\n");
  1233. }
  1234. }
  1235. */
  1236. // printf("%d\n", numEq);
  1237. return numEq > hFlatnessThreshold;
  1238. }
  1239. static inline int isHorizMinMaxOk(uint8_t src[], int stride, int QP)
  1240. {
  1241. #ifdef MMX_FIXME
  1242. FIXME
  1243. int isOk;
  1244. asm volatile(
  1245. // "int $3 \n\t"
  1246. "movq (%1, %2), %%mm0 \n\t"
  1247. "movq (%1, %2, 8), %%mm1 \n\t"
  1248. "movq %%mm0, %%mm2 \n\t"
  1249. "psubusb %%mm1, %%mm0 \n\t"
  1250. "psubusb %%mm2, %%mm1 \n\t"
  1251. "por %%mm1, %%mm0 \n\t" // ABS Diff
  1252. "movq pQPb, %%mm7 \n\t" // QP,..., QP
  1253. "paddusb %%mm7, %%mm7 \n\t" // 2QP ... 2QP
  1254. "psubusb %%mm7, %%mm0 \n\t" // Diff <= 2QP -> 0
  1255. "pcmpeqd b00, %%mm0 \n\t"
  1256. "psrlq $16, %%mm0 \n\t"
  1257. "pcmpeqd bFF, %%mm0 \n\t"
  1258. // "movd %%mm0, (%1, %2, 4)\n\t"
  1259. "movd %%mm0, %0 \n\t"
  1260. : "=r" (isOk)
  1261. : "r" (src), "r" (stride)
  1262. );
  1263. return isOk;
  1264. #else
  1265. if(abs(src[0] - src[7]) > 2*QP) return 0;
  1266. return 1;
  1267. #endif
  1268. }
  1269. static inline void doHorizDefFilterAndCopyBack(uint8_t dst[], int stride, int QP)
  1270. {
  1271. #ifdef HAVE_MMX
  1272. asm volatile(
  1273. "pushl %0 \n\t"
  1274. "pxor %%mm7, %%mm7 \n\t"
  1275. "movq bm00001000, %%mm6 \n\t"
  1276. "movd %2, %%mm5 \n\t" // QP
  1277. "movq %%mm5, %%mm4 \n\t"
  1278. "paddusb %%mm5, %%mm5 \n\t" // 2QP
  1279. "paddusb %%mm5, %%mm4 \n\t" // 3QP
  1280. "psllq $24, %%mm4 \n\t"
  1281. "pxor %%mm5, %%mm5 \n\t" // 0
  1282. "psubb %%mm4, %%mm5 \n\t" // -QP
  1283. "leal tempBlock, %%eax \n\t"
  1284. //FIXME? "unroll by 2" and mix
  1285. #ifdef HAVE_MMX2
  1286. #define HDF(i) \
  1287. "movq " #i "(%%eax), %%mm0 \n\t"\
  1288. "movq %%mm0, %%mm1 \n\t"\
  1289. "movq %%mm0, %%mm2 \n\t"\
  1290. "psrlq $8, %%mm1 \n\t"\
  1291. "psubusb %%mm1, %%mm2 \n\t"\
  1292. "psubusb %%mm0, %%mm1 \n\t"\
  1293. "por %%mm2, %%mm1 \n\t" /* p´x = |px - p(x+1)| */\
  1294. "pcmpeqb %%mm7, %%mm2 \n\t" /* p´x = sgn[px - p(x+1)] */\
  1295. "pshufw $0x00, %%mm1, %%mm3 \n\t" /* p´5 = |p1 - p2| */\
  1296. "pminub %%mm1, %%mm3 \n\t" /* p´5 = min(|p2-p1|, |p6-p5|)*/\
  1297. "psrlq $16, %%mm3 \n\t" /* p´3 = min(|p2-p1|, |p6-p5|)*/\
  1298. "psubusb %%mm3, %%mm1 \n\t" /* |p3-p4|-min(|p1-p2|,|p5-p6|) */\
  1299. "paddb %%mm5, %%mm1 \n\t"\
  1300. "psubusb %%mm5, %%mm1 \n\t"\
  1301. "psrlw $2, %%mm1 \n\t"\
  1302. "pxor %%mm2, %%mm1 \n\t"\
  1303. "psubb %%mm2, %%mm1 \n\t"\
  1304. "pand %%mm6, %%mm1 \n\t"\
  1305. "psubb %%mm1, %%mm0 \n\t"\
  1306. "psllq $8, %%mm1 \n\t"\
  1307. "paddb %%mm1, %%mm0 \n\t"\
  1308. "movd %%mm0, (%0) \n\t"\
  1309. "psrlq $32, %%mm0 \n\t"\
  1310. "movd %%mm0, 4(%0) \n\t"
  1311. #else
  1312. #define HDF(i)\
  1313. "movq " #i "(%%eax), %%mm0 \n\t"\
  1314. "movq %%mm0, %%mm1 \n\t"\
  1315. "movq %%mm0, %%mm2 \n\t"\
  1316. "psrlq $8, %%mm1 \n\t"\
  1317. "psubusb %%mm1, %%mm2 \n\t"\
  1318. "psubusb %%mm0, %%mm1 \n\t"\
  1319. "por %%mm2, %%mm1 \n\t" /* p´x = |px - p(x+1)| */\
  1320. "pcmpeqb %%mm7, %%mm2 \n\t" /* p´x = sgn[px - p(x+1)] */\
  1321. "movq %%mm1, %%mm3 \n\t"\
  1322. "psllq $32, %%mm3 \n\t"\
  1323. "movq %%mm3, %%mm4 \n\t"\
  1324. "psubusb %%mm1, %%mm4 \n\t"\
  1325. "psubb %%mm4, %%mm3 \n\t"\
  1326. "psrlq $16, %%mm3 \n\t" /* p´3 = min(|p2-p1|, |p6-p5|)*/\
  1327. "psubusb %%mm3, %%mm1 \n\t" /* |p3-p4|-min(|p1-p2|,|p5,ü6|) */\
  1328. "paddb %%mm5, %%mm1 \n\t"\
  1329. "psubusb %%mm5, %%mm1 \n\t"\
  1330. "psrlw $2, %%mm1 \n\t"\
  1331. "pxor %%mm2, %%mm1 \n\t"\
  1332. "psubb %%mm2, %%mm1 \n\t"\
  1333. "pand %%mm6, %%mm1 \n\t"\
  1334. "psubb %%mm1, %%mm0 \n\t"\
  1335. "psllq $8, %%mm1 \n\t"\
  1336. "paddb %%mm1, %%mm0 \n\t"\
  1337. "movd %%mm0, (%0) \n\t"\
  1338. "psrlq $32, %%mm0 \n\t"\
  1339. "movd %%mm0, 4(%0) \n\t"
  1340. #endif
  1341. HDF(0)
  1342. "addl %1, %0 \n\t"
  1343. HDF(8)
  1344. "addl %1, %0 \n\t"
  1345. HDF(16)
  1346. "addl %1, %0 \n\t"
  1347. HDF(24)
  1348. "addl %1, %0 \n\t"
  1349. HDF(32)
  1350. "addl %1, %0 \n\t"
  1351. HDF(40)
  1352. "addl %1, %0 \n\t"
  1353. HDF(48)
  1354. "addl %1, %0 \n\t"
  1355. HDF(56)
  1356. "popl %0 \n\t"
  1357. :
  1358. : "r" (dst), "r" (stride), "r" (QP)
  1359. : "%eax"
  1360. );
  1361. #else
  1362. uint8_t *src= tempBlock;
  1363. int y;
  1364. for(y=0; y<BLOCK_SIZE; y++)
  1365. {
  1366. const int middleEnergy= 5*(src[4] - src[5]) + 2*(src[2] - src[5]);
  1367. dst[0] = src[0];
  1368. dst[1] = src[1];
  1369. dst[2] = src[2];
  1370. dst[3] = src[3];
  1371. dst[4] = src[4];
  1372. dst[5] = src[5];
  1373. dst[6] = src[6];
  1374. dst[7] = src[7];
  1375. if(ABS(middleEnergy) < 8*QP)
  1376. {
  1377. const int q=(src[3] - src[4])/2;
  1378. const int leftEnergy= 5*(src[2] - src[1]) + 2*(src[0] - src[3]);
  1379. const int rightEnergy= 5*(src[6] - src[5]) + 2*(src[4] - src[7]);
  1380. int d= ABS(middleEnergy) - MIN( ABS(leftEnergy), ABS(rightEnergy) );
  1381. d= MAX(d, 0);
  1382. d= (5*d + 32) >> 6;
  1383. d*= SIGN(-middleEnergy);
  1384. if(q>0)
  1385. {
  1386. d= d<0 ? 0 : d;
  1387. d= d>q ? q : d;
  1388. }
  1389. else
  1390. {
  1391. d= d>0 ? 0 : d;
  1392. d= d<q ? q : d;
  1393. }
  1394. dst[3]-= d;
  1395. dst[4]+= d;
  1396. }
  1397. dst+= stride;
  1398. src+= TEMP_STRIDE;
  1399. }
  1400. #endif
  1401. }
  1402. /**
  1403. * Do a horizontal low pass filter on the 10x8 block (dst points to middle 8x8 Block)
  1404. * using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16 (C version)
  1405. * using the 7-Tap Filter (2,2,2,4,2,2,2)/16 (MMX2/3DNOW version)
  1406. */
  1407. static inline void doHorizLowPassAndCopyBack(uint8_t dst[], int stride, int QP)
  1408. {
  1409. //return;
  1410. #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
  1411. asm volatile( //"movv %0 %1 %2\n\t"
  1412. "pushl %0\n\t"
  1413. "pxor %%mm7, %%mm7 \n\t"
  1414. "leal tempBlock, %%eax \n\t"
  1415. /*
  1416. #define HLP1 "movq (%0), %%mm0 \n\t"\
  1417. "movq %%mm0, %%mm1 \n\t"\
  1418. "psllq $8, %%mm0 \n\t"\
  1419. PAVGB(%%mm1, %%mm0)\
  1420. "psrlw $8, %%mm0 \n\t"\
  1421. "pxor %%mm1, %%mm1 \n\t"\
  1422. "packuswb %%mm1, %%mm0 \n\t"\
  1423. "movq %%mm0, %%mm1 \n\t"\
  1424. "movq %%mm0, %%mm2 \n\t"\
  1425. "psllq $32, %%mm0 \n\t"\
  1426. "paddb %%mm0, %%mm1 \n\t"\
  1427. "psllq $16, %%mm2 \n\t"\
  1428. PAVGB(%%mm2, %%mm0)\
  1429. "movq %%mm0, %%mm3 \n\t"\
  1430. "pand bm11001100, %%mm0 \n\t"\
  1431. "paddusb %%mm0, %%mm3 \n\t"\
  1432. "psrlq $8, %%mm3 \n\t"\
  1433. PAVGB(%%mm1, %%mm4)\
  1434. PAVGB(%%mm3, %%mm2)\
  1435. "psrlq $16, %%mm2 \n\t"\
  1436. "punpcklbw %%mm2, %%mm2 \n\t"\
  1437. "movq %%mm2, (%0) \n\t"\
  1438. #define HLP2 "movq (%0), %%mm0 \n\t"\
  1439. "movq %%mm0, %%mm1 \n\t"\
  1440. "psllq $8, %%mm0 \n\t"\
  1441. PAVGB(%%mm1, %%mm0)\
  1442. "psrlw $8, %%mm0 \n\t"\
  1443. "pxor %%mm1, %%mm1 \n\t"\
  1444. "packuswb %%mm1, %%mm0 \n\t"\
  1445. "movq %%mm0, %%mm2 \n\t"\
  1446. "psllq $32, %%mm0 \n\t"\
  1447. "psllq $16, %%mm2 \n\t"\
  1448. PAVGB(%%mm2, %%mm0)\
  1449. "movq %%mm0, %%mm3 \n\t"\
  1450. "pand bm11001100, %%mm0 \n\t"\
  1451. "paddusb %%mm0, %%mm3 \n\t"\
  1452. "psrlq $8, %%mm3 \n\t"\
  1453. PAVGB(%%mm3, %%mm2)\
  1454. "psrlq $16, %%mm2 \n\t"\
  1455. "punpcklbw %%mm2, %%mm2 \n\t"\
  1456. "movq %%mm2, (%0) \n\t"\
  1457. */
  1458. // approximately a 7-Tap Filter with Vector (1,2,3,4,3,2,1)/16
  1459. /*
  1460. Implemented Exact 7-Tap
  1461. 9421 A321
  1462. 36421 64321
  1463. 334321 =
  1464. 1234321 =
  1465. 1234321 =
  1466. 123433 =
  1467. 12463 12346
  1468. 1249 123A
  1469. */
  1470. #ifdef HAVE_MMX2
  1471. #define HLP3(i) "movq " #i "(%%eax), %%mm0 \n\t"\
  1472. "movq %%mm0, %%mm1 \n\t"\
  1473. "movq %%mm0, %%mm2 \n\t"\
  1474. "movq %%mm0, %%mm3 \n\t"\
  1475. "movq %%mm0, %%mm4 \n\t"\
  1476. "psllq $8, %%mm1 \n\t"\
  1477. "psrlq $8, %%mm2 \n\t"\
  1478. "pand bm00000001, %%mm3 \n\t"\
  1479. "pand bm10000000, %%mm4 \n\t"\
  1480. "por %%mm3, %%mm1 \n\t"\
  1481. "por %%mm4, %%mm2 \n\t"\
  1482. PAVGB(%%mm2, %%mm1)\
  1483. PAVGB(%%mm1, %%mm0)\
  1484. \
  1485. "pshufw $0xF9, %%mm0, %%mm3 \n\t"\
  1486. "pshufw $0x90, %%mm0, %%mm4 \n\t"\
  1487. PAVGB(%%mm3, %%mm4)\
  1488. PAVGB(%%mm4, %%mm0)\
  1489. "movd %%mm0, (%0) \n\t"\
  1490. "psrlq $32, %%mm0 \n\t"\
  1491. "movd %%mm0, 4(%0) \n\t"
  1492. #else
  1493. #define HLP3(i) "movq " #i "(%%eax), %%mm0 \n\t"\
  1494. "movq %%mm0, %%mm1 \n\t"\
  1495. "movq %%mm0, %%mm2 \n\t"\
  1496. "movd -4(%0), %%mm3 \n\t" /*0001000*/\
  1497. "movd 8(%0), %%mm4 \n\t" /*0001000*/\
  1498. "psllq $8, %%mm1 \n\t"\
  1499. "psrlq $8, %%mm2 \n\t"\
  1500. "psrlq $24, %%mm3 \n\t"\
  1501. "psllq $56, %%mm4 \n\t"\
  1502. "por %%mm3, %%mm1 \n\t"\
  1503. "por %%mm4, %%mm2 \n\t"\
  1504. PAVGB(%%mm2, %%mm1)\
  1505. PAVGB(%%mm1, %%mm0)\
  1506. \
  1507. "movq %%mm0, %%mm3 \n\t"\
  1508. "movq %%mm0, %%mm4 \n\t"\
  1509. "movq %%mm0, %%mm5 \n\t"\
  1510. "psrlq $16, %%mm3 \n\t"\
  1511. "psllq $16, %%mm4 \n\t"\
  1512. "pand bm11000000, %%mm5 \n\t"\
  1513. "por %%mm5, %%mm3 \n\t"\
  1514. "movq %%mm0, %%mm5 \n\t"\
  1515. "pand bm00000011, %%mm5 \n\t"\
  1516. "por %%mm5, %%mm4 \n\t"\
  1517. PAVGB(%%mm3, %%mm4)\
  1518. PAVGB(%%mm4, %%mm0)\
  1519. "movd %%mm0, (%0) \n\t"\
  1520. "psrlq $32, %%mm0 \n\t"\
  1521. "movd %%mm0, 4(%0) \n\t"
  1522. #endif
  1523. /* uses the 7-Tap Filter: 1112111 */
  1524. #define NEW_HLP(i)\
  1525. "movq " #i "(%%eax), %%mm0 \n\t"\
  1526. "movq %%mm0, %%mm1 \n\t"\
  1527. "movq %%mm0, %%mm2 \n\t"\
  1528. "movd -4(%0), %%mm3 \n\t" /*0001000*/\
  1529. "movd 8(%0), %%mm4 \n\t" /*0001000*/\
  1530. "psllq $8, %%mm1 \n\t"\
  1531. "psrlq $8, %%mm2 \n\t"\
  1532. "psrlq $24, %%mm3 \n\t"\
  1533. "psllq $56, %%mm4 \n\t"\
  1534. "por %%mm3, %%mm1 \n\t"\
  1535. "por %%mm4, %%mm2 \n\t"\
  1536. "movq %%mm1, %%mm5 \n\t"\
  1537. PAVGB(%%mm2, %%mm1)\
  1538. PAVGB(%%mm1, %%mm0)\
  1539. "psllq $8, %%mm5 \n\t"\
  1540. "psrlq $8, %%mm2 \n\t"\
  1541. "por %%mm3, %%mm5 \n\t"\
  1542. "por %%mm4, %%mm2 \n\t"\
  1543. "movq %%mm5, %%mm1 \n\t"\
  1544. PAVGB(%%mm2, %%mm5)\
  1545. "psllq $8, %%mm1 \n\t"\
  1546. "psrlq $8, %%mm2 \n\t"\
  1547. "por %%mm3, %%mm1 \n\t"\
  1548. "por %%mm4, %%mm2 \n\t"\
  1549. PAVGB(%%mm2, %%mm1)\
  1550. PAVGB(%%mm1, %%mm5)\
  1551. PAVGB(%%mm5, %%mm0)\
  1552. "movd %%mm0, (%0) \n\t"\
  1553. "psrlq $32, %%mm0 \n\t"\
  1554. "movd %%mm0, 4(%0) \n\t"
  1555. /* uses the 9-Tap Filter: 112242211 */
  1556. #define NEW_HLP2(i)\
  1557. "movq " #i "(%%eax), %%mm0 \n\t" /*0001000*/\
  1558. "movq %%mm0, %%mm1 \n\t" /*0001000*/\
  1559. "movq %%mm0, %%mm2 \n\t" /*0001000*/\
  1560. "movd -4(%0), %%mm3 \n\t" /*0001000*/\
  1561. "movd 8(%0), %%mm4 \n\t" /*0001000*/\
  1562. "psllq $8, %%mm1 \n\t"\
  1563. "psrlq $8, %%mm2 \n\t"\
  1564. "psrlq $24, %%mm3 \n\t"\
  1565. "psllq $56, %%mm4 \n\t"\
  1566. "por %%mm3, %%mm1 \n\t" /*0010000*/\
  1567. "por %%mm4, %%mm2 \n\t" /*0000100*/\
  1568. "movq %%mm1, %%mm5 \n\t" /*0010000*/\
  1569. PAVGB(%%mm2, %%mm1) /*0010100*/\
  1570. PAVGB(%%mm1, %%mm0) /*0012100*/\
  1571. "psllq $8, %%mm5 \n\t"\
  1572. "psrlq $8, %%mm2 \n\t"\
  1573. "por %%mm3, %%mm5 \n\t" /*0100000*/\
  1574. "por %%mm4, %%mm2 \n\t" /*0000010*/\
  1575. "movq %%mm5, %%mm1 \n\t" /*0100000*/\
  1576. PAVGB(%%mm2, %%mm5) /*0100010*/\
  1577. "psllq $8, %%mm1 \n\t"\
  1578. "psrlq $8, %%mm2 \n\t"\
  1579. "por %%mm3, %%mm1 \n\t" /*1000000*/\
  1580. "por %%mm4, %%mm2 \n\t" /*0000001*/\
  1581. "movq %%mm1, %%mm6 \n\t" /*1000000*/\
  1582. PAVGB(%%mm2, %%mm1) /*1000001*/\
  1583. "psllq $8, %%mm6 \n\t"\
  1584. "psrlq $8, %%mm2 \n\t"\
  1585. "por %%mm3, %%mm6 \n\t"/*100000000*/\
  1586. "por %%mm4, %%mm2 \n\t"/*000000001*/\
  1587. PAVGB(%%mm2, %%mm6) /*100000001*/\
  1588. PAVGB(%%mm6, %%mm1) /*110000011*/\
  1589. PAVGB(%%mm1, %%mm5) /*112000211*/\
  1590. PAVGB(%%mm5, %%mm0) /*112242211*/\
  1591. "movd %%mm0, (%0) \n\t"\
  1592. "psrlq $32, %%mm0 \n\t"\
  1593. "movd %%mm0, 4(%0) \n\t"
  1594. #define HLP(i) NEW_HLP(i)
  1595. HLP(0)
  1596. "addl %1, %0 \n\t"
  1597. HLP(8)
  1598. "addl %1, %0 \n\t"
  1599. HLP(16)
  1600. "addl %1, %0 \n\t"
  1601. HLP(24)
  1602. "addl %1, %0 \n\t"
  1603. HLP(32)
  1604. "addl %1, %0 \n\t"
  1605. HLP(40)
  1606. "addl %1, %0 \n\t"
  1607. HLP(48)
  1608. "addl %1, %0 \n\t"
  1609. HLP(56)
  1610. "popl %0\n\t"
  1611. :
  1612. : "r" (dst), "r" (stride)
  1613. : "%eax", "%ebx"
  1614. );
  1615. #else
  1616. uint8_t *temp= tempBlock;
  1617. int y;
  1618. for(y=0; y<BLOCK_SIZE; y++)
  1619. {
  1620. const int first= ABS(dst[-1] - dst[0]) < QP ? dst[-1] : dst[0];
  1621. const int last= ABS(dst[8] - dst[7]) < QP ? dst[8] : dst[7];
  1622. int sums[9];
  1623. sums[0] = first + temp[0];
  1624. sums[1] = temp[0] + temp[1];
  1625. sums[2] = temp[1] + temp[2];
  1626. sums[3] = temp[2] + temp[3];
  1627. sums[4] = temp[3] + temp[4];
  1628. sums[5] = temp[4] + temp[5];
  1629. sums[6] = temp[5] + temp[6];
  1630. sums[7] = temp[6] + temp[7];
  1631. sums[8] = temp[7] + last;
  1632. dst[0]= ((sums[0]<<2) + ((first + sums[2])<<1) + sums[4] + 8)>>4;
  1633. dst[1]= ((dst[1]<<2) + (first + sums[0] + sums[3]<<1) + sums[5] + 8)>>4;
  1634. dst[2]= ((dst[2]<<2) + (first + sums[1] + sums[4]<<1) + sums[6] + 8)>>4;
  1635. dst[3]= ((dst[3]<<2) + (sums[2] + sums[5]<<1) + sums[0] + sums[7] + 8)>>4;
  1636. dst[4]= ((dst[4]<<2) + (sums[3] + sums[6]<<1) + sums[1] + sums[8] + 8)>>4;
  1637. dst[5]= ((dst[5]<<2) + (last + sums[7] + sums[4]<<1) + sums[2] + 8)>>4;
  1638. dst[6]= ((last + dst[6]<<2) + (dst[7] + sums[5]<<1) + sums[3] + 8)>>4;
  1639. dst[7]= ((sums[8]<<2) + (last + sums[6]<<1) + sums[4] + 8)>>4;
  1640. dst+= stride;
  1641. temp+= TEMP_STRIDE;
  1642. }
  1643. #endif
  1644. }
  1645. static inline void dering(uint8_t src[], int stride, int QP)
  1646. {
  1647. //FIXME
  1648. #ifdef HAVE_MMX2X
  1649. asm volatile(
  1650. "leal (%0, %1), %%eax \n\t"
  1651. "leal (%%eax, %1, 4), %%ebx \n\t"
  1652. // 0 1 2 3 4 5 6 7 8 9
  1653. // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
  1654. "pcmpeq %%mm6, %%mm6 \n\t"
  1655. "pxor %%mm7, %%mm7 \n\t"
  1656. #define FIND_MIN_MAX(addr)\
  1657. "movq (" #addr "), %%mm0, \n\t"\
  1658. "pminub %%mm0, %%mm6 \n\t"\
  1659. "pmaxub %%mm0, %%mm7 \n\t"
  1660. FIND_MIN_MAX(%0)
  1661. FIND_MIN_MAX(%%eax)
  1662. FIND_MIN_MAX(%%eax, %1)
  1663. FIND_MIN_MAX(%%eax, %1, 2)
  1664. FIND_MIN_MAX(%0, %1, 4)
  1665. FIND_MIN_MAX(%%ebx)
  1666. FIND_MIN_MAX(%%ebx, %1)
  1667. FIND_MIN_MAX(%%ebx, %1, 2)
  1668. FIND_MIN_MAX(%0, %1, 8)
  1669. FIND_MIN_MAX(%%ebx, %1, 2)
  1670. "movq %%mm6, %%mm4 \n\t"
  1671. "psrlq $32, %%mm6 \n\t"
  1672. "pminub %%mm4, %%mm6 \n\t"
  1673. "movq %%mm6, %%mm4 \n\t"
  1674. "psrlq $16, %%mm6 \n\t"
  1675. "pminub %%mm4, %%mm6 \n\t"
  1676. "movq %%mm6, %%mm4 \n\t"
  1677. "psrlq $8, %%mm6 \n\t"
  1678. "pminub %%mm4, %%mm6 \n\t" // min of pixels
  1679. "movq %%mm7, %%mm4 \n\t"
  1680. "psrlq $32, %%mm7 \n\t"
  1681. "pmaxub %%mm4, %%mm7 \n\t"
  1682. "movq %%mm7, %%mm4 \n\t"
  1683. "psrlq $16, %%mm7 \n\t"
  1684. "pmaxub %%mm4, %%mm7 \n\t"
  1685. "movq %%mm7, %%mm4 \n\t"
  1686. "psrlq $8, %%mm7 \n\t"
  1687. "pmaxub %%mm4, %%mm7 \n\t" // max of pixels
  1688. PAVGB(%%mm6, %%mm7) // (max + min)/2
  1689. : : "r" (src), "r" (stride), "r" (QP)
  1690. : "%eax", "%ebx"
  1691. );
  1692. #else
  1693. //FIXME
  1694. #endif
  1695. }
  1696. /**
  1697. * Deinterlaces the given block
  1698. * will be called for every 8x8 block, and can read & write into an 8x16 block
  1699. */
  1700. static inline void deInterlaceInterpolateLinear(uint8_t src[], int stride)
  1701. {
  1702. #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
  1703. asm volatile(
  1704. "leal (%0, %1), %%eax \n\t"
  1705. "leal (%%eax, %1, 4), %%ebx \n\t"
  1706. // 0 1 2 3 4 5 6 7 8 9
  1707. // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
  1708. "movq (%0), %%mm0 \n\t"
  1709. "movq (%%eax, %1), %%mm1 \n\t"
  1710. PAVGB(%%mm1, %%mm0)
  1711. "movq %%mm0, (%%eax) \n\t"
  1712. "movq (%0, %1, 4), %%mm0 \n\t"
  1713. PAVGB(%%mm0, %%mm1)
  1714. "movq %%mm1, (%%eax, %1, 2) \n\t"
  1715. "movq (%%ebx, %1), %%mm1 \n\t"
  1716. PAVGB(%%mm1, %%mm0)
  1717. "movq %%mm0, (%%ebx) \n\t"
  1718. "movq (%0, %1, 8), %%mm0 \n\t"
  1719. PAVGB(%%mm0, %%mm1)
  1720. "movq %%mm1, (%%ebx, %1, 2) \n\t"
  1721. : : "r" (src), "r" (stride)
  1722. : "%eax", "%ebx"
  1723. );
  1724. #else
  1725. int x;
  1726. for(x=0; x<8; x++)
  1727. {
  1728. src[stride] = (src[0] + src[stride*2])>>1;
  1729. src[stride*3] = (src[stride*2] + src[stride*4])>>1;
  1730. src[stride*5] = (src[stride*4] + src[stride*6])>>1;
  1731. src[stride*7] = (src[stride*6] + src[stride*8])>>1;
  1732. src++;
  1733. }
  1734. #endif
  1735. }
  1736. /**
  1737. * Deinterlaces the given block
  1738. * will be called for every 8x8 block, and can read & write into an 8x16 block
  1739. * no cliping in C version
  1740. */
  1741. static inline void deInterlaceInterpolateCubic(uint8_t src[], int stride)
  1742. {
  1743. #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
  1744. asm volatile(
  1745. "leal (%0, %1), %%eax \n\t"
  1746. "leal (%%eax, %1, 4), %%ebx \n\t"
  1747. "leal (%%ebx, %1, 4), %%ecx \n\t"
  1748. "addl %1, %%ecx \n\t"
  1749. "pxor %%mm7, %%mm7 \n\t"
  1750. // 0 1 2 3 4 5 6 7 8 9 10
  1751. // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1 ecx
  1752. #define DEINT_CUBIC(a,b,c,d,e)\
  1753. "movq " #a ", %%mm0 \n\t"\
  1754. "movq " #b ", %%mm1 \n\t"\
  1755. "movq " #d ", %%mm2 \n\t"\
  1756. "movq " #e ", %%mm3 \n\t"\
  1757. PAVGB(%%mm2, %%mm1) /* (b+d) /2 */\
  1758. PAVGB(%%mm3, %%mm0) /* a(a+e) /2 */\
  1759. "movq %%mm0, %%mm2 \n\t"\
  1760. "punpcklbw %%mm7, %%mm0 \n\t"\
  1761. "punpckhbw %%mm7, %%mm2 \n\t"\
  1762. "movq %%mm1, %%mm3 \n\t"\
  1763. "punpcklbw %%mm7, %%mm1 \n\t"\
  1764. "punpckhbw %%mm7, %%mm3 \n\t"\
  1765. "psubw %%mm1, %%mm0 \n\t" /* L(a+e - (b+d))/2 */\
  1766. "psubw %%mm3, %%mm2 \n\t" /* H(a+e - (b+d))/2 */\
  1767. "psraw $3, %%mm0 \n\t" /* L(a+e - (b+d))/16 */\
  1768. "psraw $3, %%mm2 \n\t" /* H(a+e - (b+d))/16 */\
  1769. "psubw %%mm0, %%mm1 \n\t" /* L(9b + 9d - a - e)/16 */\
  1770. "psubw %%mm2, %%mm3 \n\t" /* H(9b + 9d - a - e)/16 */\
  1771. "packuswb %%mm3, %%mm1 \n\t"\
  1772. "movq %%mm1, " #c " \n\t"
  1773. DEINT_CUBIC((%0), (%%eax, %1), (%%eax, %1, 2), (%0, %1, 4), (%%ebx, %1))
  1774. DEINT_CUBIC((%%eax, %1), (%0, %1, 4), (%%ebx), (%%ebx, %1), (%0, %1, 8))
  1775. DEINT_CUBIC((%0, %1, 4), (%%ebx, %1), (%%ebx, %1, 2), (%0, %1, 8), (%%ecx))
  1776. DEINT_CUBIC((%%ebx, %1), (%0, %1, 8), (%%ebx, %1, 4), (%%ecx), (%%ecx, %1, 2))
  1777. : : "r" (src), "r" (stride)
  1778. : "%eax", "%ebx", "ecx"
  1779. );
  1780. #else
  1781. int x;
  1782. for(x=0; x<8; x++)
  1783. {
  1784. src[stride*3] = (-src[0] + 9*src[stride*2] + 9*src[stride*4] - src[stride*6])>>4;
  1785. src[stride*5] = (-src[stride*2] + 9*src[stride*4] + 9*src[stride*6] - src[stride*8])>>4;
  1786. src[stride*7] = (-src[stride*4] + 9*src[stride*6] + 9*src[stride*8] - src[stride*10])>>4;
  1787. src[stride*9] = (-src[stride*6] + 9*src[stride*8] + 9*src[stride*10] - src[stride*12])>>4;
  1788. src++;
  1789. }
  1790. #endif
  1791. }
  1792. /**
  1793. * Deinterlaces the given block
  1794. * will be called for every 8x8 block, and can read & write into an 8x16 block
  1795. * will shift the image up by 1 line (FIXME if this is a problem)
  1796. */
  1797. static inline void deInterlaceBlendLinear(uint8_t src[], int stride)
  1798. {
  1799. #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
  1800. asm volatile(
  1801. "leal (%0, %1), %%eax \n\t"
  1802. "leal (%%eax, %1, 4), %%ebx \n\t"
  1803. // 0 1 2 3 4 5 6 7 8 9
  1804. // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
  1805. "movq (%0), %%mm0 \n\t" // L0
  1806. "movq (%%eax, %1), %%mm1 \n\t" // L2
  1807. PAVGB(%%mm1, %%mm0) // L0+L2
  1808. "movq (%%eax), %%mm2 \n\t" // L1
  1809. PAVGB(%%mm2, %%mm0)
  1810. "movq %%mm0, (%0) \n\t"
  1811. "movq (%%eax, %1, 2), %%mm0 \n\t" // L3
  1812. PAVGB(%%mm0, %%mm2) // L1+L3
  1813. PAVGB(%%mm1, %%mm2) // 2L2 + L1 + L3
  1814. "movq %%mm2, (%%eax) \n\t"
  1815. "movq (%0, %1, 4), %%mm2 \n\t" // L4
  1816. PAVGB(%%mm2, %%mm1) // L2+L4
  1817. PAVGB(%%mm0, %%mm1) // 2L3 + L2 + L4
  1818. "movq %%mm1, (%%eax, %1) \n\t"
  1819. "movq (%%ebx), %%mm1 \n\t" // L5
  1820. PAVGB(%%mm1, %%mm0) // L3+L5
  1821. PAVGB(%%mm2, %%mm0) // 2L4 + L3 + L5
  1822. "movq %%mm0, (%%eax, %1, 2) \n\t"
  1823. "movq (%%ebx, %1), %%mm0 \n\t" // L6
  1824. PAVGB(%%mm0, %%mm2) // L4+L6
  1825. PAVGB(%%mm1, %%mm2) // 2L5 + L4 + L6
  1826. "movq %%mm2, (%0, %1, 4) \n\t"
  1827. "movq (%%ebx, %1, 2), %%mm2 \n\t" // L7
  1828. PAVGB(%%mm2, %%mm1) // L5+L7
  1829. PAVGB(%%mm0, %%mm1) // 2L6 + L5 + L7
  1830. "movq %%mm1, (%%ebx) \n\t"
  1831. "movq (%0, %1, 8), %%mm1 \n\t" // L8
  1832. PAVGB(%%mm1, %%mm0) // L6+L8
  1833. PAVGB(%%mm2, %%mm0) // 2L7 + L6 + L8
  1834. "movq %%mm0, (%%ebx, %1) \n\t"
  1835. "movq (%%ebx, %1, 4), %%mm0 \n\t" // L9
  1836. PAVGB(%%mm0, %%mm2) // L7+L9
  1837. PAVGB(%%mm1, %%mm2) // 2L8 + L7 + L9
  1838. "movq %%mm2, (%%ebx, %1, 2) \n\t"
  1839. : : "r" (src), "r" (stride)
  1840. : "%eax", "%ebx"
  1841. );
  1842. #else
  1843. int x;
  1844. for(x=0; x<8; x++)
  1845. {
  1846. src[0 ] = (src[0 ] + 2*src[stride ] + src[stride*2])>>2;
  1847. src[stride ] = (src[stride ] + 2*src[stride*2] + src[stride*3])>>2;
  1848. src[stride*2] = (src[stride*2] + 2*src[stride*3] + src[stride*4])>>2;
  1849. src[stride*3] = (src[stride*3] + 2*src[stride*4] + src[stride*5])>>2;
  1850. src[stride*4] = (src[stride*4] + 2*src[stride*5] + src[stride*6])>>2;
  1851. src[stride*5] = (src[stride*5] + 2*src[stride*6] + src[stride*7])>>2;
  1852. src[stride*6] = (src[stride*6] + 2*src[stride*7] + src[stride*8])>>2;
  1853. src[stride*7] = (src[stride*7] + 2*src[stride*8] + src[stride*9])>>2;
  1854. src++;
  1855. }
  1856. #endif
  1857. }
  1858. /**
  1859. * Deinterlaces the given block
  1860. * will be called for every 8x8 block, except the last row, and can read & write into an 8x16 block
  1861. */
  1862. static inline void deInterlaceMedian(uint8_t src[], int stride)
  1863. {
  1864. #ifdef HAVE_MMX
  1865. #ifdef HAVE_MMX2
  1866. asm volatile(
  1867. "leal (%0, %1), %%eax \n\t"
  1868. "leal (%%eax, %1, 4), %%ebx \n\t"
  1869. // 0 1 2 3 4 5 6 7 8 9
  1870. // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
  1871. "movq (%0), %%mm0 \n\t" //
  1872. "movq (%%eax, %1), %%mm2 \n\t" //
  1873. "movq (%%eax), %%mm1 \n\t" //
  1874. "movq %%mm0, %%mm3 \n\t"
  1875. "pmaxub %%mm1, %%mm0 \n\t" //
  1876. "pminub %%mm3, %%mm1 \n\t" //
  1877. "pmaxub %%mm2, %%mm1 \n\t" //
  1878. "pminub %%mm1, %%mm0 \n\t"
  1879. "movq %%mm0, (%%eax) \n\t"
  1880. "movq (%0, %1, 4), %%mm0 \n\t" //
  1881. "movq (%%eax, %1, 2), %%mm1 \n\t" //
  1882. "movq %%mm2, %%mm3 \n\t"
  1883. "pmaxub %%mm1, %%mm2 \n\t" //
  1884. "pminub %%mm3, %%mm1 \n\t" //
  1885. "pmaxub %%mm0, %%mm1 \n\t" //
  1886. "pminub %%mm1, %%mm2 \n\t"
  1887. "movq %%mm2, (%%eax, %1, 2) \n\t"
  1888. "movq (%%ebx), %%mm2 \n\t" //
  1889. "movq (%%ebx, %1), %%mm1 \n\t" //
  1890. "movq %%mm2, %%mm3 \n\t"
  1891. "pmaxub %%mm0, %%mm2 \n\t" //
  1892. "pminub %%mm3, %%mm0 \n\t" //
  1893. "pmaxub %%mm1, %%mm0 \n\t" //
  1894. "pminub %%mm0, %%mm2 \n\t"
  1895. "movq %%mm2, (%%ebx) \n\t"
  1896. "movq (%%ebx, %1, 2), %%mm2 \n\t" //
  1897. "movq (%0, %1, 8), %%mm0 \n\t" //
  1898. "movq %%mm2, %%mm3 \n\t"
  1899. "pmaxub %%mm0, %%mm2 \n\t" //
  1900. "pminub %%mm3, %%mm0 \n\t" //
  1901. "pmaxub %%mm1, %%mm0 \n\t" //
  1902. "pminub %%mm0, %%mm2 \n\t"
  1903. "movq %%mm2, (%%ebx, %1, 2) \n\t"
  1904. : : "r" (src), "r" (stride)
  1905. : "%eax", "%ebx"
  1906. );
  1907. #else // MMX without MMX2
  1908. asm volatile(
  1909. "leal (%0, %1), %%eax \n\t"
  1910. "leal (%%eax, %1, 4), %%ebx \n\t"
  1911. // 0 1 2 3 4 5 6 7 8 9
  1912. // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
  1913. "pxor %%mm7, %%mm7 \n\t"
  1914. #define MEDIAN(a,b,c)\
  1915. "movq " #a ", %%mm0 \n\t"\
  1916. "movq " #b ", %%mm2 \n\t"\
  1917. "movq " #c ", %%mm1 \n\t"\
  1918. "movq %%mm0, %%mm3 \n\t"\
  1919. "movq %%mm1, %%mm4 \n\t"\
  1920. "movq %%mm2, %%mm5 \n\t"\
  1921. "psubusb %%mm1, %%mm3 \n\t"\
  1922. "psubusb %%mm2, %%mm4 \n\t"\
  1923. "psubusb %%mm0, %%mm5 \n\t"\
  1924. "pcmpeqb %%mm7, %%mm3 \n\t"\
  1925. "pcmpeqb %%mm7, %%mm4 \n\t"\
  1926. "pcmpeqb %%mm7, %%mm5 \n\t"\
  1927. "movq %%mm3, %%mm6 \n\t"\
  1928. "pxor %%mm4, %%mm3 \n\t"\
  1929. "pxor %%mm5, %%mm4 \n\t"\
  1930. "pxor %%mm6, %%mm5 \n\t"\
  1931. "por %%mm3, %%mm1 \n\t"\
  1932. "por %%mm4, %%mm2 \n\t"\
  1933. "por %%mm5, %%mm0 \n\t"\
  1934. "pand %%mm2, %%mm0 \n\t"\
  1935. "pand %%mm1, %%mm0 \n\t"\
  1936. "movq %%mm0, " #b " \n\t"
  1937. MEDIAN((%0), (%%eax), (%%eax, %1))
  1938. MEDIAN((%%eax, %1), (%%eax, %1, 2), (%0, %1, 4))
  1939. MEDIAN((%0, %1, 4), (%%ebx), (%%ebx, %1))
  1940. MEDIAN((%%ebx, %1), (%%ebx, %1, 2), (%0, %1, 8))
  1941. : : "r" (src), "r" (stride)
  1942. : "%eax", "%ebx"
  1943. );
  1944. #endif // MMX
  1945. #else
  1946. //FIXME
  1947. int x;
  1948. for(x=0; x<8; x++)
  1949. {
  1950. src[0 ] = (src[0 ] + 2*src[stride ] + src[stride*2])>>2;
  1951. src[stride ] = (src[stride ] + 2*src[stride*2] + src[stride*3])>>2;
  1952. src[stride*2] = (src[stride*2] + 2*src[stride*3] + src[stride*4])>>2;
  1953. src[stride*3] = (src[stride*3] + 2*src[stride*4] + src[stride*5])>>2;
  1954. src[stride*4] = (src[stride*4] + 2*src[stride*5] + src[stride*6])>>2;
  1955. src[stride*5] = (src[stride*5] + 2*src[stride*6] + src[stride*7])>>2;
  1956. src[stride*6] = (src[stride*6] + 2*src[stride*7] + src[stride*8])>>2;
  1957. src[stride*7] = (src[stride*7] + 2*src[stride*8] + src[stride*9])>>2;
  1958. src++;
  1959. }
  1960. #endif
  1961. }
  1962. #ifdef HAVE_ODIVX_POSTPROCESS
  1963. #include "../opendivx/postprocess.h"
  1964. int use_old_pp=0;
  1965. #endif
  1966. static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
  1967. QP_STORE_T QPs[], int QPStride, int isColor, int mode);
  1968. /**
  1969. * ...
  1970. */
  1971. void postprocess(unsigned char * src[], int src_stride,
  1972. unsigned char * dst[], int dst_stride,
  1973. int horizontal_size, int vertical_size,
  1974. QP_STORE_T *QP_store, int QP_stride,
  1975. int mode)
  1976. {
  1977. #ifdef HAVE_ODIVX_POSTPROCESS
  1978. // Note: I could make this shit outside of this file, but it would mean one
  1979. // more function call...
  1980. if(use_old_pp){
  1981. odivx_postprocess(src,src_stride,dst,dst_stride,horizontal_size,vertical_size,QP_store,QP_stride,mode);
  1982. return;
  1983. }
  1984. #endif
  1985. /*
  1986. long long T= rdtsc();
  1987. for(int y=vertical_size-1; y>=0 ; y--)
  1988. memcpy(dst[0] + y*src_stride, src[0] + y*src_stride,src_stride);
  1989. // memcpy(dst[0], src[0],src_stride*vertical_size);
  1990. printf("%4dk\r", (rdtsc()-T)/1000);
  1991. return;
  1992. */
  1993. /*
  1994. long long T= rdtsc();
  1995. while( (rdtsc() - T)/1000 < 4000);
  1996. return;
  1997. */
  1998. postProcess(src[0], src_stride, dst[0], dst_stride,
  1999. horizontal_size, vertical_size, QP_store, QP_stride, 0, mode);
  2000. horizontal_size >>= 1;
  2001. vertical_size >>= 1;
  2002. src_stride >>= 1;
  2003. dst_stride >>= 1;
  2004. mode= ((mode&0xFF)>>4) | (mode&0xFFFFFF00);
  2005. if(1)
  2006. {
  2007. postProcess(src[1], src_stride, dst[1], dst_stride,
  2008. horizontal_size, vertical_size, QP_store, QP_stride, 1, mode);
  2009. postProcess(src[2], src_stride, dst[2], dst_stride,
  2010. horizontal_size, vertical_size, QP_store, QP_stride, 1, mode);
  2011. }
  2012. else
  2013. {
  2014. memcpy(dst[1], src[1], src_stride*horizontal_size);
  2015. memcpy(dst[2], src[2], src_stride*horizontal_size);
  2016. }
  2017. }
  2018. /**
  2019. * gets the mode flags for a given quality (larger values mean slower but better postprocessing)
  2020. * 0 <= quality <= 6
  2021. */
  2022. int getPpModeForQuality(int quality){
  2023. int modes[1+GET_PP_QUALITY_MAX]= {
  2024. 0,
  2025. #if 1
  2026. // horizontal filters first
  2027. LUM_H_DEBLOCK,
  2028. LUM_H_DEBLOCK | LUM_V_DEBLOCK,
  2029. LUM_H_DEBLOCK | LUM_V_DEBLOCK | CHROM_H_DEBLOCK,
  2030. LUM_H_DEBLOCK | LUM_V_DEBLOCK | CHROM_H_DEBLOCK | CHROM_V_DEBLOCK,
  2031. LUM_H_DEBLOCK | LUM_V_DEBLOCK | CHROM_H_DEBLOCK | CHROM_V_DEBLOCK | LUM_DERING,
  2032. LUM_H_DEBLOCK | LUM_V_DEBLOCK | CHROM_H_DEBLOCK | CHROM_V_DEBLOCK | LUM_DERING | CHROM_DERING
  2033. #else
  2034. // vertical filters first
  2035. LUM_V_DEBLOCK,
  2036. LUM_V_DEBLOCK | LUM_H_DEBLOCK,
  2037. LUM_V_DEBLOCK | LUM_H_DEBLOCK | CHROM_V_DEBLOCK,
  2038. LUM_V_DEBLOCK | LUM_H_DEBLOCK | CHROM_V_DEBLOCK | CHROM_H_DEBLOCK,
  2039. LUM_V_DEBLOCK | LUM_H_DEBLOCK | CHROM_V_DEBLOCK | CHROM_H_DEBLOCK | LUM_DERING,
  2040. LUM_V_DEBLOCK | LUM_H_DEBLOCK | CHROM_V_DEBLOCK | CHROM_H_DEBLOCK | LUM_DERING | CHROM_DERING
  2041. #endif
  2042. };
  2043. #ifdef HAVE_ODIVX_POSTPROCESS
  2044. int odivx_modes[1+GET_PP_QUALITY_MAX]= {
  2045. 0,
  2046. PP_DEBLOCK_Y_H,
  2047. PP_DEBLOCK_Y_H|PP_DEBLOCK_Y_V,
  2048. PP_DEBLOCK_Y_H|PP_DEBLOCK_Y_V|PP_DEBLOCK_C_H,
  2049. PP_DEBLOCK_Y_H|PP_DEBLOCK_Y_V|PP_DEBLOCK_C_H|PP_DEBLOCK_C_V,
  2050. PP_DEBLOCK_Y_H|PP_DEBLOCK_Y_V|PP_DEBLOCK_C_H|PP_DEBLOCK_C_V|PP_DERING_Y,
  2051. PP_DEBLOCK_Y_H|PP_DEBLOCK_Y_V|PP_DEBLOCK_C_H|PP_DEBLOCK_C_V|PP_DERING_Y|PP_DERING_C
  2052. };
  2053. if(use_old_pp) return odivx_modes[quality];
  2054. #endif
  2055. return modes[quality];
  2056. }
  2057. //} // extern "C"
  2058. /**
  2059. * Copies a block from src to dst and fixes the blacklevel
  2060. * numLines must be a multiple of 4
  2061. * levelFix == 0 -> dont touch the brighness & contrast
  2062. */
  2063. static inline void blockCopy(uint8_t dst[], int dstStride, uint8_t src[], int srcStride,
  2064. int numLines, int levelFix)
  2065. {
  2066. int i;
  2067. if(levelFix)
  2068. {
  2069. #ifdef HAVE_MMX
  2070. asm volatile(
  2071. "movl %4, %%eax \n\t"
  2072. "movl %%eax, temp0\n\t"
  2073. "pushl %0 \n\t"
  2074. "pushl %1 \n\t"
  2075. "leal (%2,%2), %%eax \n\t"
  2076. "leal (%3,%3), %%ebx \n\t"
  2077. "movq packedYOffset, %%mm2 \n\t"
  2078. "movq packedYScale, %%mm3 \n\t"
  2079. "pxor %%mm4, %%mm4 \n\t"
  2080. #define SCALED_CPY \
  2081. "movq (%0), %%mm0 \n\t"\
  2082. "movq (%0,%2), %%mm1 \n\t"\
  2083. "psubusb %%mm2, %%mm0 \n\t"\
  2084. "psubusb %%mm2, %%mm1 \n\t"\
  2085. "movq %%mm0, %%mm5 \n\t"\
  2086. "punpcklbw %%mm4, %%mm0 \n\t"\
  2087. "punpckhbw %%mm4, %%mm5 \n\t"\
  2088. "psllw $7, %%mm0 \n\t"\
  2089. "psllw $7, %%mm5 \n\t"\
  2090. "pmulhw %%mm3, %%mm0 \n\t"\
  2091. "pmulhw %%mm3, %%mm5 \n\t"\
  2092. "packuswb %%mm5, %%mm0 \n\t"\
  2093. "movq %%mm0, (%1) \n\t"\
  2094. "movq %%mm1, %%mm5 \n\t"\
  2095. "punpcklbw %%mm4, %%mm1 \n\t"\
  2096. "punpckhbw %%mm4, %%mm5 \n\t"\
  2097. "psllw $7, %%mm1 \n\t"\
  2098. "psllw $7, %%mm5 \n\t"\
  2099. "pmulhw %%mm3, %%mm1 \n\t"\
  2100. "pmulhw %%mm3, %%mm5 \n\t"\
  2101. "packuswb %%mm5, %%mm1 \n\t"\
  2102. "movq %%mm1, (%1, %3) \n\t"\
  2103. "1: \n\t"
  2104. SCALED_CPY
  2105. "addl %%eax, %0 \n\t"
  2106. "addl %%ebx, %1 \n\t"
  2107. SCALED_CPY
  2108. "addl %%eax, %0 \n\t"
  2109. "addl %%ebx, %1 \n\t"
  2110. "decl temp0 \n\t"
  2111. "jnz 1b \n\t"
  2112. "popl %1 \n\t"
  2113. "popl %0 \n\t"
  2114. : : "r" (src),
  2115. "r" (dst),
  2116. "r" (srcStride),
  2117. "r" (dstStride),
  2118. "m" (numLines>>2)
  2119. : "%eax", "%ebx"
  2120. );
  2121. #else
  2122. for(i=0; i<numLines; i++)
  2123. memcpy( &(dst[dstStride*i]),
  2124. &(src[srcStride*i]), BLOCK_SIZE);
  2125. #endif
  2126. }
  2127. else
  2128. {
  2129. #ifdef HAVE_MMX
  2130. asm volatile(
  2131. "movl %4, %%eax \n\t"
  2132. "movl %%eax, temp0\n\t"
  2133. "pushl %0 \n\t"
  2134. "pushl %1 \n\t"
  2135. "leal (%2,%2), %%eax \n\t"
  2136. "leal (%3,%3), %%ebx \n\t"
  2137. "movq packedYOffset, %%mm2 \n\t"
  2138. "movq packedYScale, %%mm3 \n\t"
  2139. #define SIMPLE_CPY \
  2140. "movq (%0), %%mm0 \n\t"\
  2141. "movq (%0,%2), %%mm1 \n\t"\
  2142. "movq %%mm0, (%1) \n\t"\
  2143. "movq %%mm1, (%1, %3) \n\t"\
  2144. "1: \n\t"
  2145. SIMPLE_CPY
  2146. "addl %%eax, %0 \n\t"
  2147. "addl %%ebx, %1 \n\t"
  2148. SIMPLE_CPY
  2149. "addl %%eax, %0 \n\t"
  2150. "addl %%ebx, %1 \n\t"
  2151. "decl temp0 \n\t"
  2152. "jnz 1b \n\t"
  2153. "popl %1 \n\t"
  2154. "popl %0 \n\t"
  2155. : : "r" (src),
  2156. "r" (dst),
  2157. "r" (srcStride),
  2158. "r" (dstStride),
  2159. "m" (numLines>>2)
  2160. : "%eax", "%ebx"
  2161. );
  2162. #else
  2163. for(i=0; i<numLines; i++)
  2164. memcpy( &(dst[dstStride*i]),
  2165. &(src[srcStride*i]), BLOCK_SIZE);
  2166. #endif
  2167. }
  2168. }
  2169. /**
  2170. * Filters array of bytes (Y or U or V values)
  2171. */
  2172. static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
  2173. QP_STORE_T QPs[], int QPStride, int isColor, int mode)
  2174. {
  2175. int x,y;
  2176. /* we need 64bit here otherwise we´ll going to have a problem
  2177. after watching a black picture for 5 hours*/
  2178. static uint64_t *yHistogram= NULL;
  2179. int black=0, white=255; // blackest black and whitest white in the picture
  2180. /* Temporary buffers for handling the last row(s) */
  2181. static uint8_t *tempDst= NULL;
  2182. static uint8_t *tempSrc= NULL;
  2183. /* Temporary buffers for handling the last block */
  2184. static uint8_t *tempDstBlock= NULL;
  2185. static uint8_t *tempSrcBlock= NULL;
  2186. uint8_t *dstBlockPtrBackup;
  2187. uint8_t *srcBlockPtrBackup;
  2188. #ifdef TIMING
  2189. long long T0, T1, memcpyTime=0, vertTime=0, horizTime=0, sumTime, diffTime=0;
  2190. sumTime= rdtsc();
  2191. #endif
  2192. if(tempDst==NULL)
  2193. {
  2194. tempDst= (uint8_t*)memalign(8, 1024*24);
  2195. tempSrc= (uint8_t*)memalign(8, 1024*24);
  2196. tempDstBlock= (uint8_t*)memalign(8, 1024*24);
  2197. tempSrcBlock= (uint8_t*)memalign(8, 1024*24);
  2198. }
  2199. if(!yHistogram)
  2200. {
  2201. int i;
  2202. yHistogram= (uint64_t*)malloc(8*256);
  2203. for(i=0; i<256; i++) yHistogram[i]= width*height/64*15/256;
  2204. if(mode & FULL_Y_RANGE)
  2205. {
  2206. maxAllowedY=255;
  2207. minAllowedY=0;
  2208. }
  2209. }
  2210. if(!isColor)
  2211. {
  2212. uint64_t sum= 0;
  2213. int i;
  2214. static int framenum= -1;
  2215. uint64_t maxClipped;
  2216. uint64_t clipped;
  2217. double scale;
  2218. framenum++;
  2219. if(framenum == 1) yHistogram[0]= width*height/64*15/256;
  2220. for(i=0; i<256; i++)
  2221. {
  2222. sum+= yHistogram[i];
  2223. // printf("%d ", yHistogram[i]);
  2224. }
  2225. // printf("\n\n");
  2226. /* we allways get a completly black picture first */
  2227. maxClipped= (uint64_t)(sum * maxClippedThreshold);
  2228. clipped= sum;
  2229. for(black=255; black>0; black--)
  2230. {
  2231. if(clipped < maxClipped) break;
  2232. clipped-= yHistogram[black];
  2233. }
  2234. clipped= sum;
  2235. for(white=0; white<256; white++)
  2236. {
  2237. if(clipped < maxClipped) break;
  2238. clipped-= yHistogram[white];
  2239. }
  2240. // we cant handle negative correctures
  2241. packedYOffset= MAX(black - minAllowedY, 0);
  2242. packedYOffset|= packedYOffset<<32;
  2243. packedYOffset|= packedYOffset<<16;
  2244. packedYOffset|= packedYOffset<<8;
  2245. scale= (double)(maxAllowedY - minAllowedY) / (double)(white-black);
  2246. packedYScale= (uint16_t)(scale*512.0 + 0.5);
  2247. packedYScale|= packedYScale<<32;
  2248. packedYScale|= packedYScale<<16;
  2249. }
  2250. else
  2251. {
  2252. packedYScale= 0x0100010001000100LL;
  2253. packedYOffset= 0;
  2254. }
  2255. /* copy first row of 8x8 blocks */
  2256. for(x=0; x<width; x+=BLOCK_SIZE)
  2257. blockCopy(dst + x, dstStride, src + x, srcStride, 8, mode & LEVEL_FIX);
  2258. for(y=0; y<height; y+=BLOCK_SIZE)
  2259. {
  2260. //1% speedup if these are here instead of the inner loop
  2261. uint8_t *srcBlock= &(src[y*srcStride]);
  2262. uint8_t *dstBlock= &(dst[y*dstStride]);
  2263. /* can we mess with a 8x16 block from srcBlock/dstBlock downwards, if not
  2264. than use a temporary buffer */
  2265. if(y+15 >= height)
  2266. {
  2267. /* copy from line 5 to 12 of src, these will e copied with
  2268. blockcopy to dst later */
  2269. memcpy(tempSrc + srcStride*5, srcBlock + srcStride*5,
  2270. srcStride*MAX(height-y-5, 0) );
  2271. /* duplicate last line to fill the void upto line 12 */
  2272. if(y+12 >= height)
  2273. {
  2274. int i;
  2275. for(i=height-y; i<=12; i++)
  2276. memcpy(tempSrc + srcStride*i,
  2277. src + srcStride*(height-1), srcStride);
  2278. }
  2279. /* copy up to 5 lines of dst */
  2280. memcpy(tempDst, dstBlock, dstStride*MIN(height-y, 5) );
  2281. dstBlock= tempDst;
  2282. srcBlock= tempSrc;
  2283. }
  2284. // From this point on it is guranteed that we can read and write 16 lines downward
  2285. // finish 1 block before the next otherwise we´ll might have a problem
  2286. // with the L1 Cache of the P4 ... or only a few blocks at a time or soemthing
  2287. for(x=0; x<width; x+=BLOCK_SIZE)
  2288. {
  2289. const int stride= dstStride;
  2290. int QP= isColor ?
  2291. QPs[(y>>3)*QPStride + (x>>3)]:
  2292. QPs[(y>>4)*QPStride + (x>>4)];
  2293. if(!isColor && (mode & LEVEL_FIX)) QP= (QP* (packedYScale &0xFFFF))>>8;
  2294. #ifdef HAVE_MMX
  2295. asm volatile(
  2296. "movd %0, %%mm7 \n\t"
  2297. "packuswb %%mm7, %%mm7 \n\t" // 0, 0, 0, QP, 0, 0, 0, QP
  2298. "packuswb %%mm7, %%mm7 \n\t" // 0,QP, 0, QP, 0,QP, 0, QP
  2299. "packuswb %%mm7, %%mm7 \n\t" // QP,..., QP
  2300. "movq %%mm7, pQPb \n\t"
  2301. : : "r" (QP)
  2302. );
  2303. #endif
  2304. #ifdef MORE_TIMING
  2305. T0= rdtsc();
  2306. #endif
  2307. #ifdef HAVE_MMX2
  2308. prefetchnta(srcBlock + (((x>>3)&3) + 5)*srcStride + 32);
  2309. prefetchnta(srcBlock + (((x>>3)&3) + 9)*srcStride + 32);
  2310. prefetcht0(dstBlock + (((x>>3)&3) + 5)*dstStride + 32);
  2311. prefetcht0(dstBlock + (((x>>3)&3) + 9)*dstStride + 32);
  2312. #elif defined(HAVE_3DNOW)
  2313. //FIXME check if this is faster on an 3dnow chip or if its faster without the prefetch or ...
  2314. /* prefetch(srcBlock + (((x>>3)&3) + 5)*srcStride + 32);
  2315. prefetch(srcBlock + (((x>>3)&3) + 9)*srcStride + 32);
  2316. prefetchw(dstBlock + (((x>>3)&3) + 5)*dstStride + 32);
  2317. prefetchw(dstBlock + (((x>>3)&3) + 9)*dstStride + 32);
  2318. */
  2319. #endif
  2320. if(!isColor) yHistogram[ srcBlock[srcStride*5] ]++;
  2321. #ifdef PP_FUNNY_STRIDE
  2322. //can we mess with a 8x16 block, if not use a temp buffer, yes again
  2323. if(x+7 >= width)
  2324. {
  2325. int i;
  2326. dstBlockPtrBackup= dstBlock;
  2327. srcBlockPtrBackup= srcBlock;
  2328. for(i=0;i<BLOCK_SIZE*2; i++)
  2329. {
  2330. memcpy(tempSrcBlock+i*srcStride, srcBlock+i*srcStride, width-x);
  2331. memcpy(tempDstBlock+i*dstStride, dstBlock+i*dstStride, width-x);
  2332. }
  2333. dstBlock= tempDstBlock;
  2334. srcBlock= tempSrcBlock;
  2335. }
  2336. #endif
  2337. blockCopy(dstBlock + dstStride*5, dstStride,
  2338. srcBlock + srcStride*5, srcStride, 8, mode & LEVEL_FIX);
  2339. if(mode & LINEAR_IPOL_DEINT_FILTER)
  2340. deInterlaceInterpolateLinear(dstBlock, dstStride);
  2341. else if(mode & LINEAR_BLEND_DEINT_FILTER)
  2342. deInterlaceBlendLinear(dstBlock, dstStride);
  2343. else if(mode & MEDIAN_DEINT_FILTER)
  2344. deInterlaceMedian(dstBlock, dstStride);
  2345. else if(mode & CUBIC_IPOL_DEINT_FILTER)
  2346. deInterlaceInterpolateCubic(dstBlock, dstStride);
  2347. /* else if(mode & CUBIC_BLEND_DEINT_FILTER)
  2348. deInterlaceBlendCubic(dstBlock, dstStride);
  2349. */
  2350. /* only deblock if we have 2 blocks */
  2351. if(y + 8 < height)
  2352. {
  2353. #ifdef MORE_TIMING
  2354. T1= rdtsc();
  2355. memcpyTime+= T1-T0;
  2356. T0=T1;
  2357. #endif
  2358. if(mode & V_RK1_FILTER)
  2359. vertRK1Filter(dstBlock, stride, QP);
  2360. else if(mode & V_X1_FILTER)
  2361. vertX1Filter(dstBlock, stride, QP);
  2362. else if(mode & V_DEBLOCK)
  2363. {
  2364. if( isVertDC(dstBlock, stride))
  2365. {
  2366. if(isVertMinMaxOk(dstBlock, stride, QP))
  2367. doVertLowPass(dstBlock, stride, QP);
  2368. }
  2369. else
  2370. doVertDefFilter(dstBlock, stride, QP);
  2371. }
  2372. #ifdef MORE_TIMING
  2373. T1= rdtsc();
  2374. vertTime+= T1-T0;
  2375. T0=T1;
  2376. #endif
  2377. }
  2378. /* check if we have a previous block to deblock it with dstBlock */
  2379. if(x - 8 >= 0)
  2380. {
  2381. #ifdef MORE_TIMING
  2382. T0= rdtsc();
  2383. #endif
  2384. if(mode & H_X1_FILTER)
  2385. horizX1Filter(dstBlock-4, stride, QP);
  2386. else if(mode & H_DEBLOCK)
  2387. {
  2388. if( isHorizDCAndCopy2Temp(dstBlock-4, stride))
  2389. {
  2390. if(isHorizMinMaxOk(tempBlock, TEMP_STRIDE, QP))
  2391. doHorizLowPassAndCopyBack(dstBlock-4, stride, QP);
  2392. }
  2393. else
  2394. doHorizDefFilterAndCopyBack(dstBlock-4, stride, QP);
  2395. }
  2396. #ifdef MORE_TIMING
  2397. T1= rdtsc();
  2398. horizTime+= T1-T0;
  2399. T0=T1;
  2400. #endif
  2401. dering(dstBlock - 9 - stride, stride, QP);
  2402. }
  2403. else if(y!=0)
  2404. dering(dstBlock - stride*9 + width-9, stride, QP);
  2405. //FIXME dering filter will not be applied to last block (bottom right)
  2406. #ifdef PP_FUNNY_STRIDE
  2407. /* did we use a tmp-block buffer */
  2408. if(x+7 >= width)
  2409. {
  2410. int i;
  2411. dstBlock= dstBlockPtrBackup;
  2412. srcBlock= srcBlockPtrBackup;
  2413. for(i=0;i<BLOCK_SIZE*2; i++)
  2414. {
  2415. memcpy(dstBlock+i*dstStride, tempDstBlock+i*dstStride, width-x);
  2416. }
  2417. }
  2418. #endif
  2419. dstBlock+=8;
  2420. srcBlock+=8;
  2421. }
  2422. /* did we use a tmp buffer */
  2423. if(y+15 >= height)
  2424. {
  2425. uint8_t *dstBlock= &(dst[y*dstStride]);
  2426. memcpy(dstBlock, tempDst, dstStride*(height-y) );
  2427. }
  2428. }
  2429. #ifdef HAVE_3DNOW
  2430. asm volatile("femms");
  2431. #elif defined (HAVE_MMX)
  2432. asm volatile("emms");
  2433. #endif
  2434. #ifdef TIMING
  2435. // FIXME diff is mostly the time spent for rdtsc (should subtract that but ...)
  2436. sumTime= rdtsc() - sumTime;
  2437. if(!isColor)
  2438. printf("cpy:%4dk, vert:%4dk, horiz:%4dk, sum:%4dk, diff:%4dk, color: %d/%d \r",
  2439. (int)(memcpyTime/1000), (int)(vertTime/1000), (int)(horizTime/1000),
  2440. (int)(sumTime/1000), (int)((sumTime-memcpyTime-vertTime-horizTime)/1000)
  2441. , black, white);
  2442. #endif
  2443. }