You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

2278 lines
63KB

  1. /*
  2. Copyright (C) 2001 Michael Niedermayer (michaelni@gmx.at)
  3. This program is free software; you can redistribute it and/or modify
  4. it under the terms of the GNU General Public License as published by
  5. the Free Software Foundation; either version 2 of the License, or
  6. (at your option) any later version.
  7. This program is distributed in the hope that it will be useful,
  8. but WITHOUT ANY WARRANTY; without even the implied warranty of
  9. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  10. GNU General Public License for more details.
  11. You should have received a copy of the GNU General Public License
  12. along with this program; if not, write to the Free Software
  13. Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  14. */
  15. /*
  16. C MMX MMX2 3DNow*
  17. isVertDC Ec Ec
  18. isVertMinMaxOk Ec Ec
  19. doVertLowPass E e e*
  20. doVertDefFilter Ec Ec Ec
  21. isHorizDC Ec Ec
  22. isHorizMinMaxOk a
  23. doHorizLowPass E a a*
  24. doHorizDefFilter E ac ac
  25. deRing
  26. Vertical RKAlgo1 E a a*
  27. Vertical X1 a E E*
  28. Horizontal X1 a E E*
  29. * i dont have a 3dnow CPU -> its untested
  30. E = Exact implementation
  31. e = allmost exact implementation
  32. a = alternative / approximate impl
  33. c = checked against the other implementations (-vo md5)
  34. */
  35. /*
  36. TODO:
  37. verify that everything workes as it should (how?)
  38. reduce the time wasted on the mem transfer
  39. implement dering
  40. implement everything in C at least (done at the moment but ...)
  41. unroll stuff if instructions depend too much on the prior one
  42. we use 8x8 blocks for the horizontal filters, opendivx seems to use 8x4?
  43. move YScale thing to the end instead of fixing QP
  44. write a faster and higher quality deblocking filter :)
  45. do something about the speed of the horizontal filters
  46. make the mainloop more flexible (variable number of blocks at once
  47. (the if/else stuff per block is slowing things down)
  48. compare the quality & speed of all filters
  49. implement a few simple deinterlacing filters
  50. split this huge file
  51. ...
  52. Notes:
  53. */
  54. /*
  55. Changelog: use the CVS log
  56. 0.1.3
  57. bugfixes: last 3 lines not brightness/contrast corrected
  58. brightness statistics messed up with initial black pic
  59. changed initial values of the brightness statistics
  60. C++ -> C conversation
  61. QP range question solved (very likely 1<=QP<=32 according to arpi)
  62. new experimental vertical deblocking filter
  63. RK filter has 3dNow support now (untested)
  64. 0.1.2
  65. fixed a bug in the horizontal default filter
  66. 3dnow version of the Horizontal & Vertical Lowpass filters
  67. mmx version of the Horizontal Default filter
  68. mmx2 & C versions of a simple filter described in a paper from ramkishor & karandikar
  69. added mode flags & quality2mode function
  70. 0.1.1
  71. */
  72. #include <inttypes.h>
  73. #include <stdio.h>
  74. #include <stdlib.h>
  75. #include "../config.h"
  76. //#undef HAVE_MMX2
  77. //#define HAVE_3DNOW
  78. //#undef HAVE_MMX
  79. #include "postprocess.h"
  80. static uint64_t packedYOffset= 0x0000000000000000LL;
  81. static uint64_t packedYScale= 0x0100010001000100LL;
  82. static uint64_t w05= 0x0005000500050005LL;
  83. static uint64_t w20= 0x0020002000200020LL;
  84. static uint64_t w1400= 0x1400140014001400LL;
  85. static uint64_t bm00000001= 0x00000000000000FFLL;
  86. static uint64_t bm00010000= 0x000000FF00000000LL;
  87. static uint64_t bm00001000= 0x00000000FF000000LL;
  88. static uint64_t bm10000000= 0xFF00000000000000LL;
  89. static uint64_t bm10000001= 0xFF000000000000FFLL;
  90. static uint64_t bm11000011= 0xFFFF00000000FFFFLL;
  91. static uint64_t bm00000011= 0x000000000000FFFFLL;
  92. static uint64_t bm11111110= 0xFFFFFFFFFFFFFF00LL;
  93. static uint64_t bm11000000= 0xFFFF000000000000LL;
  94. static uint64_t bm00011000= 0x000000FFFF000000LL;
  95. static uint64_t bm00110011= 0x0000FFFF0000FFFFLL;
  96. static uint64_t bm11001100= 0xFFFF0000FFFF0000LL;
  97. static uint64_t b00= 0x0000000000000000LL;
  98. static uint64_t b01= 0x0101010101010101LL;
  99. static uint64_t b02= 0x0202020202020202LL;
  100. static uint64_t b0F= 0x0F0F0F0F0F0F0F0FLL;
  101. static uint64_t bFF= 0xFFFFFFFFFFFFFFFFLL;
  102. static uint64_t b20= 0x2020202020202020LL;
  103. static uint64_t b80= 0x8080808080808080LL;
  104. static uint64_t b7E= 0x7E7E7E7E7E7E7E7ELL;
  105. static uint64_t b7C= 0x7C7C7C7C7C7C7C7CLL;
  106. static uint64_t b3F= 0x3F3F3F3F3F3F3F3FLL;
  107. static uint64_t temp0=0;
  108. static uint64_t temp1=0;
  109. static uint64_t temp2=0;
  110. static uint64_t temp3=0;
  111. static uint64_t temp4=0;
  112. static uint64_t temp5=0;
  113. static uint64_t pQPb=0;
  114. static uint8_t tempBlock[16*16];
  115. int hFlatnessThreshold= 56 - 16;
  116. int vFlatnessThreshold= 56 - 16;
  117. //amount of "black" u r willing to loose to get a brightness corrected picture
  118. double maxClippedThreshold= 0.01;
  119. int maxAllowedY=255;
  120. //FIXME can never make a movie´s black brighter (anyone needs that?)
  121. int minAllowedY=0;
  122. #ifdef TIMEING
  123. static inline long long rdtsc()
  124. {
  125. long long l;
  126. asm volatile( "rdtsc\n\t"
  127. : "=A" (l)
  128. );
  129. // printf("%d\n", int(l/1000));
  130. return l;
  131. }
  132. #endif
  133. #ifdef HAVE_MMX2
  134. static inline void prefetchnta(void *p)
  135. {
  136. asm volatile( "prefetchnta (%0)\n\t"
  137. : : "r" (p)
  138. );
  139. }
  140. static inline void prefetcht0(void *p)
  141. {
  142. asm volatile( "prefetcht0 (%0)\n\t"
  143. : : "r" (p)
  144. );
  145. }
  146. static inline void prefetcht1(void *p)
  147. {
  148. asm volatile( "prefetcht1 (%0)\n\t"
  149. : : "r" (p)
  150. );
  151. }
  152. static inline void prefetcht2(void *p)
  153. {
  154. asm volatile( "prefetcht2 (%0)\n\t"
  155. : : "r" (p)
  156. );
  157. }
  158. #endif
  159. //FIXME? |255-0| = 1 (shouldnt be a problem ...)
  160. /**
  161. * Check if the middle 8x8 Block in the given 8x10 block is flat
  162. */
  163. static inline int isVertDC(uint8_t src[], int stride){
  164. // return true;
  165. int numEq= 0;
  166. int y;
  167. src+= stride; // src points to begin of the 8x8 Block
  168. #ifdef HAVE_MMX
  169. asm volatile(
  170. // "int $3 \n\t"
  171. "pushl %1\n\t"
  172. "movq b7E, %%mm7 \n\t" // mm7 = 0x7F
  173. "movq b7C, %%mm6 \n\t" // mm6 = 0x7D
  174. "movq (%1), %%mm0 \n\t"
  175. "addl %2, %1 \n\t"
  176. "movq (%1), %%mm1 \n\t"
  177. "psubb %%mm1, %%mm0 \n\t" // mm0 = differnece
  178. "paddb %%mm7, %%mm0 \n\t"
  179. "pcmpgtb %%mm6, %%mm0 \n\t"
  180. "addl %2, %1 \n\t"
  181. "movq (%1), %%mm2 \n\t"
  182. "psubb %%mm2, %%mm1 \n\t"
  183. "paddb %%mm7, %%mm1 \n\t"
  184. "pcmpgtb %%mm6, %%mm1 \n\t"
  185. "paddb %%mm1, %%mm0 \n\t"
  186. "addl %2, %1 \n\t"
  187. "movq (%1), %%mm1 \n\t"
  188. "psubb %%mm1, %%mm2 \n\t"
  189. "paddb %%mm7, %%mm2 \n\t"
  190. "pcmpgtb %%mm6, %%mm2 \n\t"
  191. "paddb %%mm2, %%mm0 \n\t"
  192. "addl %2, %1 \n\t"
  193. "movq (%1), %%mm2 \n\t"
  194. "psubb %%mm2, %%mm1 \n\t"
  195. "paddb %%mm7, %%mm1 \n\t"
  196. "pcmpgtb %%mm6, %%mm1 \n\t"
  197. "paddb %%mm1, %%mm0 \n\t"
  198. "addl %2, %1 \n\t"
  199. "movq (%1), %%mm1 \n\t"
  200. "psubb %%mm1, %%mm2 \n\t"
  201. "paddb %%mm7, %%mm2 \n\t"
  202. "pcmpgtb %%mm6, %%mm2 \n\t"
  203. "paddb %%mm2, %%mm0 \n\t"
  204. "addl %2, %1 \n\t"
  205. "movq (%1), %%mm2 \n\t"
  206. "psubb %%mm2, %%mm1 \n\t"
  207. "paddb %%mm7, %%mm1 \n\t"
  208. "pcmpgtb %%mm6, %%mm1 \n\t"
  209. "paddb %%mm1, %%mm0 \n\t"
  210. "addl %2, %1 \n\t"
  211. "movq (%1), %%mm1 \n\t"
  212. "psubb %%mm1, %%mm2 \n\t"
  213. "paddb %%mm7, %%mm2 \n\t"
  214. "pcmpgtb %%mm6, %%mm2 \n\t"
  215. "paddb %%mm2, %%mm0 \n\t"
  216. " \n\t"
  217. "movq %%mm0, %%mm1 \n\t"
  218. "psrlw $8, %%mm0 \n\t"
  219. "paddb %%mm1, %%mm0 \n\t"
  220. "movq %%mm0, %%mm1 \n\t"
  221. "psrlq $16, %%mm0 \n\t"
  222. "paddb %%mm1, %%mm0 \n\t"
  223. "movq %%mm0, %%mm1 \n\t"
  224. "psrlq $32, %%mm0 \n\t"
  225. "paddb %%mm1, %%mm0 \n\t"
  226. "popl %1\n\t"
  227. "movd %%mm0, %0 \n\t"
  228. : "=r" (numEq)
  229. : "r" (src), "r" (stride)
  230. );
  231. // printf("%d\n", numEq);
  232. numEq= (256 - (numEq & 0xFF)) &0xFF;
  233. // int asmEq= numEq;
  234. // numEq=0;
  235. // uint8_t *temp= src;
  236. #else
  237. for(y=0; y<BLOCK_SIZE-1; y++)
  238. {
  239. if(((src[0] - src[0+stride] + 1)&0xFFFF) < 3) numEq++;
  240. if(((src[1] - src[1+stride] + 1)&0xFFFF) < 3) numEq++;
  241. if(((src[2] - src[2+stride] + 1)&0xFFFF) < 3) numEq++;
  242. if(((src[3] - src[3+stride] + 1)&0xFFFF) < 3) numEq++;
  243. if(((src[4] - src[4+stride] + 1)&0xFFFF) < 3) numEq++;
  244. if(((src[5] - src[5+stride] + 1)&0xFFFF) < 3) numEq++;
  245. if(((src[6] - src[6+stride] + 1)&0xFFFF) < 3) numEq++;
  246. if(((src[7] - src[7+stride] + 1)&0xFFFF) < 3) numEq++;
  247. src+= stride;
  248. }
  249. #endif
  250. /* if(abs(numEq - asmEq) > 0)
  251. {
  252. printf("\nasm:%d c:%d\n", asmEq, numEq);
  253. for(int y=0; y<8; y++)
  254. {
  255. for(int x=0; x<8; x++)
  256. {
  257. printf("%d ", temp[x + y*stride]);
  258. }
  259. printf("\n");
  260. }
  261. }
  262. */
  263. // for(int i=0; i<numEq/8; i++) src[i]=255;
  264. return (numEq > vFlatnessThreshold) ? 1 : 0;
  265. }
  266. static inline int isVertMinMaxOk(uint8_t src[], int stride, int QP)
  267. {
  268. #ifdef HAVE_MMX
  269. int isOk;
  270. asm volatile(
  271. // "int $3 \n\t"
  272. "movq (%1, %2), %%mm0 \n\t"
  273. "movq (%1, %2, 8), %%mm1 \n\t"
  274. "movq %%mm0, %%mm2 \n\t"
  275. "psubusb %%mm1, %%mm0 \n\t"
  276. "psubusb %%mm2, %%mm1 \n\t"
  277. "por %%mm1, %%mm0 \n\t" // ABS Diff
  278. "movq pQPb, %%mm7 \n\t" // QP,..., QP
  279. "paddusb %%mm7, %%mm7 \n\t" // 2QP ... 2QP
  280. "psubusb %%mm7, %%mm0 \n\t" // Diff <= 2QP -> 0
  281. "pcmpeqd b00, %%mm0 \n\t"
  282. "psrlq $16, %%mm0 \n\t"
  283. "pcmpeqd bFF, %%mm0 \n\t"
  284. // "movd %%mm0, (%1, %2, 4)\n\t"
  285. "movd %%mm0, %0 \n\t"
  286. : "=r" (isOk)
  287. : "r" (src), "r" (stride)
  288. );
  289. return isOk ? 1 : 0;
  290. #else
  291. int isOk2= 1;
  292. int x;
  293. for(x=0; x<BLOCK_SIZE; x++)
  294. {
  295. if(abs((int)src[x + stride] - (int)src[x + (stride<<3)]) > 2*QP) isOk2=0;
  296. }
  297. /* if(isOk && !isOk2 || !isOk && isOk2)
  298. {
  299. printf("\nasm:%d c:%d QP:%d\n", isOk, isOk2, QP);
  300. for(int y=0; y<9; y++)
  301. {
  302. for(int x=0; x<8; x++)
  303. {
  304. printf("%d ", src[x + y*stride]);
  305. }
  306. printf("\n");
  307. }
  308. } */
  309. return isOk2;
  310. #endif
  311. }
  312. /**
  313. * Do a vertical low pass filter on the 8x10 block (only write to the 8x8 block in the middle)
  314. * useing the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16
  315. */
  316. static inline void doVertLowPass(uint8_t *src, int stride, int QP)
  317. {
  318. // QP= 64;
  319. #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
  320. //#ifdef HAVE_MMX2
  321. asm volatile( //"movv %0 %1 %2\n\t"
  322. "pushl %0 \n\t"
  323. "movq pQPb, %%mm0 \n\t" // QP,..., QP
  324. // "movq bFF , %%mm0 \n\t" // QP,..., QP
  325. "movq (%0), %%mm6 \n\t"
  326. "movq (%0, %1), %%mm5 \n\t"
  327. "movq %%mm5, %%mm1 \n\t"
  328. "movq %%mm6, %%mm2 \n\t"
  329. "psubusb %%mm6, %%mm5 \n\t"
  330. "psubusb %%mm1, %%mm2 \n\t"
  331. "por %%mm5, %%mm2 \n\t" // ABS Diff of lines
  332. "psubusb %%mm0, %%mm2 \n\t" // diff <= QP -> 0
  333. "pcmpeqb b00, %%mm2 \n\t" // diff <= QP -> FF
  334. "pand %%mm2, %%mm6 \n\t"
  335. "pandn %%mm1, %%mm2 \n\t"
  336. "por %%mm2, %%mm6 \n\t"// First Line to Filter
  337. "movq (%0, %1, 8), %%mm5 \n\t"
  338. "leal (%0, %1, 4), %%eax \n\t"
  339. "leal (%0, %1, 8), %%ebx \n\t"
  340. "subl %1, %%ebx \n\t"
  341. "addl %1, %0 \n\t" // %0 points to line 1 not 0
  342. "movq (%0, %1, 8), %%mm7 \n\t"
  343. "movq %%mm5, %%mm1 \n\t"
  344. "movq %%mm7, %%mm2 \n\t"
  345. "psubusb %%mm7, %%mm5 \n\t"
  346. "psubusb %%mm1, %%mm2 \n\t"
  347. "por %%mm5, %%mm2 \n\t" // ABS Diff of lines
  348. "psubusb %%mm0, %%mm2 \n\t" // diff <= QP -> 0
  349. "pcmpeqb b00, %%mm2 \n\t" // diff <= QP -> FF
  350. "pand %%mm2, %%mm7 \n\t"
  351. "pandn %%mm1, %%mm2 \n\t"
  352. "por %%mm2, %%mm7 \n\t" // First Line to Filter
  353. // 1 2 3 4 5 6 7 8
  354. // %0 %0+%1 %0+2%1 eax %0+4%1 eax+2%1 ebx eax+4%1
  355. // 6 4 2 2 1 1
  356. // 6 4 4 2
  357. // 6 8 2
  358. /*
  359. "movq %%mm6, %%mm2 \n\t" //1
  360. "movq %%mm6, %%mm3 \n\t" //1
  361. "paddusb b02, %%mm3 \n\t"
  362. "psrlw $2, %%mm3 \n\t" //1 /4
  363. "pand b3F, %%mm3 \n\t"
  364. "psubb %%mm3, %%mm2 \n\t"
  365. "movq (%0, %1), %%mm0 \n\t" // 1
  366. "movq %%mm0, %%mm1 \n\t" // 1
  367. "paddusb b02, %%mm0 \n\t"
  368. "psrlw $2, %%mm0 \n\t" // 1 /4
  369. "pand b3F, %%mm0 \n\t"
  370. "paddusb %%mm2, %%mm0 \n\t" //3 1 /4
  371. */
  372. "movq (%0, %1), %%mm0 \n\t" // 1
  373. "movq %%mm0, %%mm1 \n\t" // 1
  374. PAVGB(%%mm6, %%mm0) //1 1 /2
  375. PAVGB(%%mm6, %%mm0) //3 1 /4
  376. "movq (%0, %1, 4), %%mm2 \n\t" // 1
  377. "movq %%mm2, %%mm5 \n\t" // 1
  378. PAVGB((%%eax), %%mm2) // 11 /2
  379. PAVGB((%0, %1, 2), %%mm2) // 211 /4
  380. "movq %%mm2, %%mm3 \n\t" // 211 /4
  381. "movq (%0), %%mm4 \n\t" // 1
  382. PAVGB(%%mm4, %%mm3) // 4 211 /8
  383. PAVGB(%%mm0, %%mm3) //642211 /16
  384. "movq %%mm3, (%0) \n\t" // X
  385. // mm1=2 mm2=3(211) mm4=1 mm5=5 mm6=0 mm7=9
  386. "movq %%mm1, %%mm0 \n\t" // 1
  387. PAVGB(%%mm6, %%mm0) //1 1 /2
  388. "movq %%mm4, %%mm3 \n\t" // 1
  389. PAVGB((%0,%1,2), %%mm3) // 1 1 /2
  390. PAVGB((%%eax,%1,2), %%mm5) // 11 /2
  391. PAVGB((%%eax), %%mm5) // 211 /4
  392. PAVGB(%%mm5, %%mm3) // 2 2211 /8
  393. PAVGB(%%mm0, %%mm3) //4242211 /16
  394. "movq %%mm3, (%0,%1) \n\t" // X
  395. // mm1=2 mm2=3(211) mm4=1 mm5=4(211) mm6=0 mm7=9
  396. PAVGB(%%mm4, %%mm6) //11 /2
  397. "movq (%%ebx), %%mm0 \n\t" // 1
  398. PAVGB((%%eax, %1, 2), %%mm0) // 11/2
  399. "movq %%mm0, %%mm3 \n\t" // 11/2
  400. PAVGB(%%mm1, %%mm0) // 2 11/4
  401. PAVGB(%%mm6, %%mm0) //222 11/8
  402. PAVGB(%%mm2, %%mm0) //22242211/16
  403. "movq (%0, %1, 2), %%mm2 \n\t" // 1
  404. "movq %%mm0, (%0, %1, 2) \n\t" // X
  405. // mm1=2 mm2=3 mm3=6(11) mm4=1 mm5=4(211) mm6=0(11) mm7=9
  406. "movq (%%eax, %1, 4), %%mm0 \n\t" // 1
  407. PAVGB((%%ebx), %%mm0) // 11 /2
  408. PAVGB(%%mm0, %%mm6) //11 11 /4
  409. PAVGB(%%mm1, %%mm4) // 11 /2
  410. PAVGB(%%mm2, %%mm1) // 11 /2
  411. PAVGB(%%mm1, %%mm6) //1122 11 /8
  412. PAVGB(%%mm5, %%mm6) //112242211 /16
  413. "movq (%%eax), %%mm5 \n\t" // 1
  414. "movq %%mm6, (%%eax) \n\t" // X
  415. // mm0=7(11) mm1=2(11) mm2=3 mm3=6(11) mm4=1(11) mm5=4 mm7=9
  416. "movq (%%eax, %1, 4), %%mm6 \n\t" // 1
  417. PAVGB(%%mm7, %%mm6) // 11 /2
  418. PAVGB(%%mm4, %%mm6) // 11 11 /4
  419. PAVGB(%%mm3, %%mm6) // 11 2211 /8
  420. PAVGB(%%mm5, %%mm2) // 11 /2
  421. "movq (%0, %1, 4), %%mm4 \n\t" // 1
  422. PAVGB(%%mm4, %%mm2) // 112 /4
  423. PAVGB(%%mm2, %%mm6) // 112242211 /16
  424. "movq %%mm6, (%0, %1, 4) \n\t" // X
  425. // mm0=7(11) mm1=2(11) mm2=3(112) mm3=6(11) mm4=5 mm5=4 mm7=9
  426. PAVGB(%%mm7, %%mm1) // 11 2 /4
  427. PAVGB(%%mm4, %%mm5) // 11 /2
  428. PAVGB(%%mm5, %%mm0) // 11 11 /4
  429. "movq (%%eax, %1, 2), %%mm6 \n\t" // 1
  430. PAVGB(%%mm6, %%mm1) // 11 4 2 /8
  431. PAVGB(%%mm0, %%mm1) // 11224222 /16
  432. // "pxor %%mm1, %%mm1 \n\t"
  433. "movq %%mm1, (%%eax, %1, 2) \n\t" // X
  434. // mm2=3(112) mm3=6(11) mm4=5 mm5=4(11) mm6=6 mm7=9
  435. PAVGB((%%ebx), %%mm2) // 112 4 /8
  436. "movq (%%eax, %1, 4), %%mm0 \n\t" // 1
  437. PAVGB(%%mm0, %%mm6) // 1 1 /2
  438. PAVGB(%%mm7, %%mm6) // 1 12 /4
  439. PAVGB(%%mm2, %%mm6) // 1122424 /4
  440. // "pxor %%mm6, %%mm6 \n\t"
  441. "movq %%mm6, (%%ebx) \n\t" // X
  442. // mm0=8 mm3=6(11) mm4=5 mm5=4(11) mm7=9
  443. PAVGB(%%mm7, %%mm5) // 11 2 /4
  444. PAVGB(%%mm7, %%mm5) // 11 6 /8
  445. PAVGB(%%mm3, %%mm0) // 112 /4
  446. PAVGB(%%mm0, %%mm5) // 112246 /16
  447. // "pxor %%mm5, %%mm5 \n\t"
  448. // "movq pQPb, %%mm5 \n\t"
  449. "movq %%mm5, (%%eax, %1, 4) \n\t" // X
  450. "popl %0\n\t"
  451. :
  452. : "r" (src), "r" (stride)
  453. : "%eax", "%ebx"
  454. );
  455. #else
  456. const int l1= stride;
  457. const int l2= stride + l1;
  458. const int l3= stride + l2;
  459. const int l4= stride + l3;
  460. const int l5= stride + l4;
  461. const int l6= stride + l5;
  462. const int l7= stride + l6;
  463. const int l8= stride + l7;
  464. const int l9= stride + l8;
  465. int x;
  466. for(x=0; x<BLOCK_SIZE; x++)
  467. {
  468. const int first= ABS(src[0] - src[l1]) < QP ? src[0] : src[l1];
  469. const int last= ABS(src[l8] - src[l9]) < QP ? src[l9] : src[l8];
  470. int sums[9];
  471. sums[0] = first + src[l1];
  472. sums[1] = src[l1] + src[l2];
  473. sums[2] = src[l2] + src[l3];
  474. sums[3] = src[l3] + src[l4];
  475. sums[4] = src[l4] + src[l5];
  476. sums[5] = src[l5] + src[l6];
  477. sums[6] = src[l6] + src[l7];
  478. sums[7] = src[l7] + src[l8];
  479. sums[8] = src[l8] + last;
  480. src[l1]= ((sums[0]<<2) + ((first + sums[2])<<1) + sums[4] + 8)>>4;
  481. src[l2]= ((src[l2]<<2) + (first + sums[0] + sums[3]<<1) + sums[5] + 8)>>4;
  482. src[l3]= ((src[l3]<<2) + (first + sums[1] + sums[4]<<1) + sums[6] + 8)>>4;
  483. src[l4]= ((src[l4]<<2) + (sums[2] + sums[5]<<1) + sums[0] + sums[7] + 8)>>4;
  484. src[l5]= ((src[l5]<<2) + (sums[3] + sums[6]<<1) + sums[1] + sums[8] + 8)>>4;
  485. src[l6]= ((src[l6]<<2) + (last + sums[7] + sums[4]<<1) + sums[2] + 8)>>4;
  486. src[l7]= ((last + src[l7]<<2) + (src[l8] + sums[5]<<1) + sums[3] + 8)>>4;
  487. src[l8]= ((sums[8]<<2) + (last + sums[6]<<1) + sums[4] + 8)>>4;
  488. src++;
  489. }
  490. #endif
  491. }
  492. /**
  493. * Experimental implementation of the filter (Algorithm 1) described in a paper from Ramkishor & Karandikar
  494. * values are correctly clipped (MMX2)
  495. * values are wraparound (C)
  496. * conclusion: its fast, but introduces ugly horizontal patterns if there is a continious gradient
  497. 0 8 16 24
  498. x = 8
  499. x/2 = 4
  500. x/8 = 1
  501. 1 12 12 23
  502. */
  503. static inline void vertRK1Filter(uint8_t *src, int stride, int QP)
  504. {
  505. #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
  506. // FIXME rounding
  507. asm volatile(
  508. "pxor %%mm7, %%mm7 \n\t" // 0
  509. "movq b80, %%mm6 \n\t" // MIN_SIGNED_BYTE
  510. "leal (%0, %1), %%eax \n\t"
  511. "leal (%%eax, %1, 4), %%ebx \n\t"
  512. // 0 1 2 3 4 5 6 7 8 9
  513. // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
  514. "movq pQPb, %%mm0 \n\t" // QP,..., QP
  515. "movq %%mm0, %%mm1 \n\t" // QP,..., QP
  516. "paddusb b02, %%mm0 \n\t"
  517. "psrlw $2, %%mm0 \n\t"
  518. "pand b3F, %%mm0 \n\t" // QP/4,..., QP/4
  519. "paddusb %%mm1, %%mm0 \n\t" // QP*1.25 ...
  520. "movq (%0, %1, 4), %%mm2 \n\t" // line 4
  521. "movq (%%ebx), %%mm3 \n\t" // line 5
  522. "movq %%mm2, %%mm4 \n\t" // line 4
  523. "pcmpeqb %%mm5, %%mm5 \n\t" // -1
  524. "pxor %%mm2, %%mm5 \n\t" // -line 4 - 1
  525. PAVGB(%%mm3, %%mm5)
  526. "paddb %%mm6, %%mm5 \n\t" // (l5-l4)/2
  527. "psubusb %%mm3, %%mm4 \n\t"
  528. "psubusb %%mm2, %%mm3 \n\t"
  529. "por %%mm3, %%mm4 \n\t" // |l4 - l5|
  530. "psubusb %%mm0, %%mm4 \n\t"
  531. "pcmpeqb %%mm7, %%mm4 \n\t"
  532. "pand %%mm4, %%mm5 \n\t" // d/2
  533. // "paddb %%mm6, %%mm2 \n\t" // line 4 + 0x80
  534. "paddb %%mm5, %%mm2 \n\t"
  535. // "psubb %%mm6, %%mm2 \n\t"
  536. "movq %%mm2, (%0,%1, 4) \n\t"
  537. "movq (%%ebx), %%mm2 \n\t"
  538. // "paddb %%mm6, %%mm2 \n\t" // line 5 + 0x80
  539. "psubb %%mm5, %%mm2 \n\t"
  540. // "psubb %%mm6, %%mm2 \n\t"
  541. "movq %%mm2, (%%ebx) \n\t"
  542. "paddb %%mm6, %%mm5 \n\t"
  543. "psrlw $2, %%mm5 \n\t"
  544. "pand b3F, %%mm5 \n\t"
  545. "psubb b20, %%mm5 \n\t" // (l5-l4)/8
  546. "movq (%%eax, %1, 2), %%mm2 \n\t"
  547. "paddb %%mm6, %%mm2 \n\t" // line 3 + 0x80
  548. "paddsb %%mm5, %%mm2 \n\t"
  549. "psubb %%mm6, %%mm2 \n\t"
  550. "movq %%mm2, (%%eax, %1, 2) \n\t"
  551. "movq (%%ebx, %1), %%mm2 \n\t"
  552. "paddb %%mm6, %%mm2 \n\t" // line 6 + 0x80
  553. "psubsb %%mm5, %%mm2 \n\t"
  554. "psubb %%mm6, %%mm2 \n\t"
  555. "movq %%mm2, (%%ebx, %1) \n\t"
  556. :
  557. : "r" (src), "r" (stride)
  558. : "%eax", "%ebx"
  559. );
  560. #else
  561. const int l1= stride;
  562. const int l2= stride + l1;
  563. const int l3= stride + l2;
  564. const int l4= stride + l3;
  565. const int l5= stride + l4;
  566. const int l6= stride + l5;
  567. const int l7= stride + l6;
  568. const int l8= stride + l7;
  569. const int l9= stride + l8;
  570. int x;
  571. for(x=0; x<BLOCK_SIZE; x++)
  572. {
  573. if(ABS(src[l4]-src[l5]) < QP + QP/4)
  574. {
  575. int v = (src[l5] - src[l4]);
  576. src[l3] +=v/8;
  577. src[l4] +=v/2;
  578. src[l5] -=v/2;
  579. src[l6] -=v/8;
  580. }
  581. src++;
  582. }
  583. #endif
  584. }
  585. /**
  586. * Experimental Filter 1
  587. * will not damage linear gradients
  588. * Flat blocks should look like they where passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter
  589. * can only smooth blocks at the expected locations (it cant smooth them if they did move)
  590. * MMX2 version does correct clipping C version doesnt
  591. */
  592. static inline void vertX1Filter(uint8_t *src, int stride, int QP)
  593. {
  594. #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
  595. asm volatile(
  596. "pxor %%mm7, %%mm7 \n\t" // 0
  597. // "movq b80, %%mm6 \n\t" // MIN_SIGNED_BYTE
  598. "leal (%0, %1), %%eax \n\t"
  599. "leal (%%eax, %1, 4), %%ebx \n\t"
  600. // 0 1 2 3 4 5 6 7 8 9
  601. // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
  602. "movq (%%eax, %1, 2), %%mm0 \n\t" // line 3
  603. "movq (%0, %1, 4), %%mm1 \n\t" // line 4
  604. "movq %%mm1, %%mm2 \n\t" // line 4
  605. "psubusb %%mm0, %%mm1 \n\t"
  606. "psubusb %%mm2, %%mm0 \n\t"
  607. "por %%mm1, %%mm0 \n\t" // |l2 - l3|
  608. "movq (%%ebx), %%mm3 \n\t" // line 5
  609. "movq (%%ebx, %1), %%mm4 \n\t" // line 6
  610. "movq %%mm3, %%mm5 \n\t" // line 5
  611. "psubusb %%mm4, %%mm3 \n\t"
  612. "psubusb %%mm5, %%mm4 \n\t"
  613. "por %%mm4, %%mm3 \n\t" // |l5 - l6|
  614. PAVGB(%%mm3, %%mm0) // (|l2 - l3| + |l5 - l6|)/2
  615. "movq %%mm2, %%mm1 \n\t" // line 4
  616. "psubusb %%mm5, %%mm2 \n\t"
  617. "movq %%mm2, %%mm4 \n\t"
  618. "pcmpeqb %%mm7, %%mm2 \n\t" // (l4 - l5) <= 0 ? -1 : 0
  619. "psubusb %%mm1, %%mm5 \n\t"
  620. "por %%mm5, %%mm4 \n\t" // |l4 - l5|
  621. "psubusb %%mm0, %%mm4 \n\t" //d = MAX(0, |l4-l5| - (|l2-l3| + |l5-l6|)/2)
  622. "movq %%mm4, %%mm3 \n\t" // d
  623. "psubusb pQPb, %%mm4 \n\t"
  624. "pcmpeqb %%mm7, %%mm4 \n\t" // d <= QP ? -1 : 0
  625. "psubusb b01, %%mm3 \n\t"
  626. "pand %%mm4, %%mm3 \n\t" // d <= QP ? d : 0
  627. PAVGB(%%mm7, %%mm3) // d/2
  628. "movq %%mm3, %%mm1 \n\t" // d/2
  629. PAVGB(%%mm7, %%mm3) // d/4
  630. PAVGB(%%mm1, %%mm3) // 3*d/8
  631. "movq (%0, %1, 4), %%mm0 \n\t" // line 4
  632. "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4
  633. "psubusb %%mm3, %%mm0 \n\t"
  634. "pxor %%mm2, %%mm0 \n\t"
  635. "movq %%mm0, (%0, %1, 4) \n\t" // line 4
  636. "movq (%%ebx), %%mm0 \n\t" // line 5
  637. "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5
  638. "paddusb %%mm3, %%mm0 \n\t"
  639. "pxor %%mm2, %%mm0 \n\t"
  640. "movq %%mm0, (%%ebx) \n\t" // line 5
  641. PAVGB(%%mm7, %%mm1) // d/4
  642. "movq (%%eax, %1, 2), %%mm0 \n\t" // line 3
  643. "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4
  644. "psubusb %%mm1, %%mm0 \n\t"
  645. "pxor %%mm2, %%mm0 \n\t"
  646. "movq %%mm0, (%%eax, %1, 2) \n\t" // line 3
  647. "movq (%%ebx, %1), %%mm0 \n\t" // line 6
  648. "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5
  649. "paddusb %%mm1, %%mm0 \n\t"
  650. "pxor %%mm2, %%mm0 \n\t"
  651. "movq %%mm0, (%%ebx, %1) \n\t" // line 6
  652. PAVGB(%%mm7, %%mm1) // d/8
  653. "movq (%%eax, %1), %%mm0 \n\t" // line 2
  654. "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l2-1 : l2
  655. "psubusb %%mm1, %%mm0 \n\t"
  656. "pxor %%mm2, %%mm0 \n\t"
  657. "movq %%mm0, (%%eax, %1) \n\t" // line 2
  658. "movq (%%ebx, %1, 2), %%mm0 \n\t" // line 7
  659. "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l7-1 : l7
  660. "paddusb %%mm1, %%mm0 \n\t"
  661. "pxor %%mm2, %%mm0 \n\t"
  662. "movq %%mm0, (%%ebx, %1, 2) \n\t" // line 7
  663. :
  664. : "r" (src), "r" (stride)
  665. : "%eax", "%ebx"
  666. );
  667. #else
  668. const int l1= stride;
  669. const int l2= stride + l1;
  670. const int l3= stride + l2;
  671. const int l4= stride + l3;
  672. const int l5= stride + l4;
  673. const int l6= stride + l5;
  674. const int l7= stride + l6;
  675. const int l8= stride + l7;
  676. const int l9= stride + l8;
  677. int x;
  678. for(x=0; x<BLOCK_SIZE; x++)
  679. {
  680. int a= src[l3] - src[l4];
  681. int b= src[l4] - src[l5];
  682. int c= src[l5] - src[l6];
  683. int d= MAX(ABS(b) - (ABS(a) + ABS(c))/2, 0);
  684. if(d < QP)
  685. {
  686. int v = d * SIGN(-b);
  687. src[l2] +=v/8;
  688. src[l3] +=v/4;
  689. src[l4] +=3*v/8;
  690. src[l5] -=3*v/8;
  691. src[l6] -=v/4;
  692. src[l7] -=v/8;
  693. }
  694. src++;
  695. }
  696. /*
  697. const int l1= stride;
  698. const int l2= stride + l1;
  699. const int l3= stride + l2;
  700. const int l4= stride + l3;
  701. const int l5= stride + l4;
  702. const int l6= stride + l5;
  703. const int l7= stride + l6;
  704. const int l8= stride + l7;
  705. const int l9= stride + l8;
  706. for(int x=0; x<BLOCK_SIZE; x++)
  707. {
  708. int v2= src[l2];
  709. int v3= src[l3];
  710. int v4= src[l4];
  711. int v5= src[l5];
  712. int v6= src[l6];
  713. int v7= src[l7];
  714. if(ABS(v4-v5)<QP && ABS(v4-v5) - (ABS(v3-v4) + ABS(v5-v6))>0 )
  715. {
  716. src[l3] = (6*v2 + 4*v3 + 3*v4 + 2*v5 + v6 )/16;
  717. src[l4] = (3*v2 + 3*v3 + 4*v4 + 3*v5 + 2*v6 + v7 )/16;
  718. src[l5] = (1*v2 + 2*v3 + 3*v4 + 4*v5 + 3*v6 + 3*v7)/16;
  719. src[l6] = ( 1*v3 + 2*v4 + 3*v5 + 4*v6 + 6*v7)/16;
  720. }
  721. src++;
  722. }
  723. */
  724. #endif
  725. }
  726. /**
  727. * Experimental Filter 1 (Horizontal)
  728. * will not damage linear gradients
  729. * Flat blocks should look like they where passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter
  730. * can only smooth blocks at the expected locations (it cant smooth them if they did move)
  731. * MMX2 version does correct clipping C version doesnt
  732. * not identical with the vertical one
  733. */
  734. static inline void horizX1Filter(uint8_t *src, int stride, int QP)
  735. {
  736. int y;
  737. static uint64_t *lut= NULL;
  738. if(lut==NULL)
  739. {
  740. int i;
  741. lut= (uint64_t*)memalign(8, 256*8);
  742. for(i=0; i<256; i++)
  743. {
  744. int v= i < 128 ? 2*i : 2*(i-256);
  745. /*
  746. //Simulate 112242211 9-Tap filter
  747. uint64_t a= (v/16) & 0xFF;
  748. uint64_t b= (v/8) & 0xFF;
  749. uint64_t c= (v/4) & 0xFF;
  750. uint64_t d= (3*v/8) & 0xFF;
  751. */
  752. //Simulate piecewise linear interpolation
  753. uint64_t a= (v/16) & 0xFF;
  754. uint64_t b= (v*3/16) & 0xFF;
  755. uint64_t c= (v*5/16) & 0xFF;
  756. uint64_t d= (7*v/16) & 0xFF;
  757. uint64_t A= (0x100 - a)&0xFF;
  758. uint64_t B= (0x100 - b)&0xFF;
  759. uint64_t C= (0x100 - c)&0xFF;
  760. uint64_t D= (0x100 - c)&0xFF;
  761. lut[i] = (a<<56) | (b<<48) | (c<<40) | (d<<32) |
  762. (D<<24) | (C<<16) | (B<<8) | (A);
  763. //lut[i] = (v<<32) | (v<<24);
  764. }
  765. }
  766. #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
  767. asm volatile(
  768. "pxor %%mm7, %%mm7 \n\t" // 0
  769. // "movq b80, %%mm6 \n\t" // MIN_SIGNED_BYTE
  770. "leal (%0, %1), %%eax \n\t"
  771. "leal (%%eax, %1, 4), %%ebx \n\t"
  772. "movq b80, %%mm6 \n\t"
  773. "movd pQPb, %%mm5 \n\t" // QP
  774. "movq %%mm5, %%mm4 \n\t"
  775. "paddusb %%mm5, %%mm5 \n\t" // 2QP
  776. "paddusb %%mm5, %%mm4 \n\t" // 3QP
  777. "pxor %%mm5, %%mm5 \n\t" // 0
  778. "psubb %%mm4, %%mm5 \n\t" // -3QP
  779. "por bm11111110, %%mm5 \n\t" // ...,FF,FF,-3QP
  780. "psllq $24, %%mm5 \n\t"
  781. // 0 1 2 3 4 5 6 7 8 9
  782. // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
  783. #define HX1old(a) \
  784. "movd " #a ", %%mm0 \n\t"\
  785. "movd 4" #a ", %%mm1 \n\t"\
  786. "punpckldq %%mm1, %%mm0 \n\t"\
  787. "movq %%mm0, %%mm1 \n\t"\
  788. "movq %%mm0, %%mm2 \n\t"\
  789. "psrlq $8, %%mm1 \n\t"\
  790. "psubusb %%mm1, %%mm2 \n\t"\
  791. "psubusb %%mm0, %%mm1 \n\t"\
  792. "por %%mm2, %%mm1 \n\t" /* p´x = |px - p(x+1)| */\
  793. "pcmpeqb %%mm7, %%mm2 \n\t" /* p´x = sgn[px - p(x+1)] */\
  794. "pshufw $0x00, %%mm1, %%mm3 \n\t" /* p´5 = |p1 - p2| */\
  795. PAVGB(%%mm1, %%mm3) /* p´5 = (|p2-p1| + |p6-p5|)/2 */\
  796. "psrlq $16, %%mm3 \n\t" /* p´3 = (|p2-p1| + |p6-p5|)/2 */\
  797. "psubusb %%mm3, %%mm1 \n\t" /* |p3-p4|-(|p2-p1| + |p6-p5|)/2 */\
  798. "paddb %%mm5, %%mm1 \n\t"\
  799. "psubusb %%mm5, %%mm1 \n\t"\
  800. PAVGB(%%mm7, %%mm1)\
  801. "pxor %%mm2, %%mm1 \n\t"\
  802. "psubb %%mm2, %%mm1 \n\t"\
  803. "psrlq $24, %%mm1 \n\t"\
  804. "movd %%mm1, %%ecx \n\t"\
  805. "paddb %%mm6, %%mm0 \n\t"\
  806. "paddsb (%3, %%ecx, 8), %%mm0 \n\t"\
  807. "paddb %%mm6, %%mm0 \n\t"\
  808. "movq %%mm0, " #a " \n\t"\
  809. /*
  810. HX1old((%0))
  811. HX1old((%%eax))
  812. HX1old((%%eax, %1))
  813. HX1old((%%eax, %1, 2))
  814. HX1old((%0, %1, 4))
  815. HX1old((%%ebx))
  816. HX1old((%%ebx, %1))
  817. HX1old((%%ebx, %1, 2))
  818. */
  819. //FIXME add some comments, its unreadable ...
  820. #define HX1b(a, c, b, d) \
  821. "movd " #a ", %%mm0 \n\t"\
  822. "movd 4" #a ", %%mm1 \n\t"\
  823. "punpckldq %%mm1, %%mm0 \n\t"\
  824. "movd " #b ", %%mm4 \n\t"\
  825. "movq %%mm0, %%mm1 \n\t"\
  826. "movq %%mm0, %%mm2 \n\t"\
  827. "psrlq $8, %%mm1 \n\t"\
  828. "movd 4" #b ", %%mm3 \n\t"\
  829. "psubusb %%mm1, %%mm2 \n\t"\
  830. "psubusb %%mm0, %%mm1 \n\t"\
  831. "por %%mm2, %%mm1 \n\t" /* p´x = |px - p(x+1)| */\
  832. "pcmpeqb %%mm7, %%mm2 \n\t" /* p´x = sgn[px - p(x+1)] */\
  833. "punpckldq %%mm3, %%mm4 \n\t"\
  834. "movq %%mm1, %%mm3 \n\t"\
  835. "psllq $32, %%mm3 \n\t" /* p´5 = |p1 - p2| */\
  836. PAVGB(%%mm1, %%mm3) /* p´5 = (|p2-p1| + |p6-p5|)/2 */\
  837. "paddb %%mm6, %%mm0 \n\t"\
  838. "psrlq $16, %%mm3 \n\t" /* p´3 = (|p2-p1| + |p6-p5|)/2 */\
  839. "psubusb %%mm3, %%mm1 \n\t" /* |p3-p4|-(|p2-p1| + |p6-p5|)/2 */\
  840. "movq %%mm4, %%mm3 \n\t"\
  841. "paddb %%mm5, %%mm1 \n\t"\
  842. "psubusb %%mm5, %%mm1 \n\t"\
  843. "psrlq $8, %%mm3 \n\t"\
  844. PAVGB(%%mm7, %%mm1)\
  845. "pxor %%mm2, %%mm1 \n\t"\
  846. "psubb %%mm2, %%mm1 \n\t"\
  847. "movq %%mm4, %%mm2 \n\t"\
  848. "psrlq $24, %%mm1 \n\t"\
  849. "psubusb %%mm3, %%mm2 \n\t"\
  850. "movd %%mm1, %%ecx \n\t"\
  851. "psubusb %%mm4, %%mm3 \n\t"\
  852. "paddsb (%2, %%ecx, 8), %%mm0 \n\t"\
  853. "por %%mm2, %%mm3 \n\t" /* p´x = |px - p(x+1)| */\
  854. "paddb %%mm6, %%mm0 \n\t"\
  855. "pcmpeqb %%mm7, %%mm2 \n\t" /* p´x = sgn[px - p(x+1)] */\
  856. "movq %%mm3, %%mm1 \n\t"\
  857. "psllq $32, %%mm1 \n\t" /* p´5 = |p1 - p2| */\
  858. "movq %%mm0, " #a " \n\t"\
  859. PAVGB(%%mm3, %%mm1) /* p´5 = (|p2-p1| + |p6-p5|)/2 */\
  860. "paddb %%mm6, %%mm4 \n\t"\
  861. "psrlq $16, %%mm1 \n\t" /* p´3 = (|p2-p1| + |p6-p5|)/2 */\
  862. "psubusb %%mm1, %%mm3 \n\t" /* |p3-p4|-(|p2-p1| + |p6-p5|)/2 */\
  863. "paddb %%mm5, %%mm3 \n\t"\
  864. "psubusb %%mm5, %%mm3 \n\t"\
  865. PAVGB(%%mm7, %%mm3)\
  866. "pxor %%mm2, %%mm3 \n\t"\
  867. "psubb %%mm2, %%mm3 \n\t"\
  868. "psrlq $24, %%mm3 \n\t"\
  869. "movd " #c ", %%mm0 \n\t"\
  870. "movd 4" #c ", %%mm1 \n\t"\
  871. "punpckldq %%mm1, %%mm0 \n\t"\
  872. "paddb %%mm6, %%mm0 \n\t"\
  873. "paddsb (%2, %%ecx, 8), %%mm0 \n\t"\
  874. "paddb %%mm6, %%mm0 \n\t"\
  875. "movq %%mm0, " #c " \n\t"\
  876. "movd %%mm3, %%ecx \n\t"\
  877. "movd " #d ", %%mm0 \n\t"\
  878. "paddsb (%2, %%ecx, 8), %%mm4 \n\t"\
  879. "movd 4" #d ", %%mm1 \n\t"\
  880. "paddb %%mm6, %%mm4 \n\t"\
  881. "punpckldq %%mm1, %%mm0 \n\t"\
  882. "movq %%mm4, " #b " \n\t"\
  883. "paddb %%mm6, %%mm0 \n\t"\
  884. "paddsb (%2, %%ecx, 8), %%mm0 \n\t"\
  885. "paddb %%mm6, %%mm0 \n\t"\
  886. "movq %%mm0, " #d " \n\t"\
  887. HX1b((%0),(%%eax),(%%eax, %1),(%%eax, %1, 2))
  888. HX1b((%0, %1, 4),(%%ebx),(%%ebx, %1),(%%ebx, %1, 2))
  889. :
  890. : "r" (src), "r" (stride), "r" (lut)
  891. : "%eax", "%ebx", "%ecx"
  892. );
  893. #else
  894. //FIXME (has little in common with the mmx2 version)
  895. for(y=0; y<BLOCK_SIZE; y++)
  896. {
  897. int a= src[1] - src[2];
  898. int b= src[3] - src[4];
  899. int c= src[5] - src[6];
  900. int d= MAX(ABS(b) - (ABS(a) + ABS(c))/2, 0);
  901. if(d < QP)
  902. {
  903. int v = d * SIGN(-b);
  904. src[1] +=v/8;
  905. src[2] +=v/4;
  906. src[3] +=3*v/8;
  907. src[4] -=3*v/8;
  908. src[5] -=v/4;
  909. src[6] -=v/8;
  910. }
  911. src+=stride;
  912. }
  913. #endif
  914. }
  915. static inline void doVertDefFilter(uint8_t src[], int stride, int QP)
  916. {
  917. #ifdef HAVE_MMX
  918. src+= stride;
  919. //FIXME try pmul for *5 stuff
  920. // src[0]=0;
  921. asm volatile(
  922. "pxor %%mm7, %%mm7 \n\t"
  923. "leal (%0, %1), %%eax \n\t"
  924. "leal (%%eax, %1, 4), %%ebx \n\t"
  925. // 0 1 2 3 4 5 6 7
  926. // %0 %0+%1 %0+2%1 eax+2%1 %0+4%1 eax+4%1 ebx+%1 ebx+2%1
  927. // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1
  928. "movq (%0), %%mm0 \n\t"
  929. "movq %%mm0, %%mm1 \n\t"
  930. "punpcklbw %%mm7, %%mm0 \n\t" // low part of line 0
  931. "punpckhbw %%mm7, %%mm1 \n\t" // high part of line 0
  932. "movq (%%eax), %%mm2 \n\t"
  933. "movq %%mm2, %%mm3 \n\t"
  934. "punpcklbw %%mm7, %%mm2 \n\t" // low part of line 1
  935. "punpckhbw %%mm7, %%mm3 \n\t" // high part of line 1
  936. "movq (%%eax, %1), %%mm4 \n\t"
  937. "movq %%mm4, %%mm5 \n\t"
  938. "punpcklbw %%mm7, %%mm4 \n\t" // low part of line 2
  939. "punpckhbw %%mm7, %%mm5 \n\t" // high part of line 2
  940. "paddw %%mm0, %%mm0 \n\t" // 2L0
  941. "paddw %%mm1, %%mm1 \n\t" // 2H0
  942. "psubw %%mm4, %%mm2 \n\t" // L1 - L2
  943. "psubw %%mm5, %%mm3 \n\t" // H1 - H2
  944. "psubw %%mm2, %%mm0 \n\t" // 2L0 - L1 + L2
  945. "psubw %%mm3, %%mm1 \n\t" // 2H0 - H1 + H2
  946. "psllw $2, %%mm2 \n\t" // 4L1 - 4L2
  947. "psllw $2, %%mm3 \n\t" // 4H1 - 4H2
  948. "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2
  949. "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2
  950. "movq (%%eax, %1, 2), %%mm2 \n\t"
  951. "movq %%mm2, %%mm3 \n\t"
  952. "punpcklbw %%mm7, %%mm2 \n\t" // L3
  953. "punpckhbw %%mm7, %%mm3 \n\t" // H3
  954. "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 - L3
  955. "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 - H3
  956. "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 - 2L3
  957. "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 - 2H3
  958. "movq %%mm0, temp0 \n\t" // 2L0 - 5L1 + 5L2 - 2L3
  959. "movq %%mm1, temp1 \n\t" // 2H0 - 5H1 + 5H2 - 2H3
  960. "movq (%0, %1, 4), %%mm0 \n\t"
  961. "movq %%mm0, %%mm1 \n\t"
  962. "punpcklbw %%mm7, %%mm0 \n\t" // L4
  963. "punpckhbw %%mm7, %%mm1 \n\t" // H4
  964. "psubw %%mm0, %%mm2 \n\t" // L3 - L4
  965. "psubw %%mm1, %%mm3 \n\t" // H3 - H4
  966. "movq %%mm2, temp2 \n\t" // L3 - L4
  967. "movq %%mm3, temp3 \n\t" // H3 - H4
  968. "paddw %%mm4, %%mm4 \n\t" // 2L2
  969. "paddw %%mm5, %%mm5 \n\t" // 2H2
  970. "psubw %%mm2, %%mm4 \n\t" // 2L2 - L3 + L4
  971. "psubw %%mm3, %%mm5 \n\t" // 2H2 - H3 + H4
  972. "psllw $2, %%mm2 \n\t" // 4L3 - 4L4
  973. "psllw $2, %%mm3 \n\t" // 4H3 - 4H4
  974. "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4
  975. "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4
  976. //50 opcodes so far
  977. "movq (%%ebx), %%mm2 \n\t"
  978. "movq %%mm2, %%mm3 \n\t"
  979. "punpcklbw %%mm7, %%mm2 \n\t" // L5
  980. "punpckhbw %%mm7, %%mm3 \n\t" // H5
  981. "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 - L5
  982. "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 - H5
  983. "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 - 2L5
  984. "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 - 2H5
  985. "movq (%%ebx, %1), %%mm6 \n\t"
  986. "punpcklbw %%mm7, %%mm6 \n\t" // L6
  987. "psubw %%mm6, %%mm2 \n\t" // L5 - L6
  988. "movq (%%ebx, %1), %%mm6 \n\t"
  989. "punpckhbw %%mm7, %%mm6 \n\t" // H6
  990. "psubw %%mm6, %%mm3 \n\t" // H5 - H6
  991. "paddw %%mm0, %%mm0 \n\t" // 2L4
  992. "paddw %%mm1, %%mm1 \n\t" // 2H4
  993. "psubw %%mm2, %%mm0 \n\t" // 2L4 - L5 + L6
  994. "psubw %%mm3, %%mm1 \n\t" // 2H4 - H5 + H6
  995. "psllw $2, %%mm2 \n\t" // 4L5 - 4L6
  996. "psllw $2, %%mm3 \n\t" // 4H5 - 4H6
  997. "psubw %%mm2, %%mm0 \n\t" // 2L4 - 5L5 + 5L6
  998. "psubw %%mm3, %%mm1 \n\t" // 2H4 - 5H5 + 5H6
  999. "movq (%%ebx, %1, 2), %%mm2 \n\t"
  1000. "movq %%mm2, %%mm3 \n\t"
  1001. "punpcklbw %%mm7, %%mm2 \n\t" // L7
  1002. "punpckhbw %%mm7, %%mm3 \n\t" // H7
  1003. "paddw %%mm2, %%mm2 \n\t" // 2L7
  1004. "paddw %%mm3, %%mm3 \n\t" // 2H7
  1005. "psubw %%mm2, %%mm0 \n\t" // 2L4 - 5L5 + 5L6 - 2L7
  1006. "psubw %%mm3, %%mm1 \n\t" // 2H4 - 5H5 + 5H6 - 2H7
  1007. "movq temp0, %%mm2 \n\t" // 2L0 - 5L1 + 5L2 - 2L3
  1008. "movq temp1, %%mm3 \n\t" // 2H0 - 5H1 + 5H2 - 2H3
  1009. //FIXME pxor, psubw, pmax for abs
  1010. "movq %%mm7, %%mm6 \n\t" // 0
  1011. "pcmpgtw %%mm0, %%mm6 \n\t"
  1012. "pxor %%mm6, %%mm0 \n\t"
  1013. "psubw %%mm6, %%mm0 \n\t" // |2L4 - 5L5 + 5L6 - 2L7|
  1014. "movq %%mm7, %%mm6 \n\t" // 0
  1015. "pcmpgtw %%mm1, %%mm6 \n\t"
  1016. "pxor %%mm6, %%mm1 \n\t"
  1017. "psubw %%mm6, %%mm1 \n\t" // |2H4 - 5H5 + 5H6 - 2H7|
  1018. "movq %%mm7, %%mm6 \n\t" // 0
  1019. "pcmpgtw %%mm2, %%mm6 \n\t"
  1020. "pxor %%mm6, %%mm2 \n\t"
  1021. "psubw %%mm6, %%mm2 \n\t" // |2L0 - 5L1 + 5L2 - 2L3|
  1022. "movq %%mm7, %%mm6 \n\t" // 0
  1023. "pcmpgtw %%mm3, %%mm6 \n\t"
  1024. "pxor %%mm6, %%mm3 \n\t"
  1025. "psubw %%mm6, %%mm3 \n\t" // |2H0 - 5H1 + 5H2 - 2H3|
  1026. #ifdef HAVE_MMX2
  1027. "pminsw %%mm2, %%mm0 \n\t"
  1028. "pminsw %%mm3, %%mm1 \n\t"
  1029. #else
  1030. "movq %%mm0, %%mm6 \n\t"
  1031. "psubusw %%mm2, %%mm6 \n\t"
  1032. "psubw %%mm6, %%mm0 \n\t"
  1033. "movq %%mm1, %%mm6 \n\t"
  1034. "psubusw %%mm3, %%mm6 \n\t"
  1035. "psubw %%mm6, %%mm1 \n\t"
  1036. #endif
  1037. "movq %%mm7, %%mm6 \n\t" // 0
  1038. "pcmpgtw %%mm4, %%mm6 \n\t" // sign(2L2 - 5L3 + 5L4 - 2L5)
  1039. "pxor %%mm6, %%mm4 \n\t"
  1040. "psubw %%mm6, %%mm4 \n\t" // |2L2 - 5L3 + 5L4 - 2L5|
  1041. "pcmpgtw %%mm5, %%mm7 \n\t" // sign(2H2 - 5H3 + 5H4 - 2H5)
  1042. "pxor %%mm7, %%mm5 \n\t"
  1043. "psubw %%mm7, %%mm5 \n\t" // |2H2 - 5H3 + 5H4 - 2H5|
  1044. // 100 opcodes
  1045. "movd %2, %%mm2 \n\t" // QP
  1046. //"pcmpeqb %%mm2, %%mm2\n\t"
  1047. "punpcklwd %%mm2, %%mm2 \n\t"
  1048. "punpcklwd %%mm2, %%mm2 \n\t"
  1049. "psllw $3, %%mm2 \n\t" // 8QP
  1050. "movq %%mm2, %%mm3 \n\t" // 8QP
  1051. "pcmpgtw %%mm4, %%mm2 \n\t"
  1052. "pcmpgtw %%mm5, %%mm3 \n\t"
  1053. "pand %%mm2, %%mm4 \n\t"
  1054. "pand %%mm3, %%mm5 \n\t"
  1055. "psubusw %%mm0, %%mm4 \n\t" // hd
  1056. "psubusw %%mm1, %%mm5 \n\t" // ld
  1057. "movq w05, %%mm2 \n\t" // 5
  1058. "pmullw %%mm2, %%mm4 \n\t"
  1059. "pmullw %%mm2, %%mm5 \n\t"
  1060. "movq w20, %%mm2 \n\t" // 32
  1061. "paddw %%mm2, %%mm4 \n\t"
  1062. "paddw %%mm2, %%mm5 \n\t"
  1063. "psrlw $6, %%mm4 \n\t"
  1064. "psrlw $6, %%mm5 \n\t"
  1065. /*
  1066. "movq w06, %%mm2 \n\t" // 6
  1067. "paddw %%mm2, %%mm4 \n\t"
  1068. "paddw %%mm2, %%mm5 \n\t"
  1069. "movq w1400, %%mm2 \n\t" // 1400h = 5120 = 5/64*2^16
  1070. //FIXME if *5/64 is supposed to be /13 then we should use 5041 instead of 5120
  1071. "pmulhw %%mm2, %%mm4 \n\t" // hd/13
  1072. "pmulhw %%mm2, %%mm5 \n\t" // ld/13
  1073. */
  1074. "movq temp2, %%mm0 \n\t" // L3 - L4
  1075. "movq temp3, %%mm1 \n\t" // H3 - H4
  1076. "pxor %%mm2, %%mm2 \n\t"
  1077. "pxor %%mm3, %%mm3 \n\t"
  1078. // FIXME rounding error
  1079. "psraw $1, %%mm0 \n\t" // (L3 - L4)/2
  1080. "psraw $1, %%mm1 \n\t" // (H3 - H4)/2
  1081. "pcmpgtw %%mm0, %%mm2 \n\t" // sign (L3-L4)
  1082. "pcmpgtw %%mm1, %%mm3 \n\t" // sign (H3-H4)
  1083. "pxor %%mm2, %%mm0 \n\t"
  1084. "pxor %%mm3, %%mm1 \n\t"
  1085. "psubw %%mm2, %%mm0 \n\t" // |L3-L4|
  1086. "psubw %%mm3, %%mm1 \n\t" // |H3-H4|
  1087. // "psrlw $1, %%mm0 \n\t" // |L3 - L4|/2
  1088. // "psrlw $1, %%mm1 \n\t" // |H3 - H4|/2
  1089. "pxor %%mm6, %%mm2 \n\t"
  1090. "pxor %%mm7, %%mm3 \n\t"
  1091. "pand %%mm2, %%mm4 \n\t"
  1092. "pand %%mm3, %%mm5 \n\t"
  1093. #ifdef HAVE_MMX2
  1094. "pminsw %%mm0, %%mm4 \n\t"
  1095. "pminsw %%mm1, %%mm5 \n\t"
  1096. #else
  1097. "movq %%mm4, %%mm2 \n\t"
  1098. "psubusw %%mm0, %%mm2 \n\t"
  1099. "psubw %%mm2, %%mm4 \n\t"
  1100. "movq %%mm5, %%mm2 \n\t"
  1101. "psubusw %%mm1, %%mm2 \n\t"
  1102. "psubw %%mm2, %%mm5 \n\t"
  1103. #endif
  1104. "pxor %%mm6, %%mm4 \n\t"
  1105. "pxor %%mm7, %%mm5 \n\t"
  1106. "psubw %%mm6, %%mm4 \n\t"
  1107. "psubw %%mm7, %%mm5 \n\t"
  1108. "packsswb %%mm5, %%mm4 \n\t"
  1109. "movq (%%eax, %1, 2), %%mm0 \n\t"
  1110. "paddb %%mm4, %%mm0 \n\t"
  1111. "movq %%mm0, (%%eax, %1, 2) \n\t"
  1112. "movq (%0, %1, 4), %%mm0 \n\t"
  1113. "psubb %%mm4, %%mm0 \n\t"
  1114. // "pxor %%mm0, %%mm0 \n\t"
  1115. "movq %%mm0, (%0, %1, 4) \n\t"
  1116. :
  1117. : "r" (src), "r" (stride), "r" (QP)
  1118. : "%eax", "%ebx"
  1119. );
  1120. #else
  1121. const int l1= stride;
  1122. const int l2= stride + l1;
  1123. const int l3= stride + l2;
  1124. const int l4= stride + l3;
  1125. const int l5= stride + l4;
  1126. const int l6= stride + l5;
  1127. const int l7= stride + l6;
  1128. const int l8= stride + l7;
  1129. // const int l9= stride + l8;
  1130. int x;
  1131. for(x=0; x<BLOCK_SIZE; x++)
  1132. {
  1133. const int middleEnergy= 5*(src[l5] - src[l4]) + 2*(src[l3] - src[l6]);
  1134. if(ABS(middleEnergy) < 8*QP)
  1135. {
  1136. const int q=(src[l4] - src[l5])/2;
  1137. const int leftEnergy= 5*(src[l3] - src[l2]) + 2*(src[l1] - src[l4]);
  1138. const int rightEnergy= 5*(src[l7] - src[l6]) + 2*(src[l5] - src[l8]);
  1139. int d= ABS(middleEnergy) - MIN( ABS(leftEnergy), ABS(rightEnergy) );
  1140. d= MAX(d, 0);
  1141. d= (5*d + 32) >> 6;
  1142. d*= SIGN(-middleEnergy);
  1143. if(q>0)
  1144. {
  1145. d= d<0 ? 0 : d;
  1146. d= d>q ? q : d;
  1147. }
  1148. else
  1149. {
  1150. d= d>0 ? 0 : d;
  1151. d= d<q ? q : d;
  1152. }
  1153. src[l4]-= d;
  1154. src[l5]+= d;
  1155. }
  1156. src++;
  1157. }
  1158. #endif
  1159. }
  1160. //FIXME? |255-0| = 1
  1161. /**
  1162. * Check if the given 8x8 Block is mostly "flat" and copy the unaliged data into tempBlock.
  1163. */
  1164. static inline int isHorizDCAndCopy2Temp(uint8_t src[], int stride)
  1165. {
  1166. // src++;
  1167. int numEq= 0;
  1168. #ifdef HAVE_MMX
  1169. asm volatile (
  1170. // "int $3 \n\t"
  1171. "pushl %1\n\t"
  1172. "movq b7E, %%mm7 \n\t" // mm7 = 0x7F
  1173. "movq b7C, %%mm6 \n\t" // mm6 = 0x7D
  1174. "leal tempBlock, %%eax \n\t"
  1175. "pxor %%mm0, %%mm0 \n\t"
  1176. #define HDC_CHECK_AND_CPY(i) \
  1177. "movq -4(%1), %%mm2 \n\t"\
  1178. "psrlq $32, %%mm2 \n\t"\
  1179. "punpckldq 4(%1), %%mm2 \n\t" /* (%1) */\
  1180. "movq %%mm2, %%mm1 \n\t"\
  1181. "psrlq $8, %%mm2 \n\t"\
  1182. "psubb %%mm1, %%mm2 \n\t"\
  1183. "paddb %%mm7, %%mm2 \n\t"\
  1184. "pcmpgtb %%mm6, %%mm2 \n\t"\
  1185. "paddb %%mm2, %%mm0 \n\t"\
  1186. "movq %%mm1," #i "(%%eax) \n\t"
  1187. HDC_CHECK_AND_CPY(0)
  1188. "addl %2, %1 \n\t"
  1189. HDC_CHECK_AND_CPY(8)
  1190. "addl %2, %1 \n\t"
  1191. HDC_CHECK_AND_CPY(16)
  1192. "addl %2, %1 \n\t"
  1193. HDC_CHECK_AND_CPY(24)
  1194. "addl %2, %1 \n\t"
  1195. HDC_CHECK_AND_CPY(32)
  1196. "addl %2, %1 \n\t"
  1197. HDC_CHECK_AND_CPY(40)
  1198. "addl %2, %1 \n\t"
  1199. HDC_CHECK_AND_CPY(48)
  1200. "addl %2, %1 \n\t"
  1201. HDC_CHECK_AND_CPY(56)
  1202. "psllq $8, %%mm0 \n\t" // remove dummy value
  1203. "movq %%mm0, %%mm1 \n\t"
  1204. "psrlw $8, %%mm0 \n\t"
  1205. "paddb %%mm1, %%mm0 \n\t"
  1206. "movq %%mm0, %%mm1 \n\t"
  1207. "psrlq $16, %%mm0 \n\t"
  1208. "paddb %%mm1, %%mm0 \n\t"
  1209. "movq %%mm0, %%mm1 \n\t"
  1210. "psrlq $32, %%mm0 \n\t"
  1211. "paddb %%mm1, %%mm0 \n\t"
  1212. "popl %1\n\t"
  1213. "movd %%mm0, %0 \n\t"
  1214. : "=r" (numEq)
  1215. : "r" (src), "r" (stride)
  1216. : "%eax"
  1217. );
  1218. // printf("%d\n", numEq);
  1219. numEq= (256 - (numEq & 0xFF)) &0xFF;
  1220. #else
  1221. int y;
  1222. for(y=0; y<BLOCK_SIZE; y++)
  1223. {
  1224. if(((src[0] - src[1] + 1) & 0xFFFF) < 3) numEq++;
  1225. if(((src[1] - src[2] + 1) & 0xFFFF) < 3) numEq++;
  1226. if(((src[2] - src[3] + 1) & 0xFFFF) < 3) numEq++;
  1227. if(((src[3] - src[4] + 1) & 0xFFFF) < 3) numEq++;
  1228. if(((src[4] - src[5] + 1) & 0xFFFF) < 3) numEq++;
  1229. if(((src[5] - src[6] + 1) & 0xFFFF) < 3) numEq++;
  1230. if(((src[6] - src[7] + 1) & 0xFFFF) < 3) numEq++;
  1231. tempBlock[0 + y*TEMP_STRIDE] = src[0];
  1232. tempBlock[1 + y*TEMP_STRIDE] = src[1];
  1233. tempBlock[2 + y*TEMP_STRIDE] = src[2];
  1234. tempBlock[3 + y*TEMP_STRIDE] = src[3];
  1235. tempBlock[4 + y*TEMP_STRIDE] = src[4];
  1236. tempBlock[5 + y*TEMP_STRIDE] = src[5];
  1237. tempBlock[6 + y*TEMP_STRIDE] = src[6];
  1238. tempBlock[7 + y*TEMP_STRIDE] = src[7];
  1239. src+= stride;
  1240. }
  1241. #endif
  1242. /* if(abs(numEq - asmEq) > 0)
  1243. {
  1244. // printf("\nasm:%d c:%d\n", asmEq, numEq);
  1245. for(int y=0; y<8; y++)
  1246. {
  1247. for(int x=0; x<8; x++)
  1248. {
  1249. printf("%d ", src[x + y*stride]);
  1250. }
  1251. printf("\n");
  1252. }
  1253. }
  1254. */
  1255. // printf("%d\n", numEq);
  1256. return numEq > hFlatnessThreshold;
  1257. }
  1258. static inline int isHorizMinMaxOk(uint8_t src[], int stride, int QP)
  1259. {
  1260. #ifdef MMX_FIXME
  1261. FIXME
  1262. int isOk;
  1263. asm volatile(
  1264. // "int $3 \n\t"
  1265. "movq (%1, %2), %%mm0 \n\t"
  1266. "movq (%1, %2, 8), %%mm1 \n\t"
  1267. "movq %%mm0, %%mm2 \n\t"
  1268. "psubusb %%mm1, %%mm0 \n\t"
  1269. "psubusb %%mm2, %%mm1 \n\t"
  1270. "por %%mm1, %%mm0 \n\t" // ABS Diff
  1271. "movq pQPb, %%mm7 \n\t" // QP,..., QP
  1272. "paddusb %%mm7, %%mm7 \n\t" // 2QP ... 2QP
  1273. "psubusb %%mm7, %%mm0 \n\t" // Diff <= 2QP -> 0
  1274. "pcmpeqd b00, %%mm0 \n\t"
  1275. "psrlq $16, %%mm0 \n\t"
  1276. "pcmpeqd bFF, %%mm0 \n\t"
  1277. // "movd %%mm0, (%1, %2, 4)\n\t"
  1278. "movd %%mm0, %0 \n\t"
  1279. : "=r" (isOk)
  1280. : "r" (src), "r" (stride)
  1281. );
  1282. return isOk;
  1283. #else
  1284. if(abs(src[0] - src[7]) > 2*QP) return 0;
  1285. return 1;
  1286. #endif
  1287. }
  1288. static inline void doHorizDefFilterAndCopyBack(uint8_t dst[], int stride, int QP)
  1289. {
  1290. #ifdef HAVE_MMX
  1291. asm volatile(
  1292. "pushl %0 \n\t"
  1293. "pxor %%mm7, %%mm7 \n\t"
  1294. "movq bm00001000, %%mm6 \n\t"
  1295. "movd %2, %%mm5 \n\t" // QP
  1296. "movq %%mm5, %%mm4 \n\t"
  1297. "paddusb %%mm5, %%mm5 \n\t" // 2QP
  1298. "paddusb %%mm5, %%mm4 \n\t" // 3QP
  1299. "psllq $24, %%mm4 \n\t"
  1300. "pxor %%mm5, %%mm5 \n\t" // 0
  1301. "psubb %%mm4, %%mm5 \n\t" // -QP
  1302. "leal tempBlock, %%eax \n\t"
  1303. //FIXME? "unroll by 2" and mix
  1304. #ifdef HAVE_MMX2
  1305. #define HDF(i) \
  1306. "movq " #i "(%%eax), %%mm0 \n\t"\
  1307. "movq %%mm0, %%mm1 \n\t"\
  1308. "movq %%mm0, %%mm2 \n\t"\
  1309. "psrlq $8, %%mm1 \n\t"\
  1310. "psubusb %%mm1, %%mm2 \n\t"\
  1311. "psubusb %%mm0, %%mm1 \n\t"\
  1312. "por %%mm2, %%mm1 \n\t" /* p´x = |px - p(x+1)| */\
  1313. "pcmpeqb %%mm7, %%mm2 \n\t" /* p´x = sgn[px - p(x+1)] */\
  1314. "pshufw $0x00, %%mm1, %%mm3 \n\t" /* p´5 = |p1 - p2| */\
  1315. "pminub %%mm1, %%mm3 \n\t" /* p´5 = min(|p2-p1|, |p6-p5|)*/\
  1316. "psrlq $16, %%mm3 \n\t" /* p´3 = min(|p2-p1|, |p6-p5|)*/\
  1317. "psubusb %%mm3, %%mm1 \n\t" /* |p3-p4|-min(|p1-p2|,|p5-p6|) */\
  1318. "paddb %%mm5, %%mm1 \n\t"\
  1319. "psubusb %%mm5, %%mm1 \n\t"\
  1320. "psrlw $2, %%mm1 \n\t"\
  1321. "pxor %%mm2, %%mm1 \n\t"\
  1322. "psubb %%mm2, %%mm1 \n\t"\
  1323. "pand %%mm6, %%mm1 \n\t"\
  1324. "psubb %%mm1, %%mm0 \n\t"\
  1325. "psllq $8, %%mm1 \n\t"\
  1326. "paddb %%mm1, %%mm0 \n\t"\
  1327. "movd %%mm0, (%0) \n\t"\
  1328. "psrlq $32, %%mm0 \n\t"\
  1329. "movd %%mm0, 4(%0) \n\t"
  1330. #else
  1331. #define HDF(i)\
  1332. "movq " #i "(%%eax), %%mm0 \n\t"\
  1333. "movq %%mm0, %%mm1 \n\t"\
  1334. "movq %%mm0, %%mm2 \n\t"\
  1335. "psrlq $8, %%mm1 \n\t"\
  1336. "psubusb %%mm1, %%mm2 \n\t"\
  1337. "psubusb %%mm0, %%mm1 \n\t"\
  1338. "por %%mm2, %%mm1 \n\t" /* p´x = |px - p(x+1)| */\
  1339. "pcmpeqb %%mm7, %%mm2 \n\t" /* p´x = sgn[px - p(x+1)] */\
  1340. "movq %%mm1, %%mm3 \n\t"\
  1341. "psllq $32, %%mm3 \n\t"\
  1342. "movq %%mm3, %%mm4 \n\t"\
  1343. "psubusb %%mm1, %%mm4 \n\t"\
  1344. "psubb %%mm4, %%mm3 \n\t"\
  1345. "psrlq $16, %%mm3 \n\t" /* p´3 = min(|p2-p1|, |p6-p5|)*/\
  1346. "psubusb %%mm3, %%mm1 \n\t" /* |p3-p4|-min(|p1-p2|,|p5,ü6|) */\
  1347. "paddb %%mm5, %%mm1 \n\t"\
  1348. "psubusb %%mm5, %%mm1 \n\t"\
  1349. "psrlw $2, %%mm1 \n\t"\
  1350. "pxor %%mm2, %%mm1 \n\t"\
  1351. "psubb %%mm2, %%mm1 \n\t"\
  1352. "pand %%mm6, %%mm1 \n\t"\
  1353. "psubb %%mm1, %%mm0 \n\t"\
  1354. "psllq $8, %%mm1 \n\t"\
  1355. "paddb %%mm1, %%mm0 \n\t"\
  1356. "movd %%mm0, (%0) \n\t"\
  1357. "psrlq $32, %%mm0 \n\t"\
  1358. "movd %%mm0, 4(%0) \n\t"
  1359. #endif
  1360. HDF(0)
  1361. "addl %1, %0 \n\t"
  1362. HDF(8)
  1363. "addl %1, %0 \n\t"
  1364. HDF(16)
  1365. "addl %1, %0 \n\t"
  1366. HDF(24)
  1367. "addl %1, %0 \n\t"
  1368. HDF(32)
  1369. "addl %1, %0 \n\t"
  1370. HDF(40)
  1371. "addl %1, %0 \n\t"
  1372. HDF(48)
  1373. "addl %1, %0 \n\t"
  1374. HDF(56)
  1375. "popl %0 \n\t"
  1376. :
  1377. : "r" (dst), "r" (stride), "r" (QP)
  1378. : "%eax"
  1379. );
  1380. #else
  1381. uint8_t *src= tempBlock;
  1382. int y;
  1383. for(y=0; y<BLOCK_SIZE; y++)
  1384. {
  1385. dst[0] = src[0];
  1386. dst[1] = src[1];
  1387. dst[2] = src[2];
  1388. dst[3] = src[3];
  1389. dst[4] = src[4];
  1390. dst[5] = src[5];
  1391. dst[6] = src[6];
  1392. dst[7] = src[7];
  1393. const int middleEnergy= 5*(src[4] - src[5]) + 2*(src[2] - src[5]);
  1394. if(ABS(middleEnergy) < 8*QP)
  1395. {
  1396. const int q=(src[3] - src[4])/2;
  1397. const int leftEnergy= 5*(src[2] - src[1]) + 2*(src[0] - src[3]);
  1398. const int rightEnergy= 5*(src[6] - src[5]) + 2*(src[4] - src[7]);
  1399. int d= ABS(middleEnergy) - MIN( ABS(leftEnergy), ABS(rightEnergy) );
  1400. d= MAX(d, 0);
  1401. d= (5*d + 32) >> 6;
  1402. d*= SIGN(-middleEnergy);
  1403. if(q>0)
  1404. {
  1405. d= d<0 ? 0 : d;
  1406. d= d>q ? q : d;
  1407. }
  1408. else
  1409. {
  1410. d= d>0 ? 0 : d;
  1411. d= d<q ? q : d;
  1412. }
  1413. dst[3]-= d;
  1414. dst[4]+= d;
  1415. }
  1416. dst+= stride;
  1417. src+= TEMP_STRIDE;
  1418. }
  1419. #endif
  1420. }
  1421. /**
  1422. * Do a horizontal low pass filter on the 8x8 block
  1423. * useing the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16 (C version)
  1424. * useing approximately the 7-Tap Filter (1,2,3,4,3,2,1)/16 (MMX2/3DNOW version)
  1425. */
  1426. static inline void doHorizLowPassAndCopyBack(uint8_t dst[], int stride, int QP)
  1427. {
  1428. //return;
  1429. #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
  1430. asm volatile( //"movv %0 %1 %2\n\t"
  1431. "pushl %0\n\t"
  1432. "pxor %%mm7, %%mm7 \n\t"
  1433. "leal tempBlock, %%eax \n\t"
  1434. /*
  1435. #define HLP1 "movq (%0), %%mm0 \n\t"\
  1436. "movq %%mm0, %%mm1 \n\t"\
  1437. "psllq $8, %%mm0 \n\t"\
  1438. PAVGB(%%mm1, %%mm0)\
  1439. "psrlw $8, %%mm0 \n\t"\
  1440. "pxor %%mm1, %%mm1 \n\t"\
  1441. "packuswb %%mm1, %%mm0 \n\t"\
  1442. "movq %%mm0, %%mm1 \n\t"\
  1443. "movq %%mm0, %%mm2 \n\t"\
  1444. "psllq $32, %%mm0 \n\t"\
  1445. "paddb %%mm0, %%mm1 \n\t"\
  1446. "psllq $16, %%mm2 \n\t"\
  1447. PAVGB(%%mm2, %%mm0)\
  1448. "movq %%mm0, %%mm3 \n\t"\
  1449. "pand bm11001100, %%mm0 \n\t"\
  1450. "paddusb %%mm0, %%mm3 \n\t"\
  1451. "psrlq $8, %%mm3 \n\t"\
  1452. PAVGB(%%mm1, %%mm4)\
  1453. PAVGB(%%mm3, %%mm2)\
  1454. "psrlq $16, %%mm2 \n\t"\
  1455. "punpcklbw %%mm2, %%mm2 \n\t"\
  1456. "movq %%mm2, (%0) \n\t"\
  1457. #define HLP2 "movq (%0), %%mm0 \n\t"\
  1458. "movq %%mm0, %%mm1 \n\t"\
  1459. "psllq $8, %%mm0 \n\t"\
  1460. PAVGB(%%mm1, %%mm0)\
  1461. "psrlw $8, %%mm0 \n\t"\
  1462. "pxor %%mm1, %%mm1 \n\t"\
  1463. "packuswb %%mm1, %%mm0 \n\t"\
  1464. "movq %%mm0, %%mm2 \n\t"\
  1465. "psllq $32, %%mm0 \n\t"\
  1466. "psllq $16, %%mm2 \n\t"\
  1467. PAVGB(%%mm2, %%mm0)\
  1468. "movq %%mm0, %%mm3 \n\t"\
  1469. "pand bm11001100, %%mm0 \n\t"\
  1470. "paddusb %%mm0, %%mm3 \n\t"\
  1471. "psrlq $8, %%mm3 \n\t"\
  1472. PAVGB(%%mm3, %%mm2)\
  1473. "psrlq $16, %%mm2 \n\t"\
  1474. "punpcklbw %%mm2, %%mm2 \n\t"\
  1475. "movq %%mm2, (%0) \n\t"\
  1476. */
  1477. // approximately a 7-Tap Filter with Vector (1,2,3,4,3,2,1)/16
  1478. /*
  1479. 31
  1480. 121
  1481. 121
  1482. 121
  1483. 121
  1484. 121
  1485. 121
  1486. 13
  1487. Implemented Exact 7-Tap
  1488. 9421 A321
  1489. 36421 64321
  1490. 334321 =
  1491. 1234321 =
  1492. 1234321 =
  1493. 123433 =
  1494. 12463 12346
  1495. 1249 123A
  1496. */
  1497. #ifdef HAVE_MMX2
  1498. #define HLP3(i) "movq " #i "(%%eax), %%mm0 \n\t"\
  1499. "movq %%mm0, %%mm1 \n\t"\
  1500. "movq %%mm0, %%mm2 \n\t"\
  1501. "movq %%mm0, %%mm3 \n\t"\
  1502. "movq %%mm0, %%mm4 \n\t"\
  1503. "psllq $8, %%mm1 \n\t"\
  1504. "psrlq $8, %%mm2 \n\t"\
  1505. "pand bm00000001, %%mm3 \n\t"\
  1506. "pand bm10000000, %%mm4 \n\t"\
  1507. "por %%mm3, %%mm1 \n\t"\
  1508. "por %%mm4, %%mm2 \n\t"\
  1509. PAVGB(%%mm2, %%mm1)\
  1510. PAVGB(%%mm1, %%mm0)\
  1511. \
  1512. "pshufw $0xF9, %%mm0, %%mm3 \n\t"\
  1513. "pshufw $0x90, %%mm0, %%mm4 \n\t"\
  1514. PAVGB(%%mm3, %%mm4)\
  1515. PAVGB(%%mm4, %%mm0)\
  1516. "movd %%mm0, (%0) \n\t"\
  1517. "psrlq $32, %%mm0 \n\t"\
  1518. "movd %%mm0, 4(%0) \n\t"
  1519. #else
  1520. #define HLP3(i) "movq " #i "(%%eax), %%mm0 \n\t"\
  1521. "movq %%mm0, %%mm1 \n\t"\
  1522. "movq %%mm0, %%mm2 \n\t"\
  1523. "movq %%mm0, %%mm3 \n\t"\
  1524. "movq %%mm0, %%mm4 \n\t"\
  1525. "psllq $8, %%mm1 \n\t"\
  1526. "psrlq $8, %%mm2 \n\t"\
  1527. "pand bm00000001, %%mm3 \n\t"\
  1528. "pand bm10000000, %%mm4 \n\t"\
  1529. "por %%mm3, %%mm1 \n\t"\
  1530. "por %%mm4, %%mm2 \n\t"\
  1531. PAVGB(%%mm2, %%mm1)\
  1532. PAVGB(%%mm1, %%mm0)\
  1533. \
  1534. "movq %%mm0, %%mm3 \n\t"\
  1535. "movq %%mm0, %%mm4 \n\t"\
  1536. "movq %%mm0, %%mm5 \n\t"\
  1537. "psrlq $16, %%mm3 \n\t"\
  1538. "psllq $16, %%mm4 \n\t"\
  1539. "pand bm11000000, %%mm5 \n\t"\
  1540. "por %%mm5, %%mm3 \n\t"\
  1541. "movq %%mm0, %%mm5 \n\t"\
  1542. "pand bm00000011, %%mm5 \n\t"\
  1543. "por %%mm5, %%mm4 \n\t"\
  1544. PAVGB(%%mm3, %%mm4)\
  1545. PAVGB(%%mm4, %%mm0)\
  1546. "movd %%mm0, (%0) \n\t"\
  1547. "psrlq $32, %%mm0 \n\t"\
  1548. "movd %%mm0, 4(%0) \n\t"
  1549. #endif
  1550. #define HLP(i) HLP3(i)
  1551. HLP(0)
  1552. "addl %1, %0 \n\t"
  1553. HLP(8)
  1554. "addl %1, %0 \n\t"
  1555. HLP(16)
  1556. "addl %1, %0 \n\t"
  1557. HLP(24)
  1558. "addl %1, %0 \n\t"
  1559. HLP(32)
  1560. "addl %1, %0 \n\t"
  1561. HLP(40)
  1562. "addl %1, %0 \n\t"
  1563. HLP(48)
  1564. "addl %1, %0 \n\t"
  1565. HLP(56)
  1566. "popl %0\n\t"
  1567. :
  1568. : "r" (dst), "r" (stride)
  1569. : "%eax", "%ebx"
  1570. );
  1571. #else
  1572. uint8_t *temp= tempBlock;
  1573. int y;
  1574. for(y=0; y<BLOCK_SIZE; y++)
  1575. {
  1576. const int first= ABS(dst[-1] - dst[0]) < QP ? dst[-1] : dst[0];
  1577. const int last= ABS(dst[8] - dst[7]) < QP ? dst[8] : dst[7];
  1578. int sums[9];
  1579. sums[0] = first + temp[0];
  1580. sums[1] = temp[0] + temp[1];
  1581. sums[2] = temp[1] + temp[2];
  1582. sums[3] = temp[2] + temp[3];
  1583. sums[4] = temp[3] + temp[4];
  1584. sums[5] = temp[4] + temp[5];
  1585. sums[6] = temp[5] + temp[6];
  1586. sums[7] = temp[6] + temp[7];
  1587. sums[8] = temp[7] + last;
  1588. dst[0]= ((sums[0]<<2) + ((first + sums[2])<<1) + sums[4] + 8)>>4;
  1589. dst[1]= ((dst[1]<<2) + (first + sums[0] + sums[3]<<1) + sums[5] + 8)>>4;
  1590. dst[2]= ((dst[2]<<2) + (first + sums[1] + sums[4]<<1) + sums[6] + 8)>>4;
  1591. dst[3]= ((dst[3]<<2) + (sums[2] + sums[5]<<1) + sums[0] + sums[7] + 8)>>4;
  1592. dst[4]= ((dst[4]<<2) + (sums[3] + sums[6]<<1) + sums[1] + sums[8] + 8)>>4;
  1593. dst[5]= ((dst[5]<<2) + (last + sums[7] + sums[4]<<1) + sums[2] + 8)>>4;
  1594. dst[6]= ((last + dst[6]<<2) + (dst[7] + sums[5]<<1) + sums[3] + 8)>>4;
  1595. dst[7]= ((sums[8]<<2) + (last + sums[6]<<1) + sums[4] + 8)>>4;
  1596. dst+= stride;
  1597. temp+= TEMP_STRIDE;
  1598. }
  1599. #endif
  1600. }
  1601. static inline void dering(uint8_t src[], int stride, int QP)
  1602. {
  1603. //FIXME
  1604. #ifdef HAVE_MMX2X
  1605. asm volatile(
  1606. "leal (%0, %1), %%eax \n\t"
  1607. "leal (%%eax, %1, 4), %%ebx \n\t"
  1608. // 0 1 2 3 4 5 6 7 8 9
  1609. // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
  1610. "pcmpeq %%mm6, %%mm6 \n\t"
  1611. "pxor %%mm7, %%mm7 \n\t"
  1612. #define FIND_MIN_MAX(addr)\
  1613. "movq (" #addr "), %%mm0, \n\t"\
  1614. "pminub %%mm0, %%mm6 \n\t"\
  1615. "pmaxub %%mm0, %%mm7 \n\t"
  1616. FIND_MIN_MAX(%0)
  1617. FIND_MIN_MAX(%%eax)
  1618. FIND_MIN_MAX(%%eax, %1)
  1619. FIND_MIN_MAX(%%eax, %1, 2)
  1620. FIND_MIN_MAX(%0, %1, 4)
  1621. FIND_MIN_MAX(%%ebx)
  1622. FIND_MIN_MAX(%%ebx, %1)
  1623. FIND_MIN_MAX(%%ebx, %1, 2)
  1624. FIND_MIN_MAX(%0, %1, 8)
  1625. FIND_MIN_MAX(%%ebx, %1, 2)
  1626. "movq %%mm6, %%mm4 \n\t"
  1627. "psrlq $32, %%mm6 \n\t"
  1628. "pminub %%mm4, %%mm6 \n\t"
  1629. "movq %%mm6, %%mm4 \n\t"
  1630. "psrlq $16, %%mm6 \n\t"
  1631. "pminub %%mm4, %%mm6 \n\t"
  1632. "movq %%mm6, %%mm4 \n\t"
  1633. "psrlq $8, %%mm6 \n\t"
  1634. "pminub %%mm4, %%mm6 \n\t" // min of pixels
  1635. "movq %%mm7, %%mm4 \n\t"
  1636. "psrlq $32, %%mm7 \n\t"
  1637. "pmaxub %%mm4, %%mm7 \n\t"
  1638. "movq %%mm7, %%mm4 \n\t"
  1639. "psrlq $16, %%mm7 \n\t"
  1640. "pmaxub %%mm4, %%mm7 \n\t"
  1641. "movq %%mm7, %%mm4 \n\t"
  1642. "psrlq $8, %%mm7 \n\t"
  1643. "pmaxub %%mm4, %%mm7 \n\t" // max of pixels
  1644. PAVGB(%%mm6, %%mm7) // (max + min)/2
  1645. : : "r" (src), "r" (stride), "r" (QP)
  1646. : "%eax", "%ebx"
  1647. );
  1648. #else
  1649. //FIXME
  1650. #endif
  1651. }
  1652. #ifdef HAVE_ODIVX_POSTPROCESS
  1653. #include "../opendivx/postprocess.h"
  1654. int use_old_pp=0;
  1655. #endif
  1656. static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
  1657. QP_STORE_T QPs[], int QPStride, int isColor, int mode);
  1658. /**
  1659. * ...
  1660. * the mode value is interpreted as a quality value if its negative, its range is then (-1 ... -63)
  1661. * -63 is best quality -1 is worst
  1662. */
  1663. //extern "C"{
  1664. void postprocess(unsigned char * src[], int src_stride,
  1665. unsigned char * dst[], int dst_stride,
  1666. int horizontal_size, int vertical_size,
  1667. QP_STORE_T *QP_store, int QP_stride,
  1668. int mode)
  1669. {
  1670. #ifdef HAVE_ODIVX_POSTPROCESS
  1671. // Note: I could make this shit outside of this file, but it would mean one
  1672. // more function call...
  1673. if(use_old_pp){
  1674. odivx_postprocess(src,src_stride,dst,dst_stride,horizontal_size,vertical_size,QP_store,QP_stride,mode);
  1675. return;
  1676. }
  1677. #endif
  1678. // I'm calling this from dec_video.c:video_set_postprocess()
  1679. // if(mode<0) mode= getModeForQuality(-mode);
  1680. /*
  1681. long long T= rdtsc();
  1682. for(int y=vertical_size-1; y>=0 ; y--)
  1683. memcpy(dst[0] + y*src_stride, src[0] + y*src_stride,src_stride);
  1684. // memcpy(dst[0], src[0],src_stride*vertical_size);
  1685. printf("%4dk\r", (rdtsc()-T)/1000);
  1686. return;
  1687. */
  1688. /*
  1689. long long T= rdtsc();
  1690. while( (rdtsc() - T)/1000 < 4000);
  1691. return;
  1692. */
  1693. postProcess(src[0], src_stride, dst[0], dst_stride,
  1694. horizontal_size, vertical_size, QP_store, QP_stride, 0, mode);
  1695. horizontal_size >>= 1;
  1696. vertical_size >>= 1;
  1697. src_stride >>= 1;
  1698. dst_stride >>= 1;
  1699. mode= ((mode&0xFF)>>4) | (mode&0xFFFFFF00);
  1700. if(1)
  1701. {
  1702. postProcess(src[1], src_stride, dst[1], dst_stride,
  1703. horizontal_size, vertical_size, QP_store, QP_stride, 1, mode);
  1704. postProcess(src[2], src_stride, dst[2], dst_stride,
  1705. horizontal_size, vertical_size, QP_store, QP_stride, 1, mode);
  1706. }
  1707. else
  1708. {
  1709. memcpy(dst[1], src[1], src_stride*horizontal_size);
  1710. memcpy(dst[2], src[2], src_stride*horizontal_size);
  1711. }
  1712. }
  1713. /**
  1714. * gets the mode flags for a given quality (larger values mean slower but better postprocessing)
  1715. * 0 <= quality <= 6
  1716. */
  1717. int getPpModeForQuality(int quality){
  1718. int modes[1+GET_PP_QUALITY_MAX]= {
  1719. 0,
  1720. #if 1
  1721. // horizontal filters first
  1722. LUM_H_DEBLOCK,
  1723. LUM_H_DEBLOCK | LUM_V_DEBLOCK,
  1724. LUM_H_DEBLOCK | LUM_V_DEBLOCK | CHROM_H_DEBLOCK,
  1725. LUM_H_DEBLOCK | LUM_V_DEBLOCK | CHROM_H_DEBLOCK | CHROM_V_DEBLOCK,
  1726. LUM_H_DEBLOCK | LUM_V_DEBLOCK | CHROM_H_DEBLOCK | CHROM_V_DEBLOCK | LUM_DERING,
  1727. LUM_H_DEBLOCK | LUM_V_DEBLOCK | CHROM_H_DEBLOCK | CHROM_V_DEBLOCK | LUM_DERING | CHROM_DERING
  1728. #else
  1729. // vertical filters first
  1730. LUM_V_DEBLOCK,
  1731. LUM_V_DEBLOCK | LUM_H_DEBLOCK,
  1732. LUM_V_DEBLOCK | LUM_H_DEBLOCK | CHROM_V_DEBLOCK,
  1733. LUM_V_DEBLOCK | LUM_H_DEBLOCK | CHROM_V_DEBLOCK | CHROM_H_DEBLOCK,
  1734. LUM_V_DEBLOCK | LUM_H_DEBLOCK | CHROM_V_DEBLOCK | CHROM_H_DEBLOCK | LUM_DERING,
  1735. LUM_V_DEBLOCK | LUM_H_DEBLOCK | CHROM_V_DEBLOCK | CHROM_H_DEBLOCK | LUM_DERING | CHROM_DERING
  1736. #endif
  1737. };
  1738. #ifdef HAVE_ODIVX_POSTPROCESS
  1739. int odivx_modes[1+GET_PP_QUALITY_MAX]= {
  1740. 0,
  1741. PP_DEBLOCK_Y_H,
  1742. PP_DEBLOCK_Y_H|PP_DEBLOCK_Y_V,
  1743. PP_DEBLOCK_Y_H|PP_DEBLOCK_Y_V|PP_DEBLOCK_C_H,
  1744. PP_DEBLOCK_Y_H|PP_DEBLOCK_Y_V|PP_DEBLOCK_C_H|PP_DEBLOCK_C_V,
  1745. PP_DEBLOCK_Y_H|PP_DEBLOCK_Y_V|PP_DEBLOCK_C_H|PP_DEBLOCK_C_V|PP_DERING_Y,
  1746. PP_DEBLOCK_Y_H|PP_DEBLOCK_Y_V|PP_DEBLOCK_C_H|PP_DEBLOCK_C_V|PP_DERING_Y|PP_DERING_C
  1747. };
  1748. if(use_old_pp) return odivx_modes[quality];
  1749. #endif
  1750. return modes[quality];
  1751. }
  1752. //} // extern "C"
  1753. /**
  1754. * Copies a block from src to dst and fixes the blacklevel
  1755. * numLines must be a multiple of 4
  1756. * levelFix == 0 -> dont touch the brighness & contrast
  1757. */
  1758. static inline void blockCopy(uint8_t dst[], int dstStride, uint8_t src[], int srcStride,
  1759. int numLines, int levelFix)
  1760. {
  1761. int i;
  1762. if(levelFix)
  1763. {
  1764. #ifdef HAVE_MMX
  1765. asm volatile(
  1766. "movl %4, %%eax \n\t"
  1767. "movl %%eax, temp0\n\t"
  1768. "pushl %0 \n\t"
  1769. "pushl %1 \n\t"
  1770. "leal (%2,%2), %%eax \n\t"
  1771. "leal (%3,%3), %%ebx \n\t"
  1772. "movq packedYOffset, %%mm2 \n\t"
  1773. "movq packedYScale, %%mm3 \n\t"
  1774. "pxor %%mm4, %%mm4 \n\t"
  1775. #define SCALED_CPY \
  1776. "movq (%0), %%mm0 \n\t"\
  1777. "movq (%0,%2), %%mm1 \n\t"\
  1778. "psubusb %%mm2, %%mm0 \n\t"\
  1779. "psubusb %%mm2, %%mm1 \n\t"\
  1780. "movq %%mm0, %%mm5 \n\t"\
  1781. "punpcklbw %%mm4, %%mm0 \n\t"\
  1782. "punpckhbw %%mm4, %%mm5 \n\t"\
  1783. "psllw $7, %%mm0 \n\t"\
  1784. "psllw $7, %%mm5 \n\t"\
  1785. "pmulhw %%mm3, %%mm0 \n\t"\
  1786. "pmulhw %%mm3, %%mm5 \n\t"\
  1787. "packuswb %%mm5, %%mm0 \n\t"\
  1788. "movq %%mm0, (%1) \n\t"\
  1789. "movq %%mm1, %%mm5 \n\t"\
  1790. "punpcklbw %%mm4, %%mm1 \n\t"\
  1791. "punpckhbw %%mm4, %%mm5 \n\t"\
  1792. "psllw $7, %%mm1 \n\t"\
  1793. "psllw $7, %%mm5 \n\t"\
  1794. "pmulhw %%mm3, %%mm1 \n\t"\
  1795. "pmulhw %%mm3, %%mm5 \n\t"\
  1796. "packuswb %%mm5, %%mm1 \n\t"\
  1797. "movq %%mm1, (%1, %3) \n\t"\
  1798. "1: \n\t"
  1799. SCALED_CPY
  1800. "addl %%eax, %0 \n\t"
  1801. "addl %%ebx, %1 \n\t"
  1802. SCALED_CPY
  1803. "addl %%eax, %0 \n\t"
  1804. "addl %%ebx, %1 \n\t"
  1805. "decl temp0 \n\t"
  1806. "jnz 1b \n\t"
  1807. "popl %1 \n\t"
  1808. "popl %0 \n\t"
  1809. : : "r" (src),
  1810. "r" (dst),
  1811. "r" (srcStride),
  1812. "r" (dstStride),
  1813. "m" (numLines>>2)
  1814. : "%eax", "%ebx"
  1815. );
  1816. #else
  1817. for(i=0; i<numLines; i++)
  1818. memcpy( &(dst[dstStride*i]),
  1819. &(src[srcStride*i]), BLOCK_SIZE);
  1820. #endif
  1821. }
  1822. else
  1823. {
  1824. #ifdef HAVE_MMX
  1825. asm volatile(
  1826. "movl %4, %%eax \n\t"
  1827. "movl %%eax, temp0\n\t"
  1828. "pushl %0 \n\t"
  1829. "pushl %1 \n\t"
  1830. "leal (%2,%2), %%eax \n\t"
  1831. "leal (%3,%3), %%ebx \n\t"
  1832. "movq packedYOffset, %%mm2 \n\t"
  1833. "movq packedYScale, %%mm3 \n\t"
  1834. #define SIMPLE_CPY \
  1835. "movq (%0), %%mm0 \n\t"\
  1836. "movq (%0,%2), %%mm1 \n\t"\
  1837. "movq %%mm0, (%1) \n\t"\
  1838. "movq %%mm1, (%1, %3) \n\t"\
  1839. "1: \n\t"
  1840. SIMPLE_CPY
  1841. "addl %%eax, %0 \n\t"
  1842. "addl %%ebx, %1 \n\t"
  1843. SIMPLE_CPY
  1844. "addl %%eax, %0 \n\t"
  1845. "addl %%ebx, %1 \n\t"
  1846. "decl temp0 \n\t"
  1847. "jnz 1b \n\t"
  1848. "popl %1 \n\t"
  1849. "popl %0 \n\t"
  1850. : : "r" (src),
  1851. "r" (dst),
  1852. "r" (srcStride),
  1853. "r" (dstStride),
  1854. "m" (numLines>>2)
  1855. : "%eax", "%ebx"
  1856. );
  1857. #else
  1858. for(i=0; i<numLines; i++)
  1859. memcpy( &(dst[dstStride*i]),
  1860. &(src[srcStride*i]), BLOCK_SIZE);
  1861. #endif
  1862. }
  1863. }
  1864. /**
  1865. * Filters array of bytes (Y or U or V values)
  1866. */
  1867. static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
  1868. QP_STORE_T QPs[], int QPStride, int isColor, int mode)
  1869. {
  1870. int x,y;
  1871. /* we need 64bit here otherwise we´ll going to have a problem
  1872. after watching a black picture for 5 hours*/
  1873. static uint64_t *yHistogram= NULL;
  1874. int black=0, white=255; // blackest black and whitest white in the picture
  1875. #ifdef TIMEING
  1876. long long T0, T1, memcpyTime=0, vertTime=0, horizTime=0, sumTime, diffTime=0;
  1877. sumTime= rdtsc();
  1878. #endif
  1879. if(!yHistogram)
  1880. {
  1881. int i;
  1882. yHistogram= (uint64_t*)malloc(8*256);
  1883. for(i=0; i<256; i++) yHistogram[i]= width*height/64*15/256;
  1884. }
  1885. if(!isColor)
  1886. {
  1887. uint64_t sum= 0;
  1888. int i;
  1889. static int framenum= -1;
  1890. uint64_t maxClipped;
  1891. uint64_t clipped;
  1892. double scale;
  1893. framenum++;
  1894. if(framenum == 1) yHistogram[0]= width*height/64*15/256;
  1895. for(i=0; i<256; i++)
  1896. {
  1897. sum+= yHistogram[i];
  1898. // printf("%d ", yHistogram[i]);
  1899. }
  1900. // printf("\n\n");
  1901. /* we allways get a completly black picture first */
  1902. maxClipped= (uint64_t)(sum * maxClippedThreshold);
  1903. clipped= sum;
  1904. for(black=255; black>0; black--)
  1905. {
  1906. if(clipped < maxClipped) break;
  1907. clipped-= yHistogram[black];
  1908. }
  1909. clipped= sum;
  1910. for(white=0; white<256; white++)
  1911. {
  1912. if(clipped < maxClipped) break;
  1913. clipped-= yHistogram[white];
  1914. }
  1915. // we cant handle negative correctures
  1916. packedYOffset= MAX(black - minAllowedY, 0);
  1917. packedYOffset|= packedYOffset<<32;
  1918. packedYOffset|= packedYOffset<<16;
  1919. packedYOffset|= packedYOffset<<8;
  1920. scale= (double)(maxAllowedY - minAllowedY) / (double)(white-black);
  1921. packedYScale= (uint16_t)(scale*512.0 + 0.5);
  1922. packedYScale|= packedYScale<<32;
  1923. packedYScale|= packedYScale<<16;
  1924. }
  1925. else
  1926. {
  1927. packedYScale= 0x0100010001000100LL;
  1928. packedYOffset= 0;
  1929. }
  1930. for(x=0; x<width; x+=BLOCK_SIZE)
  1931. blockCopy(dst + x, dstStride, src + x, srcStride, 8, mode & LEVEL_FIX);
  1932. for(y=0; y<height; y+=BLOCK_SIZE)
  1933. {
  1934. //1% speedup if these are here instead of the inner loop
  1935. uint8_t *srcBlock= &(src[y*srcStride]);
  1936. uint8_t *dstBlock= &(dst[y*dstStride]);
  1937. uint8_t *vertSrcBlock= &(srcBlock[srcStride*3]); // Blocks are 10x8 -> *3 to start
  1938. uint8_t *vertBlock= &(dstBlock[dstStride*3]);
  1939. // finish 1 block before the next otherwise we´ll might have a problem
  1940. // with the L1 Cache of the P4 ... or only a few blocks at a time or soemthing
  1941. for(x=0; x<width; x+=BLOCK_SIZE)
  1942. {
  1943. const int stride= dstStride;
  1944. int QP= isColor ?
  1945. QPs[(y>>3)*QPStride + (x>>3)]:
  1946. QPs[(y>>4)*QPStride + (x>>4)];
  1947. if(!isColor && (mode & LEVEL_FIX)) QP= (QP* (packedYScale &0xFFFF))>>8;
  1948. #ifdef HAVE_MMX
  1949. asm volatile(
  1950. "movd %0, %%mm7 \n\t"
  1951. "packuswb %%mm7, %%mm7 \n\t" // 0, 0, 0, QP, 0, 0, 0, QP
  1952. "packuswb %%mm7, %%mm7 \n\t" // 0,QP, 0, QP, 0,QP, 0, QP
  1953. "packuswb %%mm7, %%mm7 \n\t" // QP,..., QP
  1954. "movq %%mm7, pQPb \n\t"
  1955. : : "r" (QP)
  1956. );
  1957. #endif
  1958. if(y + 12 < height)
  1959. {
  1960. #ifdef MORE_TIMEING
  1961. T0= rdtsc();
  1962. #endif
  1963. #ifdef HAVE_MMX2
  1964. prefetchnta(vertSrcBlock + (((x>>3)&3) + 2)*srcStride + 32);
  1965. prefetchnta(vertSrcBlock + (((x>>3)&3) + 6)*srcStride + 32);
  1966. prefetcht0(vertBlock + (((x>>3)&3) + 2)*dstStride + 32);
  1967. prefetcht0(vertBlock + (((x>>3)&3) + 6)*dstStride + 32);
  1968. #elif defined(HAVE_3DNOW)
  1969. //FIXME check if this is faster on an 3dnow chip or if its faster without the prefetch or ...
  1970. /* prefetch(vertSrcBlock + (((x>>3)&3) + 2)*srcStride + 32);
  1971. prefetch(vertSrcBlock + (((x>>3)&3) + 6)*srcStride + 32);
  1972. prefetchw(vertBlock + (((x>>3)&3) + 2)*dstStride + 32);
  1973. prefetchw(vertBlock + (((x>>3)&3) + 6)*dstStride + 32);
  1974. */
  1975. #endif
  1976. if(!isColor) yHistogram[ srcBlock[0] ]++;
  1977. blockCopy(vertBlock + dstStride*2, dstStride,
  1978. vertSrcBlock + srcStride*2, srcStride, 8, mode & LEVEL_FIX);
  1979. #ifdef MORE_TIMEING
  1980. T1= rdtsc();
  1981. memcpyTime+= T1-T0;
  1982. T0=T1;
  1983. #endif
  1984. if(mode & V_DEBLOCK)
  1985. {
  1986. if(mode & V_RK1_FILTER)
  1987. vertRK1Filter(vertBlock, stride, QP);
  1988. else if(mode & V_X1_FILTER)
  1989. vertX1Filter(vertBlock, stride, QP);
  1990. else
  1991. {
  1992. if( isVertDC(vertBlock, stride))
  1993. {
  1994. if(isVertMinMaxOk(vertBlock, stride, QP))
  1995. doVertLowPass(vertBlock, stride, QP);
  1996. }
  1997. else
  1998. doVertDefFilter(vertBlock, stride, QP);
  1999. }
  2000. }
  2001. #ifdef MORE_TIMEING
  2002. T1= rdtsc();
  2003. vertTime+= T1-T0;
  2004. T0=T1;
  2005. #endif
  2006. }
  2007. else
  2008. blockCopy(vertBlock + dstStride*1, dstStride,
  2009. vertSrcBlock + srcStride*1, srcStride, 4, mode & LEVEL_FIX);
  2010. if(x - 8 >= 0 && x<width)
  2011. {
  2012. #ifdef MORE_TIMEING
  2013. T0= rdtsc();
  2014. #endif
  2015. if(mode & H_DEBLOCK)
  2016. {
  2017. if(mode & H_X1_FILTER)
  2018. horizX1Filter(dstBlock-4, stride, QP);
  2019. else
  2020. {
  2021. if( isHorizDCAndCopy2Temp(dstBlock-4, stride))
  2022. {
  2023. if(isHorizMinMaxOk(tempBlock, TEMP_STRIDE, QP))
  2024. doHorizLowPassAndCopyBack(dstBlock-4, stride, QP);
  2025. }
  2026. else
  2027. doHorizDefFilterAndCopyBack(dstBlock-4, stride, QP);
  2028. }
  2029. }
  2030. #ifdef MORE_TIMEING
  2031. T1= rdtsc();
  2032. horizTime+= T1-T0;
  2033. T0=T1;
  2034. #endif
  2035. dering(dstBlock - 9 - stride, stride, QP);
  2036. }
  2037. else if(y!=0)
  2038. dering(dstBlock - stride*9 + width-9, stride, QP);
  2039. //FIXME dering filter will not be applied to last block (bottom right)
  2040. dstBlock+=8;
  2041. srcBlock+=8;
  2042. vertBlock+=8;
  2043. vertSrcBlock+=8;
  2044. }
  2045. }
  2046. #ifdef HAVE_3DNOW
  2047. asm volatile("femms");
  2048. #elif defined (HAVE_MMX)
  2049. asm volatile("emms");
  2050. #endif
  2051. #ifdef TIMEING
  2052. // FIXME diff is mostly the time spent for rdtsc (should subtract that but ...)
  2053. sumTime= rdtsc() - sumTime;
  2054. if(!isColor)
  2055. printf("cpy:%4dk, vert:%4dk, horiz:%4dk, sum:%4dk, diff:%4dk, color: %d/%d \r",
  2056. (int)(memcpyTime/1000), (int)(vertTime/1000), (int)(horizTime/1000),
  2057. (int)(sumTime/1000), (int)((sumTime-memcpyTime-vertTime-horizTime)/1000)
  2058. , black, white);
  2059. #endif
  2060. }