You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

2740 lines
78KB

  1. /*
  2. Copyright (C) 2001 Michael Niedermayer (michaelni@gmx.at)
  3. This program is free software; you can redistribute it and/or modify
  4. it under the terms of the GNU General Public License as published by
  5. the Free Software Foundation; either version 2 of the License, or
  6. (at your option) any later version.
  7. This program is distributed in the hope that it will be useful,
  8. but WITHOUT ANY WARRANTY; without even the implied warranty of
  9. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  10. GNU General Public License for more details.
  11. You should have received a copy of the GNU General Public License
  12. along with this program; if not, write to the Free Software
  13. Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  14. */
  15. /*
  16. C MMX MMX2 3DNow
  17. isVertDC Ec Ec
  18. isVertMinMaxOk Ec Ec
  19. doVertLowPass E e e
  20. doVertDefFilter Ec Ec Ec
  21. isHorizDC Ec Ec
  22. isHorizMinMaxOk a
  23. doHorizLowPass E a a
  24. doHorizDefFilter E ac ac
  25. deRing
  26. Vertical RKAlgo1 E a a
  27. Vertical X1 a E E
  28. Horizontal X1 a E E
  29. LinIpolDeinterlace a E E*
  30. LinBlendDeinterlace a E E*
  31. MedianDeinterlace a E
  32. * i dont have a 3dnow CPU -> its untested
  33. E = Exact implementation
  34. e = allmost exact implementation
  35. a = alternative / approximate impl
  36. c = checked against the other implementations (-vo md5)
  37. */
  38. /*
  39. TODO:
  40. verify that everything workes as it should (how?)
  41. reduce the time wasted on the mem transfer
  42. implement dering
  43. implement everything in C at least (done at the moment but ...)
  44. unroll stuff if instructions depend too much on the prior one
  45. we use 8x8 blocks for the horizontal filters, opendivx seems to use 8x4?
  46. move YScale thing to the end instead of fixing QP
  47. write a faster and higher quality deblocking filter :)
  48. do something about the speed of the horizontal filters
  49. make the mainloop more flexible (variable number of blocks at once
  50. (the if/else stuff per block is slowing things down)
  51. compare the quality & speed of all filters
  52. implement a few simple deinterlacing filters
  53. split this huge file
  54. fix warnings (unused vars, ...)
  55. ...
  56. Notes:
  57. */
  58. /*
  59. Changelog: use the CVS log
  60. rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
  61. added deinterlace filters (linear interpolate, linear blend, median)
  62. minor cleanups (removed some outcommented stuff)
  63. 0.1.3
  64. bugfixes: last 3 lines not brightness/contrast corrected
  65. brightness statistics messed up with initial black pic
  66. changed initial values of the brightness statistics
  67. C++ -> C conversation
  68. QP range question solved (very likely 1<=QP<=32 according to arpi)
  69. new experimental vertical deblocking filter
  70. RK filter has 3dNow support now (untested)
  71. 0.1.2
  72. fixed a bug in the horizontal default filter
  73. 3dnow version of the Horizontal & Vertical Lowpass filters
  74. mmx version of the Horizontal Default filter
  75. mmx2 & C versions of a simple filter described in a paper from ramkishor & karandikar
  76. added mode flags & quality2mode function
  77. 0.1.1
  78. */
  79. #include <inttypes.h>
  80. #include <stdio.h>
  81. #include <stdlib.h>
  82. #include "../config.h"
  83. //#undef HAVE_MMX2
  84. //#define HAVE_3DNOW
  85. //#undef HAVE_MMX
  86. #include "postprocess.h"
  87. #define MIN(a,b) ((a) > (b) ? (b) : (a))
  88. #define MAX(a,b) ((a) < (b) ? (b) : (a))
  89. #define ABS(a) ((a) > 0 ? (a) : (-(a)))
  90. #define SIGN(a) ((a) > 0 ? 1 : -1)
  91. #ifdef HAVE_MMX2
  92. #define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
  93. #elif defined (HAVE_3DNOW)
  94. #define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
  95. #endif
  96. static uint64_t packedYOffset= 0x0000000000000000LL;
  97. static uint64_t packedYScale= 0x0100010001000100LL;
  98. static uint64_t w05= 0x0005000500050005LL;
  99. static uint64_t w20= 0x0020002000200020LL;
  100. static uint64_t w1400= 0x1400140014001400LL;
  101. static uint64_t bm00000001= 0x00000000000000FFLL;
  102. static uint64_t bm00010000= 0x000000FF00000000LL;
  103. static uint64_t bm00001000= 0x00000000FF000000LL;
  104. static uint64_t bm10000000= 0xFF00000000000000LL;
  105. static uint64_t bm10000001= 0xFF000000000000FFLL;
  106. static uint64_t bm11000011= 0xFFFF00000000FFFFLL;
  107. static uint64_t bm00000011= 0x000000000000FFFFLL;
  108. static uint64_t bm11111110= 0xFFFFFFFFFFFFFF00LL;
  109. static uint64_t bm11000000= 0xFFFF000000000000LL;
  110. static uint64_t bm00011000= 0x000000FFFF000000LL;
  111. static uint64_t bm00110011= 0x0000FFFF0000FFFFLL;
  112. static uint64_t bm11001100= 0xFFFF0000FFFF0000LL;
  113. static uint64_t b00= 0x0000000000000000LL;
  114. static uint64_t b01= 0x0101010101010101LL;
  115. static uint64_t b02= 0x0202020202020202LL;
  116. static uint64_t b0F= 0x0F0F0F0F0F0F0F0FLL;
  117. static uint64_t bFF= 0xFFFFFFFFFFFFFFFFLL;
  118. static uint64_t b20= 0x2020202020202020LL;
  119. static uint64_t b80= 0x8080808080808080LL;
  120. static uint64_t b7E= 0x7E7E7E7E7E7E7E7ELL;
  121. static uint64_t b7C= 0x7C7C7C7C7C7C7C7CLL;
  122. static uint64_t b3F= 0x3F3F3F3F3F3F3F3FLL;
  123. static uint64_t temp0=0;
  124. static uint64_t temp1=0;
  125. static uint64_t temp2=0;
  126. static uint64_t temp3=0;
  127. static uint64_t temp4=0;
  128. static uint64_t temp5=0;
  129. static uint64_t pQPb=0;
  130. static uint8_t tempBlock[16*16];
  131. int hFlatnessThreshold= 56 - 16;
  132. int vFlatnessThreshold= 56 - 16;
  133. //amount of "black" u r willing to loose to get a brightness corrected picture
  134. double maxClippedThreshold= 0.01;
  135. int maxAllowedY=255;
  136. //FIXME can never make a movie´s black brighter (anyone needs that?)
  137. int minAllowedY=0;
  138. #ifdef TIMEING
  139. static inline long long rdtsc()
  140. {
  141. long long l;
  142. asm volatile( "rdtsc\n\t"
  143. : "=A" (l)
  144. );
  145. // printf("%d\n", int(l/1000));
  146. return l;
  147. }
  148. #endif
  149. #ifdef HAVE_MMX2
  150. static inline void prefetchnta(void *p)
  151. {
  152. asm volatile( "prefetchnta (%0)\n\t"
  153. : : "r" (p)
  154. );
  155. }
  156. static inline void prefetcht0(void *p)
  157. {
  158. asm volatile( "prefetcht0 (%0)\n\t"
  159. : : "r" (p)
  160. );
  161. }
  162. static inline void prefetcht1(void *p)
  163. {
  164. asm volatile( "prefetcht1 (%0)\n\t"
  165. : : "r" (p)
  166. );
  167. }
  168. static inline void prefetcht2(void *p)
  169. {
  170. asm volatile( "prefetcht2 (%0)\n\t"
  171. : : "r" (p)
  172. );
  173. }
  174. #endif
  175. //FIXME? |255-0| = 1 (shouldnt be a problem ...)
  176. /**
  177. * Check if the middle 8x8 Block in the given 8x10 block is flat
  178. */
  179. static inline int isVertDC(uint8_t src[], int stride){
  180. int numEq= 0;
  181. int y;
  182. src+= stride; // src points to begin of the 8x8 Block
  183. #ifdef HAVE_MMX
  184. asm volatile(
  185. "pushl %1\n\t"
  186. "movq b7E, %%mm7 \n\t" // mm7 = 0x7F
  187. "movq b7C, %%mm6 \n\t" // mm6 = 0x7D
  188. "movq (%1), %%mm0 \n\t"
  189. "addl %2, %1 \n\t"
  190. "movq (%1), %%mm1 \n\t"
  191. "psubb %%mm1, %%mm0 \n\t" // mm0 = differnece
  192. "paddb %%mm7, %%mm0 \n\t"
  193. "pcmpgtb %%mm6, %%mm0 \n\t"
  194. "addl %2, %1 \n\t"
  195. "movq (%1), %%mm2 \n\t"
  196. "psubb %%mm2, %%mm1 \n\t"
  197. "paddb %%mm7, %%mm1 \n\t"
  198. "pcmpgtb %%mm6, %%mm1 \n\t"
  199. "paddb %%mm1, %%mm0 \n\t"
  200. "addl %2, %1 \n\t"
  201. "movq (%1), %%mm1 \n\t"
  202. "psubb %%mm1, %%mm2 \n\t"
  203. "paddb %%mm7, %%mm2 \n\t"
  204. "pcmpgtb %%mm6, %%mm2 \n\t"
  205. "paddb %%mm2, %%mm0 \n\t"
  206. "addl %2, %1 \n\t"
  207. "movq (%1), %%mm2 \n\t"
  208. "psubb %%mm2, %%mm1 \n\t"
  209. "paddb %%mm7, %%mm1 \n\t"
  210. "pcmpgtb %%mm6, %%mm1 \n\t"
  211. "paddb %%mm1, %%mm0 \n\t"
  212. "addl %2, %1 \n\t"
  213. "movq (%1), %%mm1 \n\t"
  214. "psubb %%mm1, %%mm2 \n\t"
  215. "paddb %%mm7, %%mm2 \n\t"
  216. "pcmpgtb %%mm6, %%mm2 \n\t"
  217. "paddb %%mm2, %%mm0 \n\t"
  218. "addl %2, %1 \n\t"
  219. "movq (%1), %%mm2 \n\t"
  220. "psubb %%mm2, %%mm1 \n\t"
  221. "paddb %%mm7, %%mm1 \n\t"
  222. "pcmpgtb %%mm6, %%mm1 \n\t"
  223. "paddb %%mm1, %%mm0 \n\t"
  224. "addl %2, %1 \n\t"
  225. "movq (%1), %%mm1 \n\t"
  226. "psubb %%mm1, %%mm2 \n\t"
  227. "paddb %%mm7, %%mm2 \n\t"
  228. "pcmpgtb %%mm6, %%mm2 \n\t"
  229. "paddb %%mm2, %%mm0 \n\t"
  230. " \n\t"
  231. "movq %%mm0, %%mm1 \n\t"
  232. "psrlw $8, %%mm0 \n\t"
  233. "paddb %%mm1, %%mm0 \n\t"
  234. "movq %%mm0, %%mm1 \n\t"
  235. "psrlq $16, %%mm0 \n\t"
  236. "paddb %%mm1, %%mm0 \n\t"
  237. "movq %%mm0, %%mm1 \n\t"
  238. "psrlq $32, %%mm0 \n\t"
  239. "paddb %%mm1, %%mm0 \n\t"
  240. "popl %1\n\t"
  241. "movd %%mm0, %0 \n\t"
  242. : "=r" (numEq)
  243. : "r" (src), "r" (stride)
  244. );
  245. // printf("%d\n", numEq);
  246. numEq= (256 - (numEq & 0xFF)) &0xFF;
  247. // int asmEq= numEq;
  248. // numEq=0;
  249. // uint8_t *temp= src;
  250. #else
  251. for(y=0; y<BLOCK_SIZE-1; y++)
  252. {
  253. if(((src[0] - src[0+stride] + 1)&0xFFFF) < 3) numEq++;
  254. if(((src[1] - src[1+stride] + 1)&0xFFFF) < 3) numEq++;
  255. if(((src[2] - src[2+stride] + 1)&0xFFFF) < 3) numEq++;
  256. if(((src[3] - src[3+stride] + 1)&0xFFFF) < 3) numEq++;
  257. if(((src[4] - src[4+stride] + 1)&0xFFFF) < 3) numEq++;
  258. if(((src[5] - src[5+stride] + 1)&0xFFFF) < 3) numEq++;
  259. if(((src[6] - src[6+stride] + 1)&0xFFFF) < 3) numEq++;
  260. if(((src[7] - src[7+stride] + 1)&0xFFFF) < 3) numEq++;
  261. src+= stride;
  262. }
  263. #endif
  264. /* if(abs(numEq - asmEq) > 0)
  265. {
  266. printf("\nasm:%d c:%d\n", asmEq, numEq);
  267. for(int y=0; y<8; y++)
  268. {
  269. for(int x=0; x<8; x++)
  270. {
  271. printf("%d ", temp[x + y*stride]);
  272. }
  273. printf("\n");
  274. }
  275. }
  276. */
  277. // for(int i=0; i<numEq/8; i++) src[i]=255;
  278. return (numEq > vFlatnessThreshold) ? 1 : 0;
  279. }
  280. static inline int isVertMinMaxOk(uint8_t src[], int stride, int QP)
  281. {
  282. #ifdef HAVE_MMX
  283. int isOk;
  284. asm volatile(
  285. // "int $3 \n\t"
  286. "movq (%1, %2), %%mm0 \n\t"
  287. "movq (%1, %2, 8), %%mm1 \n\t"
  288. "movq %%mm0, %%mm2 \n\t"
  289. "psubusb %%mm1, %%mm0 \n\t"
  290. "psubusb %%mm2, %%mm1 \n\t"
  291. "por %%mm1, %%mm0 \n\t" // ABS Diff
  292. "movq pQPb, %%mm7 \n\t" // QP,..., QP
  293. "paddusb %%mm7, %%mm7 \n\t" // 2QP ... 2QP
  294. "psubusb %%mm7, %%mm0 \n\t" // Diff <= 2QP -> 0
  295. "pcmpeqd b00, %%mm0 \n\t"
  296. "psrlq $16, %%mm0 \n\t"
  297. "pcmpeqd bFF, %%mm0 \n\t"
  298. // "movd %%mm0, (%1, %2, 4)\n\t"
  299. "movd %%mm0, %0 \n\t"
  300. : "=r" (isOk)
  301. : "r" (src), "r" (stride)
  302. );
  303. return isOk ? 1 : 0;
  304. #else
  305. int isOk2= 1;
  306. int x;
  307. for(x=0; x<BLOCK_SIZE; x++)
  308. {
  309. if(abs((int)src[x + stride] - (int)src[x + (stride<<3)]) > 2*QP) isOk2=0;
  310. }
  311. /* if(isOk && !isOk2 || !isOk && isOk2)
  312. {
  313. printf("\nasm:%d c:%d QP:%d\n", isOk, isOk2, QP);
  314. for(int y=0; y<9; y++)
  315. {
  316. for(int x=0; x<8; x++)
  317. {
  318. printf("%d ", src[x + y*stride]);
  319. }
  320. printf("\n");
  321. }
  322. } */
  323. return isOk2;
  324. #endif
  325. }
  326. /**
  327. * Do a vertical low pass filter on the 8x10 block (only write to the 8x8 block in the middle)
  328. * useing the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16
  329. */
  330. static inline void doVertLowPass(uint8_t *src, int stride, int QP)
  331. {
  332. // QP= 64;
  333. #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
  334. //#ifdef HAVE_MMX2
  335. asm volatile( //"movv %0 %1 %2\n\t"
  336. "pushl %0 \n\t"
  337. "movq pQPb, %%mm0 \n\t" // QP,..., QP
  338. // "movq bFF , %%mm0 \n\t" // QP,..., QP
  339. "movq (%0), %%mm6 \n\t"
  340. "movq (%0, %1), %%mm5 \n\t"
  341. "movq %%mm5, %%mm1 \n\t"
  342. "movq %%mm6, %%mm2 \n\t"
  343. "psubusb %%mm6, %%mm5 \n\t"
  344. "psubusb %%mm1, %%mm2 \n\t"
  345. "por %%mm5, %%mm2 \n\t" // ABS Diff of lines
  346. "psubusb %%mm0, %%mm2 \n\t" // diff <= QP -> 0
  347. "pcmpeqb b00, %%mm2 \n\t" // diff <= QP -> FF
  348. "pand %%mm2, %%mm6 \n\t"
  349. "pandn %%mm1, %%mm2 \n\t"
  350. "por %%mm2, %%mm6 \n\t"// First Line to Filter
  351. "movq (%0, %1, 8), %%mm5 \n\t"
  352. "leal (%0, %1, 4), %%eax \n\t"
  353. "leal (%0, %1, 8), %%ebx \n\t"
  354. "subl %1, %%ebx \n\t"
  355. "addl %1, %0 \n\t" // %0 points to line 1 not 0
  356. "movq (%0, %1, 8), %%mm7 \n\t"
  357. "movq %%mm5, %%mm1 \n\t"
  358. "movq %%mm7, %%mm2 \n\t"
  359. "psubusb %%mm7, %%mm5 \n\t"
  360. "psubusb %%mm1, %%mm2 \n\t"
  361. "por %%mm5, %%mm2 \n\t" // ABS Diff of lines
  362. "psubusb %%mm0, %%mm2 \n\t" // diff <= QP -> 0
  363. "pcmpeqb b00, %%mm2 \n\t" // diff <= QP -> FF
  364. "pand %%mm2, %%mm7 \n\t"
  365. "pandn %%mm1, %%mm2 \n\t"
  366. "por %%mm2, %%mm7 \n\t" // First Line to Filter
  367. // 1 2 3 4 5 6 7 8
  368. // %0 %0+%1 %0+2%1 eax %0+4%1 eax+2%1 ebx eax+4%1
  369. // 6 4 2 2 1 1
  370. // 6 4 4 2
  371. // 6 8 2
  372. /*
  373. "movq %%mm6, %%mm2 \n\t" //1
  374. "movq %%mm6, %%mm3 \n\t" //1
  375. "paddusb b02, %%mm3 \n\t"
  376. "psrlw $2, %%mm3 \n\t" //1 /4
  377. "pand b3F, %%mm3 \n\t"
  378. "psubb %%mm3, %%mm2 \n\t"
  379. "movq (%0, %1), %%mm0 \n\t" // 1
  380. "movq %%mm0, %%mm1 \n\t" // 1
  381. "paddusb b02, %%mm0 \n\t"
  382. "psrlw $2, %%mm0 \n\t" // 1 /4
  383. "pand b3F, %%mm0 \n\t"
  384. "paddusb %%mm2, %%mm0 \n\t" //3 1 /4
  385. */
  386. "movq (%0, %1), %%mm0 \n\t" // 1
  387. "movq %%mm0, %%mm1 \n\t" // 1
  388. PAVGB(%%mm6, %%mm0) //1 1 /2
  389. PAVGB(%%mm6, %%mm0) //3 1 /4
  390. "movq (%0, %1, 4), %%mm2 \n\t" // 1
  391. "movq %%mm2, %%mm5 \n\t" // 1
  392. PAVGB((%%eax), %%mm2) // 11 /2
  393. PAVGB((%0, %1, 2), %%mm2) // 211 /4
  394. "movq %%mm2, %%mm3 \n\t" // 211 /4
  395. "movq (%0), %%mm4 \n\t" // 1
  396. PAVGB(%%mm4, %%mm3) // 4 211 /8
  397. PAVGB(%%mm0, %%mm3) //642211 /16
  398. "movq %%mm3, (%0) \n\t" // X
  399. // mm1=2 mm2=3(211) mm4=1 mm5=5 mm6=0 mm7=9
  400. "movq %%mm1, %%mm0 \n\t" // 1
  401. PAVGB(%%mm6, %%mm0) //1 1 /2
  402. "movq %%mm4, %%mm3 \n\t" // 1
  403. PAVGB((%0,%1,2), %%mm3) // 1 1 /2
  404. PAVGB((%%eax,%1,2), %%mm5) // 11 /2
  405. PAVGB((%%eax), %%mm5) // 211 /4
  406. PAVGB(%%mm5, %%mm3) // 2 2211 /8
  407. PAVGB(%%mm0, %%mm3) //4242211 /16
  408. "movq %%mm3, (%0,%1) \n\t" // X
  409. // mm1=2 mm2=3(211) mm4=1 mm5=4(211) mm6=0 mm7=9
  410. PAVGB(%%mm4, %%mm6) //11 /2
  411. "movq (%%ebx), %%mm0 \n\t" // 1
  412. PAVGB((%%eax, %1, 2), %%mm0) // 11/2
  413. "movq %%mm0, %%mm3 \n\t" // 11/2
  414. PAVGB(%%mm1, %%mm0) // 2 11/4
  415. PAVGB(%%mm6, %%mm0) //222 11/8
  416. PAVGB(%%mm2, %%mm0) //22242211/16
  417. "movq (%0, %1, 2), %%mm2 \n\t" // 1
  418. "movq %%mm0, (%0, %1, 2) \n\t" // X
  419. // mm1=2 mm2=3 mm3=6(11) mm4=1 mm5=4(211) mm6=0(11) mm7=9
  420. "movq (%%eax, %1, 4), %%mm0 \n\t" // 1
  421. PAVGB((%%ebx), %%mm0) // 11 /2
  422. PAVGB(%%mm0, %%mm6) //11 11 /4
  423. PAVGB(%%mm1, %%mm4) // 11 /2
  424. PAVGB(%%mm2, %%mm1) // 11 /2
  425. PAVGB(%%mm1, %%mm6) //1122 11 /8
  426. PAVGB(%%mm5, %%mm6) //112242211 /16
  427. "movq (%%eax), %%mm5 \n\t" // 1
  428. "movq %%mm6, (%%eax) \n\t" // X
  429. // mm0=7(11) mm1=2(11) mm2=3 mm3=6(11) mm4=1(11) mm5=4 mm7=9
  430. "movq (%%eax, %1, 4), %%mm6 \n\t" // 1
  431. PAVGB(%%mm7, %%mm6) // 11 /2
  432. PAVGB(%%mm4, %%mm6) // 11 11 /4
  433. PAVGB(%%mm3, %%mm6) // 11 2211 /8
  434. PAVGB(%%mm5, %%mm2) // 11 /2
  435. "movq (%0, %1, 4), %%mm4 \n\t" // 1
  436. PAVGB(%%mm4, %%mm2) // 112 /4
  437. PAVGB(%%mm2, %%mm6) // 112242211 /16
  438. "movq %%mm6, (%0, %1, 4) \n\t" // X
  439. // mm0=7(11) mm1=2(11) mm2=3(112) mm3=6(11) mm4=5 mm5=4 mm7=9
  440. PAVGB(%%mm7, %%mm1) // 11 2 /4
  441. PAVGB(%%mm4, %%mm5) // 11 /2
  442. PAVGB(%%mm5, %%mm0) // 11 11 /4
  443. "movq (%%eax, %1, 2), %%mm6 \n\t" // 1
  444. PAVGB(%%mm6, %%mm1) // 11 4 2 /8
  445. PAVGB(%%mm0, %%mm1) // 11224222 /16
  446. // "pxor %%mm1, %%mm1 \n\t"
  447. "movq %%mm1, (%%eax, %1, 2) \n\t" // X
  448. // mm2=3(112) mm3=6(11) mm4=5 mm5=4(11) mm6=6 mm7=9
  449. PAVGB((%%ebx), %%mm2) // 112 4 /8
  450. "movq (%%eax, %1, 4), %%mm0 \n\t" // 1
  451. PAVGB(%%mm0, %%mm6) // 1 1 /2
  452. PAVGB(%%mm7, %%mm6) // 1 12 /4
  453. PAVGB(%%mm2, %%mm6) // 1122424 /4
  454. // "pxor %%mm6, %%mm6 \n\t"
  455. "movq %%mm6, (%%ebx) \n\t" // X
  456. // mm0=8 mm3=6(11) mm4=5 mm5=4(11) mm7=9
  457. PAVGB(%%mm7, %%mm5) // 11 2 /4
  458. PAVGB(%%mm7, %%mm5) // 11 6 /8
  459. PAVGB(%%mm3, %%mm0) // 112 /4
  460. PAVGB(%%mm0, %%mm5) // 112246 /16
  461. // "pxor %%mm5, %%mm5 \n\t"
  462. // "movq pQPb, %%mm5 \n\t"
  463. "movq %%mm5, (%%eax, %1, 4) \n\t" // X
  464. "popl %0\n\t"
  465. :
  466. : "r" (src), "r" (stride)
  467. : "%eax", "%ebx"
  468. );
  469. #else
  470. const int l1= stride;
  471. const int l2= stride + l1;
  472. const int l3= stride + l2;
  473. const int l4= stride + l3;
  474. const int l5= stride + l4;
  475. const int l6= stride + l5;
  476. const int l7= stride + l6;
  477. const int l8= stride + l7;
  478. const int l9= stride + l8;
  479. int x;
  480. for(x=0; x<BLOCK_SIZE; x++)
  481. {
  482. const int first= ABS(src[0] - src[l1]) < QP ? src[0] : src[l1];
  483. const int last= ABS(src[l8] - src[l9]) < QP ? src[l9] : src[l8];
  484. int sums[9];
  485. sums[0] = first + src[l1];
  486. sums[1] = src[l1] + src[l2];
  487. sums[2] = src[l2] + src[l3];
  488. sums[3] = src[l3] + src[l4];
  489. sums[4] = src[l4] + src[l5];
  490. sums[5] = src[l5] + src[l6];
  491. sums[6] = src[l6] + src[l7];
  492. sums[7] = src[l7] + src[l8];
  493. sums[8] = src[l8] + last;
  494. src[l1]= ((sums[0]<<2) + ((first + sums[2])<<1) + sums[4] + 8)>>4;
  495. src[l2]= ((src[l2]<<2) + (first + sums[0] + sums[3]<<1) + sums[5] + 8)>>4;
  496. src[l3]= ((src[l3]<<2) + (first + sums[1] + sums[4]<<1) + sums[6] + 8)>>4;
  497. src[l4]= ((src[l4]<<2) + (sums[2] + sums[5]<<1) + sums[0] + sums[7] + 8)>>4;
  498. src[l5]= ((src[l5]<<2) + (sums[3] + sums[6]<<1) + sums[1] + sums[8] + 8)>>4;
  499. src[l6]= ((src[l6]<<2) + (last + sums[7] + sums[4]<<1) + sums[2] + 8)>>4;
  500. src[l7]= ((last + src[l7]<<2) + (src[l8] + sums[5]<<1) + sums[3] + 8)>>4;
  501. src[l8]= ((sums[8]<<2) + (last + sums[6]<<1) + sums[4] + 8)>>4;
  502. src++;
  503. }
  504. #endif
  505. }
  506. /**
  507. * Experimental implementation of the filter (Algorithm 1) described in a paper from Ramkishor & Karandikar
  508. * values are correctly clipped (MMX2)
  509. * values are wraparound (C)
  510. * conclusion: its fast, but introduces ugly horizontal patterns if there is a continious gradient
  511. 0 8 16 24
  512. x = 8
  513. x/2 = 4
  514. x/8 = 1
  515. 1 12 12 23
  516. */
  517. static inline void vertRK1Filter(uint8_t *src, int stride, int QP)
  518. {
  519. #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
  520. // FIXME rounding
  521. asm volatile(
  522. "pxor %%mm7, %%mm7 \n\t" // 0
  523. "movq b80, %%mm6 \n\t" // MIN_SIGNED_BYTE
  524. "leal (%0, %1), %%eax \n\t"
  525. "leal (%%eax, %1, 4), %%ebx \n\t"
  526. // 0 1 2 3 4 5 6 7 8 9
  527. // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
  528. "movq pQPb, %%mm0 \n\t" // QP,..., QP
  529. "movq %%mm0, %%mm1 \n\t" // QP,..., QP
  530. "paddusb b02, %%mm0 \n\t"
  531. "psrlw $2, %%mm0 \n\t"
  532. "pand b3F, %%mm0 \n\t" // QP/4,..., QP/4
  533. "paddusb %%mm1, %%mm0 \n\t" // QP*1.25 ...
  534. "movq (%0, %1, 4), %%mm2 \n\t" // line 4
  535. "movq (%%ebx), %%mm3 \n\t" // line 5
  536. "movq %%mm2, %%mm4 \n\t" // line 4
  537. "pcmpeqb %%mm5, %%mm5 \n\t" // -1
  538. "pxor %%mm2, %%mm5 \n\t" // -line 4 - 1
  539. PAVGB(%%mm3, %%mm5)
  540. "paddb %%mm6, %%mm5 \n\t" // (l5-l4)/2
  541. "psubusb %%mm3, %%mm4 \n\t"
  542. "psubusb %%mm2, %%mm3 \n\t"
  543. "por %%mm3, %%mm4 \n\t" // |l4 - l5|
  544. "psubusb %%mm0, %%mm4 \n\t"
  545. "pcmpeqb %%mm7, %%mm4 \n\t"
  546. "pand %%mm4, %%mm5 \n\t" // d/2
  547. // "paddb %%mm6, %%mm2 \n\t" // line 4 + 0x80
  548. "paddb %%mm5, %%mm2 \n\t"
  549. // "psubb %%mm6, %%mm2 \n\t"
  550. "movq %%mm2, (%0,%1, 4) \n\t"
  551. "movq (%%ebx), %%mm2 \n\t"
  552. // "paddb %%mm6, %%mm2 \n\t" // line 5 + 0x80
  553. "psubb %%mm5, %%mm2 \n\t"
  554. // "psubb %%mm6, %%mm2 \n\t"
  555. "movq %%mm2, (%%ebx) \n\t"
  556. "paddb %%mm6, %%mm5 \n\t"
  557. "psrlw $2, %%mm5 \n\t"
  558. "pand b3F, %%mm5 \n\t"
  559. "psubb b20, %%mm5 \n\t" // (l5-l4)/8
  560. "movq (%%eax, %1, 2), %%mm2 \n\t"
  561. "paddb %%mm6, %%mm2 \n\t" // line 3 + 0x80
  562. "paddsb %%mm5, %%mm2 \n\t"
  563. "psubb %%mm6, %%mm2 \n\t"
  564. "movq %%mm2, (%%eax, %1, 2) \n\t"
  565. "movq (%%ebx, %1), %%mm2 \n\t"
  566. "paddb %%mm6, %%mm2 \n\t" // line 6 + 0x80
  567. "psubsb %%mm5, %%mm2 \n\t"
  568. "psubb %%mm6, %%mm2 \n\t"
  569. "movq %%mm2, (%%ebx, %1) \n\t"
  570. :
  571. : "r" (src), "r" (stride)
  572. : "%eax", "%ebx"
  573. );
  574. #else
  575. const int l1= stride;
  576. const int l2= stride + l1;
  577. const int l3= stride + l2;
  578. const int l4= stride + l3;
  579. const int l5= stride + l4;
  580. const int l6= stride + l5;
  581. const int l7= stride + l6;
  582. const int l8= stride + l7;
  583. const int l9= stride + l8;
  584. int x;
  585. for(x=0; x<BLOCK_SIZE; x++)
  586. {
  587. if(ABS(src[l4]-src[l5]) < QP + QP/4)
  588. {
  589. int v = (src[l5] - src[l4]);
  590. src[l3] +=v/8;
  591. src[l4] +=v/2;
  592. src[l5] -=v/2;
  593. src[l6] -=v/8;
  594. }
  595. src++;
  596. }
  597. #endif
  598. }
  599. /**
  600. * Experimental Filter 1
  601. * will not damage linear gradients
  602. * Flat blocks should look like they where passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter
  603. * can only smooth blocks at the expected locations (it cant smooth them if they did move)
  604. * MMX2 version does correct clipping C version doesnt
  605. */
  606. static inline void vertX1Filter(uint8_t *src, int stride, int QP)
  607. {
  608. #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
  609. asm volatile(
  610. "pxor %%mm7, %%mm7 \n\t" // 0
  611. // "movq b80, %%mm6 \n\t" // MIN_SIGNED_BYTE
  612. "leal (%0, %1), %%eax \n\t"
  613. "leal (%%eax, %1, 4), %%ebx \n\t"
  614. // 0 1 2 3 4 5 6 7 8 9
  615. // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
  616. "movq (%%eax, %1, 2), %%mm0 \n\t" // line 3
  617. "movq (%0, %1, 4), %%mm1 \n\t" // line 4
  618. "movq %%mm1, %%mm2 \n\t" // line 4
  619. "psubusb %%mm0, %%mm1 \n\t"
  620. "psubusb %%mm2, %%mm0 \n\t"
  621. "por %%mm1, %%mm0 \n\t" // |l2 - l3|
  622. "movq (%%ebx), %%mm3 \n\t" // line 5
  623. "movq (%%ebx, %1), %%mm4 \n\t" // line 6
  624. "movq %%mm3, %%mm5 \n\t" // line 5
  625. "psubusb %%mm4, %%mm3 \n\t"
  626. "psubusb %%mm5, %%mm4 \n\t"
  627. "por %%mm4, %%mm3 \n\t" // |l5 - l6|
  628. PAVGB(%%mm3, %%mm0) // (|l2 - l3| + |l5 - l6|)/2
  629. "movq %%mm2, %%mm1 \n\t" // line 4
  630. "psubusb %%mm5, %%mm2 \n\t"
  631. "movq %%mm2, %%mm4 \n\t"
  632. "pcmpeqb %%mm7, %%mm2 \n\t" // (l4 - l5) <= 0 ? -1 : 0
  633. "psubusb %%mm1, %%mm5 \n\t"
  634. "por %%mm5, %%mm4 \n\t" // |l4 - l5|
  635. "psubusb %%mm0, %%mm4 \n\t" //d = MAX(0, |l4-l5| - (|l2-l3| + |l5-l6|)/2)
  636. "movq %%mm4, %%mm3 \n\t" // d
  637. "psubusb pQPb, %%mm4 \n\t"
  638. "pcmpeqb %%mm7, %%mm4 \n\t" // d <= QP ? -1 : 0
  639. "psubusb b01, %%mm3 \n\t"
  640. "pand %%mm4, %%mm3 \n\t" // d <= QP ? d : 0
  641. PAVGB(%%mm7, %%mm3) // d/2
  642. "movq %%mm3, %%mm1 \n\t" // d/2
  643. PAVGB(%%mm7, %%mm3) // d/4
  644. PAVGB(%%mm1, %%mm3) // 3*d/8
  645. "movq (%0, %1, 4), %%mm0 \n\t" // line 4
  646. "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4
  647. "psubusb %%mm3, %%mm0 \n\t"
  648. "pxor %%mm2, %%mm0 \n\t"
  649. "movq %%mm0, (%0, %1, 4) \n\t" // line 4
  650. "movq (%%ebx), %%mm0 \n\t" // line 5
  651. "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5
  652. "paddusb %%mm3, %%mm0 \n\t"
  653. "pxor %%mm2, %%mm0 \n\t"
  654. "movq %%mm0, (%%ebx) \n\t" // line 5
  655. PAVGB(%%mm7, %%mm1) // d/4
  656. "movq (%%eax, %1, 2), %%mm0 \n\t" // line 3
  657. "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4
  658. "psubusb %%mm1, %%mm0 \n\t"
  659. "pxor %%mm2, %%mm0 \n\t"
  660. "movq %%mm0, (%%eax, %1, 2) \n\t" // line 3
  661. "movq (%%ebx, %1), %%mm0 \n\t" // line 6
  662. "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5
  663. "paddusb %%mm1, %%mm0 \n\t"
  664. "pxor %%mm2, %%mm0 \n\t"
  665. "movq %%mm0, (%%ebx, %1) \n\t" // line 6
  666. PAVGB(%%mm7, %%mm1) // d/8
  667. "movq (%%eax, %1), %%mm0 \n\t" // line 2
  668. "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l2-1 : l2
  669. "psubusb %%mm1, %%mm0 \n\t"
  670. "pxor %%mm2, %%mm0 \n\t"
  671. "movq %%mm0, (%%eax, %1) \n\t" // line 2
  672. "movq (%%ebx, %1, 2), %%mm0 \n\t" // line 7
  673. "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l7-1 : l7
  674. "paddusb %%mm1, %%mm0 \n\t"
  675. "pxor %%mm2, %%mm0 \n\t"
  676. "movq %%mm0, (%%ebx, %1, 2) \n\t" // line 7
  677. :
  678. : "r" (src), "r" (stride)
  679. : "%eax", "%ebx"
  680. );
  681. #else
  682. const int l1= stride;
  683. const int l2= stride + l1;
  684. const int l3= stride + l2;
  685. const int l4= stride + l3;
  686. const int l5= stride + l4;
  687. const int l6= stride + l5;
  688. const int l7= stride + l6;
  689. const int l8= stride + l7;
  690. const int l9= stride + l8;
  691. int x;
  692. for(x=0; x<BLOCK_SIZE; x++)
  693. {
  694. int a= src[l3] - src[l4];
  695. int b= src[l4] - src[l5];
  696. int c= src[l5] - src[l6];
  697. int d= MAX(ABS(b) - (ABS(a) + ABS(c))/2, 0);
  698. if(d < QP)
  699. {
  700. int v = d * SIGN(-b);
  701. src[l2] +=v/8;
  702. src[l3] +=v/4;
  703. src[l4] +=3*v/8;
  704. src[l5] -=3*v/8;
  705. src[l6] -=v/4;
  706. src[l7] -=v/8;
  707. }
  708. src++;
  709. }
  710. /*
  711. const int l1= stride;
  712. const int l2= stride + l1;
  713. const int l3= stride + l2;
  714. const int l4= stride + l3;
  715. const int l5= stride + l4;
  716. const int l6= stride + l5;
  717. const int l7= stride + l6;
  718. const int l8= stride + l7;
  719. const int l9= stride + l8;
  720. for(int x=0; x<BLOCK_SIZE; x++)
  721. {
  722. int v2= src[l2];
  723. int v3= src[l3];
  724. int v4= src[l4];
  725. int v5= src[l5];
  726. int v6= src[l6];
  727. int v7= src[l7];
  728. if(ABS(v4-v5)<QP && ABS(v4-v5) - (ABS(v3-v4) + ABS(v5-v6))>0 )
  729. {
  730. src[l3] = (6*v2 + 4*v3 + 3*v4 + 2*v5 + v6 )/16;
  731. src[l4] = (3*v2 + 3*v3 + 4*v4 + 3*v5 + 2*v6 + v7 )/16;
  732. src[l5] = (1*v2 + 2*v3 + 3*v4 + 4*v5 + 3*v6 + 3*v7)/16;
  733. src[l6] = ( 1*v3 + 2*v4 + 3*v5 + 4*v6 + 6*v7)/16;
  734. }
  735. src++;
  736. }
  737. */
  738. #endif
  739. }
  740. /**
  741. * Experimental Filter 1 (Horizontal)
  742. * will not damage linear gradients
  743. * Flat blocks should look like they where passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter
  744. * can only smooth blocks at the expected locations (it cant smooth them if they did move)
  745. * MMX2 version does correct clipping C version doesnt
  746. * not identical with the vertical one
  747. */
  748. static inline void horizX1Filter(uint8_t *src, int stride, int QP)
  749. {
  750. int y;
  751. static uint64_t *lut= NULL;
  752. if(lut==NULL)
  753. {
  754. int i;
  755. lut= (uint64_t*)memalign(8, 256*8);
  756. for(i=0; i<256; i++)
  757. {
  758. int v= i < 128 ? 2*i : 2*(i-256);
  759. /*
  760. //Simulate 112242211 9-Tap filter
  761. uint64_t a= (v/16) & 0xFF;
  762. uint64_t b= (v/8) & 0xFF;
  763. uint64_t c= (v/4) & 0xFF;
  764. uint64_t d= (3*v/8) & 0xFF;
  765. */
  766. //Simulate piecewise linear interpolation
  767. uint64_t a= (v/16) & 0xFF;
  768. uint64_t b= (v*3/16) & 0xFF;
  769. uint64_t c= (v*5/16) & 0xFF;
  770. uint64_t d= (7*v/16) & 0xFF;
  771. uint64_t A= (0x100 - a)&0xFF;
  772. uint64_t B= (0x100 - b)&0xFF;
  773. uint64_t C= (0x100 - c)&0xFF;
  774. uint64_t D= (0x100 - c)&0xFF;
  775. lut[i] = (a<<56) | (b<<48) | (c<<40) | (d<<32) |
  776. (D<<24) | (C<<16) | (B<<8) | (A);
  777. //lut[i] = (v<<32) | (v<<24);
  778. }
  779. }
  780. #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
  781. asm volatile(
  782. "pxor %%mm7, %%mm7 \n\t" // 0
  783. // "movq b80, %%mm6 \n\t" // MIN_SIGNED_BYTE
  784. "leal (%0, %1), %%eax \n\t"
  785. "leal (%%eax, %1, 4), %%ebx \n\t"
  786. "movq b80, %%mm6 \n\t"
  787. "movd pQPb, %%mm5 \n\t" // QP
  788. "movq %%mm5, %%mm4 \n\t"
  789. "paddusb %%mm5, %%mm5 \n\t" // 2QP
  790. "paddusb %%mm5, %%mm4 \n\t" // 3QP
  791. "pxor %%mm5, %%mm5 \n\t" // 0
  792. "psubb %%mm4, %%mm5 \n\t" // -3QP
  793. "por bm11111110, %%mm5 \n\t" // ...,FF,FF,-3QP
  794. "psllq $24, %%mm5 \n\t"
  795. // 0 1 2 3 4 5 6 7 8 9
  796. // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
  797. #define HX1old(a) \
  798. "movd " #a ", %%mm0 \n\t"\
  799. "movd 4" #a ", %%mm1 \n\t"\
  800. "punpckldq %%mm1, %%mm0 \n\t"\
  801. "movq %%mm0, %%mm1 \n\t"\
  802. "movq %%mm0, %%mm2 \n\t"\
  803. "psrlq $8, %%mm1 \n\t"\
  804. "psubusb %%mm1, %%mm2 \n\t"\
  805. "psubusb %%mm0, %%mm1 \n\t"\
  806. "por %%mm2, %%mm1 \n\t" /* p´x = |px - p(x+1)| */\
  807. "pcmpeqb %%mm7, %%mm2 \n\t" /* p´x = sgn[px - p(x+1)] */\
  808. "pshufw $0x00, %%mm1, %%mm3 \n\t" /* p´5 = |p1 - p2| */\
  809. PAVGB(%%mm1, %%mm3) /* p´5 = (|p2-p1| + |p6-p5|)/2 */\
  810. "psrlq $16, %%mm3 \n\t" /* p´3 = (|p2-p1| + |p6-p5|)/2 */\
  811. "psubusb %%mm3, %%mm1 \n\t" /* |p3-p4|-(|p2-p1| + |p6-p5|)/2 */\
  812. "paddb %%mm5, %%mm1 \n\t"\
  813. "psubusb %%mm5, %%mm1 \n\t"\
  814. PAVGB(%%mm7, %%mm1)\
  815. "pxor %%mm2, %%mm1 \n\t"\
  816. "psubb %%mm2, %%mm1 \n\t"\
  817. "psrlq $24, %%mm1 \n\t"\
  818. "movd %%mm1, %%ecx \n\t"\
  819. "paddb %%mm6, %%mm0 \n\t"\
  820. "paddsb (%3, %%ecx, 8), %%mm0 \n\t"\
  821. "paddb %%mm6, %%mm0 \n\t"\
  822. "movq %%mm0, " #a " \n\t"\
  823. /*
  824. HX1old((%0))
  825. HX1old((%%eax))
  826. HX1old((%%eax, %1))
  827. HX1old((%%eax, %1, 2))
  828. HX1old((%0, %1, 4))
  829. HX1old((%%ebx))
  830. HX1old((%%ebx, %1))
  831. HX1old((%%ebx, %1, 2))
  832. */
  833. //FIXME add some comments, its unreadable ...
  834. #define HX1b(a, c, b, d) \
  835. "movd " #a ", %%mm0 \n\t"\
  836. "movd 4" #a ", %%mm1 \n\t"\
  837. "punpckldq %%mm1, %%mm0 \n\t"\
  838. "movd " #b ", %%mm4 \n\t"\
  839. "movq %%mm0, %%mm1 \n\t"\
  840. "movq %%mm0, %%mm2 \n\t"\
  841. "psrlq $8, %%mm1 \n\t"\
  842. "movd 4" #b ", %%mm3 \n\t"\
  843. "psubusb %%mm1, %%mm2 \n\t"\
  844. "psubusb %%mm0, %%mm1 \n\t"\
  845. "por %%mm2, %%mm1 \n\t" /* p´x = |px - p(x+1)| */\
  846. "pcmpeqb %%mm7, %%mm2 \n\t" /* p´x = sgn[px - p(x+1)] */\
  847. "punpckldq %%mm3, %%mm4 \n\t"\
  848. "movq %%mm1, %%mm3 \n\t"\
  849. "psllq $32, %%mm3 \n\t" /* p´5 = |p1 - p2| */\
  850. PAVGB(%%mm1, %%mm3) /* p´5 = (|p2-p1| + |p6-p5|)/2 */\
  851. "paddb %%mm6, %%mm0 \n\t"\
  852. "psrlq $16, %%mm3 \n\t" /* p´3 = (|p2-p1| + |p6-p5|)/2 */\
  853. "psubusb %%mm3, %%mm1 \n\t" /* |p3-p4|-(|p2-p1| + |p6-p5|)/2 */\
  854. "movq %%mm4, %%mm3 \n\t"\
  855. "paddb %%mm5, %%mm1 \n\t"\
  856. "psubusb %%mm5, %%mm1 \n\t"\
  857. "psrlq $8, %%mm3 \n\t"\
  858. PAVGB(%%mm7, %%mm1)\
  859. "pxor %%mm2, %%mm1 \n\t"\
  860. "psubb %%mm2, %%mm1 \n\t"\
  861. "movq %%mm4, %%mm2 \n\t"\
  862. "psrlq $24, %%mm1 \n\t"\
  863. "psubusb %%mm3, %%mm2 \n\t"\
  864. "movd %%mm1, %%ecx \n\t"\
  865. "psubusb %%mm4, %%mm3 \n\t"\
  866. "paddsb (%2, %%ecx, 8), %%mm0 \n\t"\
  867. "por %%mm2, %%mm3 \n\t" /* p´x = |px - p(x+1)| */\
  868. "paddb %%mm6, %%mm0 \n\t"\
  869. "pcmpeqb %%mm7, %%mm2 \n\t" /* p´x = sgn[px - p(x+1)] */\
  870. "movq %%mm3, %%mm1 \n\t"\
  871. "psllq $32, %%mm1 \n\t" /* p´5 = |p1 - p2| */\
  872. "movq %%mm0, " #a " \n\t"\
  873. PAVGB(%%mm3, %%mm1) /* p´5 = (|p2-p1| + |p6-p5|)/2 */\
  874. "paddb %%mm6, %%mm4 \n\t"\
  875. "psrlq $16, %%mm1 \n\t" /* p´3 = (|p2-p1| + |p6-p5|)/2 */\
  876. "psubusb %%mm1, %%mm3 \n\t" /* |p3-p4|-(|p2-p1| + |p6-p5|)/2 */\
  877. "paddb %%mm5, %%mm3 \n\t"\
  878. "psubusb %%mm5, %%mm3 \n\t"\
  879. PAVGB(%%mm7, %%mm3)\
  880. "pxor %%mm2, %%mm3 \n\t"\
  881. "psubb %%mm2, %%mm3 \n\t"\
  882. "psrlq $24, %%mm3 \n\t"\
  883. "movd " #c ", %%mm0 \n\t"\
  884. "movd 4" #c ", %%mm1 \n\t"\
  885. "punpckldq %%mm1, %%mm0 \n\t"\
  886. "paddb %%mm6, %%mm0 \n\t"\
  887. "paddsb (%2, %%ecx, 8), %%mm0 \n\t"\
  888. "paddb %%mm6, %%mm0 \n\t"\
  889. "movq %%mm0, " #c " \n\t"\
  890. "movd %%mm3, %%ecx \n\t"\
  891. "movd " #d ", %%mm0 \n\t"\
  892. "paddsb (%2, %%ecx, 8), %%mm4 \n\t"\
  893. "movd 4" #d ", %%mm1 \n\t"\
  894. "paddb %%mm6, %%mm4 \n\t"\
  895. "punpckldq %%mm1, %%mm0 \n\t"\
  896. "movq %%mm4, " #b " \n\t"\
  897. "paddb %%mm6, %%mm0 \n\t"\
  898. "paddsb (%2, %%ecx, 8), %%mm0 \n\t"\
  899. "paddb %%mm6, %%mm0 \n\t"\
  900. "movq %%mm0, " #d " \n\t"\
  901. HX1b((%0),(%%eax),(%%eax, %1),(%%eax, %1, 2))
  902. HX1b((%0, %1, 4),(%%ebx),(%%ebx, %1),(%%ebx, %1, 2))
  903. :
  904. : "r" (src), "r" (stride), "r" (lut)
  905. : "%eax", "%ebx", "%ecx"
  906. );
  907. #else
  908. //FIXME (has little in common with the mmx2 version)
  909. for(y=0; y<BLOCK_SIZE; y++)
  910. {
  911. int a= src[1] - src[2];
  912. int b= src[3] - src[4];
  913. int c= src[5] - src[6];
  914. int d= MAX(ABS(b) - (ABS(a) + ABS(c))/2, 0);
  915. if(d < QP)
  916. {
  917. int v = d * SIGN(-b);
  918. src[1] +=v/8;
  919. src[2] +=v/4;
  920. src[3] +=3*v/8;
  921. src[4] -=3*v/8;
  922. src[5] -=v/4;
  923. src[6] -=v/8;
  924. }
  925. src+=stride;
  926. }
  927. #endif
  928. }
  929. static inline void doVertDefFilter(uint8_t src[], int stride, int QP)
  930. {
  931. #ifdef HAVE_MMX
  932. src+= stride;
  933. //FIXME try pmul for *5 stuff
  934. // src[0]=0;
  935. asm volatile(
  936. "pxor %%mm7, %%mm7 \n\t"
  937. "leal (%0, %1), %%eax \n\t"
  938. "leal (%%eax, %1, 4), %%ebx \n\t"
  939. // 0 1 2 3 4 5 6 7
  940. // %0 %0+%1 %0+2%1 eax+2%1 %0+4%1 eax+4%1 ebx+%1 ebx+2%1
  941. // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1
  942. "movq (%0), %%mm0 \n\t"
  943. "movq %%mm0, %%mm1 \n\t"
  944. "punpcklbw %%mm7, %%mm0 \n\t" // low part of line 0
  945. "punpckhbw %%mm7, %%mm1 \n\t" // high part of line 0
  946. "movq (%%eax), %%mm2 \n\t"
  947. "movq %%mm2, %%mm3 \n\t"
  948. "punpcklbw %%mm7, %%mm2 \n\t" // low part of line 1
  949. "punpckhbw %%mm7, %%mm3 \n\t" // high part of line 1
  950. "movq (%%eax, %1), %%mm4 \n\t"
  951. "movq %%mm4, %%mm5 \n\t"
  952. "punpcklbw %%mm7, %%mm4 \n\t" // low part of line 2
  953. "punpckhbw %%mm7, %%mm5 \n\t" // high part of line 2
  954. "paddw %%mm0, %%mm0 \n\t" // 2L0
  955. "paddw %%mm1, %%mm1 \n\t" // 2H0
  956. "psubw %%mm4, %%mm2 \n\t" // L1 - L2
  957. "psubw %%mm5, %%mm3 \n\t" // H1 - H2
  958. "psubw %%mm2, %%mm0 \n\t" // 2L0 - L1 + L2
  959. "psubw %%mm3, %%mm1 \n\t" // 2H0 - H1 + H2
  960. "psllw $2, %%mm2 \n\t" // 4L1 - 4L2
  961. "psllw $2, %%mm3 \n\t" // 4H1 - 4H2
  962. "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2
  963. "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2
  964. "movq (%%eax, %1, 2), %%mm2 \n\t"
  965. "movq %%mm2, %%mm3 \n\t"
  966. "punpcklbw %%mm7, %%mm2 \n\t" // L3
  967. "punpckhbw %%mm7, %%mm3 \n\t" // H3
  968. "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 - L3
  969. "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 - H3
  970. "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 - 2L3
  971. "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 - 2H3
  972. "movq %%mm0, temp0 \n\t" // 2L0 - 5L1 + 5L2 - 2L3
  973. "movq %%mm1, temp1 \n\t" // 2H0 - 5H1 + 5H2 - 2H3
  974. "movq (%0, %1, 4), %%mm0 \n\t"
  975. "movq %%mm0, %%mm1 \n\t"
  976. "punpcklbw %%mm7, %%mm0 \n\t" // L4
  977. "punpckhbw %%mm7, %%mm1 \n\t" // H4
  978. "psubw %%mm0, %%mm2 \n\t" // L3 - L4
  979. "psubw %%mm1, %%mm3 \n\t" // H3 - H4
  980. "movq %%mm2, temp2 \n\t" // L3 - L4
  981. "movq %%mm3, temp3 \n\t" // H3 - H4
  982. "paddw %%mm4, %%mm4 \n\t" // 2L2
  983. "paddw %%mm5, %%mm5 \n\t" // 2H2
  984. "psubw %%mm2, %%mm4 \n\t" // 2L2 - L3 + L4
  985. "psubw %%mm3, %%mm5 \n\t" // 2H2 - H3 + H4
  986. "psllw $2, %%mm2 \n\t" // 4L3 - 4L4
  987. "psllw $2, %%mm3 \n\t" // 4H3 - 4H4
  988. "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4
  989. "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4
  990. //50 opcodes so far
  991. "movq (%%ebx), %%mm2 \n\t"
  992. "movq %%mm2, %%mm3 \n\t"
  993. "punpcklbw %%mm7, %%mm2 \n\t" // L5
  994. "punpckhbw %%mm7, %%mm3 \n\t" // H5
  995. "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 - L5
  996. "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 - H5
  997. "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 - 2L5
  998. "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 - 2H5
  999. "movq (%%ebx, %1), %%mm6 \n\t"
  1000. "punpcklbw %%mm7, %%mm6 \n\t" // L6
  1001. "psubw %%mm6, %%mm2 \n\t" // L5 - L6
  1002. "movq (%%ebx, %1), %%mm6 \n\t"
  1003. "punpckhbw %%mm7, %%mm6 \n\t" // H6
  1004. "psubw %%mm6, %%mm3 \n\t" // H5 - H6
  1005. "paddw %%mm0, %%mm0 \n\t" // 2L4
  1006. "paddw %%mm1, %%mm1 \n\t" // 2H4
  1007. "psubw %%mm2, %%mm0 \n\t" // 2L4 - L5 + L6
  1008. "psubw %%mm3, %%mm1 \n\t" // 2H4 - H5 + H6
  1009. "psllw $2, %%mm2 \n\t" // 4L5 - 4L6
  1010. "psllw $2, %%mm3 \n\t" // 4H5 - 4H6
  1011. "psubw %%mm2, %%mm0 \n\t" // 2L4 - 5L5 + 5L6
  1012. "psubw %%mm3, %%mm1 \n\t" // 2H4 - 5H5 + 5H6
  1013. "movq (%%ebx, %1, 2), %%mm2 \n\t"
  1014. "movq %%mm2, %%mm3 \n\t"
  1015. "punpcklbw %%mm7, %%mm2 \n\t" // L7
  1016. "punpckhbw %%mm7, %%mm3 \n\t" // H7
  1017. "paddw %%mm2, %%mm2 \n\t" // 2L7
  1018. "paddw %%mm3, %%mm3 \n\t" // 2H7
  1019. "psubw %%mm2, %%mm0 \n\t" // 2L4 - 5L5 + 5L6 - 2L7
  1020. "psubw %%mm3, %%mm1 \n\t" // 2H4 - 5H5 + 5H6 - 2H7
  1021. "movq temp0, %%mm2 \n\t" // 2L0 - 5L1 + 5L2 - 2L3
  1022. "movq temp1, %%mm3 \n\t" // 2H0 - 5H1 + 5H2 - 2H3
  1023. //FIXME pxor, psubw, pmax for abs
  1024. "movq %%mm7, %%mm6 \n\t" // 0
  1025. "pcmpgtw %%mm0, %%mm6 \n\t"
  1026. "pxor %%mm6, %%mm0 \n\t"
  1027. "psubw %%mm6, %%mm0 \n\t" // |2L4 - 5L5 + 5L6 - 2L7|
  1028. "movq %%mm7, %%mm6 \n\t" // 0
  1029. "pcmpgtw %%mm1, %%mm6 \n\t"
  1030. "pxor %%mm6, %%mm1 \n\t"
  1031. "psubw %%mm6, %%mm1 \n\t" // |2H4 - 5H5 + 5H6 - 2H7|
  1032. "movq %%mm7, %%mm6 \n\t" // 0
  1033. "pcmpgtw %%mm2, %%mm6 \n\t"
  1034. "pxor %%mm6, %%mm2 \n\t"
  1035. "psubw %%mm6, %%mm2 \n\t" // |2L0 - 5L1 + 5L2 - 2L3|
  1036. "movq %%mm7, %%mm6 \n\t" // 0
  1037. "pcmpgtw %%mm3, %%mm6 \n\t"
  1038. "pxor %%mm6, %%mm3 \n\t"
  1039. "psubw %%mm6, %%mm3 \n\t" // |2H0 - 5H1 + 5H2 - 2H3|
  1040. #ifdef HAVE_MMX2
  1041. "pminsw %%mm2, %%mm0 \n\t"
  1042. "pminsw %%mm3, %%mm1 \n\t"
  1043. #else
  1044. "movq %%mm0, %%mm6 \n\t"
  1045. "psubusw %%mm2, %%mm6 \n\t"
  1046. "psubw %%mm6, %%mm0 \n\t"
  1047. "movq %%mm1, %%mm6 \n\t"
  1048. "psubusw %%mm3, %%mm6 \n\t"
  1049. "psubw %%mm6, %%mm1 \n\t"
  1050. #endif
  1051. "movq %%mm7, %%mm6 \n\t" // 0
  1052. "pcmpgtw %%mm4, %%mm6 \n\t" // sign(2L2 - 5L3 + 5L4 - 2L5)
  1053. "pxor %%mm6, %%mm4 \n\t"
  1054. "psubw %%mm6, %%mm4 \n\t" // |2L2 - 5L3 + 5L4 - 2L5|
  1055. "pcmpgtw %%mm5, %%mm7 \n\t" // sign(2H2 - 5H3 + 5H4 - 2H5)
  1056. "pxor %%mm7, %%mm5 \n\t"
  1057. "psubw %%mm7, %%mm5 \n\t" // |2H2 - 5H3 + 5H4 - 2H5|
  1058. // 100 opcodes
  1059. "movd %2, %%mm2 \n\t" // QP
  1060. //"pcmpeqb %%mm2, %%mm2\n\t"
  1061. "punpcklwd %%mm2, %%mm2 \n\t"
  1062. "punpcklwd %%mm2, %%mm2 \n\t"
  1063. "psllw $3, %%mm2 \n\t" // 8QP
  1064. "movq %%mm2, %%mm3 \n\t" // 8QP
  1065. "pcmpgtw %%mm4, %%mm2 \n\t"
  1066. "pcmpgtw %%mm5, %%mm3 \n\t"
  1067. "pand %%mm2, %%mm4 \n\t"
  1068. "pand %%mm3, %%mm5 \n\t"
  1069. "psubusw %%mm0, %%mm4 \n\t" // hd
  1070. "psubusw %%mm1, %%mm5 \n\t" // ld
  1071. "movq w05, %%mm2 \n\t" // 5
  1072. "pmullw %%mm2, %%mm4 \n\t"
  1073. "pmullw %%mm2, %%mm5 \n\t"
  1074. "movq w20, %%mm2 \n\t" // 32
  1075. "paddw %%mm2, %%mm4 \n\t"
  1076. "paddw %%mm2, %%mm5 \n\t"
  1077. "psrlw $6, %%mm4 \n\t"
  1078. "psrlw $6, %%mm5 \n\t"
  1079. /*
  1080. "movq w06, %%mm2 \n\t" // 6
  1081. "paddw %%mm2, %%mm4 \n\t"
  1082. "paddw %%mm2, %%mm5 \n\t"
  1083. "movq w1400, %%mm2 \n\t" // 1400h = 5120 = 5/64*2^16
  1084. //FIXME if *5/64 is supposed to be /13 then we should use 5041 instead of 5120
  1085. "pmulhw %%mm2, %%mm4 \n\t" // hd/13
  1086. "pmulhw %%mm2, %%mm5 \n\t" // ld/13
  1087. */
  1088. "movq temp2, %%mm0 \n\t" // L3 - L4
  1089. "movq temp3, %%mm1 \n\t" // H3 - H4
  1090. "pxor %%mm2, %%mm2 \n\t"
  1091. "pxor %%mm3, %%mm3 \n\t"
  1092. // FIXME rounding error
  1093. "psraw $1, %%mm0 \n\t" // (L3 - L4)/2
  1094. "psraw $1, %%mm1 \n\t" // (H3 - H4)/2
  1095. "pcmpgtw %%mm0, %%mm2 \n\t" // sign (L3-L4)
  1096. "pcmpgtw %%mm1, %%mm3 \n\t" // sign (H3-H4)
  1097. "pxor %%mm2, %%mm0 \n\t"
  1098. "pxor %%mm3, %%mm1 \n\t"
  1099. "psubw %%mm2, %%mm0 \n\t" // |L3-L4|
  1100. "psubw %%mm3, %%mm1 \n\t" // |H3-H4|
  1101. // "psrlw $1, %%mm0 \n\t" // |L3 - L4|/2
  1102. // "psrlw $1, %%mm1 \n\t" // |H3 - H4|/2
  1103. "pxor %%mm6, %%mm2 \n\t"
  1104. "pxor %%mm7, %%mm3 \n\t"
  1105. "pand %%mm2, %%mm4 \n\t"
  1106. "pand %%mm3, %%mm5 \n\t"
  1107. #ifdef HAVE_MMX2
  1108. "pminsw %%mm0, %%mm4 \n\t"
  1109. "pminsw %%mm1, %%mm5 \n\t"
  1110. #else
  1111. "movq %%mm4, %%mm2 \n\t"
  1112. "psubusw %%mm0, %%mm2 \n\t"
  1113. "psubw %%mm2, %%mm4 \n\t"
  1114. "movq %%mm5, %%mm2 \n\t"
  1115. "psubusw %%mm1, %%mm2 \n\t"
  1116. "psubw %%mm2, %%mm5 \n\t"
  1117. #endif
  1118. "pxor %%mm6, %%mm4 \n\t"
  1119. "pxor %%mm7, %%mm5 \n\t"
  1120. "psubw %%mm6, %%mm4 \n\t"
  1121. "psubw %%mm7, %%mm5 \n\t"
  1122. "packsswb %%mm5, %%mm4 \n\t"
  1123. "movq (%%eax, %1, 2), %%mm0 \n\t"
  1124. "paddb %%mm4, %%mm0 \n\t"
  1125. "movq %%mm0, (%%eax, %1, 2) \n\t"
  1126. "movq (%0, %1, 4), %%mm0 \n\t"
  1127. "psubb %%mm4, %%mm0 \n\t"
  1128. // "pxor %%mm0, %%mm0 \n\t"
  1129. "movq %%mm0, (%0, %1, 4) \n\t"
  1130. :
  1131. : "r" (src), "r" (stride), "r" (QP)
  1132. : "%eax", "%ebx"
  1133. );
  1134. #else
  1135. const int l1= stride;
  1136. const int l2= stride + l1;
  1137. const int l3= stride + l2;
  1138. const int l4= stride + l3;
  1139. const int l5= stride + l4;
  1140. const int l6= stride + l5;
  1141. const int l7= stride + l6;
  1142. const int l8= stride + l7;
  1143. // const int l9= stride + l8;
  1144. int x;
  1145. for(x=0; x<BLOCK_SIZE; x++)
  1146. {
  1147. const int middleEnergy= 5*(src[l5] - src[l4]) + 2*(src[l3] - src[l6]);
  1148. if(ABS(middleEnergy) < 8*QP)
  1149. {
  1150. const int q=(src[l4] - src[l5])/2;
  1151. const int leftEnergy= 5*(src[l3] - src[l2]) + 2*(src[l1] - src[l4]);
  1152. const int rightEnergy= 5*(src[l7] - src[l6]) + 2*(src[l5] - src[l8]);
  1153. int d= ABS(middleEnergy) - MIN( ABS(leftEnergy), ABS(rightEnergy) );
  1154. d= MAX(d, 0);
  1155. d= (5*d + 32) >> 6;
  1156. d*= SIGN(-middleEnergy);
  1157. if(q>0)
  1158. {
  1159. d= d<0 ? 0 : d;
  1160. d= d>q ? q : d;
  1161. }
  1162. else
  1163. {
  1164. d= d>0 ? 0 : d;
  1165. d= d<q ? q : d;
  1166. }
  1167. src[l4]-= d;
  1168. src[l5]+= d;
  1169. }
  1170. src++;
  1171. }
  1172. #endif
  1173. }
  1174. //FIXME? |255-0| = 1
  1175. /**
  1176. * Check if the given 8x8 Block is mostly "flat" and copy the unaliged data into tempBlock.
  1177. */
  1178. static inline int isHorizDCAndCopy2Temp(uint8_t src[], int stride)
  1179. {
  1180. // src++;
  1181. int numEq= 0;
  1182. #ifdef HAVE_MMX
  1183. asm volatile (
  1184. // "int $3 \n\t"
  1185. "pushl %1\n\t"
  1186. "movq b7E, %%mm7 \n\t" // mm7 = 0x7F
  1187. "movq b7C, %%mm6 \n\t" // mm6 = 0x7D
  1188. "leal tempBlock, %%eax \n\t"
  1189. "pxor %%mm0, %%mm0 \n\t"
  1190. #define HDC_CHECK_AND_CPY(i) \
  1191. "movq -4(%1), %%mm2 \n\t"\
  1192. "psrlq $32, %%mm2 \n\t"\
  1193. "punpckldq 4(%1), %%mm2 \n\t" /* (%1) */\
  1194. "movq %%mm2, %%mm1 \n\t"\
  1195. "psrlq $8, %%mm2 \n\t"\
  1196. "psubb %%mm1, %%mm2 \n\t"\
  1197. "paddb %%mm7, %%mm2 \n\t"\
  1198. "pcmpgtb %%mm6, %%mm2 \n\t"\
  1199. "paddb %%mm2, %%mm0 \n\t"\
  1200. "movq %%mm1," #i "(%%eax) \n\t"
  1201. HDC_CHECK_AND_CPY(0)
  1202. "addl %2, %1 \n\t"
  1203. HDC_CHECK_AND_CPY(8)
  1204. "addl %2, %1 \n\t"
  1205. HDC_CHECK_AND_CPY(16)
  1206. "addl %2, %1 \n\t"
  1207. HDC_CHECK_AND_CPY(24)
  1208. "addl %2, %1 \n\t"
  1209. HDC_CHECK_AND_CPY(32)
  1210. "addl %2, %1 \n\t"
  1211. HDC_CHECK_AND_CPY(40)
  1212. "addl %2, %1 \n\t"
  1213. HDC_CHECK_AND_CPY(48)
  1214. "addl %2, %1 \n\t"
  1215. HDC_CHECK_AND_CPY(56)
  1216. "psllq $8, %%mm0 \n\t" // remove dummy value
  1217. "movq %%mm0, %%mm1 \n\t"
  1218. "psrlw $8, %%mm0 \n\t"
  1219. "paddb %%mm1, %%mm0 \n\t"
  1220. "movq %%mm0, %%mm1 \n\t"
  1221. "psrlq $16, %%mm0 \n\t"
  1222. "paddb %%mm1, %%mm0 \n\t"
  1223. "movq %%mm0, %%mm1 \n\t"
  1224. "psrlq $32, %%mm0 \n\t"
  1225. "paddb %%mm1, %%mm0 \n\t"
  1226. "popl %1\n\t"
  1227. "movd %%mm0, %0 \n\t"
  1228. : "=r" (numEq)
  1229. : "r" (src), "r" (stride)
  1230. : "%eax"
  1231. );
  1232. // printf("%d\n", numEq);
  1233. numEq= (256 - (numEq & 0xFF)) &0xFF;
  1234. #else
  1235. int y;
  1236. for(y=0; y<BLOCK_SIZE; y++)
  1237. {
  1238. if(((src[0] - src[1] + 1) & 0xFFFF) < 3) numEq++;
  1239. if(((src[1] - src[2] + 1) & 0xFFFF) < 3) numEq++;
  1240. if(((src[2] - src[3] + 1) & 0xFFFF) < 3) numEq++;
  1241. if(((src[3] - src[4] + 1) & 0xFFFF) < 3) numEq++;
  1242. if(((src[4] - src[5] + 1) & 0xFFFF) < 3) numEq++;
  1243. if(((src[5] - src[6] + 1) & 0xFFFF) < 3) numEq++;
  1244. if(((src[6] - src[7] + 1) & 0xFFFF) < 3) numEq++;
  1245. tempBlock[0 + y*TEMP_STRIDE] = src[0];
  1246. tempBlock[1 + y*TEMP_STRIDE] = src[1];
  1247. tempBlock[2 + y*TEMP_STRIDE] = src[2];
  1248. tempBlock[3 + y*TEMP_STRIDE] = src[3];
  1249. tempBlock[4 + y*TEMP_STRIDE] = src[4];
  1250. tempBlock[5 + y*TEMP_STRIDE] = src[5];
  1251. tempBlock[6 + y*TEMP_STRIDE] = src[6];
  1252. tempBlock[7 + y*TEMP_STRIDE] = src[7];
  1253. src+= stride;
  1254. }
  1255. #endif
  1256. /* if(abs(numEq - asmEq) > 0)
  1257. {
  1258. // printf("\nasm:%d c:%d\n", asmEq, numEq);
  1259. for(int y=0; y<8; y++)
  1260. {
  1261. for(int x=0; x<8; x++)
  1262. {
  1263. printf("%d ", src[x + y*stride]);
  1264. }
  1265. printf("\n");
  1266. }
  1267. }
  1268. */
  1269. // printf("%d\n", numEq);
  1270. return numEq > hFlatnessThreshold;
  1271. }
  1272. static inline int isHorizMinMaxOk(uint8_t src[], int stride, int QP)
  1273. {
  1274. #ifdef MMX_FIXME
  1275. FIXME
  1276. int isOk;
  1277. asm volatile(
  1278. // "int $3 \n\t"
  1279. "movq (%1, %2), %%mm0 \n\t"
  1280. "movq (%1, %2, 8), %%mm1 \n\t"
  1281. "movq %%mm0, %%mm2 \n\t"
  1282. "psubusb %%mm1, %%mm0 \n\t"
  1283. "psubusb %%mm2, %%mm1 \n\t"
  1284. "por %%mm1, %%mm0 \n\t" // ABS Diff
  1285. "movq pQPb, %%mm7 \n\t" // QP,..., QP
  1286. "paddusb %%mm7, %%mm7 \n\t" // 2QP ... 2QP
  1287. "psubusb %%mm7, %%mm0 \n\t" // Diff <= 2QP -> 0
  1288. "pcmpeqd b00, %%mm0 \n\t"
  1289. "psrlq $16, %%mm0 \n\t"
  1290. "pcmpeqd bFF, %%mm0 \n\t"
  1291. // "movd %%mm0, (%1, %2, 4)\n\t"
  1292. "movd %%mm0, %0 \n\t"
  1293. : "=r" (isOk)
  1294. : "r" (src), "r" (stride)
  1295. );
  1296. return isOk;
  1297. #else
  1298. if(abs(src[0] - src[7]) > 2*QP) return 0;
  1299. return 1;
  1300. #endif
  1301. }
  1302. static inline void doHorizDefFilterAndCopyBack(uint8_t dst[], int stride, int QP)
  1303. {
  1304. #ifdef HAVE_MMX
  1305. asm volatile(
  1306. "pushl %0 \n\t"
  1307. "pxor %%mm7, %%mm7 \n\t"
  1308. "movq bm00001000, %%mm6 \n\t"
  1309. "movd %2, %%mm5 \n\t" // QP
  1310. "movq %%mm5, %%mm4 \n\t"
  1311. "paddusb %%mm5, %%mm5 \n\t" // 2QP
  1312. "paddusb %%mm5, %%mm4 \n\t" // 3QP
  1313. "psllq $24, %%mm4 \n\t"
  1314. "pxor %%mm5, %%mm5 \n\t" // 0
  1315. "psubb %%mm4, %%mm5 \n\t" // -QP
  1316. "leal tempBlock, %%eax \n\t"
  1317. //FIXME? "unroll by 2" and mix
  1318. #ifdef HAVE_MMX2
  1319. #define HDF(i) \
  1320. "movq " #i "(%%eax), %%mm0 \n\t"\
  1321. "movq %%mm0, %%mm1 \n\t"\
  1322. "movq %%mm0, %%mm2 \n\t"\
  1323. "psrlq $8, %%mm1 \n\t"\
  1324. "psubusb %%mm1, %%mm2 \n\t"\
  1325. "psubusb %%mm0, %%mm1 \n\t"\
  1326. "por %%mm2, %%mm1 \n\t" /* p´x = |px - p(x+1)| */\
  1327. "pcmpeqb %%mm7, %%mm2 \n\t" /* p´x = sgn[px - p(x+1)] */\
  1328. "pshufw $0x00, %%mm1, %%mm3 \n\t" /* p´5 = |p1 - p2| */\
  1329. "pminub %%mm1, %%mm3 \n\t" /* p´5 = min(|p2-p1|, |p6-p5|)*/\
  1330. "psrlq $16, %%mm3 \n\t" /* p´3 = min(|p2-p1|, |p6-p5|)*/\
  1331. "psubusb %%mm3, %%mm1 \n\t" /* |p3-p4|-min(|p1-p2|,|p5-p6|) */\
  1332. "paddb %%mm5, %%mm1 \n\t"\
  1333. "psubusb %%mm5, %%mm1 \n\t"\
  1334. "psrlw $2, %%mm1 \n\t"\
  1335. "pxor %%mm2, %%mm1 \n\t"\
  1336. "psubb %%mm2, %%mm1 \n\t"\
  1337. "pand %%mm6, %%mm1 \n\t"\
  1338. "psubb %%mm1, %%mm0 \n\t"\
  1339. "psllq $8, %%mm1 \n\t"\
  1340. "paddb %%mm1, %%mm0 \n\t"\
  1341. "movd %%mm0, (%0) \n\t"\
  1342. "psrlq $32, %%mm0 \n\t"\
  1343. "movd %%mm0, 4(%0) \n\t"
  1344. #else
  1345. #define HDF(i)\
  1346. "movq " #i "(%%eax), %%mm0 \n\t"\
  1347. "movq %%mm0, %%mm1 \n\t"\
  1348. "movq %%mm0, %%mm2 \n\t"\
  1349. "psrlq $8, %%mm1 \n\t"\
  1350. "psubusb %%mm1, %%mm2 \n\t"\
  1351. "psubusb %%mm0, %%mm1 \n\t"\
  1352. "por %%mm2, %%mm1 \n\t" /* p´x = |px - p(x+1)| */\
  1353. "pcmpeqb %%mm7, %%mm2 \n\t" /* p´x = sgn[px - p(x+1)] */\
  1354. "movq %%mm1, %%mm3 \n\t"\
  1355. "psllq $32, %%mm3 \n\t"\
  1356. "movq %%mm3, %%mm4 \n\t"\
  1357. "psubusb %%mm1, %%mm4 \n\t"\
  1358. "psubb %%mm4, %%mm3 \n\t"\
  1359. "psrlq $16, %%mm3 \n\t" /* p´3 = min(|p2-p1|, |p6-p5|)*/\
  1360. "psubusb %%mm3, %%mm1 \n\t" /* |p3-p4|-min(|p1-p2|,|p5,ü6|) */\
  1361. "paddb %%mm5, %%mm1 \n\t"\
  1362. "psubusb %%mm5, %%mm1 \n\t"\
  1363. "psrlw $2, %%mm1 \n\t"\
  1364. "pxor %%mm2, %%mm1 \n\t"\
  1365. "psubb %%mm2, %%mm1 \n\t"\
  1366. "pand %%mm6, %%mm1 \n\t"\
  1367. "psubb %%mm1, %%mm0 \n\t"\
  1368. "psllq $8, %%mm1 \n\t"\
  1369. "paddb %%mm1, %%mm0 \n\t"\
  1370. "movd %%mm0, (%0) \n\t"\
  1371. "psrlq $32, %%mm0 \n\t"\
  1372. "movd %%mm0, 4(%0) \n\t"
  1373. #endif
  1374. HDF(0)
  1375. "addl %1, %0 \n\t"
  1376. HDF(8)
  1377. "addl %1, %0 \n\t"
  1378. HDF(16)
  1379. "addl %1, %0 \n\t"
  1380. HDF(24)
  1381. "addl %1, %0 \n\t"
  1382. HDF(32)
  1383. "addl %1, %0 \n\t"
  1384. HDF(40)
  1385. "addl %1, %0 \n\t"
  1386. HDF(48)
  1387. "addl %1, %0 \n\t"
  1388. HDF(56)
  1389. "popl %0 \n\t"
  1390. :
  1391. : "r" (dst), "r" (stride), "r" (QP)
  1392. : "%eax"
  1393. );
  1394. #else
  1395. uint8_t *src= tempBlock;
  1396. int y;
  1397. for(y=0; y<BLOCK_SIZE; y++)
  1398. {
  1399. const int middleEnergy= 5*(src[4] - src[5]) + 2*(src[2] - src[5]);
  1400. dst[0] = src[0];
  1401. dst[1] = src[1];
  1402. dst[2] = src[2];
  1403. dst[3] = src[3];
  1404. dst[4] = src[4];
  1405. dst[5] = src[5];
  1406. dst[6] = src[6];
  1407. dst[7] = src[7];
  1408. if(ABS(middleEnergy) < 8*QP)
  1409. {
  1410. const int q=(src[3] - src[4])/2;
  1411. const int leftEnergy= 5*(src[2] - src[1]) + 2*(src[0] - src[3]);
  1412. const int rightEnergy= 5*(src[6] - src[5]) + 2*(src[4] - src[7]);
  1413. int d= ABS(middleEnergy) - MIN( ABS(leftEnergy), ABS(rightEnergy) );
  1414. d= MAX(d, 0);
  1415. d= (5*d + 32) >> 6;
  1416. d*= SIGN(-middleEnergy);
  1417. if(q>0)
  1418. {
  1419. d= d<0 ? 0 : d;
  1420. d= d>q ? q : d;
  1421. }
  1422. else
  1423. {
  1424. d= d>0 ? 0 : d;
  1425. d= d<q ? q : d;
  1426. }
  1427. dst[3]-= d;
  1428. dst[4]+= d;
  1429. }
  1430. dst+= stride;
  1431. src+= TEMP_STRIDE;
  1432. }
  1433. #endif
  1434. }
  1435. /**
  1436. * Do a horizontal low pass filter on the 10x8 block (dst points to middle 8x8 Block)
  1437. * useing the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16 (C version)
  1438. * useing the 7-Tap Filter (2,2,2,4,2,2,2)/16 (MMX2/3DNOW version)
  1439. */
  1440. static inline void doHorizLowPassAndCopyBack(uint8_t dst[], int stride, int QP)
  1441. {
  1442. //return;
  1443. #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
  1444. asm volatile( //"movv %0 %1 %2\n\t"
  1445. "pushl %0\n\t"
  1446. "pxor %%mm7, %%mm7 \n\t"
  1447. "leal tempBlock, %%eax \n\t"
  1448. /*
  1449. #define HLP1 "movq (%0), %%mm0 \n\t"\
  1450. "movq %%mm0, %%mm1 \n\t"\
  1451. "psllq $8, %%mm0 \n\t"\
  1452. PAVGB(%%mm1, %%mm0)\
  1453. "psrlw $8, %%mm0 \n\t"\
  1454. "pxor %%mm1, %%mm1 \n\t"\
  1455. "packuswb %%mm1, %%mm0 \n\t"\
  1456. "movq %%mm0, %%mm1 \n\t"\
  1457. "movq %%mm0, %%mm2 \n\t"\
  1458. "psllq $32, %%mm0 \n\t"\
  1459. "paddb %%mm0, %%mm1 \n\t"\
  1460. "psllq $16, %%mm2 \n\t"\
  1461. PAVGB(%%mm2, %%mm0)\
  1462. "movq %%mm0, %%mm3 \n\t"\
  1463. "pand bm11001100, %%mm0 \n\t"\
  1464. "paddusb %%mm0, %%mm3 \n\t"\
  1465. "psrlq $8, %%mm3 \n\t"\
  1466. PAVGB(%%mm1, %%mm4)\
  1467. PAVGB(%%mm3, %%mm2)\
  1468. "psrlq $16, %%mm2 \n\t"\
  1469. "punpcklbw %%mm2, %%mm2 \n\t"\
  1470. "movq %%mm2, (%0) \n\t"\
  1471. #define HLP2 "movq (%0), %%mm0 \n\t"\
  1472. "movq %%mm0, %%mm1 \n\t"\
  1473. "psllq $8, %%mm0 \n\t"\
  1474. PAVGB(%%mm1, %%mm0)\
  1475. "psrlw $8, %%mm0 \n\t"\
  1476. "pxor %%mm1, %%mm1 \n\t"\
  1477. "packuswb %%mm1, %%mm0 \n\t"\
  1478. "movq %%mm0, %%mm2 \n\t"\
  1479. "psllq $32, %%mm0 \n\t"\
  1480. "psllq $16, %%mm2 \n\t"\
  1481. PAVGB(%%mm2, %%mm0)\
  1482. "movq %%mm0, %%mm3 \n\t"\
  1483. "pand bm11001100, %%mm0 \n\t"\
  1484. "paddusb %%mm0, %%mm3 \n\t"\
  1485. "psrlq $8, %%mm3 \n\t"\
  1486. PAVGB(%%mm3, %%mm2)\
  1487. "psrlq $16, %%mm2 \n\t"\
  1488. "punpcklbw %%mm2, %%mm2 \n\t"\
  1489. "movq %%mm2, (%0) \n\t"\
  1490. */
  1491. // approximately a 7-Tap Filter with Vector (1,2,3,4,3,2,1)/16
  1492. /*
  1493. Implemented Exact 7-Tap
  1494. 9421 A321
  1495. 36421 64321
  1496. 334321 =
  1497. 1234321 =
  1498. 1234321 =
  1499. 123433 =
  1500. 12463 12346
  1501. 1249 123A
  1502. */
  1503. #ifdef HAVE_MMX2
  1504. #define HLP3(i) "movq " #i "(%%eax), %%mm0 \n\t"\
  1505. "movq %%mm0, %%mm1 \n\t"\
  1506. "movq %%mm0, %%mm2 \n\t"\
  1507. "movq %%mm0, %%mm3 \n\t"\
  1508. "movq %%mm0, %%mm4 \n\t"\
  1509. "psllq $8, %%mm1 \n\t"\
  1510. "psrlq $8, %%mm2 \n\t"\
  1511. "pand bm00000001, %%mm3 \n\t"\
  1512. "pand bm10000000, %%mm4 \n\t"\
  1513. "por %%mm3, %%mm1 \n\t"\
  1514. "por %%mm4, %%mm2 \n\t"\
  1515. PAVGB(%%mm2, %%mm1)\
  1516. PAVGB(%%mm1, %%mm0)\
  1517. \
  1518. "pshufw $0xF9, %%mm0, %%mm3 \n\t"\
  1519. "pshufw $0x90, %%mm0, %%mm4 \n\t"\
  1520. PAVGB(%%mm3, %%mm4)\
  1521. PAVGB(%%mm4, %%mm0)\
  1522. "movd %%mm0, (%0) \n\t"\
  1523. "psrlq $32, %%mm0 \n\t"\
  1524. "movd %%mm0, 4(%0) \n\t"
  1525. #else
  1526. #define HLP3(i) "movq " #i "(%%eax), %%mm0 \n\t"\
  1527. "movq %%mm0, %%mm1 \n\t"\
  1528. "movq %%mm0, %%mm2 \n\t"\
  1529. "movd -4(%0), %%mm3 \n\t" /*0001000*/\
  1530. "movd 8(%0), %%mm4 \n\t" /*0001000*/\
  1531. "psllq $8, %%mm1 \n\t"\
  1532. "psrlq $8, %%mm2 \n\t"\
  1533. "psrlq $24, %%mm3 \n\t"\
  1534. "psllq $56, %%mm4 \n\t"\
  1535. "por %%mm3, %%mm1 \n\t"\
  1536. "por %%mm4, %%mm2 \n\t"\
  1537. PAVGB(%%mm2, %%mm1)\
  1538. PAVGB(%%mm1, %%mm0)\
  1539. \
  1540. "movq %%mm0, %%mm3 \n\t"\
  1541. "movq %%mm0, %%mm4 \n\t"\
  1542. "movq %%mm0, %%mm5 \n\t"\
  1543. "psrlq $16, %%mm3 \n\t"\
  1544. "psllq $16, %%mm4 \n\t"\
  1545. "pand bm11000000, %%mm5 \n\t"\
  1546. "por %%mm5, %%mm3 \n\t"\
  1547. "movq %%mm0, %%mm5 \n\t"\
  1548. "pand bm00000011, %%mm5 \n\t"\
  1549. "por %%mm5, %%mm4 \n\t"\
  1550. PAVGB(%%mm3, %%mm4)\
  1551. PAVGB(%%mm4, %%mm0)\
  1552. "movd %%mm0, (%0) \n\t"\
  1553. "psrlq $32, %%mm0 \n\t"\
  1554. "movd %%mm0, 4(%0) \n\t"
  1555. #endif
  1556. /* uses the 7-Tap Filter: 1112111 */
  1557. #define NEW_HLP(i)\
  1558. "movq " #i "(%%eax), %%mm0 \n\t"\
  1559. "movq %%mm0, %%mm1 \n\t"\
  1560. "movq %%mm0, %%mm2 \n\t"\
  1561. "movd -4(%0), %%mm3 \n\t" /*0001000*/\
  1562. "movd 8(%0), %%mm4 \n\t" /*0001000*/\
  1563. "psllq $8, %%mm1 \n\t"\
  1564. "psrlq $8, %%mm2 \n\t"\
  1565. "psrlq $24, %%mm3 \n\t"\
  1566. "psllq $56, %%mm4 \n\t"\
  1567. "por %%mm3, %%mm1 \n\t"\
  1568. "por %%mm4, %%mm2 \n\t"\
  1569. "movq %%mm1, %%mm5 \n\t"\
  1570. PAVGB(%%mm2, %%mm1)\
  1571. PAVGB(%%mm1, %%mm0)\
  1572. "psllq $8, %%mm5 \n\t"\
  1573. "psrlq $8, %%mm2 \n\t"\
  1574. "por %%mm3, %%mm5 \n\t"\
  1575. "por %%mm4, %%mm2 \n\t"\
  1576. "movq %%mm5, %%mm1 \n\t"\
  1577. PAVGB(%%mm2, %%mm5)\
  1578. "psllq $8, %%mm1 \n\t"\
  1579. "psrlq $8, %%mm2 \n\t"\
  1580. "por %%mm3, %%mm1 \n\t"\
  1581. "por %%mm4, %%mm2 \n\t"\
  1582. PAVGB(%%mm2, %%mm1)\
  1583. PAVGB(%%mm1, %%mm5)\
  1584. PAVGB(%%mm5, %%mm0)\
  1585. "movd %%mm0, (%0) \n\t"\
  1586. "psrlq $32, %%mm0 \n\t"\
  1587. "movd %%mm0, 4(%0) \n\t"
  1588. /* uses the 9-Tap Filter: 112242211 */
  1589. #define NEW_HLP2(i)\
  1590. "movq " #i "(%%eax), %%mm0 \n\t" /*0001000*/\
  1591. "movq %%mm0, %%mm1 \n\t" /*0001000*/\
  1592. "movq %%mm0, %%mm2 \n\t" /*0001000*/\
  1593. "movd -4(%0), %%mm3 \n\t" /*0001000*/\
  1594. "movd 8(%0), %%mm4 \n\t" /*0001000*/\
  1595. "psllq $8, %%mm1 \n\t"\
  1596. "psrlq $8, %%mm2 \n\t"\
  1597. "psrlq $24, %%mm3 \n\t"\
  1598. "psllq $56, %%mm4 \n\t"\
  1599. "por %%mm3, %%mm1 \n\t" /*0010000*/\
  1600. "por %%mm4, %%mm2 \n\t" /*0000100*/\
  1601. "movq %%mm1, %%mm5 \n\t" /*0010000*/\
  1602. PAVGB(%%mm2, %%mm1) /*0010100*/\
  1603. PAVGB(%%mm1, %%mm0) /*0012100*/\
  1604. "psllq $8, %%mm5 \n\t"\
  1605. "psrlq $8, %%mm2 \n\t"\
  1606. "por %%mm3, %%mm5 \n\t" /*0100000*/\
  1607. "por %%mm4, %%mm2 \n\t" /*0000010*/\
  1608. "movq %%mm5, %%mm1 \n\t" /*0100000*/\
  1609. PAVGB(%%mm2, %%mm5) /*0100010*/\
  1610. "psllq $8, %%mm1 \n\t"\
  1611. "psrlq $8, %%mm2 \n\t"\
  1612. "por %%mm3, %%mm1 \n\t" /*1000000*/\
  1613. "por %%mm4, %%mm2 \n\t" /*0000001*/\
  1614. "movq %%mm1, %%mm6 \n\t" /*1000000*/\
  1615. PAVGB(%%mm2, %%mm1) /*1000001*/\
  1616. "psllq $8, %%mm6 \n\t"\
  1617. "psrlq $8, %%mm2 \n\t"\
  1618. "por %%mm3, %%mm6 \n\t"/*100000000*/\
  1619. "por %%mm4, %%mm2 \n\t"/*000000001*/\
  1620. PAVGB(%%mm2, %%mm6) /*100000001*/\
  1621. PAVGB(%%mm6, %%mm1) /*110000011*/\
  1622. PAVGB(%%mm1, %%mm5) /*112000211*/\
  1623. PAVGB(%%mm5, %%mm0) /*112242211*/\
  1624. "movd %%mm0, (%0) \n\t"\
  1625. "psrlq $32, %%mm0 \n\t"\
  1626. "movd %%mm0, 4(%0) \n\t"
  1627. #define HLP(i) NEW_HLP(i)
  1628. HLP(0)
  1629. "addl %1, %0 \n\t"
  1630. HLP(8)
  1631. "addl %1, %0 \n\t"
  1632. HLP(16)
  1633. "addl %1, %0 \n\t"
  1634. HLP(24)
  1635. "addl %1, %0 \n\t"
  1636. HLP(32)
  1637. "addl %1, %0 \n\t"
  1638. HLP(40)
  1639. "addl %1, %0 \n\t"
  1640. HLP(48)
  1641. "addl %1, %0 \n\t"
  1642. HLP(56)
  1643. "popl %0\n\t"
  1644. :
  1645. : "r" (dst), "r" (stride)
  1646. : "%eax", "%ebx"
  1647. );
  1648. #else
  1649. uint8_t *temp= tempBlock;
  1650. int y;
  1651. for(y=0; y<BLOCK_SIZE; y++)
  1652. {
  1653. const int first= ABS(dst[-1] - dst[0]) < QP ? dst[-1] : dst[0];
  1654. const int last= ABS(dst[8] - dst[7]) < QP ? dst[8] : dst[7];
  1655. int sums[9];
  1656. sums[0] = first + temp[0];
  1657. sums[1] = temp[0] + temp[1];
  1658. sums[2] = temp[1] + temp[2];
  1659. sums[3] = temp[2] + temp[3];
  1660. sums[4] = temp[3] + temp[4];
  1661. sums[5] = temp[4] + temp[5];
  1662. sums[6] = temp[5] + temp[6];
  1663. sums[7] = temp[6] + temp[7];
  1664. sums[8] = temp[7] + last;
  1665. dst[0]= ((sums[0]<<2) + ((first + sums[2])<<1) + sums[4] + 8)>>4;
  1666. dst[1]= ((dst[1]<<2) + (first + sums[0] + sums[3]<<1) + sums[5] + 8)>>4;
  1667. dst[2]= ((dst[2]<<2) + (first + sums[1] + sums[4]<<1) + sums[6] + 8)>>4;
  1668. dst[3]= ((dst[3]<<2) + (sums[2] + sums[5]<<1) + sums[0] + sums[7] + 8)>>4;
  1669. dst[4]= ((dst[4]<<2) + (sums[3] + sums[6]<<1) + sums[1] + sums[8] + 8)>>4;
  1670. dst[5]= ((dst[5]<<2) + (last + sums[7] + sums[4]<<1) + sums[2] + 8)>>4;
  1671. dst[6]= ((last + dst[6]<<2) + (dst[7] + sums[5]<<1) + sums[3] + 8)>>4;
  1672. dst[7]= ((sums[8]<<2) + (last + sums[6]<<1) + sums[4] + 8)>>4;
  1673. dst+= stride;
  1674. temp+= TEMP_STRIDE;
  1675. }
  1676. #endif
  1677. }
  1678. static inline void dering(uint8_t src[], int stride, int QP)
  1679. {
  1680. //FIXME
  1681. #ifdef HAVE_MMX2X
  1682. asm volatile(
  1683. "leal (%0, %1), %%eax \n\t"
  1684. "leal (%%eax, %1, 4), %%ebx \n\t"
  1685. // 0 1 2 3 4 5 6 7 8 9
  1686. // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
  1687. "pcmpeq %%mm6, %%mm6 \n\t"
  1688. "pxor %%mm7, %%mm7 \n\t"
  1689. #define FIND_MIN_MAX(addr)\
  1690. "movq (" #addr "), %%mm0, \n\t"\
  1691. "pminub %%mm0, %%mm6 \n\t"\
  1692. "pmaxub %%mm0, %%mm7 \n\t"
  1693. FIND_MIN_MAX(%0)
  1694. FIND_MIN_MAX(%%eax)
  1695. FIND_MIN_MAX(%%eax, %1)
  1696. FIND_MIN_MAX(%%eax, %1, 2)
  1697. FIND_MIN_MAX(%0, %1, 4)
  1698. FIND_MIN_MAX(%%ebx)
  1699. FIND_MIN_MAX(%%ebx, %1)
  1700. FIND_MIN_MAX(%%ebx, %1, 2)
  1701. FIND_MIN_MAX(%0, %1, 8)
  1702. FIND_MIN_MAX(%%ebx, %1, 2)
  1703. "movq %%mm6, %%mm4 \n\t"
  1704. "psrlq $32, %%mm6 \n\t"
  1705. "pminub %%mm4, %%mm6 \n\t"
  1706. "movq %%mm6, %%mm4 \n\t"
  1707. "psrlq $16, %%mm6 \n\t"
  1708. "pminub %%mm4, %%mm6 \n\t"
  1709. "movq %%mm6, %%mm4 \n\t"
  1710. "psrlq $8, %%mm6 \n\t"
  1711. "pminub %%mm4, %%mm6 \n\t" // min of pixels
  1712. "movq %%mm7, %%mm4 \n\t"
  1713. "psrlq $32, %%mm7 \n\t"
  1714. "pmaxub %%mm4, %%mm7 \n\t"
  1715. "movq %%mm7, %%mm4 \n\t"
  1716. "psrlq $16, %%mm7 \n\t"
  1717. "pmaxub %%mm4, %%mm7 \n\t"
  1718. "movq %%mm7, %%mm4 \n\t"
  1719. "psrlq $8, %%mm7 \n\t"
  1720. "pmaxub %%mm4, %%mm7 \n\t" // max of pixels
  1721. PAVGB(%%mm6, %%mm7) // (max + min)/2
  1722. : : "r" (src), "r" (stride), "r" (QP)
  1723. : "%eax", "%ebx"
  1724. );
  1725. #else
  1726. //FIXME
  1727. #endif
  1728. }
  1729. /**
  1730. * Deinterlaces the given block
  1731. * will be called for every 8x8 block, except the last row, and can read & write into an 8x16 block
  1732. */
  1733. static inline void deInterlaceInterpolateLinear(uint8_t src[], int stride)
  1734. {
  1735. #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
  1736. asm volatile(
  1737. "leal (%0, %1), %%eax \n\t"
  1738. "leal (%%eax, %1, 4), %%ebx \n\t"
  1739. // 0 1 2 3 4 5 6 7 8 9
  1740. // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
  1741. "movq (%0), %%mm0 \n\t"
  1742. "movq (%%eax, %1), %%mm1 \n\t"
  1743. PAVGB(%%mm1, %%mm0)\
  1744. "movq %%mm0, (%%eax) \n\t"
  1745. "movq (%0, %1, 4), %%mm0 \n\t"
  1746. PAVGB(%%mm0, %%mm1)\
  1747. "movq %%mm1, (%%eax, %1, 2) \n\t"
  1748. "movq (%%ebx, %1), %%mm1 \n\t"
  1749. PAVGB(%%mm1, %%mm0)\
  1750. "movq %%mm0, (%%ebx) \n\t"
  1751. "movq (%0, %1, 8), %%mm0 \n\t"
  1752. PAVGB(%%mm0, %%mm1)\
  1753. "movq %%mm1, (%%ebx, %1, 2) \n\t"
  1754. : : "r" (src), "r" (stride)
  1755. : "%eax", "%ebx"
  1756. );
  1757. #else
  1758. int x;
  1759. for(x=0; x<8; x++)
  1760. {
  1761. src[stride] = (src[0] + src[stride*2])>>1;
  1762. src[stride*3] = (src[stride*2] + src[stride*4])>>1;
  1763. src[stride*5] = (src[stride*4] + src[stride*6])>>1;
  1764. src[stride*7] = (src[stride*6] + src[stride*8])>>1;
  1765. src++;
  1766. }
  1767. #endif
  1768. }
  1769. /**
  1770. * Deinterlaces the given block
  1771. * will be called for every 8x8 block, in the last row, and can read & write into an 8x8 block
  1772. */
  1773. static inline void deInterlaceInterpolateLinearLastRow(uint8_t src[], int stride)
  1774. {
  1775. #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
  1776. asm volatile(
  1777. "leal (%0, %1), %%eax \n\t"
  1778. "leal (%%eax, %1, 4), %%ebx \n\t"
  1779. // 0 1 2 3 4 5 6 7 8 9
  1780. // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
  1781. "movq (%0), %%mm0 \n\t"
  1782. "movq (%%eax, %1), %%mm1 \n\t"
  1783. PAVGB(%%mm1, %%mm0)\
  1784. "movq %%mm0, (%%eax) \n\t"
  1785. "movq (%0, %1, 4), %%mm0 \n\t"
  1786. PAVGB(%%mm0, %%mm1)\
  1787. "movq %%mm1, (%%eax, %1, 2) \n\t"
  1788. "movq (%%ebx, %1), %%mm1 \n\t"
  1789. PAVGB(%%mm1, %%mm0)\
  1790. "movq %%mm0, (%%ebx) \n\t"
  1791. "movq %%mm1, (%%ebx, %1, 2) \n\t"
  1792. : : "r" (src), "r" (stride)
  1793. : "%eax", "%ebx"
  1794. );
  1795. #else
  1796. int x;
  1797. for(x=0; x<8; x++)
  1798. {
  1799. src[stride] = (src[0] + src[stride*2])>>1;
  1800. src[stride*3] = (src[stride*2] + src[stride*4])>>1;
  1801. src[stride*5] = (src[stride*4] + src[stride*6])>>1;
  1802. src[stride*7] = src[stride*6];
  1803. src++;
  1804. }
  1805. #endif
  1806. }
  1807. /**
  1808. * Deinterlaces the given block
  1809. * will be called for every 8x8 block, except the last row, and can read & write into an 8x16 block
  1810. * will shift the image up by 1 line (FIXME if this is a problem)
  1811. */
  1812. static inline void deInterlaceBlendLinear(uint8_t src[], int stride)
  1813. {
  1814. #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
  1815. asm volatile(
  1816. "leal (%0, %1), %%eax \n\t"
  1817. "leal (%%eax, %1, 4), %%ebx \n\t"
  1818. // 0 1 2 3 4 5 6 7 8 9
  1819. // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
  1820. "movq (%0), %%mm0 \n\t" // L0
  1821. "movq (%%eax, %1), %%mm1 \n\t" // L2
  1822. PAVGB(%%mm1, %%mm0) // L0+L2
  1823. "movq (%%eax), %%mm2 \n\t" // L1
  1824. PAVGB(%%mm2, %%mm0)
  1825. "movq %%mm0, (%0) \n\t"
  1826. "movq (%%eax, %1, 2), %%mm0 \n\t" // L3
  1827. PAVGB(%%mm0, %%mm2) // L1+L3
  1828. PAVGB(%%mm1, %%mm2) // 2L2 + L1 + L3
  1829. "movq %%mm2, (%%eax) \n\t"
  1830. "movq (%0, %1, 4), %%mm2 \n\t" // L4
  1831. PAVGB(%%mm2, %%mm1) // L2+L4
  1832. PAVGB(%%mm0, %%mm1) // 2L3 + L2 + L4
  1833. "movq %%mm1, (%%eax, %1) \n\t"
  1834. "movq (%%ebx), %%mm1 \n\t" // L5
  1835. PAVGB(%%mm1, %%mm0) // L3+L5
  1836. PAVGB(%%mm2, %%mm0) // 2L4 + L3 + L5
  1837. "movq %%mm0, (%%eax, %1, 2) \n\t"
  1838. "movq (%%ebx, %1), %%mm0 \n\t" // L6
  1839. PAVGB(%%mm0, %%mm2) // L4+L6
  1840. PAVGB(%%mm1, %%mm2) // 2L5 + L4 + L6
  1841. "movq %%mm2, (%0, %1, 4) \n\t"
  1842. "movq (%%ebx, %1, 2), %%mm2 \n\t" // L7
  1843. PAVGB(%%mm2, %%mm1) // L5+L7
  1844. PAVGB(%%mm0, %%mm1) // 2L6 + L5 + L7
  1845. "movq %%mm1, (%%ebx) \n\t"
  1846. "movq (%0, %1, 8), %%mm1 \n\t" // L8
  1847. PAVGB(%%mm1, %%mm0) // L6+L8
  1848. PAVGB(%%mm2, %%mm0) // 2L7 + L6 + L8
  1849. "movq %%mm0, (%%ebx, %1) \n\t"
  1850. "movq (%%ebx, %1, 4), %%mm0 \n\t" // L9
  1851. PAVGB(%%mm0, %%mm2) // L7+L9
  1852. PAVGB(%%mm1, %%mm2) // 2L8 + L7 + L9
  1853. "movq %%mm2, (%%ebx, %1, 2) \n\t"
  1854. : : "r" (src), "r" (stride)
  1855. : "%eax", "%ebx"
  1856. );
  1857. #else
  1858. int x;
  1859. for(x=0; x<8; x++)
  1860. {
  1861. src[0 ] = (src[0 ] + 2*src[stride ] + src[stride*2])>>2;
  1862. src[stride ] = (src[stride ] + 2*src[stride*2] + src[stride*3])>>2;
  1863. src[stride*2] = (src[stride*2] + 2*src[stride*3] + src[stride*4])>>2;
  1864. src[stride*3] = (src[stride*3] + 2*src[stride*4] + src[stride*5])>>2;
  1865. src[stride*4] = (src[stride*4] + 2*src[stride*5] + src[stride*6])>>2;
  1866. src[stride*5] = (src[stride*5] + 2*src[stride*6] + src[stride*7])>>2;
  1867. src[stride*6] = (src[stride*6] + 2*src[stride*7] + src[stride*8])>>2;
  1868. src[stride*7] = (src[stride*7] + 2*src[stride*8] + src[stride*9])>>2;
  1869. src++;
  1870. }
  1871. #endif
  1872. }
  1873. /**
  1874. * Deinterlaces the given block
  1875. * will be called for every 8x8 block, in the last row, and can read & write into an 8x8 block
  1876. * will shift the image up by 1 line (FIXME if this is a problem)
  1877. */
  1878. static inline void deInterlaceBlendLinearLastRow(uint8_t src[], int stride)
  1879. {
  1880. #if defined (HAVE_MMSX2) || defined (HAVE_3DNOW)
  1881. asm volatile(
  1882. "leal (%0, %1), %%eax \n\t"
  1883. "leal (%%eax, %1, 4), %%ebx \n\t"
  1884. // 0 1 2 3 4 5 6 7 8 9
  1885. // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
  1886. "movq (%0), %%mm0 \n\t" // L0
  1887. "movq (%%eax, %1), %%mm1 \n\t" // L2
  1888. PAVGB(%%mm1, %%mm0) // L0+L2
  1889. "movq (%%eax), %%mm2 \n\t" // L1
  1890. PAVGB(%%mm2, %%mm0)
  1891. "movq %%mm0, (%0) \n\t"
  1892. "movq (%%eax, %1, 2), %%mm0 \n\t" // L3
  1893. PAVGB(%%mm0, %%mm2) // L1+L3
  1894. PAVGB(%%mm1, %%mm2) // 2L2 + L1 + L3
  1895. "movq %%mm2, (%%eax) \n\t"
  1896. "movq (%0, %1, 4), %%mm2 \n\t" // L4
  1897. PAVGB(%%mm2, %%mm1) // L2+L4
  1898. PAVGB(%%mm0, %%mm1) // 2L3 + L2 + L4
  1899. "movq %%mm1, (%%eax, %1) \n\t"
  1900. "movq (%%ebx), %%mm1 \n\t" // L5
  1901. PAVGB(%%mm1, %%mm0) // L3+L5
  1902. PAVGB(%%mm2, %%mm0) // 2L4 + L3 + L5
  1903. "movq %%mm0, (%%eax, %1, 2) \n\t"
  1904. "movq (%%ebx, %1), %%mm0 \n\t" // L6
  1905. PAVGB(%%mm0, %%mm2) // L4+L6
  1906. PAVGB(%%mm1, %%mm2) // 2L5 + L4 + L6
  1907. "movq %%mm2, (%0, %1, 4) \n\t"
  1908. "movq (%%ebx, %1, 2), %%mm2 \n\t" // L7
  1909. PAVGB(%%mm2, %%mm1) // L5+L7
  1910. PAVGB(%%mm0, %%mm1) // 2L6 + L5 + L7
  1911. "movq %%mm1, (%%ebx) \n\t"
  1912. PAVGB(%%mm2, %%mm0) // L7 + L8
  1913. "movq %%mm0, (%%ebx, %1) \n\t"
  1914. "movq %%mm0, (%%ebx, %1, 2) \n\t"
  1915. : : "r" (src), "r" (stride)
  1916. : "%eax", "%ebx"
  1917. );
  1918. #else
  1919. int x;
  1920. for(x=0; x<8; x++)
  1921. {
  1922. src[0 ] = (src[0 ] + 2*src[stride ] + src[stride*2])>>2;
  1923. src[stride ] = (src[stride ] + 2*src[stride*2] + src[stride*3])>>2;
  1924. src[stride*2] = (src[stride*2] + 2*src[stride*3] + src[stride*4])>>2;
  1925. src[stride*3] = (src[stride*3] + 2*src[stride*4] + src[stride*5])>>2;
  1926. src[stride*4] = (src[stride*4] + 2*src[stride*5] + src[stride*6])>>2;
  1927. src[stride*5] = (src[stride*5] + 2*src[stride*6] + src[stride*7])>>2;
  1928. src[stride*6] = (src[stride*6] + src[stride*7])>>1;
  1929. src[stride*7] = src[stride*6];
  1930. src++;
  1931. }
  1932. #endif
  1933. }
  1934. /**
  1935. * Deinterlaces the given block
  1936. * will be called for every 8x8 block, except the last row, and can read & write into an 8x16 block
  1937. */
  1938. static inline void deInterlaceMedian(uint8_t src[], int stride)
  1939. {
  1940. #if defined (HAVE_MMX2)
  1941. asm volatile(
  1942. "leal (%0, %1), %%eax \n\t"
  1943. "leal (%%eax, %1, 4), %%ebx \n\t"
  1944. // 0 1 2 3 4 5 6 7 8 9
  1945. // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
  1946. "movq (%0), %%mm0 \n\t" //
  1947. "movq (%%eax, %1), %%mm2 \n\t" //
  1948. "movq (%%eax), %%mm1 \n\t" //
  1949. "movq %%mm0, %%mm3 \n\t"
  1950. "pmaxub %%mm1, %%mm0 \n\t" //
  1951. "pminub %%mm3, %%mm1 \n\t" //
  1952. "pmaxub %%mm2, %%mm1 \n\t" //
  1953. "pminub %%mm1, %%mm0 \n\t"
  1954. "movq %%mm0, (%%eax) \n\t"
  1955. "movq (%0, %1, 4), %%mm0 \n\t" //
  1956. "movq (%%eax, %1, 2), %%mm1 \n\t" //
  1957. "movq %%mm2, %%mm3 \n\t"
  1958. "pmaxub %%mm1, %%mm2 \n\t" //
  1959. "pminub %%mm3, %%mm1 \n\t" //
  1960. "pmaxub %%mm0, %%mm1 \n\t" //
  1961. "pminub %%mm1, %%mm2 \n\t"
  1962. "movq %%mm2, (%%eax, %1, 2) \n\t"
  1963. "movq (%%ebx), %%mm2 \n\t" //
  1964. "movq (%%ebx, %1), %%mm1 \n\t" //
  1965. "movq %%mm2, %%mm3 \n\t"
  1966. "pmaxub %%mm0, %%mm2 \n\t" //
  1967. "pminub %%mm3, %%mm0 \n\t" //
  1968. "pmaxub %%mm1, %%mm0 \n\t" //
  1969. "pminub %%mm0, %%mm2 \n\t"
  1970. "movq %%mm2, (%%ebx) \n\t"
  1971. "movq (%%ebx, %1, 2), %%mm2 \n\t" //
  1972. "movq (%0, %1, 8), %%mm0 \n\t" //
  1973. "movq %%mm2, %%mm3 \n\t"
  1974. "pmaxub %%mm0, %%mm2 \n\t" //
  1975. "pminub %%mm3, %%mm0 \n\t" //
  1976. "pmaxub %%mm1, %%mm0 \n\t" //
  1977. "pminub %%mm0, %%mm2 \n\t"
  1978. "movq %%mm2, (%%ebx, %1, 2) \n\t"
  1979. : : "r" (src), "r" (stride)
  1980. : "%eax", "%ebx"
  1981. );
  1982. #else
  1983. //FIXME
  1984. int x;
  1985. for(x=0; x<8; x++)
  1986. {
  1987. src[0 ] = (src[0 ] + 2*src[stride ] + src[stride*2])>>2;
  1988. src[stride ] = (src[stride ] + 2*src[stride*2] + src[stride*3])>>2;
  1989. src[stride*2] = (src[stride*2] + 2*src[stride*3] + src[stride*4])>>2;
  1990. src[stride*3] = (src[stride*3] + 2*src[stride*4] + src[stride*5])>>2;
  1991. src[stride*4] = (src[stride*4] + 2*src[stride*5] + src[stride*6])>>2;
  1992. src[stride*5] = (src[stride*5] + 2*src[stride*6] + src[stride*7])>>2;
  1993. src[stride*6] = (src[stride*6] + 2*src[stride*7] + src[stride*8])>>2;
  1994. src[stride*7] = (src[stride*7] + 2*src[stride*8] + src[stride*9])>>2;
  1995. src++;
  1996. }
  1997. #endif
  1998. }
  1999. /**
  2000. * Deinterlaces the given block
  2001. * will be called for every 8x8 block, in the last row, and can read & write into an 8x8 block
  2002. * will shift the image up by 1 line (FIXME if this is a problem)
  2003. */
  2004. static inline void deInterlaceMedianLastRow(uint8_t src[], int stride)
  2005. {
  2006. #if defined (HAVE_MMX2)
  2007. asm volatile(
  2008. "leal (%0, %1), %%eax \n\t"
  2009. "leal (%%eax, %1, 4), %%ebx \n\t"
  2010. // 0 1 2 3 4 5 6 7 8 9
  2011. // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
  2012. "movq (%0), %%mm0 \n\t" //
  2013. "movq (%%eax, %1), %%mm2 \n\t" //
  2014. "movq (%%eax), %%mm1 \n\t" //
  2015. "movq %%mm0, %%mm3 \n\t"
  2016. "pmaxub %%mm1, %%mm0 \n\t" //
  2017. "pminub %%mm3, %%mm1 \n\t" //
  2018. "pmaxub %%mm2, %%mm1 \n\t" //
  2019. "pminub %%mm1, %%mm0 \n\t"
  2020. "movq %%mm0, (%%eax) \n\t"
  2021. "movq (%0, %1, 4), %%mm0 \n\t" //
  2022. "movq (%%eax, %1, 2), %%mm1 \n\t" //
  2023. "movq %%mm2, %%mm3 \n\t"
  2024. "pmaxub %%mm1, %%mm2 \n\t" //
  2025. "pminub %%mm3, %%mm1 \n\t" //
  2026. "pmaxub %%mm0, %%mm1 \n\t" //
  2027. "pminub %%mm1, %%mm2 \n\t"
  2028. "movq %%mm2, (%%eax, %1, 2) \n\t"
  2029. "movq (%%ebx), %%mm2 \n\t" //
  2030. "movq (%%ebx, %1), %%mm1 \n\t" //
  2031. "movq %%mm2, %%mm3 \n\t"
  2032. "pmaxub %%mm0, %%mm2 \n\t" //
  2033. "pminub %%mm3, %%mm0 \n\t" //
  2034. "pmaxub %%mm1, %%mm0 \n\t" //
  2035. "pminub %%mm0, %%mm2 \n\t"
  2036. "movq %%mm2, (%%ebx) \n\t"
  2037. "movq %%mm1, (%%ebx, %1, 2) \n\t"
  2038. : : "r" (src), "r" (stride)
  2039. : "%eax", "%ebx"
  2040. );
  2041. #else
  2042. //FIXME
  2043. int x;
  2044. for(x=0; x<8; x++)
  2045. {
  2046. src[0 ] = (src[0 ] + 2*src[stride ] + src[stride*2])>>2;
  2047. src[stride ] = (src[stride ] + 2*src[stride*2] + src[stride*3])>>2;
  2048. src[stride*2] = (src[stride*2] + 2*src[stride*3] + src[stride*4])>>2;
  2049. src[stride*3] = (src[stride*3] + 2*src[stride*4] + src[stride*5])>>2;
  2050. src[stride*4] = (src[stride*4] + 2*src[stride*5] + src[stride*6])>>2;
  2051. src[stride*5] = (src[stride*5] + 2*src[stride*6] + src[stride*7])>>2;
  2052. src[stride*6] = (src[stride*6] + src[stride*7])>>1;
  2053. src[stride*7] = src[stride*6];
  2054. src++;
  2055. }
  2056. #endif
  2057. }
  2058. #ifdef HAVE_ODIVX_POSTPROCESS
  2059. #include "../opendivx/postprocess.h"
  2060. int use_old_pp=0;
  2061. #endif
  2062. static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
  2063. QP_STORE_T QPs[], int QPStride, int isColor, int mode);
  2064. /**
  2065. * ...
  2066. * the mode value is interpreted as a quality value if its negative, its range is then (-1 ... -63)
  2067. * -63 is best quality -1 is worst
  2068. */
  2069. void postprocess(unsigned char * src[], int src_stride,
  2070. unsigned char * dst[], int dst_stride,
  2071. int horizontal_size, int vertical_size,
  2072. QP_STORE_T *QP_store, int QP_stride,
  2073. int mode)
  2074. {
  2075. #ifdef HAVE_ODIVX_POSTPROCESS
  2076. // Note: I could make this shit outside of this file, but it would mean one
  2077. // more function call...
  2078. if(use_old_pp){
  2079. odivx_postprocess(src,src_stride,dst,dst_stride,horizontal_size,vertical_size,QP_store,QP_stride,mode);
  2080. return;
  2081. }
  2082. #endif
  2083. // I'm calling this from dec_video.c:video_set_postprocess()
  2084. // if(mode<0) mode= getModeForQuality(-mode);
  2085. /*
  2086. long long T= rdtsc();
  2087. for(int y=vertical_size-1; y>=0 ; y--)
  2088. memcpy(dst[0] + y*src_stride, src[0] + y*src_stride,src_stride);
  2089. // memcpy(dst[0], src[0],src_stride*vertical_size);
  2090. printf("%4dk\r", (rdtsc()-T)/1000);
  2091. return;
  2092. */
  2093. /*
  2094. long long T= rdtsc();
  2095. while( (rdtsc() - T)/1000 < 4000);
  2096. return;
  2097. */
  2098. postProcess(src[0], src_stride, dst[0], dst_stride,
  2099. horizontal_size, vertical_size, QP_store, QP_stride, 0, mode);
  2100. horizontal_size >>= 1;
  2101. vertical_size >>= 1;
  2102. src_stride >>= 1;
  2103. dst_stride >>= 1;
  2104. mode= ((mode&0xFF)>>4) | (mode&0xFFFFFF00);
  2105. if(1)
  2106. {
  2107. postProcess(src[1], src_stride, dst[1], dst_stride,
  2108. horizontal_size, vertical_size, QP_store, QP_stride, 1, mode);
  2109. postProcess(src[2], src_stride, dst[2], dst_stride,
  2110. horizontal_size, vertical_size, QP_store, QP_stride, 1, mode);
  2111. }
  2112. else
  2113. {
  2114. memcpy(dst[1], src[1], src_stride*horizontal_size);
  2115. memcpy(dst[2], src[2], src_stride*horizontal_size);
  2116. }
  2117. }
  2118. /**
  2119. * gets the mode flags for a given quality (larger values mean slower but better postprocessing)
  2120. * 0 <= quality <= 6
  2121. */
  2122. int getPpModeForQuality(int quality){
  2123. int modes[1+GET_PP_QUALITY_MAX]= {
  2124. 0,
  2125. #if 1
  2126. // horizontal filters first
  2127. LUM_H_DEBLOCK,
  2128. LUM_H_DEBLOCK | LUM_V_DEBLOCK,
  2129. LUM_H_DEBLOCK | LUM_V_DEBLOCK | CHROM_H_DEBLOCK,
  2130. LUM_H_DEBLOCK | LUM_V_DEBLOCK | CHROM_H_DEBLOCK | CHROM_V_DEBLOCK,
  2131. LUM_H_DEBLOCK | LUM_V_DEBLOCK | CHROM_H_DEBLOCK | CHROM_V_DEBLOCK | LUM_DERING,
  2132. LUM_H_DEBLOCK | LUM_V_DEBLOCK | CHROM_H_DEBLOCK | CHROM_V_DEBLOCK | LUM_DERING | CHROM_DERING
  2133. #else
  2134. // vertical filters first
  2135. LUM_V_DEBLOCK,
  2136. LUM_V_DEBLOCK | LUM_H_DEBLOCK,
  2137. LUM_V_DEBLOCK | LUM_H_DEBLOCK | CHROM_V_DEBLOCK,
  2138. LUM_V_DEBLOCK | LUM_H_DEBLOCK | CHROM_V_DEBLOCK | CHROM_H_DEBLOCK,
  2139. LUM_V_DEBLOCK | LUM_H_DEBLOCK | CHROM_V_DEBLOCK | CHROM_H_DEBLOCK | LUM_DERING,
  2140. LUM_V_DEBLOCK | LUM_H_DEBLOCK | CHROM_V_DEBLOCK | CHROM_H_DEBLOCK | LUM_DERING | CHROM_DERING
  2141. #endif
  2142. };
  2143. #ifdef HAVE_ODIVX_POSTPROCESS
  2144. int odivx_modes[1+GET_PP_QUALITY_MAX]= {
  2145. 0,
  2146. PP_DEBLOCK_Y_H,
  2147. PP_DEBLOCK_Y_H|PP_DEBLOCK_Y_V,
  2148. PP_DEBLOCK_Y_H|PP_DEBLOCK_Y_V|PP_DEBLOCK_C_H,
  2149. PP_DEBLOCK_Y_H|PP_DEBLOCK_Y_V|PP_DEBLOCK_C_H|PP_DEBLOCK_C_V,
  2150. PP_DEBLOCK_Y_H|PP_DEBLOCK_Y_V|PP_DEBLOCK_C_H|PP_DEBLOCK_C_V|PP_DERING_Y,
  2151. PP_DEBLOCK_Y_H|PP_DEBLOCK_Y_V|PP_DEBLOCK_C_H|PP_DEBLOCK_C_V|PP_DERING_Y|PP_DERING_C
  2152. };
  2153. if(use_old_pp) return odivx_modes[quality];
  2154. #endif
  2155. return modes[quality];
  2156. }
  2157. //} // extern "C"
  2158. /**
  2159. * Copies a block from src to dst and fixes the blacklevel
  2160. * numLines must be a multiple of 4
  2161. * levelFix == 0 -> dont touch the brighness & contrast
  2162. */
  2163. static inline void blockCopy(uint8_t dst[], int dstStride, uint8_t src[], int srcStride,
  2164. int numLines, int levelFix)
  2165. {
  2166. int i;
  2167. if(levelFix)
  2168. {
  2169. #ifdef HAVE_MMX
  2170. asm volatile(
  2171. "movl %4, %%eax \n\t"
  2172. "movl %%eax, temp0\n\t"
  2173. "pushl %0 \n\t"
  2174. "pushl %1 \n\t"
  2175. "leal (%2,%2), %%eax \n\t"
  2176. "leal (%3,%3), %%ebx \n\t"
  2177. "movq packedYOffset, %%mm2 \n\t"
  2178. "movq packedYScale, %%mm3 \n\t"
  2179. "pxor %%mm4, %%mm4 \n\t"
  2180. #define SCALED_CPY \
  2181. "movq (%0), %%mm0 \n\t"\
  2182. "movq (%0,%2), %%mm1 \n\t"\
  2183. "psubusb %%mm2, %%mm0 \n\t"\
  2184. "psubusb %%mm2, %%mm1 \n\t"\
  2185. "movq %%mm0, %%mm5 \n\t"\
  2186. "punpcklbw %%mm4, %%mm0 \n\t"\
  2187. "punpckhbw %%mm4, %%mm5 \n\t"\
  2188. "psllw $7, %%mm0 \n\t"\
  2189. "psllw $7, %%mm5 \n\t"\
  2190. "pmulhw %%mm3, %%mm0 \n\t"\
  2191. "pmulhw %%mm3, %%mm5 \n\t"\
  2192. "packuswb %%mm5, %%mm0 \n\t"\
  2193. "movq %%mm0, (%1) \n\t"\
  2194. "movq %%mm1, %%mm5 \n\t"\
  2195. "punpcklbw %%mm4, %%mm1 \n\t"\
  2196. "punpckhbw %%mm4, %%mm5 \n\t"\
  2197. "psllw $7, %%mm1 \n\t"\
  2198. "psllw $7, %%mm5 \n\t"\
  2199. "pmulhw %%mm3, %%mm1 \n\t"\
  2200. "pmulhw %%mm3, %%mm5 \n\t"\
  2201. "packuswb %%mm5, %%mm1 \n\t"\
  2202. "movq %%mm1, (%1, %3) \n\t"\
  2203. "1: \n\t"
  2204. SCALED_CPY
  2205. "addl %%eax, %0 \n\t"
  2206. "addl %%ebx, %1 \n\t"
  2207. SCALED_CPY
  2208. "addl %%eax, %0 \n\t"
  2209. "addl %%ebx, %1 \n\t"
  2210. "decl temp0 \n\t"
  2211. "jnz 1b \n\t"
  2212. "popl %1 \n\t"
  2213. "popl %0 \n\t"
  2214. : : "r" (src),
  2215. "r" (dst),
  2216. "r" (srcStride),
  2217. "r" (dstStride),
  2218. "m" (numLines>>2)
  2219. : "%eax", "%ebx"
  2220. );
  2221. #else
  2222. for(i=0; i<numLines; i++)
  2223. memcpy( &(dst[dstStride*i]),
  2224. &(src[srcStride*i]), BLOCK_SIZE);
  2225. #endif
  2226. }
  2227. else
  2228. {
  2229. #ifdef HAVE_MMX
  2230. asm volatile(
  2231. "movl %4, %%eax \n\t"
  2232. "movl %%eax, temp0\n\t"
  2233. "pushl %0 \n\t"
  2234. "pushl %1 \n\t"
  2235. "leal (%2,%2), %%eax \n\t"
  2236. "leal (%3,%3), %%ebx \n\t"
  2237. "movq packedYOffset, %%mm2 \n\t"
  2238. "movq packedYScale, %%mm3 \n\t"
  2239. #define SIMPLE_CPY \
  2240. "movq (%0), %%mm0 \n\t"\
  2241. "movq (%0,%2), %%mm1 \n\t"\
  2242. "movq %%mm0, (%1) \n\t"\
  2243. "movq %%mm1, (%1, %3) \n\t"\
  2244. "1: \n\t"
  2245. SIMPLE_CPY
  2246. "addl %%eax, %0 \n\t"
  2247. "addl %%ebx, %1 \n\t"
  2248. SIMPLE_CPY
  2249. "addl %%eax, %0 \n\t"
  2250. "addl %%ebx, %1 \n\t"
  2251. "decl temp0 \n\t"
  2252. "jnz 1b \n\t"
  2253. "popl %1 \n\t"
  2254. "popl %0 \n\t"
  2255. : : "r" (src),
  2256. "r" (dst),
  2257. "r" (srcStride),
  2258. "r" (dstStride),
  2259. "m" (numLines>>2)
  2260. : "%eax", "%ebx"
  2261. );
  2262. #else
  2263. for(i=0; i<numLines; i++)
  2264. memcpy( &(dst[dstStride*i]),
  2265. &(src[srcStride*i]), BLOCK_SIZE);
  2266. #endif
  2267. }
  2268. }
  2269. /**
  2270. * Filters array of bytes (Y or U or V values)
  2271. */
  2272. static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
  2273. QP_STORE_T QPs[], int QPStride, int isColor, int mode)
  2274. {
  2275. int x,y;
  2276. /* we need 64bit here otherwise we´ll going to have a problem
  2277. after watching a black picture for 5 hours*/
  2278. static uint64_t *yHistogram= NULL;
  2279. int black=0, white=255; // blackest black and whitest white in the picture
  2280. #ifdef TIMEING
  2281. long long T0, T1, memcpyTime=0, vertTime=0, horizTime=0, sumTime, diffTime=0;
  2282. sumTime= rdtsc();
  2283. #endif
  2284. if(!yHistogram)
  2285. {
  2286. int i;
  2287. yHistogram= (uint64_t*)malloc(8*256);
  2288. for(i=0; i<256; i++) yHistogram[i]= width*height/64*15/256;
  2289. }
  2290. if(!isColor)
  2291. {
  2292. uint64_t sum= 0;
  2293. int i;
  2294. static int framenum= -1;
  2295. uint64_t maxClipped;
  2296. uint64_t clipped;
  2297. double scale;
  2298. framenum++;
  2299. if(framenum == 1) yHistogram[0]= width*height/64*15/256;
  2300. for(i=0; i<256; i++)
  2301. {
  2302. sum+= yHistogram[i];
  2303. // printf("%d ", yHistogram[i]);
  2304. }
  2305. // printf("\n\n");
  2306. /* we allways get a completly black picture first */
  2307. maxClipped= (uint64_t)(sum * maxClippedThreshold);
  2308. clipped= sum;
  2309. for(black=255; black>0; black--)
  2310. {
  2311. if(clipped < maxClipped) break;
  2312. clipped-= yHistogram[black];
  2313. }
  2314. clipped= sum;
  2315. for(white=0; white<256; white++)
  2316. {
  2317. if(clipped < maxClipped) break;
  2318. clipped-= yHistogram[white];
  2319. }
  2320. // we cant handle negative correctures
  2321. packedYOffset= MAX(black - minAllowedY, 0);
  2322. packedYOffset|= packedYOffset<<32;
  2323. packedYOffset|= packedYOffset<<16;
  2324. packedYOffset|= packedYOffset<<8;
  2325. scale= (double)(maxAllowedY - minAllowedY) / (double)(white-black);
  2326. packedYScale= (uint16_t)(scale*512.0 + 0.5);
  2327. packedYScale|= packedYScale<<32;
  2328. packedYScale|= packedYScale<<16;
  2329. }
  2330. else
  2331. {
  2332. packedYScale= 0x0100010001000100LL;
  2333. packedYOffset= 0;
  2334. }
  2335. for(x=0; x<width; x+=BLOCK_SIZE)
  2336. blockCopy(dst + x, dstStride, src + x, srcStride, 8, mode & LEVEL_FIX);
  2337. for(y=0; y<height; y+=BLOCK_SIZE)
  2338. {
  2339. //1% speedup if these are here instead of the inner loop
  2340. uint8_t *srcBlock= &(src[y*srcStride]);
  2341. uint8_t *dstBlock= &(dst[y*dstStride]);
  2342. uint8_t *vertSrcBlock= &(srcBlock[srcStride*3]); // Blocks are 10x8 -> *3 to start
  2343. uint8_t *vertBlock= &(dstBlock[dstStride*3]);
  2344. // finish 1 block before the next otherwise we´ll might have a problem
  2345. // with the L1 Cache of the P4 ... or only a few blocks at a time or soemthing
  2346. for(x=0; x<width; x+=BLOCK_SIZE)
  2347. {
  2348. const int stride= dstStride;
  2349. int QP= isColor ?
  2350. QPs[(y>>3)*QPStride + (x>>3)]:
  2351. QPs[(y>>4)*QPStride + (x>>4)];
  2352. if(!isColor && (mode & LEVEL_FIX)) QP= (QP* (packedYScale &0xFFFF))>>8;
  2353. #ifdef HAVE_MMX
  2354. asm volatile(
  2355. "movd %0, %%mm7 \n\t"
  2356. "packuswb %%mm7, %%mm7 \n\t" // 0, 0, 0, QP, 0, 0, 0, QP
  2357. "packuswb %%mm7, %%mm7 \n\t" // 0,QP, 0, QP, 0,QP, 0, QP
  2358. "packuswb %%mm7, %%mm7 \n\t" // QP,..., QP
  2359. "movq %%mm7, pQPb \n\t"
  2360. : : "r" (QP)
  2361. );
  2362. #endif
  2363. if(y + 12 < height)
  2364. {
  2365. #ifdef MORE_TIMEING
  2366. T0= rdtsc();
  2367. #endif
  2368. #ifdef HAVE_MMX2
  2369. prefetchnta(vertSrcBlock + (((x>>3)&3) + 2)*srcStride + 32);
  2370. prefetchnta(vertSrcBlock + (((x>>3)&3) + 6)*srcStride + 32);
  2371. prefetcht0(vertBlock + (((x>>3)&3) + 2)*dstStride + 32);
  2372. prefetcht0(vertBlock + (((x>>3)&3) + 6)*dstStride + 32);
  2373. #elif defined(HAVE_3DNOW)
  2374. //FIXME check if this is faster on an 3dnow chip or if its faster without the prefetch or ...
  2375. /* prefetch(vertSrcBlock + (((x>>3)&3) + 2)*srcStride + 32);
  2376. prefetch(vertSrcBlock + (((x>>3)&3) + 6)*srcStride + 32);
  2377. prefetchw(vertBlock + (((x>>3)&3) + 2)*dstStride + 32);
  2378. prefetchw(vertBlock + (((x>>3)&3) + 6)*dstStride + 32);
  2379. */
  2380. #endif
  2381. if(!isColor) yHistogram[ srcBlock[0] ]++;
  2382. blockCopy(vertBlock + dstStride*2, dstStride,
  2383. vertSrcBlock + srcStride*2, srcStride, 8, mode & LEVEL_FIX);
  2384. if(mode & LINEAR_IPOL_DEINT_FILTER)
  2385. deInterlaceInterpolateLinear(dstBlock, dstStride);
  2386. else if(mode & LINEAR_BLEND_DEINT_FILTER)
  2387. deInterlaceBlendLinear(dstBlock, dstStride);
  2388. else if(mode & MEDIAN_DEINT_FILTER)
  2389. deInterlaceMedian(dstBlock, dstStride);
  2390. /* else if(mode & CUBIC_IPOL_DEINT_FILTER)
  2391. deInterlaceInterpolateCubic(dstBlock, dstStride);
  2392. else if(mode & CUBIC_BLEND_DEINT_FILTER)
  2393. deInterlaceBlendCubic(dstBlock, dstStride);
  2394. */
  2395. #ifdef MORE_TIMEING
  2396. T1= rdtsc();
  2397. memcpyTime+= T1-T0;
  2398. T0=T1;
  2399. #endif
  2400. if(mode & V_DEBLOCK)
  2401. {
  2402. if(mode & V_RK1_FILTER)
  2403. vertRK1Filter(vertBlock, stride, QP);
  2404. else if(mode & V_X1_FILTER)
  2405. vertX1Filter(vertBlock, stride, QP);
  2406. else
  2407. {
  2408. if( isVertDC(vertBlock, stride))
  2409. {
  2410. if(isVertMinMaxOk(vertBlock, stride, QP))
  2411. doVertLowPass(vertBlock, stride, QP);
  2412. }
  2413. else
  2414. doVertDefFilter(vertBlock, stride, QP);
  2415. }
  2416. }
  2417. #ifdef MORE_TIMEING
  2418. T1= rdtsc();
  2419. vertTime+= T1-T0;
  2420. T0=T1;
  2421. #endif
  2422. }
  2423. else
  2424. {
  2425. blockCopy(vertBlock + dstStride*1, dstStride,
  2426. vertSrcBlock + srcStride*1, srcStride, 4, mode & LEVEL_FIX);
  2427. if(mode & LINEAR_IPOL_DEINT_FILTER)
  2428. deInterlaceInterpolateLinearLastRow(dstBlock, dstStride);
  2429. else if(mode & LINEAR_BLEND_DEINT_FILTER)
  2430. deInterlaceBlendLinearLastRow(dstBlock, dstStride);
  2431. else if(mode & MEDIAN_DEINT_FILTER)
  2432. deInterlaceMedianLastRow(dstBlock, dstStride);
  2433. /* else if(mode & CUBIC_IPOL_DEINT_FILTER)
  2434. deInterlaceInterpolateCubicLastRow(dstBlock, dstStride);
  2435. else if(mode & CUBIC_BLEND_DEINT_FILTER)
  2436. deInterlaceBlendCubicLastRow(dstBlock, dstStride);
  2437. */
  2438. }
  2439. if(x - 8 >= 0 && x<width)
  2440. {
  2441. #ifdef MORE_TIMEING
  2442. T0= rdtsc();
  2443. #endif
  2444. if(mode & H_DEBLOCK)
  2445. {
  2446. if(mode & H_X1_FILTER)
  2447. horizX1Filter(dstBlock-4, stride, QP);
  2448. else
  2449. {
  2450. if( isHorizDCAndCopy2Temp(dstBlock-4, stride))
  2451. {
  2452. if(isHorizMinMaxOk(tempBlock, TEMP_STRIDE, QP))
  2453. doHorizLowPassAndCopyBack(dstBlock-4, stride, QP);
  2454. }
  2455. else
  2456. doHorizDefFilterAndCopyBack(dstBlock-4, stride, QP);
  2457. }
  2458. }
  2459. #ifdef MORE_TIMEING
  2460. T1= rdtsc();
  2461. horizTime+= T1-T0;
  2462. T0=T1;
  2463. #endif
  2464. dering(dstBlock - 9 - stride, stride, QP);
  2465. }
  2466. else if(y!=0)
  2467. dering(dstBlock - stride*9 + width-9, stride, QP);
  2468. //FIXME dering filter will not be applied to last block (bottom right)
  2469. dstBlock+=8;
  2470. srcBlock+=8;
  2471. vertBlock+=8;
  2472. vertSrcBlock+=8;
  2473. }
  2474. }
  2475. #ifdef HAVE_3DNOW
  2476. asm volatile("femms");
  2477. #elif defined (HAVE_MMX)
  2478. asm volatile("emms");
  2479. #endif
  2480. #ifdef TIMEING
  2481. // FIXME diff is mostly the time spent for rdtsc (should subtract that but ...)
  2482. sumTime= rdtsc() - sumTime;
  2483. if(!isColor)
  2484. printf("cpy:%4dk, vert:%4dk, horiz:%4dk, sum:%4dk, diff:%4dk, color: %d/%d \r",
  2485. (int)(memcpyTime/1000), (int)(vertTime/1000), (int)(horizTime/1000),
  2486. (int)(sumTime/1000), (int)((sumTime-memcpyTime-vertTime-horizTime)/1000)
  2487. , black, white);
  2488. #endif
  2489. }