You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1072 lines
26KB

  1. /*
  2. * MMX optimized DSP utils
  3. * Copyright (c) 2000, 2001 Gerard Lantau.
  4. *
  5. * This program is free software; you can redistribute it and/or modify
  6. * it under the terms of the GNU General Public License as published by
  7. * the Free Software Foundation; either version 2 of the License, or
  8. * (at your option) any later version.
  9. *
  10. * This program is distributed in the hope that it will be useful,
  11. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  12. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  13. * GNU General Public License for more details.
  14. *
  15. * You should have received a copy of the GNU General Public License
  16. * along with this program; if not, write to the Free Software
  17. * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  18. *
  19. * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
  20. */
  21. #include "../dsputil.h"
  22. int mm_flags; /* multimedia extension flags */
  23. int pix_abs16x16_mmx(UINT8 *blk1, UINT8 *blk2, int lx, int h);
  24. int pix_abs16x16_sse(UINT8 *blk1, UINT8 *blk2, int lx, int h);
  25. int pix_abs16x16_x2_mmx(UINT8 *blk1, UINT8 *blk2, int lx, int h);
  26. int pix_abs16x16_y2_mmx(UINT8 *blk1, UINT8 *blk2, int lx, int h);
  27. int pix_abs16x16_xy2_mmx(UINT8 *blk1, UINT8 *blk2, int lx, int h);
  28. #ifdef USE_MMX_IDCT
  29. /* external functions, defined in libmpeg2 */
  30. void mmx_idct(DCTELEM *block);
  31. void mmxext_idct(DCTELEM *block);
  32. /* this should be in dsputil.h? -- A'rpi */
  33. extern UINT8 ff_alternate_horizontal_scan[64];
  34. extern UINT8 ff_alternate_vertical_scan[64];
  35. extern UINT8 zigzag_direct[64];
  36. #endif
  37. /* pixel operations */
  38. static const unsigned long long int mm_wone __attribute__ ((aligned(8))) = 0x0001000100010001;
  39. static const unsigned long long int mm_wtwo __attribute__ ((aligned(8))) = 0x0002000200020002;
  40. //static const unsigned short mm_wone[4] __attribute__ ((aligned(8))) = { 0x1, 0x1, 0x1, 0x1 };
  41. //static const unsigned short mm_wtwo[4] __attribute__ ((aligned(8))) = { 0x2, 0x2, 0x2, 0x2 };
  42. /***********************************/
  43. /* 3Dnow specific */
  44. #define DEF(x) x ## _3dnow
  45. /* for Athlons PAVGUSB is prefered */
  46. #define PAVGB "pavgusb"
  47. #include "dsputil_mmx_avg.h"
  48. #undef DEF
  49. #undef PAVGB
  50. /***********************************/
  51. /* MMX2 specific */
  52. #define DEF(x) x ## _sse
  53. /* Introduced only in MMX2 set */
  54. #define PAVGB "pavgb"
  55. #include "dsputil_mmx_avg.h"
  56. #undef DEF
  57. #undef PAVGB
  58. /***********************************/
  59. /* standard MMX */
  60. static void get_pixels_mmx(DCTELEM *block, const UINT8 *pixels, int line_size)
  61. {
  62. DCTELEM *p;
  63. const UINT8 *pix;
  64. int i;
  65. /* read the pixels */
  66. p = block;
  67. pix = pixels;
  68. __asm __volatile("pxor %%mm7, %%mm7":::"memory");
  69. for(i=0;i<4;i++) {
  70. __asm __volatile(
  71. "movq %1, %%mm0\n\t"
  72. "movq %2, %%mm1\n\t"
  73. "movq %%mm0, %%mm2\n\t"
  74. "movq %%mm1, %%mm3\n\t"
  75. "punpcklbw %%mm7, %%mm0\n\t"
  76. "punpckhbw %%mm7, %%mm2\n\t"
  77. "punpcklbw %%mm7, %%mm1\n\t"
  78. "punpckhbw %%mm7, %%mm3\n\t"
  79. "movq %%mm0, %0\n\t"
  80. "movq %%mm2, 8%0\n\t"
  81. "movq %%mm1, 16%0\n\t"
  82. "movq %%mm3, 24%0\n\t"
  83. :"=m"(*p)
  84. :"m"(*pix), "m"(*(pix+line_size))
  85. :"memory");
  86. pix += line_size*2;
  87. p += 16;
  88. }
  89. }
  90. static void put_pixels_clamped_mmx(const DCTELEM *block, UINT8 *pixels, int line_size)
  91. {
  92. const DCTELEM *p;
  93. UINT8 *pix;
  94. int i;
  95. /* read the pixels */
  96. p = block;
  97. pix = pixels;
  98. for(i=0;i<2;i++) {
  99. __asm __volatile(
  100. "movq %4, %%mm0\n\t"
  101. "movq 8%4, %%mm1\n\t"
  102. "movq 16%4, %%mm2\n\t"
  103. "movq 24%4, %%mm3\n\t"
  104. "movq 32%4, %%mm4\n\t"
  105. "movq 40%4, %%mm5\n\t"
  106. "movq 48%4, %%mm6\n\t"
  107. "movq 56%4, %%mm7\n\t"
  108. "packuswb %%mm1, %%mm0\n\t"
  109. "packuswb %%mm3, %%mm2\n\t"
  110. "packuswb %%mm5, %%mm4\n\t"
  111. "packuswb %%mm7, %%mm6\n\t"
  112. "movq %%mm0, %0\n\t"
  113. "movq %%mm2, %1\n\t"
  114. "movq %%mm4, %2\n\t"
  115. "movq %%mm6, %3\n\t"
  116. :"=m"(*pix), "=m"(*(pix+line_size))
  117. ,"=m"(*(pix+line_size*2)), "=m"(*(pix+line_size*3))
  118. :"m"(*p)
  119. :"memory");
  120. pix += line_size*4;
  121. p += 32;
  122. }
  123. }
  124. static void add_pixels_clamped_mmx(const DCTELEM *block, UINT8 *pixels, int line_size)
  125. {
  126. const DCTELEM *p;
  127. UINT8 *pix;
  128. int i;
  129. /* read the pixels */
  130. p = block;
  131. pix = pixels;
  132. __asm __volatile("pxor %%mm7, %%mm7":::"memory");
  133. for(i=0;i<4;i++) {
  134. __asm __volatile(
  135. "movq %2, %%mm0\n\t"
  136. "movq 8%2, %%mm1\n\t"
  137. "movq 16%2, %%mm2\n\t"
  138. "movq 24%2, %%mm3\n\t"
  139. "movq %0, %%mm4\n\t"
  140. "movq %1, %%mm6\n\t"
  141. "movq %%mm4, %%mm5\n\t"
  142. "punpcklbw %%mm7, %%mm4\n\t"
  143. "punpckhbw %%mm7, %%mm5\n\t"
  144. "paddsw %%mm4, %%mm0\n\t"
  145. "paddsw %%mm5, %%mm1\n\t"
  146. "movq %%mm6, %%mm5\n\t"
  147. "punpcklbw %%mm7, %%mm6\n\t"
  148. "punpckhbw %%mm7, %%mm5\n\t"
  149. "paddsw %%mm6, %%mm2\n\t"
  150. "paddsw %%mm5, %%mm3\n\t"
  151. "packuswb %%mm1, %%mm0\n\t"
  152. "packuswb %%mm3, %%mm2\n\t"
  153. "movq %%mm0, %0\n\t"
  154. "movq %%mm2, %1\n\t"
  155. :"=m"(*pix), "=m"(*(pix+line_size))
  156. :"m"(*p)
  157. :"memory");
  158. pix += line_size*2;
  159. p += 16;
  160. }
  161. }
  162. static void put_pixels_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h)
  163. {
  164. int dh, hh;
  165. UINT8 *p;
  166. const UINT8 *pix;
  167. p = block;
  168. pix = pixels;
  169. hh=h>>2;
  170. dh=h&3;
  171. while(hh--) {
  172. __asm __volatile(
  173. "movq %4, %%mm0\n\t"
  174. "movq %5, %%mm1\n\t"
  175. "movq %6, %%mm2\n\t"
  176. "movq %7, %%mm3\n\t"
  177. "movq %%mm0, %0\n\t"
  178. "movq %%mm1, %1\n\t"
  179. "movq %%mm2, %2\n\t"
  180. "movq %%mm3, %3\n\t"
  181. :"=m"(*p), "=m"(*(p+line_size)), "=m"(*(p+line_size*2)), "=m"(*(p+line_size*3))
  182. :"m"(*pix), "m"(*(pix+line_size)), "m"(*(pix+line_size*2)), "m"(*(pix+line_size*3))
  183. :"memory");
  184. pix = pix + line_size*4;
  185. p = p + line_size*4;
  186. }
  187. while(dh--) {
  188. __asm __volatile(
  189. "movq %1, %%mm0\n\t"
  190. "movq %%mm0, %0\n\t"
  191. :"=m"(*p)
  192. :"m"(*pix)
  193. :"memory");
  194. pix = pix + line_size;
  195. p = p + line_size;
  196. }
  197. }
  198. static void put_pixels_x2_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h)
  199. {
  200. UINT8 *p;
  201. const UINT8 *pix;
  202. p = block;
  203. pix = pixels;
  204. __asm __volatile(
  205. "pxor %%mm7, %%mm7\n\t"
  206. "movq %0, %%mm4\n\t"
  207. ::"m"(mm_wone):"memory");
  208. do {
  209. __asm __volatile(
  210. "movq %1, %%mm0\n\t"
  211. "movq 1%1, %%mm1\n\t"
  212. "movq %%mm0, %%mm2\n\t"
  213. "movq %%mm1, %%mm3\n\t"
  214. "punpcklbw %%mm7, %%mm0\n\t"
  215. "punpcklbw %%mm7, %%mm1\n\t"
  216. "punpckhbw %%mm7, %%mm2\n\t"
  217. "punpckhbw %%mm7, %%mm3\n\t"
  218. "paddusw %%mm1, %%mm0\n\t"
  219. "paddusw %%mm3, %%mm2\n\t"
  220. "paddusw %%mm4, %%mm0\n\t"
  221. "paddusw %%mm4, %%mm2\n\t"
  222. "psrlw $1, %%mm0\n\t"
  223. "psrlw $1, %%mm2\n\t"
  224. "packuswb %%mm2, %%mm0\n\t"
  225. "movq %%mm0, %0\n\t"
  226. :"=m"(*p)
  227. :"m"(*pix)
  228. :"memory");
  229. pix += line_size; p += line_size;
  230. } while (--h);
  231. }
  232. static void put_pixels_y2_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h)
  233. {
  234. UINT8 *p;
  235. const UINT8 *pix;
  236. p = block;
  237. pix = pixels;
  238. __asm __volatile(
  239. "pxor %%mm7, %%mm7\n\t"
  240. "movq %0, %%mm4\n\t"
  241. ::"m"(mm_wone):"memory");
  242. do {
  243. __asm __volatile(
  244. "movq %1, %%mm0\n\t"
  245. "movq %2, %%mm1\n\t"
  246. "movq %%mm0, %%mm2\n\t"
  247. "movq %%mm1, %%mm3\n\t"
  248. "punpcklbw %%mm7, %%mm0\n\t"
  249. "punpcklbw %%mm7, %%mm1\n\t"
  250. "punpckhbw %%mm7, %%mm2\n\t"
  251. "punpckhbw %%mm7, %%mm3\n\t"
  252. "paddusw %%mm1, %%mm0\n\t"
  253. "paddusw %%mm3, %%mm2\n\t"
  254. "paddusw %%mm4, %%mm0\n\t"
  255. "paddusw %%mm4, %%mm2\n\t"
  256. "psrlw $1, %%mm0\n\t"
  257. "psrlw $1, %%mm2\n\t"
  258. "packuswb %%mm2, %%mm0\n\t"
  259. "movq %%mm0, %0\n\t"
  260. :"=m"(*p)
  261. :"m"(*pix),
  262. "m"(*(pix+line_size))
  263. :"memory");
  264. pix += line_size;
  265. p += line_size;
  266. } while (--h);
  267. }
  268. static void put_pixels_xy2_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h)
  269. {
  270. UINT8 *p;
  271. const UINT8 *pix;
  272. p = block;
  273. pix = pixels;
  274. __asm __volatile(
  275. "pxor %%mm7, %%mm7\n\t"
  276. "movq %0, %%mm6\n\t"
  277. ::"m"(mm_wtwo):"memory");
  278. do {
  279. __asm __volatile(
  280. "movq %1, %%mm0\n\t"
  281. "movq %2, %%mm1\n\t"
  282. "movq 1%1, %%mm4\n\t"
  283. "movq 1%2, %%mm5\n\t"
  284. "movq %%mm0, %%mm2\n\t"
  285. "movq %%mm1, %%mm3\n\t"
  286. "punpcklbw %%mm7, %%mm0\n\t"
  287. "punpcklbw %%mm7, %%mm1\n\t"
  288. "punpckhbw %%mm7, %%mm2\n\t"
  289. "punpckhbw %%mm7, %%mm3\n\t"
  290. "paddusw %%mm1, %%mm0\n\t"
  291. "paddusw %%mm3, %%mm2\n\t"
  292. "movq %%mm4, %%mm1\n\t"
  293. "movq %%mm5, %%mm3\n\t"
  294. "punpcklbw %%mm7, %%mm4\n\t"
  295. "punpcklbw %%mm7, %%mm5\n\t"
  296. "punpckhbw %%mm7, %%mm1\n\t"
  297. "punpckhbw %%mm7, %%mm3\n\t"
  298. "paddusw %%mm5, %%mm4\n\t"
  299. "paddusw %%mm3, %%mm1\n\t"
  300. "paddusw %%mm6, %%mm4\n\t"
  301. "paddusw %%mm6, %%mm1\n\t"
  302. "paddusw %%mm4, %%mm0\n\t"
  303. "paddusw %%mm1, %%mm2\n\t"
  304. "psrlw $2, %%mm0\n\t"
  305. "psrlw $2, %%mm2\n\t"
  306. "packuswb %%mm2, %%mm0\n\t"
  307. "movq %%mm0, %0\n\t"
  308. :"=m"(*p)
  309. :"m"(*pix),
  310. "m"(*(pix+line_size))
  311. :"memory");
  312. pix += line_size;
  313. p += line_size;
  314. } while(--h);
  315. }
  316. static void put_no_rnd_pixels_x2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h)
  317. {
  318. UINT8 *p;
  319. const UINT8 *pix;
  320. p = block;
  321. pix = pixels;
  322. __asm __volatile("pxor %%mm7, %%mm7\n\t":::"memory");
  323. do {
  324. __asm __volatile(
  325. "movq %1, %%mm0\n\t"
  326. "movq 1%1, %%mm1\n\t"
  327. "movq %%mm0, %%mm2\n\t"
  328. "movq %%mm1, %%mm3\n\t"
  329. "punpcklbw %%mm7, %%mm0\n\t"
  330. "punpcklbw %%mm7, %%mm1\n\t"
  331. "punpckhbw %%mm7, %%mm2\n\t"
  332. "punpckhbw %%mm7, %%mm3\n\t"
  333. "paddusw %%mm1, %%mm0\n\t"
  334. "paddusw %%mm3, %%mm2\n\t"
  335. "psrlw $1, %%mm0\n\t"
  336. "psrlw $1, %%mm2\n\t"
  337. "packuswb %%mm2, %%mm0\n\t"
  338. "movq %%mm0, %0\n\t"
  339. :"=m"(*p)
  340. :"m"(*pix)
  341. :"memory");
  342. pix += line_size;
  343. p += line_size;
  344. } while (--h);
  345. }
  346. static void put_no_rnd_pixels_y2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h)
  347. {
  348. UINT8 *p;
  349. const UINT8 *pix;
  350. p = block;
  351. pix = pixels;
  352. __asm __volatile("pxor %%mm7, %%mm7\n\t":::"memory");
  353. do {
  354. __asm __volatile(
  355. "movq %1, %%mm0\n\t"
  356. "movq %2, %%mm1\n\t"
  357. "movq %%mm0, %%mm2\n\t"
  358. "movq %%mm1, %%mm3\n\t"
  359. "punpcklbw %%mm7, %%mm0\n\t"
  360. "punpcklbw %%mm7, %%mm1\n\t"
  361. "punpckhbw %%mm7, %%mm2\n\t"
  362. "punpckhbw %%mm7, %%mm3\n\t"
  363. "paddusw %%mm1, %%mm0\n\t"
  364. "paddusw %%mm3, %%mm2\n\t"
  365. "psrlw $1, %%mm0\n\t"
  366. "psrlw $1, %%mm2\n\t"
  367. "packuswb %%mm2, %%mm0\n\t"
  368. "movq %%mm0, %0\n\t"
  369. :"=m"(*p)
  370. :"m"(*pix),
  371. "m"(*(pix+line_size))
  372. :"memory");
  373. pix += line_size;
  374. p += line_size;
  375. } while(--h);
  376. }
  377. static void put_no_rnd_pixels_xy2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h)
  378. {
  379. UINT8 *p;
  380. const UINT8 *pix;
  381. p = block;
  382. pix = pixels;
  383. __asm __volatile(
  384. "pxor %%mm7, %%mm7\n\t"
  385. "movq %0, %%mm6\n\t"
  386. ::"m"(mm_wone):"memory");
  387. do {
  388. __asm __volatile(
  389. "movq %1, %%mm0\n\t"
  390. "movq %2, %%mm1\n\t"
  391. "movq 1%1, %%mm4\n\t"
  392. "movq 1%2, %%mm5\n\t"
  393. "movq %%mm0, %%mm2\n\t"
  394. "movq %%mm1, %%mm3\n\t"
  395. "punpcklbw %%mm7, %%mm0\n\t"
  396. "punpcklbw %%mm7, %%mm1\n\t"
  397. "punpckhbw %%mm7, %%mm2\n\t"
  398. "punpckhbw %%mm7, %%mm3\n\t"
  399. "paddusw %%mm1, %%mm0\n\t"
  400. "paddusw %%mm3, %%mm2\n\t"
  401. "movq %%mm4, %%mm1\n\t"
  402. "movq %%mm5, %%mm3\n\t"
  403. "punpcklbw %%mm7, %%mm4\n\t"
  404. "punpcklbw %%mm7, %%mm5\n\t"
  405. "punpckhbw %%mm7, %%mm1\n\t"
  406. "punpckhbw %%mm7, %%mm3\n\t"
  407. "paddusw %%mm5, %%mm4\n\t"
  408. "paddusw %%mm3, %%mm1\n\t"
  409. "paddusw %%mm6, %%mm4\n\t"
  410. "paddusw %%mm6, %%mm1\n\t"
  411. "paddusw %%mm4, %%mm0\n\t"
  412. "paddusw %%mm1, %%mm2\n\t"
  413. "psrlw $2, %%mm0\n\t"
  414. "psrlw $2, %%mm2\n\t"
  415. "packuswb %%mm2, %%mm0\n\t"
  416. "movq %%mm0, %0\n\t"
  417. :"=m"(*p)
  418. :"m"(*pix),
  419. "m"(*(pix+line_size))
  420. :"memory");
  421. pix += line_size;
  422. p += line_size;
  423. } while(--h);
  424. }
  425. static void avg_pixels_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h)
  426. {
  427. UINT8 *p;
  428. const UINT8 *pix;
  429. p = block;
  430. pix = pixels;
  431. __asm __volatile(
  432. "pxor %%mm7, %%mm7\n\t"
  433. "movq %0, %%mm6\n\t"
  434. ::"m"(mm_wone):"memory");
  435. do {
  436. __asm __volatile(
  437. "movq %0, %%mm0\n\t"
  438. "movq %1, %%mm1\n\t"
  439. "movq %%mm0, %%mm2\n\t"
  440. "movq %%mm1, %%mm3\n\t"
  441. "punpcklbw %%mm7, %%mm0\n\t"
  442. "punpcklbw %%mm7, %%mm1\n\t"
  443. "punpckhbw %%mm7, %%mm2\n\t"
  444. "punpckhbw %%mm7, %%mm3\n\t"
  445. "paddusw %%mm1, %%mm0\n\t"
  446. "paddusw %%mm3, %%mm2\n\t"
  447. "paddusw %%mm6, %%mm0\n\t"
  448. "paddusw %%mm6, %%mm2\n\t"
  449. "psrlw $1, %%mm0\n\t"
  450. "psrlw $1, %%mm2\n\t"
  451. "packuswb %%mm2, %%mm0\n\t"
  452. "movq %%mm0, %0\n\t"
  453. :"=m"(*p)
  454. :"m"(*pix)
  455. :"memory");
  456. pix += line_size;
  457. p += line_size;
  458. }
  459. while (--h);
  460. }
  461. static void avg_pixels_x2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h)
  462. {
  463. UINT8 *p;
  464. const UINT8 *pix;
  465. p = block;
  466. pix = pixels;
  467. __asm __volatile(
  468. "pxor %%mm7, %%mm7\n\t"
  469. "movq %0, %%mm6\n\t"
  470. ::"m"(mm_wone):"memory");
  471. do {
  472. __asm __volatile(
  473. "movq %1, %%mm1\n\t"
  474. "movq %0, %%mm0\n\t"
  475. "movq 1%1, %%mm4\n\t"
  476. "movq %%mm0, %%mm2\n\t"
  477. "movq %%mm1, %%mm3\n\t"
  478. "movq %%mm4, %%mm5\n\t"
  479. "punpcklbw %%mm7, %%mm1\n\t"
  480. "punpckhbw %%mm7, %%mm3\n\t"
  481. "punpcklbw %%mm7, %%mm4\n\t"
  482. "punpckhbw %%mm7, %%mm5\n\t"
  483. "punpcklbw %%mm7, %%mm0\n\t"
  484. "punpckhbw %%mm7, %%mm2\n\t"
  485. "paddusw %%mm4, %%mm1\n\t"
  486. "paddusw %%mm5, %%mm3\n\t"
  487. "paddusw %%mm6, %%mm1\n\t"
  488. "paddusw %%mm6, %%mm3\n\t"
  489. "psrlw $1, %%mm1\n\t"
  490. "psrlw $1, %%mm3\n\t"
  491. "paddusw %%mm6, %%mm0\n\t"
  492. "paddusw %%mm6, %%mm2\n\t"
  493. "paddusw %%mm1, %%mm0\n\t"
  494. "paddusw %%mm3, %%mm2\n\t"
  495. "psrlw $1, %%mm0\n\t"
  496. "psrlw $1, %%mm2\n\t"
  497. "packuswb %%mm2, %%mm0\n\t"
  498. "movq %%mm0, %0\n\t"
  499. :"=m"(*p)
  500. :"m"(*pix)
  501. :"memory");
  502. pix += line_size;
  503. p += line_size;
  504. } while (--h);
  505. }
  506. static void avg_pixels_y2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h)
  507. {
  508. UINT8 *p;
  509. const UINT8 *pix;
  510. p = block;
  511. pix = pixels;
  512. __asm __volatile(
  513. "pxor %%mm7, %%mm7\n\t"
  514. "movq %0, %%mm6\n\t"
  515. ::"m"(mm_wone):"memory");
  516. do {
  517. __asm __volatile(
  518. "movq %1, %%mm1\n\t"
  519. "movq %0, %%mm0\n\t"
  520. "movq %2, %%mm4\n\t"
  521. "movq %%mm0, %%mm2\n\t"
  522. "movq %%mm1, %%mm3\n\t"
  523. "movq %%mm4, %%mm5\n\t"
  524. "punpcklbw %%mm7, %%mm1\n\t"
  525. "punpckhbw %%mm7, %%mm3\n\t"
  526. "punpcklbw %%mm7, %%mm4\n\t"
  527. "punpckhbw %%mm7, %%mm5\n\t"
  528. "punpcklbw %%mm7, %%mm0\n\t"
  529. "punpckhbw %%mm7, %%mm2\n\t"
  530. "paddusw %%mm4, %%mm1\n\t"
  531. "paddusw %%mm5, %%mm3\n\t"
  532. "paddusw %%mm6, %%mm1\n\t"
  533. "paddusw %%mm6, %%mm3\n\t"
  534. "psrlw $1, %%mm1\n\t"
  535. "psrlw $1, %%mm3\n\t"
  536. "paddusw %%mm6, %%mm0\n\t"
  537. "paddusw %%mm6, %%mm2\n\t"
  538. "paddusw %%mm1, %%mm0\n\t"
  539. "paddusw %%mm3, %%mm2\n\t"
  540. "psrlw $1, %%mm0\n\t"
  541. "psrlw $1, %%mm2\n\t"
  542. "packuswb %%mm2, %%mm0\n\t"
  543. "movq %%mm0, %0\n\t"
  544. :"=m"(*p)
  545. :"m"(*pix), "m"(*(pix+line_size))
  546. :"memory");
  547. pix += line_size;
  548. p += line_size ;
  549. } while(--h);
  550. }
  551. static void avg_pixels_xy2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h)
  552. {
  553. UINT8 *p;
  554. const UINT8 *pix;
  555. p = block;
  556. pix = pixels;
  557. __asm __volatile(
  558. "pxor %%mm7, %%mm7\n\t"
  559. "movq %0, %%mm6\n\t"
  560. ::"m"(mm_wtwo):"memory");
  561. do {
  562. __asm __volatile(
  563. "movq %1, %%mm0\n\t"
  564. "movq %2, %%mm1\n\t"
  565. "movq 1%1, %%mm4\n\t"
  566. "movq 1%2, %%mm5\n\t"
  567. "movq %%mm0, %%mm2\n\t"
  568. "movq %%mm1, %%mm3\n\t"
  569. "punpcklbw %%mm7, %%mm0\n\t"
  570. "punpcklbw %%mm7, %%mm1\n\t"
  571. "punpckhbw %%mm7, %%mm2\n\t"
  572. "punpckhbw %%mm7, %%mm3\n\t"
  573. "paddusw %%mm1, %%mm0\n\t"
  574. "paddusw %%mm3, %%mm2\n\t"
  575. "movq %%mm4, %%mm1\n\t"
  576. "movq %%mm5, %%mm3\n\t"
  577. "punpcklbw %%mm7, %%mm4\n\t"
  578. "punpcklbw %%mm7, %%mm5\n\t"
  579. "punpckhbw %%mm7, %%mm1\n\t"
  580. "punpckhbw %%mm7, %%mm3\n\t"
  581. "paddusw %%mm5, %%mm4\n\t"
  582. "paddusw %%mm3, %%mm1\n\t"
  583. "paddusw %%mm6, %%mm4\n\t"
  584. "paddusw %%mm6, %%mm1\n\t"
  585. "paddusw %%mm4, %%mm0\n\t"
  586. "paddusw %%mm1, %%mm2\n\t"
  587. "movq %3, %%mm5\n\t"
  588. "psrlw $2, %%mm0\n\t"
  589. "movq %0, %%mm1\n\t"
  590. "psrlw $2, %%mm2\n\t"
  591. "movq %%mm1, %%mm3\n\t"
  592. "punpcklbw %%mm7, %%mm1\n\t"
  593. "punpckhbw %%mm7, %%mm3\n\t"
  594. "paddusw %%mm1, %%mm0\n\t"
  595. "paddusw %%mm3, %%mm2\n\t"
  596. "paddusw %%mm5, %%mm0\n\t"
  597. "paddusw %%mm5, %%mm2\n\t"
  598. "psrlw $1, %%mm0\n\t"
  599. "psrlw $1, %%mm2\n\t"
  600. "packuswb %%mm2, %%mm0\n\t"
  601. "movq %%mm0, %0\n\t"
  602. :"=m"(*p)
  603. :"m"(*pix),
  604. "m"(*(pix+line_size)), "m"(mm_wone)
  605. :"memory");
  606. pix += line_size;
  607. p += line_size ;
  608. } while(--h);
  609. }
  610. static void avg_no_rnd_pixels_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h)
  611. {
  612. UINT8 *p;
  613. const UINT8 *pix;
  614. p = block;
  615. pix = pixels;
  616. __asm __volatile("pxor %%mm7, %%mm7\n\t":::"memory");
  617. do {
  618. __asm __volatile(
  619. "movq %1, %%mm0\n\t"
  620. "movq %0, %%mm1\n\t"
  621. "movq %%mm0, %%mm2\n\t"
  622. "movq %%mm1, %%mm3\n\t"
  623. "punpcklbw %%mm7, %%mm0\n\t"
  624. "punpcklbw %%mm7, %%mm1\n\t"
  625. "punpckhbw %%mm7, %%mm2\n\t"
  626. "punpckhbw %%mm7, %%mm3\n\t"
  627. "paddusw %%mm1, %%mm0\n\t"
  628. "paddusw %%mm3, %%mm2\n\t"
  629. "psrlw $1, %%mm0\n\t"
  630. "psrlw $1, %%mm2\n\t"
  631. "packuswb %%mm2, %%mm0\n\t"
  632. "movq %%mm0, %0\n\t"
  633. :"=m"(*p)
  634. :"m"(*pix)
  635. :"memory");
  636. pix += line_size;
  637. p += line_size ;
  638. } while (--h);
  639. }
  640. static void avg_no_rnd_pixels_x2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h)
  641. {
  642. UINT8 *p;
  643. const UINT8 *pix;
  644. p = block;
  645. pix = pixels;
  646. __asm __volatile(
  647. "pxor %%mm7, %%mm7\n\t":::"memory");
  648. do {
  649. __asm __volatile(
  650. "movq %1, %%mm0\n\t"
  651. "movq 1%1, %%mm1\n\t"
  652. "movq %0, %%mm4\n\t"
  653. "movq %%mm0, %%mm2\n\t"
  654. "movq %%mm1, %%mm3\n\t"
  655. "movq %%mm4, %%mm5\n\t"
  656. "punpcklbw %%mm7, %%mm0\n\t"
  657. "punpcklbw %%mm7, %%mm1\n\t"
  658. "punpckhbw %%mm7, %%mm2\n\t"
  659. "punpckhbw %%mm7, %%mm3\n\t"
  660. "punpcklbw %%mm7, %%mm4\n\t"
  661. "punpckhbw %%mm7, %%mm5\n\t"
  662. "paddusw %%mm1, %%mm0\n\t"
  663. "paddusw %%mm3, %%mm2\n\t"
  664. "psrlw $1, %%mm0\n\t"
  665. "psrlw $1, %%mm2\n\t"
  666. "paddusw %%mm4, %%mm0\n\t"
  667. "paddusw %%mm5, %%mm2\n\t"
  668. "psrlw $1, %%mm0\n\t"
  669. "psrlw $1, %%mm2\n\t"
  670. "packuswb %%mm2, %%mm0\n\t"
  671. "movq %%mm0, %0\n\t"
  672. :"=m"(*p)
  673. :"m"(*pix)
  674. :"memory");
  675. pix += line_size;
  676. p += line_size;
  677. } while (--h);
  678. }
  679. static void avg_no_rnd_pixels_y2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h)
  680. {
  681. UINT8 *p;
  682. const UINT8 *pix;
  683. p = block;
  684. pix = pixels;
  685. __asm __volatile(
  686. "pxor %%mm7, %%mm7\n\t":::"memory");
  687. do {
  688. __asm __volatile(
  689. "movq %1, %%mm0\n\t"
  690. "movq %2, %%mm1\n\t"
  691. "movq %0, %%mm4\n\t"
  692. "movq %%mm0, %%mm2\n\t"
  693. "movq %%mm1, %%mm3\n\t"
  694. "movq %%mm4, %%mm5\n\t"
  695. "punpcklbw %%mm7, %%mm0\n\t"
  696. "punpcklbw %%mm7, %%mm1\n\t"
  697. "punpckhbw %%mm7, %%mm2\n\t"
  698. "punpckhbw %%mm7, %%mm3\n\t"
  699. "punpcklbw %%mm7, %%mm4\n\t"
  700. "punpckhbw %%mm7, %%mm5\n\t"
  701. "paddusw %%mm1, %%mm0\n\t"
  702. "paddusw %%mm3, %%mm2\n\t"
  703. "psrlw $1, %%mm0\n\t"
  704. "psrlw $1, %%mm2\n\t"
  705. "paddusw %%mm4, %%mm0\n\t"
  706. "paddusw %%mm5, %%mm2\n\t"
  707. "psrlw $1, %%mm0\n\t"
  708. "psrlw $1, %%mm2\n\t"
  709. "packuswb %%mm2, %%mm0\n\t"
  710. "movq %%mm0, %0\n\t"
  711. :"=m"(*p)
  712. :"m"(*pix), "m"(*(pix+line_size))
  713. :"memory");
  714. pix += line_size;
  715. p += line_size ;
  716. } while(--h);
  717. }
  718. static void avg_no_rnd_pixels_xy2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h)
  719. {
  720. UINT8 *p;
  721. const UINT8 *pix;
  722. p = block;
  723. pix = pixels;
  724. __asm __volatile(
  725. "pxor %%mm7, %%mm7\n\t"
  726. "movq %0, %%mm6\n\t"
  727. ::"m"(mm_wone):"memory");
  728. do {
  729. __asm __volatile(
  730. "movq %1, %%mm0\n\t"
  731. "movq %2, %%mm1\n\t"
  732. "movq 1%1, %%mm4\n\t"
  733. "movq 1%2, %%mm5\n\t"
  734. "movq %%mm0, %%mm2\n\t"
  735. "movq %%mm1, %%mm3\n\t"
  736. "punpcklbw %%mm7, %%mm0\n\t"
  737. "punpcklbw %%mm7, %%mm1\n\t"
  738. "punpckhbw %%mm7, %%mm2\n\t"
  739. "punpckhbw %%mm7, %%mm3\n\t"
  740. "paddusw %%mm1, %%mm0\n\t"
  741. "paddusw %%mm3, %%mm2\n\t"
  742. "movq %%mm4, %%mm1\n\t"
  743. "movq %%mm5, %%mm3\n\t"
  744. "punpcklbw %%mm7, %%mm4\n\t"
  745. "punpcklbw %%mm7, %%mm5\n\t"
  746. "punpckhbw %%mm7, %%mm1\n\t"
  747. "punpckhbw %%mm7, %%mm3\n\t"
  748. "paddusw %%mm5, %%mm4\n\t"
  749. "paddusw %%mm3, %%mm1\n\t"
  750. "paddusw %%mm6, %%mm4\n\t"
  751. "paddusw %%mm6, %%mm1\n\t"
  752. "paddusw %%mm4, %%mm0\n\t"
  753. "paddusw %%mm1, %%mm2\n\t"
  754. "movq %0, %%mm1\n\t"
  755. "psrlw $2, %%mm0\n\t"
  756. "movq %%mm1, %%mm3\n\t"
  757. "psrlw $2, %%mm2\n\t"
  758. "punpcklbw %%mm7, %%mm1\n\t"
  759. "punpckhbw %%mm7, %%mm3\n\t"
  760. "paddusw %%mm1, %%mm0\n\t"
  761. "paddusw %%mm3, %%mm2\n\t"
  762. "psrlw $1, %%mm0\n\t"
  763. "psrlw $1, %%mm2\n\t"
  764. "packuswb %%mm2, %%mm0\n\t"
  765. "movq %%mm0, %0\n\t"
  766. :"=m"(*p)
  767. :"m"(*pix),
  768. "m"(*(pix+line_size))
  769. :"memory");
  770. pix += line_size;
  771. p += line_size;
  772. } while(--h);
  773. }
  774. static void sub_pixels_mmx( DCTELEM *block, const UINT8 *pixels, int line_size, int h)
  775. {
  776. DCTELEM *p;
  777. const UINT8 *pix;
  778. p = block;
  779. pix = pixels;
  780. __asm __volatile("pxor %%mm7, %%mm7":::"memory");
  781. do {
  782. __asm __volatile(
  783. "movq %0, %%mm0\n\t"
  784. "movq %1, %%mm2\n\t"
  785. "movq 8%0, %%mm1\n\t"
  786. "movq %%mm2, %%mm3\n\t"
  787. "punpcklbw %%mm7, %%mm2\n\t"
  788. "punpckhbw %%mm7, %%mm3\n\t"
  789. "psubsw %%mm2, %%mm0\n\t"
  790. "psubsw %%mm3, %%mm1\n\t"
  791. "movq %%mm0, %0\n\t"
  792. "movq %%mm1, 8%0\n\t"
  793. :"=m"(*p)
  794. :"m"(*pix)
  795. :"memory");
  796. pix += line_size;
  797. p += 8;
  798. } while (--h);
  799. }
  800. static void sub_pixels_x2_mmx( DCTELEM *block, const UINT8 *pixels, int line_size, int h)
  801. {
  802. DCTELEM *p;
  803. const UINT8 *pix;
  804. p = block;
  805. pix = pixels;
  806. __asm __volatile(
  807. "pxor %%mm7, %%mm7\n\t"
  808. "movq %0, %%mm6"
  809. ::"m"(mm_wone):"memory");
  810. do {
  811. __asm __volatile(
  812. "movq %0, %%mm0\n\t"
  813. "movq %1, %%mm2\n\t"
  814. "movq 8%0, %%mm1\n\t"
  815. "movq 1%1, %%mm4\n\t"
  816. "movq %%mm2, %%mm3\n\t"
  817. "movq %%mm4, %%mm5\n\t"
  818. "punpcklbw %%mm7, %%mm2\n\t"
  819. "punpckhbw %%mm7, %%mm3\n\t"
  820. "punpcklbw %%mm7, %%mm4\n\t"
  821. "punpckhbw %%mm7, %%mm5\n\t"
  822. "paddusw %%mm4, %%mm2\n\t"
  823. "paddusw %%mm5, %%mm3\n\t"
  824. "paddusw %%mm6, %%mm2\n\t"
  825. "paddusw %%mm6, %%mm3\n\t"
  826. "psrlw $1, %%mm2\n\t"
  827. "psrlw $1, %%mm3\n\t"
  828. "psubsw %%mm2, %%mm0\n\t"
  829. "psubsw %%mm3, %%mm1\n\t"
  830. "movq %%mm0, %0\n\t"
  831. "movq %%mm1, 8%0\n\t"
  832. :"=m"(*p)
  833. :"m"(*pix)
  834. :"memory");
  835. pix += line_size;
  836. p += 8;
  837. } while (--h);
  838. }
  839. static void sub_pixels_y2_mmx( DCTELEM *block, const UINT8 *pixels, int line_size, int h)
  840. {
  841. DCTELEM *p;
  842. const UINT8 *pix;
  843. p = block;
  844. pix = pixels;
  845. __asm __volatile(
  846. "pxor %%mm7, %%mm7\n\t"
  847. "movq %0, %%mm6"
  848. ::"m"(mm_wone):"memory");
  849. do {
  850. __asm __volatile(
  851. "movq %0, %%mm0\n\t"
  852. "movq %1, %%mm2\n\t"
  853. "movq 8%0, %%mm1\n\t"
  854. "movq %2, %%mm4\n\t"
  855. "movq %%mm2, %%mm3\n\t"
  856. "movq %%mm4, %%mm5\n\t"
  857. "punpcklbw %%mm7, %%mm2\n\t"
  858. "punpckhbw %%mm7, %%mm3\n\t"
  859. "punpcklbw %%mm7, %%mm4\n\t"
  860. "punpckhbw %%mm7, %%mm5\n\t"
  861. "paddusw %%mm4, %%mm2\n\t"
  862. "paddusw %%mm5, %%mm3\n\t"
  863. "paddusw %%mm6, %%mm2\n\t"
  864. "paddusw %%mm6, %%mm3\n\t"
  865. "psrlw $1, %%mm2\n\t"
  866. "psrlw $1, %%mm3\n\t"
  867. "psubsw %%mm2, %%mm0\n\t"
  868. "psubsw %%mm3, %%mm1\n\t"
  869. "movq %%mm0, %0\n\t"
  870. "movq %%mm1, 8%0\n\t"
  871. :"=m"(*p)
  872. :"m"(*pix), "m"(*(pix+line_size))
  873. :"memory");
  874. pix += line_size;
  875. p += 8;
  876. } while (--h);
  877. }
  878. static void sub_pixels_xy2_mmx( DCTELEM *block, const UINT8 *pixels, int line_size, int h)
  879. {
  880. DCTELEM *p;
  881. const UINT8 *pix;
  882. p = block;
  883. pix = pixels;
  884. __asm __volatile(
  885. "pxor %%mm7, %%mm7\n\t"
  886. "movq %0, %%mm6\n\t"
  887. ::"m"(mm_wtwo):"memory");
  888. do {
  889. __asm __volatile(
  890. "movq %1, %%mm0\n\t"
  891. "movq %2, %%mm1\n\t"
  892. "movq 1%1, %%mm4\n\t"
  893. "movq 1%2, %%mm5\n\t"
  894. "movq %%mm0, %%mm2\n\t"
  895. "movq %%mm1, %%mm3\n\t"
  896. "punpcklbw %%mm7, %%mm0\n\t"
  897. "punpcklbw %%mm7, %%mm1\n\t"
  898. "punpckhbw %%mm7, %%mm2\n\t"
  899. "punpckhbw %%mm7, %%mm3\n\t"
  900. "paddusw %%mm1, %%mm0\n\t"
  901. "paddusw %%mm3, %%mm2\n\t"
  902. "movq %%mm4, %%mm1\n\t"
  903. "movq %%mm5, %%mm3\n\t"
  904. "punpcklbw %%mm7, %%mm4\n\t"
  905. "punpcklbw %%mm7, %%mm5\n\t"
  906. "punpckhbw %%mm7, %%mm1\n\t"
  907. "punpckhbw %%mm7, %%mm3\n\t"
  908. "paddusw %%mm5, %%mm4\n\t"
  909. "paddusw %%mm3, %%mm1\n\t"
  910. "paddusw %%mm6, %%mm4\n\t"
  911. "paddusw %%mm6, %%mm1\n\t"
  912. "paddusw %%mm4, %%mm0\n\t"
  913. "paddusw %%mm1, %%mm2\n\t"
  914. "movq %0, %%mm1\n\t"
  915. "movq 8%0, %%mm3\n\t"
  916. "psrlw $2, %%mm0\n\t"
  917. "psrlw $2, %%mm2\n\t"
  918. "psubsw %%mm0, %%mm1\n\t"
  919. "psubsw %%mm2, %%mm3\n\t"
  920. "movq %%mm1, %0\n\t"
  921. "movq %%mm3, 8%0\n\t"
  922. :"=m"(*p)
  923. :"m"(*pix),
  924. "m"(*(pix+line_size))
  925. :"memory");
  926. pix += line_size;
  927. p += 8 ;
  928. } while(--h);
  929. }
  930. void dsputil_init_mmx(void)
  931. {
  932. mm_flags = mm_support();
  933. #if 0
  934. printf("CPU flags:");
  935. if (mm_flags & MM_MMX)
  936. printf(" mmx");
  937. if (mm_flags & MM_MMXEXT)
  938. printf(" mmxext");
  939. if (mm_flags & MM_3DNOW)
  940. printf(" 3dnow");
  941. if (mm_flags & MM_SSE)
  942. printf(" sse");
  943. if (mm_flags & MM_SSE2)
  944. printf(" sse2");
  945. printf("\n");
  946. #endif
  947. if (mm_flags & MM_MMX) {
  948. get_pixels = get_pixels_mmx;
  949. put_pixels_clamped = put_pixels_clamped_mmx;
  950. add_pixels_clamped = add_pixels_clamped_mmx;
  951. pix_abs16x16 = pix_abs16x16_mmx;
  952. pix_abs16x16_x2 = pix_abs16x16_x2_mmx;
  953. pix_abs16x16_y2 = pix_abs16x16_y2_mmx;
  954. pix_abs16x16_xy2 = pix_abs16x16_xy2_mmx;
  955. av_fdct = fdct_mmx;
  956. put_pixels_tab[0] = put_pixels_mmx;
  957. put_pixels_tab[1] = put_pixels_x2_mmx;
  958. put_pixels_tab[2] = put_pixels_y2_mmx;
  959. put_pixels_tab[3] = put_pixels_xy2_mmx;
  960. put_no_rnd_pixels_tab[0] = put_pixels_mmx;
  961. put_no_rnd_pixels_tab[1] = put_no_rnd_pixels_x2_mmx;
  962. put_no_rnd_pixels_tab[2] = put_no_rnd_pixels_y2_mmx;
  963. put_no_rnd_pixels_tab[3] = put_no_rnd_pixels_xy2_mmx;
  964. avg_pixels_tab[0] = avg_pixels_mmx;
  965. avg_pixels_tab[1] = avg_pixels_x2_mmx;
  966. avg_pixels_tab[2] = avg_pixels_y2_mmx;
  967. avg_pixels_tab[3] = avg_pixels_xy2_mmx;
  968. avg_no_rnd_pixels_tab[0] = avg_no_rnd_pixels_mmx;
  969. avg_no_rnd_pixels_tab[1] = avg_no_rnd_pixels_x2_mmx;
  970. avg_no_rnd_pixels_tab[2] = avg_no_rnd_pixels_y2_mmx;
  971. avg_no_rnd_pixels_tab[3] = avg_no_rnd_pixels_xy2_mmx;
  972. sub_pixels_tab[0] = sub_pixels_mmx;
  973. sub_pixels_tab[1] = sub_pixels_x2_mmx;
  974. sub_pixels_tab[2] = sub_pixels_y2_mmx;
  975. sub_pixels_tab[3] = sub_pixels_xy2_mmx;
  976. if (mm_flags & MM_MMXEXT) {
  977. pix_abs16x16 = pix_abs16x16_sse;
  978. }
  979. if (mm_flags & MM_SSE) {
  980. put_pixels_tab[1] = put_pixels_x2_sse;
  981. put_pixels_tab[2] = put_pixels_y2_sse;
  982. avg_pixels_tab[0] = avg_pixels_sse;
  983. avg_pixels_tab[1] = avg_pixels_x2_sse;
  984. avg_pixels_tab[2] = avg_pixels_y2_sse;
  985. avg_pixels_tab[3] = avg_pixels_xy2_sse;
  986. sub_pixels_tab[1] = sub_pixels_x2_sse;
  987. sub_pixels_tab[2] = sub_pixels_y2_sse;
  988. } else if (mm_flags & MM_3DNOW) {
  989. put_pixels_tab[1] = put_pixels_x2_3dnow;
  990. put_pixels_tab[2] = put_pixels_y2_3dnow;
  991. avg_pixels_tab[0] = avg_pixels_3dnow;
  992. avg_pixels_tab[1] = avg_pixels_x2_3dnow;
  993. avg_pixels_tab[2] = avg_pixels_y2_3dnow;
  994. avg_pixels_tab[3] = avg_pixels_xy2_3dnow;
  995. sub_pixels_tab[1] = sub_pixels_x2_3dnow;
  996. sub_pixels_tab[2] = sub_pixels_y2_3dnow;
  997. }
  998. #ifdef USE_MMX_IDCT
  999. /* use MMX / MMXEXT iDCT code from libmpeg2 */
  1000. //printf("LIBAVCODEC: Using MMX%s iDCT code\n",(mm_flags & MM_MMXEXT)?"EXT":"");
  1001. ff_idct = (mm_flags & MM_MMXEXT) ? mmxext_idct : mmx_idct;
  1002. /* the mmx/mmxext idct uses a reordered input, so we patch scan tables */
  1003. { int i,j;
  1004. for (i = 0; i < 64; i++) {
  1005. j = zigzag_direct[i];
  1006. zigzag_direct[i] = (j & 0x38) | ((j & 6) >> 1) | ((j & 1) << 2);
  1007. j = ff_alternate_horizontal_scan[i];
  1008. ff_alternate_horizontal_scan[i] = (j & 0x38) | ((j & 6) >> 1) | ((j & 1) << 2);
  1009. j = ff_alternate_vertical_scan[i];
  1010. ff_alternate_vertical_scan[i] = (j & 0x38) | ((j & 6) >> 1) | ((j & 1) << 2);
  1011. }
  1012. }
  1013. #endif
  1014. }
  1015. }