You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1044 lines
25KB

  1. /*
  2. * MMX optimized DSP utils
  3. * Copyright (c) 2000, 2001 Gerard Lantau.
  4. *
  5. * This program is free software; you can redistribute it and/or modify
  6. * it under the terms of the GNU General Public License as published by
  7. * the Free Software Foundation; either version 2 of the License, or
  8. * (at your option) any later version.
  9. *
  10. * This program is distributed in the hope that it will be useful,
  11. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  12. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  13. * GNU General Public License for more details.
  14. *
  15. * You should have received a copy of the GNU General Public License
  16. * along with this program; if not, write to the Free Software
  17. * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  18. *
  19. * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
  20. */
  21. #include "../dsputil.h"
  22. int mm_flags; /* multimedia extension flags */
  23. int pix_abs16x16_mmx(UINT8 *blk1, UINT8 *blk2, int lx, int h);
  24. int pix_abs16x16_sse(UINT8 *blk1, UINT8 *blk2, int lx, int h);
  25. int pix_abs16x16_x2_mmx(UINT8 *blk1, UINT8 *blk2, int lx, int h);
  26. int pix_abs16x16_y2_mmx(UINT8 *blk1, UINT8 *blk2, int lx, int h);
  27. int pix_abs16x16_xy2_mmx(UINT8 *blk1, UINT8 *blk2, int lx, int h);
  28. /* pixel operations */
  29. static const unsigned long long int mm_wone __attribute__ ((aligned(8))) = 0x0001000100010001;
  30. static const unsigned long long int mm_wtwo __attribute__ ((aligned(8))) = 0x0002000200020002;
  31. //static const unsigned short mm_wone[4] __attribute__ ((aligned(8))) = { 0x1, 0x1, 0x1, 0x1 };
  32. //static const unsigned short mm_wtwo[4] __attribute__ ((aligned(8))) = { 0x2, 0x2, 0x2, 0x2 };
  33. /***********************************/
  34. /* 3Dnow specific */
  35. #define DEF(x) x ## _3dnow
  36. /* for Athlons PAVGUSB is prefered */
  37. #define PAVGB "pavgusb"
  38. #include "dsputil_mmx_avg.h"
  39. #undef DEF
  40. #undef PAVGB
  41. /***********************************/
  42. /* MMX2 specific */
  43. #define DEF(x) x ## _sse
  44. /* Introduced only in MMX2 set */
  45. #define PAVGB "pavgb"
  46. #include "dsputil_mmx_avg.h"
  47. #undef DEF
  48. #undef PAVGB
  49. /***********************************/
  50. /* standard MMX */
  51. static void get_pixels_mmx(DCTELEM *block, const UINT8 *pixels, int line_size)
  52. {
  53. DCTELEM *p;
  54. const UINT8 *pix;
  55. int i;
  56. /* read the pixels */
  57. p = block;
  58. pix = pixels;
  59. __asm __volatile("pxor %%mm7, %%mm7":::"memory");
  60. for(i=0;i<4;i++) {
  61. __asm __volatile(
  62. "movq %1, %%mm0\n\t"
  63. "movq %2, %%mm1\n\t"
  64. "movq %%mm0, %%mm2\n\t"
  65. "movq %%mm1, %%mm3\n\t"
  66. "punpcklbw %%mm7, %%mm0\n\t"
  67. "punpckhbw %%mm7, %%mm2\n\t"
  68. "punpcklbw %%mm7, %%mm1\n\t"
  69. "punpckhbw %%mm7, %%mm3\n\t"
  70. "movq %%mm0, %0\n\t"
  71. "movq %%mm2, 8%0\n\t"
  72. "movq %%mm1, 16%0\n\t"
  73. "movq %%mm3, 24%0\n\t"
  74. :"=m"(*p)
  75. :"m"(*pix), "m"(*(pix+line_size))
  76. :"memory");
  77. pix += line_size*2;
  78. p += 16;
  79. }
  80. }
  81. static void put_pixels_clamped_mmx(const DCTELEM *block, UINT8 *pixels, int line_size)
  82. {
  83. const DCTELEM *p;
  84. UINT8 *pix;
  85. int i;
  86. /* read the pixels */
  87. p = block;
  88. pix = pixels;
  89. for(i=0;i<2;i++) {
  90. __asm __volatile(
  91. "movq %4, %%mm0\n\t"
  92. "movq 8%4, %%mm1\n\t"
  93. "movq 16%4, %%mm2\n\t"
  94. "movq 24%4, %%mm3\n\t"
  95. "movq 32%4, %%mm4\n\t"
  96. "movq 40%4, %%mm5\n\t"
  97. "movq 48%4, %%mm6\n\t"
  98. "movq 56%4, %%mm7\n\t"
  99. "packuswb %%mm1, %%mm0\n\t"
  100. "packuswb %%mm3, %%mm2\n\t"
  101. "packuswb %%mm5, %%mm4\n\t"
  102. "packuswb %%mm7, %%mm6\n\t"
  103. "movq %%mm0, %0\n\t"
  104. "movq %%mm2, %1\n\t"
  105. "movq %%mm4, %2\n\t"
  106. "movq %%mm6, %3\n\t"
  107. :"=m"(*pix), "=m"(*(pix+line_size))
  108. ,"=m"(*(pix+line_size*2)), "=m"(*(pix+line_size*3))
  109. :"m"(*p)
  110. :"memory");
  111. pix += line_size*4;
  112. p += 32;
  113. }
  114. }
  115. static void add_pixels_clamped_mmx(const DCTELEM *block, UINT8 *pixels, int line_size)
  116. {
  117. const DCTELEM *p;
  118. UINT8 *pix;
  119. int i;
  120. /* read the pixels */
  121. p = block;
  122. pix = pixels;
  123. __asm __volatile("pxor %%mm7, %%mm7":::"memory");
  124. for(i=0;i<4;i++) {
  125. __asm __volatile(
  126. "movq %2, %%mm0\n\t"
  127. "movq 8%2, %%mm1\n\t"
  128. "movq 16%2, %%mm2\n\t"
  129. "movq 24%2, %%mm3\n\t"
  130. "movq %0, %%mm4\n\t"
  131. "movq %1, %%mm6\n\t"
  132. "movq %%mm4, %%mm5\n\t"
  133. "punpcklbw %%mm7, %%mm4\n\t"
  134. "punpckhbw %%mm7, %%mm5\n\t"
  135. "paddsw %%mm4, %%mm0\n\t"
  136. "paddsw %%mm5, %%mm1\n\t"
  137. "movq %%mm6, %%mm5\n\t"
  138. "punpcklbw %%mm7, %%mm6\n\t"
  139. "punpckhbw %%mm7, %%mm5\n\t"
  140. "paddsw %%mm6, %%mm2\n\t"
  141. "paddsw %%mm5, %%mm3\n\t"
  142. "packuswb %%mm1, %%mm0\n\t"
  143. "packuswb %%mm3, %%mm2\n\t"
  144. "movq %%mm0, %0\n\t"
  145. "movq %%mm2, %1\n\t"
  146. :"=m"(*pix), "=m"(*(pix+line_size))
  147. :"m"(*p)
  148. :"memory");
  149. pix += line_size*2;
  150. p += 16;
  151. }
  152. }
  153. static void put_pixels_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h)
  154. {
  155. int dh, hh;
  156. UINT8 *p;
  157. const UINT8 *pix;
  158. p = block;
  159. pix = pixels;
  160. hh=h>>2;
  161. dh=h&3;
  162. while(hh--) {
  163. __asm __volatile(
  164. "movq %4, %%mm0\n\t"
  165. "movq %5, %%mm1\n\t"
  166. "movq %6, %%mm2\n\t"
  167. "movq %7, %%mm3\n\t"
  168. "movq %%mm0, %0\n\t"
  169. "movq %%mm1, %1\n\t"
  170. "movq %%mm2, %2\n\t"
  171. "movq %%mm3, %3\n\t"
  172. :"=m"(*p), "=m"(*(p+line_size)), "=m"(*(p+line_size*2)), "=m"(*(p+line_size*3))
  173. :"m"(*pix), "m"(*(pix+line_size)), "m"(*(pix+line_size*2)), "m"(*(pix+line_size*3))
  174. :"memory");
  175. pix = pix + line_size*4;
  176. p = p + line_size*4;
  177. }
  178. while(dh--) {
  179. __asm __volatile(
  180. "movq %1, %%mm0\n\t"
  181. "movq %%mm0, %0\n\t"
  182. :"=m"(*p)
  183. :"m"(*pix)
  184. :"memory");
  185. pix = pix + line_size;
  186. p = p + line_size;
  187. }
  188. }
  189. static void put_pixels_x2_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h)
  190. {
  191. UINT8 *p;
  192. const UINT8 *pix;
  193. p = block;
  194. pix = pixels;
  195. __asm __volatile(
  196. "pxor %%mm7, %%mm7\n\t"
  197. "movq %0, %%mm4\n\t"
  198. ::"m"(mm_wone):"memory");
  199. do {
  200. __asm __volatile(
  201. "movq %1, %%mm0\n\t"
  202. "movq 1%1, %%mm1\n\t"
  203. "movq %%mm0, %%mm2\n\t"
  204. "movq %%mm1, %%mm3\n\t"
  205. "punpcklbw %%mm7, %%mm0\n\t"
  206. "punpcklbw %%mm7, %%mm1\n\t"
  207. "punpckhbw %%mm7, %%mm2\n\t"
  208. "punpckhbw %%mm7, %%mm3\n\t"
  209. "paddusw %%mm1, %%mm0\n\t"
  210. "paddusw %%mm3, %%mm2\n\t"
  211. "paddusw %%mm4, %%mm0\n\t"
  212. "paddusw %%mm4, %%mm2\n\t"
  213. "psrlw $1, %%mm0\n\t"
  214. "psrlw $1, %%mm2\n\t"
  215. "packuswb %%mm2, %%mm0\n\t"
  216. "movq %%mm0, %0\n\t"
  217. :"=m"(*p)
  218. :"m"(*pix)
  219. :"memory");
  220. pix += line_size; p += line_size;
  221. } while (--h);
  222. }
  223. static void put_pixels_y2_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h)
  224. {
  225. UINT8 *p;
  226. const UINT8 *pix;
  227. p = block;
  228. pix = pixels;
  229. __asm __volatile(
  230. "pxor %%mm7, %%mm7\n\t"
  231. "movq %0, %%mm4\n\t"
  232. ::"m"(mm_wone):"memory");
  233. do {
  234. __asm __volatile(
  235. "movq %1, %%mm0\n\t"
  236. "movq %2, %%mm1\n\t"
  237. "movq %%mm0, %%mm2\n\t"
  238. "movq %%mm1, %%mm3\n\t"
  239. "punpcklbw %%mm7, %%mm0\n\t"
  240. "punpcklbw %%mm7, %%mm1\n\t"
  241. "punpckhbw %%mm7, %%mm2\n\t"
  242. "punpckhbw %%mm7, %%mm3\n\t"
  243. "paddusw %%mm1, %%mm0\n\t"
  244. "paddusw %%mm3, %%mm2\n\t"
  245. "paddusw %%mm4, %%mm0\n\t"
  246. "paddusw %%mm4, %%mm2\n\t"
  247. "psrlw $1, %%mm0\n\t"
  248. "psrlw $1, %%mm2\n\t"
  249. "packuswb %%mm2, %%mm0\n\t"
  250. "movq %%mm0, %0\n\t"
  251. :"=m"(*p)
  252. :"m"(*pix),
  253. "m"(*(pix+line_size))
  254. :"memory");
  255. pix += line_size;
  256. p += line_size;
  257. } while (--h);
  258. }
  259. static void put_pixels_xy2_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h)
  260. {
  261. UINT8 *p;
  262. const UINT8 *pix;
  263. p = block;
  264. pix = pixels;
  265. __asm __volatile(
  266. "pxor %%mm7, %%mm7\n\t"
  267. "movq %0, %%mm6\n\t"
  268. ::"m"(mm_wtwo):"memory");
  269. do {
  270. __asm __volatile(
  271. "movq %1, %%mm0\n\t"
  272. "movq %2, %%mm1\n\t"
  273. "movq 1%1, %%mm4\n\t"
  274. "movq 1%2, %%mm5\n\t"
  275. "movq %%mm0, %%mm2\n\t"
  276. "movq %%mm1, %%mm3\n\t"
  277. "punpcklbw %%mm7, %%mm0\n\t"
  278. "punpcklbw %%mm7, %%mm1\n\t"
  279. "punpckhbw %%mm7, %%mm2\n\t"
  280. "punpckhbw %%mm7, %%mm3\n\t"
  281. "paddusw %%mm1, %%mm0\n\t"
  282. "paddusw %%mm3, %%mm2\n\t"
  283. "movq %%mm4, %%mm1\n\t"
  284. "movq %%mm5, %%mm3\n\t"
  285. "punpcklbw %%mm7, %%mm4\n\t"
  286. "punpcklbw %%mm7, %%mm5\n\t"
  287. "punpckhbw %%mm7, %%mm1\n\t"
  288. "punpckhbw %%mm7, %%mm3\n\t"
  289. "paddusw %%mm5, %%mm4\n\t"
  290. "paddusw %%mm3, %%mm1\n\t"
  291. "paddusw %%mm6, %%mm4\n\t"
  292. "paddusw %%mm6, %%mm1\n\t"
  293. "paddusw %%mm4, %%mm0\n\t"
  294. "paddusw %%mm1, %%mm2\n\t"
  295. "psrlw $2, %%mm0\n\t"
  296. "psrlw $2, %%mm2\n\t"
  297. "packuswb %%mm2, %%mm0\n\t"
  298. "movq %%mm0, %0\n\t"
  299. :"=m"(*p)
  300. :"m"(*pix),
  301. "m"(*(pix+line_size))
  302. :"memory");
  303. pix += line_size;
  304. p += line_size;
  305. } while(--h);
  306. }
  307. static void put_no_rnd_pixels_x2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h)
  308. {
  309. UINT8 *p;
  310. const UINT8 *pix;
  311. p = block;
  312. pix = pixels;
  313. __asm __volatile("pxor %%mm7, %%mm7\n\t":::"memory");
  314. do {
  315. __asm __volatile(
  316. "movq %1, %%mm0\n\t"
  317. "movq 1%1, %%mm1\n\t"
  318. "movq %%mm0, %%mm2\n\t"
  319. "movq %%mm1, %%mm3\n\t"
  320. "punpcklbw %%mm7, %%mm0\n\t"
  321. "punpcklbw %%mm7, %%mm1\n\t"
  322. "punpckhbw %%mm7, %%mm2\n\t"
  323. "punpckhbw %%mm7, %%mm3\n\t"
  324. "paddusw %%mm1, %%mm0\n\t"
  325. "paddusw %%mm3, %%mm2\n\t"
  326. "psrlw $1, %%mm0\n\t"
  327. "psrlw $1, %%mm2\n\t"
  328. "packuswb %%mm2, %%mm0\n\t"
  329. "movq %%mm0, %0\n\t"
  330. :"=m"(*p)
  331. :"m"(*pix)
  332. :"memory");
  333. pix += line_size;
  334. p += line_size;
  335. } while (--h);
  336. }
  337. static void put_no_rnd_pixels_y2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h)
  338. {
  339. UINT8 *p;
  340. const UINT8 *pix;
  341. p = block;
  342. pix = pixels;
  343. __asm __volatile("pxor %%mm7, %%mm7\n\t":::"memory");
  344. do {
  345. __asm __volatile(
  346. "movq %1, %%mm0\n\t"
  347. "movq %2, %%mm1\n\t"
  348. "movq %%mm0, %%mm2\n\t"
  349. "movq %%mm1, %%mm3\n\t"
  350. "punpcklbw %%mm7, %%mm0\n\t"
  351. "punpcklbw %%mm7, %%mm1\n\t"
  352. "punpckhbw %%mm7, %%mm2\n\t"
  353. "punpckhbw %%mm7, %%mm3\n\t"
  354. "paddusw %%mm1, %%mm0\n\t"
  355. "paddusw %%mm3, %%mm2\n\t"
  356. "psrlw $1, %%mm0\n\t"
  357. "psrlw $1, %%mm2\n\t"
  358. "packuswb %%mm2, %%mm0\n\t"
  359. "movq %%mm0, %0\n\t"
  360. :"=m"(*p)
  361. :"m"(*pix),
  362. "m"(*(pix+line_size))
  363. :"memory");
  364. pix += line_size;
  365. p += line_size;
  366. } while(--h);
  367. }
  368. static void put_no_rnd_pixels_xy2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h)
  369. {
  370. UINT8 *p;
  371. const UINT8 *pix;
  372. p = block;
  373. pix = pixels;
  374. __asm __volatile(
  375. "pxor %%mm7, %%mm7\n\t"
  376. "movq %0, %%mm6\n\t"
  377. ::"m"(mm_wone):"memory");
  378. do {
  379. __asm __volatile(
  380. "movq %1, %%mm0\n\t"
  381. "movq %2, %%mm1\n\t"
  382. "movq 1%1, %%mm4\n\t"
  383. "movq 1%2, %%mm5\n\t"
  384. "movq %%mm0, %%mm2\n\t"
  385. "movq %%mm1, %%mm3\n\t"
  386. "punpcklbw %%mm7, %%mm0\n\t"
  387. "punpcklbw %%mm7, %%mm1\n\t"
  388. "punpckhbw %%mm7, %%mm2\n\t"
  389. "punpckhbw %%mm7, %%mm3\n\t"
  390. "paddusw %%mm1, %%mm0\n\t"
  391. "paddusw %%mm3, %%mm2\n\t"
  392. "movq %%mm4, %%mm1\n\t"
  393. "movq %%mm5, %%mm3\n\t"
  394. "punpcklbw %%mm7, %%mm4\n\t"
  395. "punpcklbw %%mm7, %%mm5\n\t"
  396. "punpckhbw %%mm7, %%mm1\n\t"
  397. "punpckhbw %%mm7, %%mm3\n\t"
  398. "paddusw %%mm5, %%mm4\n\t"
  399. "paddusw %%mm3, %%mm1\n\t"
  400. "paddusw %%mm6, %%mm4\n\t"
  401. "paddusw %%mm6, %%mm1\n\t"
  402. "paddusw %%mm4, %%mm0\n\t"
  403. "paddusw %%mm1, %%mm2\n\t"
  404. "psrlw $2, %%mm0\n\t"
  405. "psrlw $2, %%mm2\n\t"
  406. "packuswb %%mm2, %%mm0\n\t"
  407. "movq %%mm0, %0\n\t"
  408. :"=m"(*p)
  409. :"m"(*pix),
  410. "m"(*(pix+line_size))
  411. :"memory");
  412. pix += line_size;
  413. p += line_size;
  414. } while(--h);
  415. }
  416. static void avg_pixels_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h)
  417. {
  418. UINT8 *p;
  419. const UINT8 *pix;
  420. p = block;
  421. pix = pixels;
  422. __asm __volatile(
  423. "pxor %%mm7, %%mm7\n\t"
  424. "movq %0, %%mm6\n\t"
  425. ::"m"(mm_wone):"memory");
  426. do {
  427. __asm __volatile(
  428. "movq %0, %%mm0\n\t"
  429. "movq %1, %%mm1\n\t"
  430. "movq %%mm0, %%mm2\n\t"
  431. "movq %%mm1, %%mm3\n\t"
  432. "punpcklbw %%mm7, %%mm0\n\t"
  433. "punpcklbw %%mm7, %%mm1\n\t"
  434. "punpckhbw %%mm7, %%mm2\n\t"
  435. "punpckhbw %%mm7, %%mm3\n\t"
  436. "paddusw %%mm1, %%mm0\n\t"
  437. "paddusw %%mm3, %%mm2\n\t"
  438. "paddusw %%mm6, %%mm0\n\t"
  439. "paddusw %%mm6, %%mm2\n\t"
  440. "psrlw $1, %%mm0\n\t"
  441. "psrlw $1, %%mm2\n\t"
  442. "packuswb %%mm2, %%mm0\n\t"
  443. "movq %%mm0, %0\n\t"
  444. :"=m"(*p)
  445. :"m"(*pix)
  446. :"memory");
  447. pix += line_size;
  448. p += line_size;
  449. }
  450. while (--h);
  451. }
  452. static void avg_pixels_x2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h)
  453. {
  454. UINT8 *p;
  455. const UINT8 *pix;
  456. p = block;
  457. pix = pixels;
  458. __asm __volatile(
  459. "pxor %%mm7, %%mm7\n\t"
  460. "movq %0, %%mm6\n\t"
  461. ::"m"(mm_wone):"memory");
  462. do {
  463. __asm __volatile(
  464. "movq %1, %%mm1\n\t"
  465. "movq %0, %%mm0\n\t"
  466. "movq 1%1, %%mm4\n\t"
  467. "movq %%mm0, %%mm2\n\t"
  468. "movq %%mm1, %%mm3\n\t"
  469. "movq %%mm4, %%mm5\n\t"
  470. "punpcklbw %%mm7, %%mm1\n\t"
  471. "punpckhbw %%mm7, %%mm3\n\t"
  472. "punpcklbw %%mm7, %%mm4\n\t"
  473. "punpckhbw %%mm7, %%mm5\n\t"
  474. "punpcklbw %%mm7, %%mm0\n\t"
  475. "punpckhbw %%mm7, %%mm2\n\t"
  476. "paddusw %%mm4, %%mm1\n\t"
  477. "paddusw %%mm5, %%mm3\n\t"
  478. "paddusw %%mm6, %%mm1\n\t"
  479. "paddusw %%mm6, %%mm3\n\t"
  480. "psrlw $1, %%mm1\n\t"
  481. "psrlw $1, %%mm3\n\t"
  482. "paddusw %%mm6, %%mm0\n\t"
  483. "paddusw %%mm6, %%mm2\n\t"
  484. "paddusw %%mm1, %%mm0\n\t"
  485. "paddusw %%mm3, %%mm2\n\t"
  486. "psrlw $1, %%mm0\n\t"
  487. "psrlw $1, %%mm2\n\t"
  488. "packuswb %%mm2, %%mm0\n\t"
  489. "movq %%mm0, %0\n\t"
  490. :"=m"(*p)
  491. :"m"(*pix)
  492. :"memory");
  493. pix += line_size;
  494. p += line_size;
  495. } while (--h);
  496. }
  497. static void avg_pixels_y2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h)
  498. {
  499. UINT8 *p;
  500. const UINT8 *pix;
  501. p = block;
  502. pix = pixels;
  503. __asm __volatile(
  504. "pxor %%mm7, %%mm7\n\t"
  505. "movq %0, %%mm6\n\t"
  506. ::"m"(mm_wone):"memory");
  507. do {
  508. __asm __volatile(
  509. "movq %1, %%mm1\n\t"
  510. "movq %0, %%mm0\n\t"
  511. "movq %2, %%mm4\n\t"
  512. "movq %%mm0, %%mm2\n\t"
  513. "movq %%mm1, %%mm3\n\t"
  514. "movq %%mm4, %%mm5\n\t"
  515. "punpcklbw %%mm7, %%mm1\n\t"
  516. "punpckhbw %%mm7, %%mm3\n\t"
  517. "punpcklbw %%mm7, %%mm4\n\t"
  518. "punpckhbw %%mm7, %%mm5\n\t"
  519. "punpcklbw %%mm7, %%mm0\n\t"
  520. "punpckhbw %%mm7, %%mm2\n\t"
  521. "paddusw %%mm4, %%mm1\n\t"
  522. "paddusw %%mm5, %%mm3\n\t"
  523. "paddusw %%mm6, %%mm1\n\t"
  524. "paddusw %%mm6, %%mm3\n\t"
  525. "psrlw $1, %%mm1\n\t"
  526. "psrlw $1, %%mm3\n\t"
  527. "paddusw %%mm6, %%mm0\n\t"
  528. "paddusw %%mm6, %%mm2\n\t"
  529. "paddusw %%mm1, %%mm0\n\t"
  530. "paddusw %%mm3, %%mm2\n\t"
  531. "psrlw $1, %%mm0\n\t"
  532. "psrlw $1, %%mm2\n\t"
  533. "packuswb %%mm2, %%mm0\n\t"
  534. "movq %%mm0, %0\n\t"
  535. :"=m"(*p)
  536. :"m"(*pix), "m"(*(pix+line_size))
  537. :"memory");
  538. pix += line_size;
  539. p += line_size ;
  540. } while(--h);
  541. }
  542. static void avg_pixels_xy2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h)
  543. {
  544. UINT8 *p;
  545. const UINT8 *pix;
  546. p = block;
  547. pix = pixels;
  548. __asm __volatile(
  549. "pxor %%mm7, %%mm7\n\t"
  550. "movq %0, %%mm6\n\t"
  551. ::"m"(mm_wtwo):"memory");
  552. do {
  553. __asm __volatile(
  554. "movq %1, %%mm0\n\t"
  555. "movq %2, %%mm1\n\t"
  556. "movq 1%1, %%mm4\n\t"
  557. "movq 1%2, %%mm5\n\t"
  558. "movq %%mm0, %%mm2\n\t"
  559. "movq %%mm1, %%mm3\n\t"
  560. "punpcklbw %%mm7, %%mm0\n\t"
  561. "punpcklbw %%mm7, %%mm1\n\t"
  562. "punpckhbw %%mm7, %%mm2\n\t"
  563. "punpckhbw %%mm7, %%mm3\n\t"
  564. "paddusw %%mm1, %%mm0\n\t"
  565. "paddusw %%mm3, %%mm2\n\t"
  566. "movq %%mm4, %%mm1\n\t"
  567. "movq %%mm5, %%mm3\n\t"
  568. "punpcklbw %%mm7, %%mm4\n\t"
  569. "punpcklbw %%mm7, %%mm5\n\t"
  570. "punpckhbw %%mm7, %%mm1\n\t"
  571. "punpckhbw %%mm7, %%mm3\n\t"
  572. "paddusw %%mm5, %%mm4\n\t"
  573. "paddusw %%mm3, %%mm1\n\t"
  574. "paddusw %%mm6, %%mm4\n\t"
  575. "paddusw %%mm6, %%mm1\n\t"
  576. "paddusw %%mm4, %%mm0\n\t"
  577. "paddusw %%mm1, %%mm2\n\t"
  578. "movq %3, %%mm5\n\t"
  579. "psrlw $2, %%mm0\n\t"
  580. "movq %0, %%mm1\n\t"
  581. "psrlw $2, %%mm2\n\t"
  582. "movq %%mm1, %%mm3\n\t"
  583. "punpcklbw %%mm7, %%mm1\n\t"
  584. "punpckhbw %%mm7, %%mm3\n\t"
  585. "paddusw %%mm1, %%mm0\n\t"
  586. "paddusw %%mm3, %%mm2\n\t"
  587. "paddusw %%mm5, %%mm0\n\t"
  588. "paddusw %%mm5, %%mm2\n\t"
  589. "psrlw $1, %%mm0\n\t"
  590. "psrlw $1, %%mm2\n\t"
  591. "packuswb %%mm2, %%mm0\n\t"
  592. "movq %%mm0, %0\n\t"
  593. :"=m"(*p)
  594. :"m"(*pix),
  595. "m"(*(pix+line_size)), "m"(mm_wone)
  596. :"memory");
  597. pix += line_size;
  598. p += line_size ;
  599. } while(--h);
  600. }
  601. static void avg_no_rnd_pixels_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h)
  602. {
  603. UINT8 *p;
  604. const UINT8 *pix;
  605. p = block;
  606. pix = pixels;
  607. __asm __volatile("pxor %%mm7, %%mm7\n\t":::"memory");
  608. do {
  609. __asm __volatile(
  610. "movq %1, %%mm0\n\t"
  611. "movq %0, %%mm1\n\t"
  612. "movq %%mm0, %%mm2\n\t"
  613. "movq %%mm1, %%mm3\n\t"
  614. "punpcklbw %%mm7, %%mm0\n\t"
  615. "punpcklbw %%mm7, %%mm1\n\t"
  616. "punpckhbw %%mm7, %%mm2\n\t"
  617. "punpckhbw %%mm7, %%mm3\n\t"
  618. "paddusw %%mm1, %%mm0\n\t"
  619. "paddusw %%mm3, %%mm2\n\t"
  620. "psrlw $1, %%mm0\n\t"
  621. "psrlw $1, %%mm2\n\t"
  622. "packuswb %%mm2, %%mm0\n\t"
  623. "movq %%mm0, %0\n\t"
  624. :"=m"(*p)
  625. :"m"(*pix)
  626. :"memory");
  627. pix += line_size;
  628. p += line_size ;
  629. } while (--h);
  630. }
  631. static void avg_no_rnd_pixels_x2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h)
  632. {
  633. UINT8 *p;
  634. const UINT8 *pix;
  635. p = block;
  636. pix = pixels;
  637. __asm __volatile(
  638. "pxor %%mm7, %%mm7\n\t":::"memory");
  639. do {
  640. __asm __volatile(
  641. "movq %1, %%mm0\n\t"
  642. "movq 1%1, %%mm1\n\t"
  643. "movq %0, %%mm4\n\t"
  644. "movq %%mm0, %%mm2\n\t"
  645. "movq %%mm1, %%mm3\n\t"
  646. "movq %%mm4, %%mm5\n\t"
  647. "punpcklbw %%mm7, %%mm0\n\t"
  648. "punpcklbw %%mm7, %%mm1\n\t"
  649. "punpckhbw %%mm7, %%mm2\n\t"
  650. "punpckhbw %%mm7, %%mm3\n\t"
  651. "punpcklbw %%mm7, %%mm4\n\t"
  652. "punpckhbw %%mm7, %%mm5\n\t"
  653. "paddusw %%mm1, %%mm0\n\t"
  654. "paddusw %%mm3, %%mm2\n\t"
  655. "psrlw $1, %%mm0\n\t"
  656. "psrlw $1, %%mm2\n\t"
  657. "paddusw %%mm4, %%mm0\n\t"
  658. "paddusw %%mm5, %%mm2\n\t"
  659. "psrlw $1, %%mm0\n\t"
  660. "psrlw $1, %%mm2\n\t"
  661. "packuswb %%mm2, %%mm0\n\t"
  662. "movq %%mm0, %0\n\t"
  663. :"=m"(*p)
  664. :"m"(*pix)
  665. :"memory");
  666. pix += line_size;
  667. p += line_size;
  668. } while (--h);
  669. }
  670. static void avg_no_rnd_pixels_y2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h)
  671. {
  672. UINT8 *p;
  673. const UINT8 *pix;
  674. p = block;
  675. pix = pixels;
  676. __asm __volatile(
  677. "pxor %%mm7, %%mm7\n\t":::"memory");
  678. do {
  679. __asm __volatile(
  680. "movq %1, %%mm0\n\t"
  681. "movq %2, %%mm1\n\t"
  682. "movq %0, %%mm4\n\t"
  683. "movq %%mm0, %%mm2\n\t"
  684. "movq %%mm1, %%mm3\n\t"
  685. "movq %%mm4, %%mm5\n\t"
  686. "punpcklbw %%mm7, %%mm0\n\t"
  687. "punpcklbw %%mm7, %%mm1\n\t"
  688. "punpckhbw %%mm7, %%mm2\n\t"
  689. "punpckhbw %%mm7, %%mm3\n\t"
  690. "punpcklbw %%mm7, %%mm4\n\t"
  691. "punpckhbw %%mm7, %%mm5\n\t"
  692. "paddusw %%mm1, %%mm0\n\t"
  693. "paddusw %%mm3, %%mm2\n\t"
  694. "psrlw $1, %%mm0\n\t"
  695. "psrlw $1, %%mm2\n\t"
  696. "paddusw %%mm4, %%mm0\n\t"
  697. "paddusw %%mm5, %%mm2\n\t"
  698. "psrlw $1, %%mm0\n\t"
  699. "psrlw $1, %%mm2\n\t"
  700. "packuswb %%mm2, %%mm0\n\t"
  701. "movq %%mm0, %0\n\t"
  702. :"=m"(*p)
  703. :"m"(*pix), "m"(*(pix+line_size))
  704. :"memory");
  705. pix += line_size;
  706. p += line_size ;
  707. } while(--h);
  708. }
  709. static void avg_no_rnd_pixels_xy2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h)
  710. {
  711. UINT8 *p;
  712. const UINT8 *pix;
  713. p = block;
  714. pix = pixels;
  715. __asm __volatile(
  716. "pxor %%mm7, %%mm7\n\t"
  717. "movq %0, %%mm6\n\t"
  718. ::"m"(mm_wone):"memory");
  719. do {
  720. __asm __volatile(
  721. "movq %1, %%mm0\n\t"
  722. "movq %2, %%mm1\n\t"
  723. "movq 1%1, %%mm4\n\t"
  724. "movq 1%2, %%mm5\n\t"
  725. "movq %%mm0, %%mm2\n\t"
  726. "movq %%mm1, %%mm3\n\t"
  727. "punpcklbw %%mm7, %%mm0\n\t"
  728. "punpcklbw %%mm7, %%mm1\n\t"
  729. "punpckhbw %%mm7, %%mm2\n\t"
  730. "punpckhbw %%mm7, %%mm3\n\t"
  731. "paddusw %%mm1, %%mm0\n\t"
  732. "paddusw %%mm3, %%mm2\n\t"
  733. "movq %%mm4, %%mm1\n\t"
  734. "movq %%mm5, %%mm3\n\t"
  735. "punpcklbw %%mm7, %%mm4\n\t"
  736. "punpcklbw %%mm7, %%mm5\n\t"
  737. "punpckhbw %%mm7, %%mm1\n\t"
  738. "punpckhbw %%mm7, %%mm3\n\t"
  739. "paddusw %%mm5, %%mm4\n\t"
  740. "paddusw %%mm3, %%mm1\n\t"
  741. "paddusw %%mm6, %%mm4\n\t"
  742. "paddusw %%mm6, %%mm1\n\t"
  743. "paddusw %%mm4, %%mm0\n\t"
  744. "paddusw %%mm1, %%mm2\n\t"
  745. "movq %0, %%mm1\n\t"
  746. "psrlw $2, %%mm0\n\t"
  747. "movq %%mm1, %%mm3\n\t"
  748. "psrlw $2, %%mm2\n\t"
  749. "punpcklbw %%mm7, %%mm1\n\t"
  750. "punpckhbw %%mm7, %%mm3\n\t"
  751. "paddusw %%mm1, %%mm0\n\t"
  752. "paddusw %%mm3, %%mm2\n\t"
  753. "psrlw $1, %%mm0\n\t"
  754. "psrlw $1, %%mm2\n\t"
  755. "packuswb %%mm2, %%mm0\n\t"
  756. "movq %%mm0, %0\n\t"
  757. :"=m"(*p)
  758. :"m"(*pix),
  759. "m"(*(pix+line_size))
  760. :"memory");
  761. pix += line_size;
  762. p += line_size;
  763. } while(--h);
  764. }
  765. static void sub_pixels_mmx( DCTELEM *block, const UINT8 *pixels, int line_size, int h)
  766. {
  767. DCTELEM *p;
  768. const UINT8 *pix;
  769. p = block;
  770. pix = pixels;
  771. __asm __volatile("pxor %%mm7, %%mm7":::"memory");
  772. do {
  773. __asm __volatile(
  774. "movq %0, %%mm0\n\t"
  775. "movq %1, %%mm2\n\t"
  776. "movq 8%0, %%mm1\n\t"
  777. "movq %%mm2, %%mm3\n\t"
  778. "punpcklbw %%mm7, %%mm2\n\t"
  779. "punpckhbw %%mm7, %%mm3\n\t"
  780. "psubsw %%mm2, %%mm0\n\t"
  781. "psubsw %%mm3, %%mm1\n\t"
  782. "movq %%mm0, %0\n\t"
  783. "movq %%mm1, 8%0\n\t"
  784. :"=m"(*p)
  785. :"m"(*pix)
  786. :"memory");
  787. pix += line_size;
  788. p += 8;
  789. } while (--h);
  790. }
  791. static void sub_pixels_x2_mmx( DCTELEM *block, const UINT8 *pixels, int line_size, int h)
  792. {
  793. DCTELEM *p;
  794. const UINT8 *pix;
  795. p = block;
  796. pix = pixels;
  797. __asm __volatile(
  798. "pxor %%mm7, %%mm7\n\t"
  799. "movq %0, %%mm6"
  800. ::"m"(mm_wone):"memory");
  801. do {
  802. __asm __volatile(
  803. "movq %0, %%mm0\n\t"
  804. "movq %1, %%mm2\n\t"
  805. "movq 8%0, %%mm1\n\t"
  806. "movq 1%1, %%mm4\n\t"
  807. "movq %%mm2, %%mm3\n\t"
  808. "movq %%mm4, %%mm5\n\t"
  809. "punpcklbw %%mm7, %%mm2\n\t"
  810. "punpckhbw %%mm7, %%mm3\n\t"
  811. "punpcklbw %%mm7, %%mm4\n\t"
  812. "punpckhbw %%mm7, %%mm5\n\t"
  813. "paddusw %%mm4, %%mm2\n\t"
  814. "paddusw %%mm5, %%mm3\n\t"
  815. "paddusw %%mm6, %%mm2\n\t"
  816. "paddusw %%mm6, %%mm3\n\t"
  817. "psrlw $1, %%mm2\n\t"
  818. "psrlw $1, %%mm3\n\t"
  819. "psubsw %%mm2, %%mm0\n\t"
  820. "psubsw %%mm3, %%mm1\n\t"
  821. "movq %%mm0, %0\n\t"
  822. "movq %%mm1, 8%0\n\t"
  823. :"=m"(*p)
  824. :"m"(*pix)
  825. :"memory");
  826. pix += line_size;
  827. p += 8;
  828. } while (--h);
  829. }
  830. static void sub_pixels_y2_mmx( DCTELEM *block, const UINT8 *pixels, int line_size, int h)
  831. {
  832. DCTELEM *p;
  833. const UINT8 *pix;
  834. p = block;
  835. pix = pixels;
  836. __asm __volatile(
  837. "pxor %%mm7, %%mm7\n\t"
  838. "movq %0, %%mm6"
  839. ::"m"(mm_wone):"memory");
  840. do {
  841. __asm __volatile(
  842. "movq %0, %%mm0\n\t"
  843. "movq %1, %%mm2\n\t"
  844. "movq 8%0, %%mm1\n\t"
  845. "movq %2, %%mm4\n\t"
  846. "movq %%mm2, %%mm3\n\t"
  847. "movq %%mm4, %%mm5\n\t"
  848. "punpcklbw %%mm7, %%mm2\n\t"
  849. "punpckhbw %%mm7, %%mm3\n\t"
  850. "punpcklbw %%mm7, %%mm4\n\t"
  851. "punpckhbw %%mm7, %%mm5\n\t"
  852. "paddusw %%mm4, %%mm2\n\t"
  853. "paddusw %%mm5, %%mm3\n\t"
  854. "paddusw %%mm6, %%mm2\n\t"
  855. "paddusw %%mm6, %%mm3\n\t"
  856. "psrlw $1, %%mm2\n\t"
  857. "psrlw $1, %%mm3\n\t"
  858. "psubsw %%mm2, %%mm0\n\t"
  859. "psubsw %%mm3, %%mm1\n\t"
  860. "movq %%mm0, %0\n\t"
  861. "movq %%mm1, 8%0\n\t"
  862. :"=m"(*p)
  863. :"m"(*pix), "m"(*(pix+line_size))
  864. :"memory");
  865. pix += line_size;
  866. p += 8;
  867. } while (--h);
  868. }
  869. static void sub_pixels_xy2_mmx( DCTELEM *block, const UINT8 *pixels, int line_size, int h)
  870. {
  871. DCTELEM *p;
  872. const UINT8 *pix;
  873. p = block;
  874. pix = pixels;
  875. __asm __volatile(
  876. "pxor %%mm7, %%mm7\n\t"
  877. "movq %0, %%mm6\n\t"
  878. ::"m"(mm_wtwo):"memory");
  879. do {
  880. __asm __volatile(
  881. "movq %1, %%mm0\n\t"
  882. "movq %2, %%mm1\n\t"
  883. "movq 1%1, %%mm4\n\t"
  884. "movq 1%2, %%mm5\n\t"
  885. "movq %%mm0, %%mm2\n\t"
  886. "movq %%mm1, %%mm3\n\t"
  887. "punpcklbw %%mm7, %%mm0\n\t"
  888. "punpcklbw %%mm7, %%mm1\n\t"
  889. "punpckhbw %%mm7, %%mm2\n\t"
  890. "punpckhbw %%mm7, %%mm3\n\t"
  891. "paddusw %%mm1, %%mm0\n\t"
  892. "paddusw %%mm3, %%mm2\n\t"
  893. "movq %%mm4, %%mm1\n\t"
  894. "movq %%mm5, %%mm3\n\t"
  895. "punpcklbw %%mm7, %%mm4\n\t"
  896. "punpcklbw %%mm7, %%mm5\n\t"
  897. "punpckhbw %%mm7, %%mm1\n\t"
  898. "punpckhbw %%mm7, %%mm3\n\t"
  899. "paddusw %%mm5, %%mm4\n\t"
  900. "paddusw %%mm3, %%mm1\n\t"
  901. "paddusw %%mm6, %%mm4\n\t"
  902. "paddusw %%mm6, %%mm1\n\t"
  903. "paddusw %%mm4, %%mm0\n\t"
  904. "paddusw %%mm1, %%mm2\n\t"
  905. "movq %0, %%mm1\n\t"
  906. "movq 8%0, %%mm3\n\t"
  907. "psrlw $2, %%mm0\n\t"
  908. "psrlw $2, %%mm2\n\t"
  909. "psubsw %%mm0, %%mm1\n\t"
  910. "psubsw %%mm2, %%mm3\n\t"
  911. "movq %%mm1, %0\n\t"
  912. "movq %%mm3, 8%0\n\t"
  913. :"=m"(*p)
  914. :"m"(*pix),
  915. "m"(*(pix+line_size))
  916. :"memory");
  917. pix += line_size;
  918. p += 8 ;
  919. } while(--h);
  920. }
  921. void dsputil_init_mmx(void)
  922. {
  923. mm_flags = mm_support();
  924. #if 0
  925. printf("CPU flags:");
  926. if (mm_flags & MM_MMX)
  927. printf(" mmx");
  928. if (mm_flags & MM_MMXEXT)
  929. printf(" mmxext");
  930. if (mm_flags & MM_3DNOW)
  931. printf(" 3dnow");
  932. if (mm_flags & MM_SSE)
  933. printf(" sse");
  934. if (mm_flags & MM_SSE2)
  935. printf(" sse2");
  936. printf("\n");
  937. #endif
  938. if (mm_flags & MM_MMX) {
  939. get_pixels = get_pixels_mmx;
  940. put_pixels_clamped = put_pixels_clamped_mmx;
  941. add_pixels_clamped = add_pixels_clamped_mmx;
  942. pix_abs16x16 = pix_abs16x16_mmx;
  943. pix_abs16x16_x2 = pix_abs16x16_x2_mmx;
  944. pix_abs16x16_y2 = pix_abs16x16_y2_mmx;
  945. pix_abs16x16_xy2 = pix_abs16x16_xy2_mmx;
  946. av_fdct = fdct_mmx;
  947. put_pixels_tab[0] = put_pixels_mmx;
  948. put_pixels_tab[1] = put_pixels_x2_mmx;
  949. put_pixels_tab[2] = put_pixels_y2_mmx;
  950. put_pixels_tab[3] = put_pixels_xy2_mmx;
  951. put_no_rnd_pixels_tab[0] = put_pixels_mmx;
  952. put_no_rnd_pixels_tab[1] = put_no_rnd_pixels_x2_mmx;
  953. put_no_rnd_pixels_tab[2] = put_no_rnd_pixels_y2_mmx;
  954. put_no_rnd_pixels_tab[3] = put_no_rnd_pixels_xy2_mmx;
  955. avg_pixels_tab[0] = avg_pixels_mmx;
  956. avg_pixels_tab[1] = avg_pixels_x2_mmx;
  957. avg_pixels_tab[2] = avg_pixels_y2_mmx;
  958. avg_pixels_tab[3] = avg_pixels_xy2_mmx;
  959. avg_no_rnd_pixels_tab[0] = avg_no_rnd_pixels_mmx;
  960. avg_no_rnd_pixels_tab[1] = avg_no_rnd_pixels_x2_mmx;
  961. avg_no_rnd_pixels_tab[2] = avg_no_rnd_pixels_y2_mmx;
  962. avg_no_rnd_pixels_tab[3] = avg_no_rnd_pixels_xy2_mmx;
  963. sub_pixels_tab[0] = sub_pixels_mmx;
  964. sub_pixels_tab[1] = sub_pixels_x2_mmx;
  965. sub_pixels_tab[2] = sub_pixels_y2_mmx;
  966. sub_pixels_tab[3] = sub_pixels_xy2_mmx;
  967. if (mm_flags & MM_MMXEXT) {
  968. pix_abs16x16 = pix_abs16x16_sse;
  969. }
  970. if (mm_flags & MM_SSE) {
  971. put_pixels_tab[1] = put_pixels_x2_sse;
  972. put_pixels_tab[2] = put_pixels_y2_sse;
  973. avg_pixels_tab[0] = avg_pixels_sse;
  974. avg_pixels_tab[1] = avg_pixels_x2_sse;
  975. avg_pixels_tab[2] = avg_pixels_y2_sse;
  976. avg_pixels_tab[3] = avg_pixels_xy2_sse;
  977. sub_pixels_tab[1] = sub_pixels_x2_sse;
  978. sub_pixels_tab[2] = sub_pixels_y2_sse;
  979. } else if (mm_flags & MM_3DNOW) {
  980. put_pixels_tab[1] = put_pixels_x2_3dnow;
  981. put_pixels_tab[2] = put_pixels_y2_3dnow;
  982. avg_pixels_tab[0] = avg_pixels_3dnow;
  983. avg_pixels_tab[1] = avg_pixels_x2_3dnow;
  984. avg_pixels_tab[2] = avg_pixels_y2_3dnow;
  985. avg_pixels_tab[3] = avg_pixels_xy2_3dnow;
  986. sub_pixels_tab[1] = sub_pixels_x2_3dnow;
  987. sub_pixels_tab[2] = sub_pixels_y2_3dnow;
  988. }
  989. }
  990. }