You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

2690 lines
68KB

  1. /*
  2. *
  3. * rgb2rgb.c, Software RGB to RGB convertor
  4. * pluralize by Software PAL8 to RGB convertor
  5. * Software YUV to YUV convertor
  6. * Software YUV to RGB convertor
  7. * Written by Nick Kurshev.
  8. * palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at)
  9. * lot of big-endian byteorder fixes by Alex Beregszaszi
  10. *
  11. * This file is part of FFmpeg.
  12. *
  13. * FFmpeg is free software; you can redistribute it and/or modify
  14. * it under the terms of the GNU General Public License as published by
  15. * the Free Software Foundation; either version 2 of the License, or
  16. * (at your option) any later version.
  17. *
  18. * FFmpeg is distributed in the hope that it will be useful,
  19. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  20. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  21. * GNU General Public License for more details.
  22. *
  23. * You should have received a copy of the GNU General Public License
  24. * along with FFmpeg; if not, write to the Free Software
  25. * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
  26. */
  27. #include <stddef.h>
  28. #include <inttypes.h> /* for __WORDSIZE */
  29. #ifndef __WORDSIZE
  30. // #warning You have misconfigured system and probably will lose performance!
  31. #define __WORDSIZE MP_WORDSIZE
  32. #endif
  33. #undef PREFETCH
  34. #undef MOVNTQ
  35. #undef EMMS
  36. #undef SFENCE
  37. #undef MMREG_SIZE
  38. #undef PREFETCHW
  39. #undef PAVGB
  40. #ifdef HAVE_SSE2
  41. #define MMREG_SIZE 16
  42. #else
  43. #define MMREG_SIZE 8
  44. #endif
  45. #ifdef HAVE_3DNOW
  46. #define PREFETCH "prefetch"
  47. #define PREFETCHW "prefetchw"
  48. #define PAVGB "pavgusb"
  49. #elif defined ( HAVE_MMX2 )
  50. #define PREFETCH "prefetchnta"
  51. #define PREFETCHW "prefetcht0"
  52. #define PAVGB "pavgb"
  53. #else
  54. #ifdef __APPLE__
  55. #define PREFETCH "#"
  56. #define PREFETCHW "#"
  57. #else
  58. #define PREFETCH "/nop"
  59. #define PREFETCHW "/nop"
  60. #endif
  61. #endif
  62. #ifdef HAVE_3DNOW
  63. /* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
  64. #define EMMS "femms"
  65. #else
  66. #define EMMS "emms"
  67. #endif
  68. #ifdef HAVE_MMX2
  69. #define MOVNTQ "movntq"
  70. #define SFENCE "sfence"
  71. #else
  72. #define MOVNTQ "movq"
  73. #ifdef __APPLE__
  74. #define SFENCE "#"
  75. #else
  76. #define SFENCE "/nop"
  77. #endif
  78. #endif
  79. static inline void RENAME(rgb24to32)(const uint8_t *src,uint8_t *dst,long src_size)
  80. {
  81. uint8_t *dest = dst;
  82. const uint8_t *s = src;
  83. const uint8_t *end;
  84. #ifdef HAVE_MMX
  85. const uint8_t *mm_end;
  86. #endif
  87. end = s + src_size;
  88. #ifdef HAVE_MMX
  89. __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
  90. mm_end = end - 23;
  91. __asm __volatile("movq %0, %%mm7"::"m"(mask32):"memory");
  92. while(s < mm_end)
  93. {
  94. __asm __volatile(
  95. PREFETCH" 32%1\n\t"
  96. "movd %1, %%mm0\n\t"
  97. "punpckldq 3%1, %%mm0\n\t"
  98. "movd 6%1, %%mm1\n\t"
  99. "punpckldq 9%1, %%mm1\n\t"
  100. "movd 12%1, %%mm2\n\t"
  101. "punpckldq 15%1, %%mm2\n\t"
  102. "movd 18%1, %%mm3\n\t"
  103. "punpckldq 21%1, %%mm3\n\t"
  104. "pand %%mm7, %%mm0\n\t"
  105. "pand %%mm7, %%mm1\n\t"
  106. "pand %%mm7, %%mm2\n\t"
  107. "pand %%mm7, %%mm3\n\t"
  108. MOVNTQ" %%mm0, %0\n\t"
  109. MOVNTQ" %%mm1, 8%0\n\t"
  110. MOVNTQ" %%mm2, 16%0\n\t"
  111. MOVNTQ" %%mm3, 24%0"
  112. :"=m"(*dest)
  113. :"m"(*s)
  114. :"memory");
  115. dest += 32;
  116. s += 24;
  117. }
  118. __asm __volatile(SFENCE:::"memory");
  119. __asm __volatile(EMMS:::"memory");
  120. #endif
  121. while(s < end)
  122. {
  123. #ifdef WORDS_BIGENDIAN
  124. /* RGB24 (= R,G,B) -> RGB32 (= A,B,G,R) */
  125. *dest++ = 0;
  126. *dest++ = s[2];
  127. *dest++ = s[1];
  128. *dest++ = s[0];
  129. s+=3;
  130. #else
  131. *dest++ = *s++;
  132. *dest++ = *s++;
  133. *dest++ = *s++;
  134. *dest++ = 0;
  135. #endif
  136. }
  137. }
  138. static inline void RENAME(rgb32to24)(const uint8_t *src,uint8_t *dst,long src_size)
  139. {
  140. uint8_t *dest = dst;
  141. const uint8_t *s = src;
  142. const uint8_t *end;
  143. #ifdef HAVE_MMX
  144. const uint8_t *mm_end;
  145. #endif
  146. end = s + src_size;
  147. #ifdef HAVE_MMX
  148. __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
  149. mm_end = end - 31;
  150. while(s < mm_end)
  151. {
  152. __asm __volatile(
  153. PREFETCH" 32%1\n\t"
  154. "movq %1, %%mm0\n\t"
  155. "movq 8%1, %%mm1\n\t"
  156. "movq 16%1, %%mm4\n\t"
  157. "movq 24%1, %%mm5\n\t"
  158. "movq %%mm0, %%mm2\n\t"
  159. "movq %%mm1, %%mm3\n\t"
  160. "movq %%mm4, %%mm6\n\t"
  161. "movq %%mm5, %%mm7\n\t"
  162. "psrlq $8, %%mm2\n\t"
  163. "psrlq $8, %%mm3\n\t"
  164. "psrlq $8, %%mm6\n\t"
  165. "psrlq $8, %%mm7\n\t"
  166. "pand %2, %%mm0\n\t"
  167. "pand %2, %%mm1\n\t"
  168. "pand %2, %%mm4\n\t"
  169. "pand %2, %%mm5\n\t"
  170. "pand %3, %%mm2\n\t"
  171. "pand %3, %%mm3\n\t"
  172. "pand %3, %%mm6\n\t"
  173. "pand %3, %%mm7\n\t"
  174. "por %%mm2, %%mm0\n\t"
  175. "por %%mm3, %%mm1\n\t"
  176. "por %%mm6, %%mm4\n\t"
  177. "por %%mm7, %%mm5\n\t"
  178. "movq %%mm1, %%mm2\n\t"
  179. "movq %%mm4, %%mm3\n\t"
  180. "psllq $48, %%mm2\n\t"
  181. "psllq $32, %%mm3\n\t"
  182. "pand %4, %%mm2\n\t"
  183. "pand %5, %%mm3\n\t"
  184. "por %%mm2, %%mm0\n\t"
  185. "psrlq $16, %%mm1\n\t"
  186. "psrlq $32, %%mm4\n\t"
  187. "psllq $16, %%mm5\n\t"
  188. "por %%mm3, %%mm1\n\t"
  189. "pand %6, %%mm5\n\t"
  190. "por %%mm5, %%mm4\n\t"
  191. MOVNTQ" %%mm0, %0\n\t"
  192. MOVNTQ" %%mm1, 8%0\n\t"
  193. MOVNTQ" %%mm4, 16%0"
  194. :"=m"(*dest)
  195. :"m"(*s),"m"(mask24l),
  196. "m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
  197. :"memory");
  198. dest += 24;
  199. s += 32;
  200. }
  201. __asm __volatile(SFENCE:::"memory");
  202. __asm __volatile(EMMS:::"memory");
  203. #endif
  204. while(s < end)
  205. {
  206. #ifdef WORDS_BIGENDIAN
  207. /* RGB32 (= A,B,G,R) -> RGB24 (= R,G,B) */
  208. s++;
  209. dest[2] = *s++;
  210. dest[1] = *s++;
  211. dest[0] = *s++;
  212. dest += 3;
  213. #else
  214. *dest++ = *s++;
  215. *dest++ = *s++;
  216. *dest++ = *s++;
  217. s++;
  218. #endif
  219. }
  220. }
  221. /*
  222. Original by Strepto/Astral
  223. ported to gcc & bugfixed : A'rpi
  224. MMX2, 3DNOW optimization by Nick Kurshev
  225. 32bit c version, and and&add trick by Michael Niedermayer
  226. */
  227. static inline void RENAME(rgb15to16)(const uint8_t *src,uint8_t *dst,long src_size)
  228. {
  229. register const uint8_t* s=src;
  230. register uint8_t* d=dst;
  231. register const uint8_t *end;
  232. const uint8_t *mm_end;
  233. end = s + src_size;
  234. #ifdef HAVE_MMX
  235. __asm __volatile(PREFETCH" %0"::"m"(*s));
  236. __asm __volatile("movq %0, %%mm4"::"m"(mask15s));
  237. mm_end = end - 15;
  238. while(s<mm_end)
  239. {
  240. __asm __volatile(
  241. PREFETCH" 32%1\n\t"
  242. "movq %1, %%mm0\n\t"
  243. "movq 8%1, %%mm2\n\t"
  244. "movq %%mm0, %%mm1\n\t"
  245. "movq %%mm2, %%mm3\n\t"
  246. "pand %%mm4, %%mm0\n\t"
  247. "pand %%mm4, %%mm2\n\t"
  248. "paddw %%mm1, %%mm0\n\t"
  249. "paddw %%mm3, %%mm2\n\t"
  250. MOVNTQ" %%mm0, %0\n\t"
  251. MOVNTQ" %%mm2, 8%0"
  252. :"=m"(*d)
  253. :"m"(*s)
  254. );
  255. d+=16;
  256. s+=16;
  257. }
  258. __asm __volatile(SFENCE:::"memory");
  259. __asm __volatile(EMMS:::"memory");
  260. #endif
  261. mm_end = end - 3;
  262. while(s < mm_end)
  263. {
  264. register unsigned x= *((uint32_t *)s);
  265. *((uint32_t *)d) = (x&0x7FFF7FFF) + (x&0x7FE07FE0);
  266. d+=4;
  267. s+=4;
  268. }
  269. if(s < end)
  270. {
  271. register unsigned short x= *((uint16_t *)s);
  272. *((uint16_t *)d) = (x&0x7FFF) + (x&0x7FE0);
  273. }
  274. }
  275. static inline void RENAME(rgb16to15)(const uint8_t *src,uint8_t *dst,long src_size)
  276. {
  277. register const uint8_t* s=src;
  278. register uint8_t* d=dst;
  279. register const uint8_t *end;
  280. const uint8_t *mm_end;
  281. end = s + src_size;
  282. #ifdef HAVE_MMX
  283. __asm __volatile(PREFETCH" %0"::"m"(*s));
  284. __asm __volatile("movq %0, %%mm7"::"m"(mask15rg));
  285. __asm __volatile("movq %0, %%mm6"::"m"(mask15b));
  286. mm_end = end - 15;
  287. while(s<mm_end)
  288. {
  289. __asm __volatile(
  290. PREFETCH" 32%1\n\t"
  291. "movq %1, %%mm0\n\t"
  292. "movq 8%1, %%mm2\n\t"
  293. "movq %%mm0, %%mm1\n\t"
  294. "movq %%mm2, %%mm3\n\t"
  295. "psrlq $1, %%mm0\n\t"
  296. "psrlq $1, %%mm2\n\t"
  297. "pand %%mm7, %%mm0\n\t"
  298. "pand %%mm7, %%mm2\n\t"
  299. "pand %%mm6, %%mm1\n\t"
  300. "pand %%mm6, %%mm3\n\t"
  301. "por %%mm1, %%mm0\n\t"
  302. "por %%mm3, %%mm2\n\t"
  303. MOVNTQ" %%mm0, %0\n\t"
  304. MOVNTQ" %%mm2, 8%0"
  305. :"=m"(*d)
  306. :"m"(*s)
  307. );
  308. d+=16;
  309. s+=16;
  310. }
  311. __asm __volatile(SFENCE:::"memory");
  312. __asm __volatile(EMMS:::"memory");
  313. #endif
  314. mm_end = end - 3;
  315. while(s < mm_end)
  316. {
  317. register uint32_t x= *((uint32_t *)s);
  318. *((uint32_t *)d) = ((x>>1)&0x7FE07FE0) | (x&0x001F001F);
  319. s+=4;
  320. d+=4;
  321. }
  322. if(s < end)
  323. {
  324. register uint16_t x= *((uint16_t *)s);
  325. *((uint16_t *)d) = ((x>>1)&0x7FE0) | (x&0x001F);
  326. s+=2;
  327. d+=2;
  328. }
  329. }
  330. static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, long src_size)
  331. {
  332. const uint8_t *s = src;
  333. const uint8_t *end;
  334. #ifdef HAVE_MMX
  335. const uint8_t *mm_end;
  336. #endif
  337. uint16_t *d = (uint16_t *)dst;
  338. end = s + src_size;
  339. #ifdef HAVE_MMX
  340. mm_end = end - 15;
  341. #if 1 //is faster only if multiplies are reasonable fast (FIXME figure out on which cpus this is faster, on Athlon its slightly faster)
  342. asm volatile(
  343. "movq %3, %%mm5 \n\t"
  344. "movq %4, %%mm6 \n\t"
  345. "movq %5, %%mm7 \n\t"
  346. ASMALIGN(4)
  347. "1: \n\t"
  348. PREFETCH" 32(%1) \n\t"
  349. "movd (%1), %%mm0 \n\t"
  350. "movd 4(%1), %%mm3 \n\t"
  351. "punpckldq 8(%1), %%mm0 \n\t"
  352. "punpckldq 12(%1), %%mm3 \n\t"
  353. "movq %%mm0, %%mm1 \n\t"
  354. "movq %%mm3, %%mm4 \n\t"
  355. "pand %%mm6, %%mm0 \n\t"
  356. "pand %%mm6, %%mm3 \n\t"
  357. "pmaddwd %%mm7, %%mm0 \n\t"
  358. "pmaddwd %%mm7, %%mm3 \n\t"
  359. "pand %%mm5, %%mm1 \n\t"
  360. "pand %%mm5, %%mm4 \n\t"
  361. "por %%mm1, %%mm0 \n\t"
  362. "por %%mm4, %%mm3 \n\t"
  363. "psrld $5, %%mm0 \n\t"
  364. "pslld $11, %%mm3 \n\t"
  365. "por %%mm3, %%mm0 \n\t"
  366. MOVNTQ" %%mm0, (%0) \n\t"
  367. "add $16, %1 \n\t"
  368. "add $8, %0 \n\t"
  369. "cmp %2, %1 \n\t"
  370. " jb 1b \n\t"
  371. : "+r" (d), "+r"(s)
  372. : "r" (mm_end), "m" (mask3216g), "m" (mask3216br), "m" (mul3216)
  373. );
  374. #else
  375. __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
  376. __asm __volatile(
  377. "movq %0, %%mm7\n\t"
  378. "movq %1, %%mm6\n\t"
  379. ::"m"(red_16mask),"m"(green_16mask));
  380. while(s < mm_end)
  381. {
  382. __asm __volatile(
  383. PREFETCH" 32%1\n\t"
  384. "movd %1, %%mm0\n\t"
  385. "movd 4%1, %%mm3\n\t"
  386. "punpckldq 8%1, %%mm0\n\t"
  387. "punpckldq 12%1, %%mm3\n\t"
  388. "movq %%mm0, %%mm1\n\t"
  389. "movq %%mm0, %%mm2\n\t"
  390. "movq %%mm3, %%mm4\n\t"
  391. "movq %%mm3, %%mm5\n\t"
  392. "psrlq $3, %%mm0\n\t"
  393. "psrlq $3, %%mm3\n\t"
  394. "pand %2, %%mm0\n\t"
  395. "pand %2, %%mm3\n\t"
  396. "psrlq $5, %%mm1\n\t"
  397. "psrlq $5, %%mm4\n\t"
  398. "pand %%mm6, %%mm1\n\t"
  399. "pand %%mm6, %%mm4\n\t"
  400. "psrlq $8, %%mm2\n\t"
  401. "psrlq $8, %%mm5\n\t"
  402. "pand %%mm7, %%mm2\n\t"
  403. "pand %%mm7, %%mm5\n\t"
  404. "por %%mm1, %%mm0\n\t"
  405. "por %%mm4, %%mm3\n\t"
  406. "por %%mm2, %%mm0\n\t"
  407. "por %%mm5, %%mm3\n\t"
  408. "psllq $16, %%mm3\n\t"
  409. "por %%mm3, %%mm0\n\t"
  410. MOVNTQ" %%mm0, %0\n\t"
  411. :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
  412. d += 4;
  413. s += 16;
  414. }
  415. #endif
  416. __asm __volatile(SFENCE:::"memory");
  417. __asm __volatile(EMMS:::"memory");
  418. #endif
  419. while(s < end)
  420. {
  421. register int rgb = *(uint32_t*)s; s += 4;
  422. *d++ = ((rgb&0xFF)>>3) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>8);
  423. }
  424. }
  425. static inline void RENAME(rgb32tobgr16)(const uint8_t *src, uint8_t *dst, long src_size)
  426. {
  427. const uint8_t *s = src;
  428. const uint8_t *end;
  429. #ifdef HAVE_MMX
  430. const uint8_t *mm_end;
  431. #endif
  432. uint16_t *d = (uint16_t *)dst;
  433. end = s + src_size;
  434. #ifdef HAVE_MMX
  435. __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
  436. __asm __volatile(
  437. "movq %0, %%mm7\n\t"
  438. "movq %1, %%mm6\n\t"
  439. ::"m"(red_16mask),"m"(green_16mask));
  440. mm_end = end - 15;
  441. while(s < mm_end)
  442. {
  443. __asm __volatile(
  444. PREFETCH" 32%1\n\t"
  445. "movd %1, %%mm0\n\t"
  446. "movd 4%1, %%mm3\n\t"
  447. "punpckldq 8%1, %%mm0\n\t"
  448. "punpckldq 12%1, %%mm3\n\t"
  449. "movq %%mm0, %%mm1\n\t"
  450. "movq %%mm0, %%mm2\n\t"
  451. "movq %%mm3, %%mm4\n\t"
  452. "movq %%mm3, %%mm5\n\t"
  453. "psllq $8, %%mm0\n\t"
  454. "psllq $8, %%mm3\n\t"
  455. "pand %%mm7, %%mm0\n\t"
  456. "pand %%mm7, %%mm3\n\t"
  457. "psrlq $5, %%mm1\n\t"
  458. "psrlq $5, %%mm4\n\t"
  459. "pand %%mm6, %%mm1\n\t"
  460. "pand %%mm6, %%mm4\n\t"
  461. "psrlq $19, %%mm2\n\t"
  462. "psrlq $19, %%mm5\n\t"
  463. "pand %2, %%mm2\n\t"
  464. "pand %2, %%mm5\n\t"
  465. "por %%mm1, %%mm0\n\t"
  466. "por %%mm4, %%mm3\n\t"
  467. "por %%mm2, %%mm0\n\t"
  468. "por %%mm5, %%mm3\n\t"
  469. "psllq $16, %%mm3\n\t"
  470. "por %%mm3, %%mm0\n\t"
  471. MOVNTQ" %%mm0, %0\n\t"
  472. :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
  473. d += 4;
  474. s += 16;
  475. }
  476. __asm __volatile(SFENCE:::"memory");
  477. __asm __volatile(EMMS:::"memory");
  478. #endif
  479. while(s < end)
  480. {
  481. register int rgb = *(uint32_t*)s; s += 4;
  482. *d++ = ((rgb&0xF8)<<8) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>19);
  483. }
  484. }
  485. static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, long src_size)
  486. {
  487. const uint8_t *s = src;
  488. const uint8_t *end;
  489. #ifdef HAVE_MMX
  490. const uint8_t *mm_end;
  491. #endif
  492. uint16_t *d = (uint16_t *)dst;
  493. end = s + src_size;
  494. #ifdef HAVE_MMX
  495. mm_end = end - 15;
  496. #if 1 //is faster only if multiplies are reasonable fast (FIXME figure out on which cpus this is faster, on Athlon its slightly faster)
  497. asm volatile(
  498. "movq %3, %%mm5 \n\t"
  499. "movq %4, %%mm6 \n\t"
  500. "movq %5, %%mm7 \n\t"
  501. ASMALIGN(4)
  502. "1: \n\t"
  503. PREFETCH" 32(%1) \n\t"
  504. "movd (%1), %%mm0 \n\t"
  505. "movd 4(%1), %%mm3 \n\t"
  506. "punpckldq 8(%1), %%mm0 \n\t"
  507. "punpckldq 12(%1), %%mm3 \n\t"
  508. "movq %%mm0, %%mm1 \n\t"
  509. "movq %%mm3, %%mm4 \n\t"
  510. "pand %%mm6, %%mm0 \n\t"
  511. "pand %%mm6, %%mm3 \n\t"
  512. "pmaddwd %%mm7, %%mm0 \n\t"
  513. "pmaddwd %%mm7, %%mm3 \n\t"
  514. "pand %%mm5, %%mm1 \n\t"
  515. "pand %%mm5, %%mm4 \n\t"
  516. "por %%mm1, %%mm0 \n\t"
  517. "por %%mm4, %%mm3 \n\t"
  518. "psrld $6, %%mm0 \n\t"
  519. "pslld $10, %%mm3 \n\t"
  520. "por %%mm3, %%mm0 \n\t"
  521. MOVNTQ" %%mm0, (%0) \n\t"
  522. "add $16, %1 \n\t"
  523. "add $8, %0 \n\t"
  524. "cmp %2, %1 \n\t"
  525. " jb 1b \n\t"
  526. : "+r" (d), "+r"(s)
  527. : "r" (mm_end), "m" (mask3215g), "m" (mask3216br), "m" (mul3215)
  528. );
  529. #else
  530. __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
  531. __asm __volatile(
  532. "movq %0, %%mm7\n\t"
  533. "movq %1, %%mm6\n\t"
  534. ::"m"(red_15mask),"m"(green_15mask));
  535. while(s < mm_end)
  536. {
  537. __asm __volatile(
  538. PREFETCH" 32%1\n\t"
  539. "movd %1, %%mm0\n\t"
  540. "movd 4%1, %%mm3\n\t"
  541. "punpckldq 8%1, %%mm0\n\t"
  542. "punpckldq 12%1, %%mm3\n\t"
  543. "movq %%mm0, %%mm1\n\t"
  544. "movq %%mm0, %%mm2\n\t"
  545. "movq %%mm3, %%mm4\n\t"
  546. "movq %%mm3, %%mm5\n\t"
  547. "psrlq $3, %%mm0\n\t"
  548. "psrlq $3, %%mm3\n\t"
  549. "pand %2, %%mm0\n\t"
  550. "pand %2, %%mm3\n\t"
  551. "psrlq $6, %%mm1\n\t"
  552. "psrlq $6, %%mm4\n\t"
  553. "pand %%mm6, %%mm1\n\t"
  554. "pand %%mm6, %%mm4\n\t"
  555. "psrlq $9, %%mm2\n\t"
  556. "psrlq $9, %%mm5\n\t"
  557. "pand %%mm7, %%mm2\n\t"
  558. "pand %%mm7, %%mm5\n\t"
  559. "por %%mm1, %%mm0\n\t"
  560. "por %%mm4, %%mm3\n\t"
  561. "por %%mm2, %%mm0\n\t"
  562. "por %%mm5, %%mm3\n\t"
  563. "psllq $16, %%mm3\n\t"
  564. "por %%mm3, %%mm0\n\t"
  565. MOVNTQ" %%mm0, %0\n\t"
  566. :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
  567. d += 4;
  568. s += 16;
  569. }
  570. #endif
  571. __asm __volatile(SFENCE:::"memory");
  572. __asm __volatile(EMMS:::"memory");
  573. #endif
  574. while(s < end)
  575. {
  576. register int rgb = *(uint32_t*)s; s += 4;
  577. *d++ = ((rgb&0xFF)>>3) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>9);
  578. }
  579. }
  580. static inline void RENAME(rgb32tobgr15)(const uint8_t *src, uint8_t *dst, long src_size)
  581. {
  582. const uint8_t *s = src;
  583. const uint8_t *end;
  584. #ifdef HAVE_MMX
  585. const uint8_t *mm_end;
  586. #endif
  587. uint16_t *d = (uint16_t *)dst;
  588. end = s + src_size;
  589. #ifdef HAVE_MMX
  590. __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
  591. __asm __volatile(
  592. "movq %0, %%mm7\n\t"
  593. "movq %1, %%mm6\n\t"
  594. ::"m"(red_15mask),"m"(green_15mask));
  595. mm_end = end - 15;
  596. while(s < mm_end)
  597. {
  598. __asm __volatile(
  599. PREFETCH" 32%1\n\t"
  600. "movd %1, %%mm0\n\t"
  601. "movd 4%1, %%mm3\n\t"
  602. "punpckldq 8%1, %%mm0\n\t"
  603. "punpckldq 12%1, %%mm3\n\t"
  604. "movq %%mm0, %%mm1\n\t"
  605. "movq %%mm0, %%mm2\n\t"
  606. "movq %%mm3, %%mm4\n\t"
  607. "movq %%mm3, %%mm5\n\t"
  608. "psllq $7, %%mm0\n\t"
  609. "psllq $7, %%mm3\n\t"
  610. "pand %%mm7, %%mm0\n\t"
  611. "pand %%mm7, %%mm3\n\t"
  612. "psrlq $6, %%mm1\n\t"
  613. "psrlq $6, %%mm4\n\t"
  614. "pand %%mm6, %%mm1\n\t"
  615. "pand %%mm6, %%mm4\n\t"
  616. "psrlq $19, %%mm2\n\t"
  617. "psrlq $19, %%mm5\n\t"
  618. "pand %2, %%mm2\n\t"
  619. "pand %2, %%mm5\n\t"
  620. "por %%mm1, %%mm0\n\t"
  621. "por %%mm4, %%mm3\n\t"
  622. "por %%mm2, %%mm0\n\t"
  623. "por %%mm5, %%mm3\n\t"
  624. "psllq $16, %%mm3\n\t"
  625. "por %%mm3, %%mm0\n\t"
  626. MOVNTQ" %%mm0, %0\n\t"
  627. :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
  628. d += 4;
  629. s += 16;
  630. }
  631. __asm __volatile(SFENCE:::"memory");
  632. __asm __volatile(EMMS:::"memory");
  633. #endif
  634. while(s < end)
  635. {
  636. register int rgb = *(uint32_t*)s; s += 4;
  637. *d++ = ((rgb&0xF8)<<7) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>19);
  638. }
  639. }
  640. static inline void RENAME(rgb24to16)(const uint8_t *src, uint8_t *dst, long src_size)
  641. {
  642. const uint8_t *s = src;
  643. const uint8_t *end;
  644. #ifdef HAVE_MMX
  645. const uint8_t *mm_end;
  646. #endif
  647. uint16_t *d = (uint16_t *)dst;
  648. end = s + src_size;
  649. #ifdef HAVE_MMX
  650. __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
  651. __asm __volatile(
  652. "movq %0, %%mm7\n\t"
  653. "movq %1, %%mm6\n\t"
  654. ::"m"(red_16mask),"m"(green_16mask));
  655. mm_end = end - 11;
  656. while(s < mm_end)
  657. {
  658. __asm __volatile(
  659. PREFETCH" 32%1\n\t"
  660. "movd %1, %%mm0\n\t"
  661. "movd 3%1, %%mm3\n\t"
  662. "punpckldq 6%1, %%mm0\n\t"
  663. "punpckldq 9%1, %%mm3\n\t"
  664. "movq %%mm0, %%mm1\n\t"
  665. "movq %%mm0, %%mm2\n\t"
  666. "movq %%mm3, %%mm4\n\t"
  667. "movq %%mm3, %%mm5\n\t"
  668. "psrlq $3, %%mm0\n\t"
  669. "psrlq $3, %%mm3\n\t"
  670. "pand %2, %%mm0\n\t"
  671. "pand %2, %%mm3\n\t"
  672. "psrlq $5, %%mm1\n\t"
  673. "psrlq $5, %%mm4\n\t"
  674. "pand %%mm6, %%mm1\n\t"
  675. "pand %%mm6, %%mm4\n\t"
  676. "psrlq $8, %%mm2\n\t"
  677. "psrlq $8, %%mm5\n\t"
  678. "pand %%mm7, %%mm2\n\t"
  679. "pand %%mm7, %%mm5\n\t"
  680. "por %%mm1, %%mm0\n\t"
  681. "por %%mm4, %%mm3\n\t"
  682. "por %%mm2, %%mm0\n\t"
  683. "por %%mm5, %%mm3\n\t"
  684. "psllq $16, %%mm3\n\t"
  685. "por %%mm3, %%mm0\n\t"
  686. MOVNTQ" %%mm0, %0\n\t"
  687. :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
  688. d += 4;
  689. s += 12;
  690. }
  691. __asm __volatile(SFENCE:::"memory");
  692. __asm __volatile(EMMS:::"memory");
  693. #endif
  694. while(s < end)
  695. {
  696. const int b= *s++;
  697. const int g= *s++;
  698. const int r= *s++;
  699. *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
  700. }
  701. }
  702. static inline void RENAME(rgb24tobgr16)(const uint8_t *src, uint8_t *dst, long src_size)
  703. {
  704. const uint8_t *s = src;
  705. const uint8_t *end;
  706. #ifdef HAVE_MMX
  707. const uint8_t *mm_end;
  708. #endif
  709. uint16_t *d = (uint16_t *)dst;
  710. end = s + src_size;
  711. #ifdef HAVE_MMX
  712. __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
  713. __asm __volatile(
  714. "movq %0, %%mm7\n\t"
  715. "movq %1, %%mm6\n\t"
  716. ::"m"(red_16mask),"m"(green_16mask));
  717. mm_end = end - 15;
  718. while(s < mm_end)
  719. {
  720. __asm __volatile(
  721. PREFETCH" 32%1\n\t"
  722. "movd %1, %%mm0\n\t"
  723. "movd 3%1, %%mm3\n\t"
  724. "punpckldq 6%1, %%mm0\n\t"
  725. "punpckldq 9%1, %%mm3\n\t"
  726. "movq %%mm0, %%mm1\n\t"
  727. "movq %%mm0, %%mm2\n\t"
  728. "movq %%mm3, %%mm4\n\t"
  729. "movq %%mm3, %%mm5\n\t"
  730. "psllq $8, %%mm0\n\t"
  731. "psllq $8, %%mm3\n\t"
  732. "pand %%mm7, %%mm0\n\t"
  733. "pand %%mm7, %%mm3\n\t"
  734. "psrlq $5, %%mm1\n\t"
  735. "psrlq $5, %%mm4\n\t"
  736. "pand %%mm6, %%mm1\n\t"
  737. "pand %%mm6, %%mm4\n\t"
  738. "psrlq $19, %%mm2\n\t"
  739. "psrlq $19, %%mm5\n\t"
  740. "pand %2, %%mm2\n\t"
  741. "pand %2, %%mm5\n\t"
  742. "por %%mm1, %%mm0\n\t"
  743. "por %%mm4, %%mm3\n\t"
  744. "por %%mm2, %%mm0\n\t"
  745. "por %%mm5, %%mm3\n\t"
  746. "psllq $16, %%mm3\n\t"
  747. "por %%mm3, %%mm0\n\t"
  748. MOVNTQ" %%mm0, %0\n\t"
  749. :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
  750. d += 4;
  751. s += 12;
  752. }
  753. __asm __volatile(SFENCE:::"memory");
  754. __asm __volatile(EMMS:::"memory");
  755. #endif
  756. while(s < end)
  757. {
  758. const int r= *s++;
  759. const int g= *s++;
  760. const int b= *s++;
  761. *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
  762. }
  763. }
  764. static inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, long src_size)
  765. {
  766. const uint8_t *s = src;
  767. const uint8_t *end;
  768. #ifdef HAVE_MMX
  769. const uint8_t *mm_end;
  770. #endif
  771. uint16_t *d = (uint16_t *)dst;
  772. end = s + src_size;
  773. #ifdef HAVE_MMX
  774. __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
  775. __asm __volatile(
  776. "movq %0, %%mm7\n\t"
  777. "movq %1, %%mm6\n\t"
  778. ::"m"(red_15mask),"m"(green_15mask));
  779. mm_end = end - 11;
  780. while(s < mm_end)
  781. {
  782. __asm __volatile(
  783. PREFETCH" 32%1\n\t"
  784. "movd %1, %%mm0\n\t"
  785. "movd 3%1, %%mm3\n\t"
  786. "punpckldq 6%1, %%mm0\n\t"
  787. "punpckldq 9%1, %%mm3\n\t"
  788. "movq %%mm0, %%mm1\n\t"
  789. "movq %%mm0, %%mm2\n\t"
  790. "movq %%mm3, %%mm4\n\t"
  791. "movq %%mm3, %%mm5\n\t"
  792. "psrlq $3, %%mm0\n\t"
  793. "psrlq $3, %%mm3\n\t"
  794. "pand %2, %%mm0\n\t"
  795. "pand %2, %%mm3\n\t"
  796. "psrlq $6, %%mm1\n\t"
  797. "psrlq $6, %%mm4\n\t"
  798. "pand %%mm6, %%mm1\n\t"
  799. "pand %%mm6, %%mm4\n\t"
  800. "psrlq $9, %%mm2\n\t"
  801. "psrlq $9, %%mm5\n\t"
  802. "pand %%mm7, %%mm2\n\t"
  803. "pand %%mm7, %%mm5\n\t"
  804. "por %%mm1, %%mm0\n\t"
  805. "por %%mm4, %%mm3\n\t"
  806. "por %%mm2, %%mm0\n\t"
  807. "por %%mm5, %%mm3\n\t"
  808. "psllq $16, %%mm3\n\t"
  809. "por %%mm3, %%mm0\n\t"
  810. MOVNTQ" %%mm0, %0\n\t"
  811. :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
  812. d += 4;
  813. s += 12;
  814. }
  815. __asm __volatile(SFENCE:::"memory");
  816. __asm __volatile(EMMS:::"memory");
  817. #endif
  818. while(s < end)
  819. {
  820. const int b= *s++;
  821. const int g= *s++;
  822. const int r= *s++;
  823. *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
  824. }
  825. }
  826. static inline void RENAME(rgb24tobgr15)(const uint8_t *src, uint8_t *dst, long src_size)
  827. {
  828. const uint8_t *s = src;
  829. const uint8_t *end;
  830. #ifdef HAVE_MMX
  831. const uint8_t *mm_end;
  832. #endif
  833. uint16_t *d = (uint16_t *)dst;
  834. end = s + src_size;
  835. #ifdef HAVE_MMX
  836. __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
  837. __asm __volatile(
  838. "movq %0, %%mm7\n\t"
  839. "movq %1, %%mm6\n\t"
  840. ::"m"(red_15mask),"m"(green_15mask));
  841. mm_end = end - 15;
  842. while(s < mm_end)
  843. {
  844. __asm __volatile(
  845. PREFETCH" 32%1\n\t"
  846. "movd %1, %%mm0\n\t"
  847. "movd 3%1, %%mm3\n\t"
  848. "punpckldq 6%1, %%mm0\n\t"
  849. "punpckldq 9%1, %%mm3\n\t"
  850. "movq %%mm0, %%mm1\n\t"
  851. "movq %%mm0, %%mm2\n\t"
  852. "movq %%mm3, %%mm4\n\t"
  853. "movq %%mm3, %%mm5\n\t"
  854. "psllq $7, %%mm0\n\t"
  855. "psllq $7, %%mm3\n\t"
  856. "pand %%mm7, %%mm0\n\t"
  857. "pand %%mm7, %%mm3\n\t"
  858. "psrlq $6, %%mm1\n\t"
  859. "psrlq $6, %%mm4\n\t"
  860. "pand %%mm6, %%mm1\n\t"
  861. "pand %%mm6, %%mm4\n\t"
  862. "psrlq $19, %%mm2\n\t"
  863. "psrlq $19, %%mm5\n\t"
  864. "pand %2, %%mm2\n\t"
  865. "pand %2, %%mm5\n\t"
  866. "por %%mm1, %%mm0\n\t"
  867. "por %%mm4, %%mm3\n\t"
  868. "por %%mm2, %%mm0\n\t"
  869. "por %%mm5, %%mm3\n\t"
  870. "psllq $16, %%mm3\n\t"
  871. "por %%mm3, %%mm0\n\t"
  872. MOVNTQ" %%mm0, %0\n\t"
  873. :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
  874. d += 4;
  875. s += 12;
  876. }
  877. __asm __volatile(SFENCE:::"memory");
  878. __asm __volatile(EMMS:::"memory");
  879. #endif
  880. while(s < end)
  881. {
  882. const int r= *s++;
  883. const int g= *s++;
  884. const int b= *s++;
  885. *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
  886. }
  887. }
  888. /*
  889. I use here less accurate approximation by simply
  890. left-shifting the input
  891. value and filling the low order bits with
  892. zeroes. This method improves png's
  893. compression but this scheme cannot reproduce white exactly, since it does not
  894. generate an all-ones maximum value; the net effect is to darken the
  895. image slightly.
  896. The better method should be "left bit replication":
  897. 4 3 2 1 0
  898. ---------
  899. 1 1 0 1 1
  900. 7 6 5 4 3 2 1 0
  901. ----------------
  902. 1 1 0 1 1 1 1 0
  903. |=======| |===|
  904. | Leftmost Bits Repeated to Fill Open Bits
  905. |
  906. Original Bits
  907. */
  908. static inline void RENAME(rgb15to24)(const uint8_t *src, uint8_t *dst, long src_size)
  909. {
  910. const uint16_t *end;
  911. #ifdef HAVE_MMX
  912. const uint16_t *mm_end;
  913. #endif
  914. uint8_t *d = (uint8_t *)dst;
  915. const uint16_t *s = (uint16_t *)src;
  916. end = s + src_size/2;
  917. #ifdef HAVE_MMX
  918. __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
  919. mm_end = end - 7;
  920. while(s < mm_end)
  921. {
  922. __asm __volatile(
  923. PREFETCH" 32%1\n\t"
  924. "movq %1, %%mm0\n\t"
  925. "movq %1, %%mm1\n\t"
  926. "movq %1, %%mm2\n\t"
  927. "pand %2, %%mm0\n\t"
  928. "pand %3, %%mm1\n\t"
  929. "pand %4, %%mm2\n\t"
  930. "psllq $3, %%mm0\n\t"
  931. "psrlq $2, %%mm1\n\t"
  932. "psrlq $7, %%mm2\n\t"
  933. "movq %%mm0, %%mm3\n\t"
  934. "movq %%mm1, %%mm4\n\t"
  935. "movq %%mm2, %%mm5\n\t"
  936. "punpcklwd %5, %%mm0\n\t"
  937. "punpcklwd %5, %%mm1\n\t"
  938. "punpcklwd %5, %%mm2\n\t"
  939. "punpckhwd %5, %%mm3\n\t"
  940. "punpckhwd %5, %%mm4\n\t"
  941. "punpckhwd %5, %%mm5\n\t"
  942. "psllq $8, %%mm1\n\t"
  943. "psllq $16, %%mm2\n\t"
  944. "por %%mm1, %%mm0\n\t"
  945. "por %%mm2, %%mm0\n\t"
  946. "psllq $8, %%mm4\n\t"
  947. "psllq $16, %%mm5\n\t"
  948. "por %%mm4, %%mm3\n\t"
  949. "por %%mm5, %%mm3\n\t"
  950. "movq %%mm0, %%mm6\n\t"
  951. "movq %%mm3, %%mm7\n\t"
  952. "movq 8%1, %%mm0\n\t"
  953. "movq 8%1, %%mm1\n\t"
  954. "movq 8%1, %%mm2\n\t"
  955. "pand %2, %%mm0\n\t"
  956. "pand %3, %%mm1\n\t"
  957. "pand %4, %%mm2\n\t"
  958. "psllq $3, %%mm0\n\t"
  959. "psrlq $2, %%mm1\n\t"
  960. "psrlq $7, %%mm2\n\t"
  961. "movq %%mm0, %%mm3\n\t"
  962. "movq %%mm1, %%mm4\n\t"
  963. "movq %%mm2, %%mm5\n\t"
  964. "punpcklwd %5, %%mm0\n\t"
  965. "punpcklwd %5, %%mm1\n\t"
  966. "punpcklwd %5, %%mm2\n\t"
  967. "punpckhwd %5, %%mm3\n\t"
  968. "punpckhwd %5, %%mm4\n\t"
  969. "punpckhwd %5, %%mm5\n\t"
  970. "psllq $8, %%mm1\n\t"
  971. "psllq $16, %%mm2\n\t"
  972. "por %%mm1, %%mm0\n\t"
  973. "por %%mm2, %%mm0\n\t"
  974. "psllq $8, %%mm4\n\t"
  975. "psllq $16, %%mm5\n\t"
  976. "por %%mm4, %%mm3\n\t"
  977. "por %%mm5, %%mm3\n\t"
  978. :"=m"(*d)
  979. :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r), "m"(mmx_null)
  980. :"memory");
  981. /* Borrowed 32 to 24 */
  982. __asm __volatile(
  983. "movq %%mm0, %%mm4\n\t"
  984. "movq %%mm3, %%mm5\n\t"
  985. "movq %%mm6, %%mm0\n\t"
  986. "movq %%mm7, %%mm1\n\t"
  987. "movq %%mm4, %%mm6\n\t"
  988. "movq %%mm5, %%mm7\n\t"
  989. "movq %%mm0, %%mm2\n\t"
  990. "movq %%mm1, %%mm3\n\t"
  991. "psrlq $8, %%mm2\n\t"
  992. "psrlq $8, %%mm3\n\t"
  993. "psrlq $8, %%mm6\n\t"
  994. "psrlq $8, %%mm7\n\t"
  995. "pand %2, %%mm0\n\t"
  996. "pand %2, %%mm1\n\t"
  997. "pand %2, %%mm4\n\t"
  998. "pand %2, %%mm5\n\t"
  999. "pand %3, %%mm2\n\t"
  1000. "pand %3, %%mm3\n\t"
  1001. "pand %3, %%mm6\n\t"
  1002. "pand %3, %%mm7\n\t"
  1003. "por %%mm2, %%mm0\n\t"
  1004. "por %%mm3, %%mm1\n\t"
  1005. "por %%mm6, %%mm4\n\t"
  1006. "por %%mm7, %%mm5\n\t"
  1007. "movq %%mm1, %%mm2\n\t"
  1008. "movq %%mm4, %%mm3\n\t"
  1009. "psllq $48, %%mm2\n\t"
  1010. "psllq $32, %%mm3\n\t"
  1011. "pand %4, %%mm2\n\t"
  1012. "pand %5, %%mm3\n\t"
  1013. "por %%mm2, %%mm0\n\t"
  1014. "psrlq $16, %%mm1\n\t"
  1015. "psrlq $32, %%mm4\n\t"
  1016. "psllq $16, %%mm5\n\t"
  1017. "por %%mm3, %%mm1\n\t"
  1018. "pand %6, %%mm5\n\t"
  1019. "por %%mm5, %%mm4\n\t"
  1020. MOVNTQ" %%mm0, %0\n\t"
  1021. MOVNTQ" %%mm1, 8%0\n\t"
  1022. MOVNTQ" %%mm4, 16%0"
  1023. :"=m"(*d)
  1024. :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
  1025. :"memory");
  1026. d += 24;
  1027. s += 8;
  1028. }
  1029. __asm __volatile(SFENCE:::"memory");
  1030. __asm __volatile(EMMS:::"memory");
  1031. #endif
  1032. while(s < end)
  1033. {
  1034. register uint16_t bgr;
  1035. bgr = *s++;
  1036. *d++ = (bgr&0x1F)<<3;
  1037. *d++ = (bgr&0x3E0)>>2;
  1038. *d++ = (bgr&0x7C00)>>7;
  1039. }
  1040. }
  1041. static inline void RENAME(rgb16to24)(const uint8_t *src, uint8_t *dst, long src_size)
  1042. {
  1043. const uint16_t *end;
  1044. #ifdef HAVE_MMX
  1045. const uint16_t *mm_end;
  1046. #endif
  1047. uint8_t *d = (uint8_t *)dst;
  1048. const uint16_t *s = (const uint16_t *)src;
  1049. end = s + src_size/2;
  1050. #ifdef HAVE_MMX
  1051. __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
  1052. mm_end = end - 7;
  1053. while(s < mm_end)
  1054. {
  1055. __asm __volatile(
  1056. PREFETCH" 32%1\n\t"
  1057. "movq %1, %%mm0\n\t"
  1058. "movq %1, %%mm1\n\t"
  1059. "movq %1, %%mm2\n\t"
  1060. "pand %2, %%mm0\n\t"
  1061. "pand %3, %%mm1\n\t"
  1062. "pand %4, %%mm2\n\t"
  1063. "psllq $3, %%mm0\n\t"
  1064. "psrlq $3, %%mm1\n\t"
  1065. "psrlq $8, %%mm2\n\t"
  1066. "movq %%mm0, %%mm3\n\t"
  1067. "movq %%mm1, %%mm4\n\t"
  1068. "movq %%mm2, %%mm5\n\t"
  1069. "punpcklwd %5, %%mm0\n\t"
  1070. "punpcklwd %5, %%mm1\n\t"
  1071. "punpcklwd %5, %%mm2\n\t"
  1072. "punpckhwd %5, %%mm3\n\t"
  1073. "punpckhwd %5, %%mm4\n\t"
  1074. "punpckhwd %5, %%mm5\n\t"
  1075. "psllq $8, %%mm1\n\t"
  1076. "psllq $16, %%mm2\n\t"
  1077. "por %%mm1, %%mm0\n\t"
  1078. "por %%mm2, %%mm0\n\t"
  1079. "psllq $8, %%mm4\n\t"
  1080. "psllq $16, %%mm5\n\t"
  1081. "por %%mm4, %%mm3\n\t"
  1082. "por %%mm5, %%mm3\n\t"
  1083. "movq %%mm0, %%mm6\n\t"
  1084. "movq %%mm3, %%mm7\n\t"
  1085. "movq 8%1, %%mm0\n\t"
  1086. "movq 8%1, %%mm1\n\t"
  1087. "movq 8%1, %%mm2\n\t"
  1088. "pand %2, %%mm0\n\t"
  1089. "pand %3, %%mm1\n\t"
  1090. "pand %4, %%mm2\n\t"
  1091. "psllq $3, %%mm0\n\t"
  1092. "psrlq $3, %%mm1\n\t"
  1093. "psrlq $8, %%mm2\n\t"
  1094. "movq %%mm0, %%mm3\n\t"
  1095. "movq %%mm1, %%mm4\n\t"
  1096. "movq %%mm2, %%mm5\n\t"
  1097. "punpcklwd %5, %%mm0\n\t"
  1098. "punpcklwd %5, %%mm1\n\t"
  1099. "punpcklwd %5, %%mm2\n\t"
  1100. "punpckhwd %5, %%mm3\n\t"
  1101. "punpckhwd %5, %%mm4\n\t"
  1102. "punpckhwd %5, %%mm5\n\t"
  1103. "psllq $8, %%mm1\n\t"
  1104. "psllq $16, %%mm2\n\t"
  1105. "por %%mm1, %%mm0\n\t"
  1106. "por %%mm2, %%mm0\n\t"
  1107. "psllq $8, %%mm4\n\t"
  1108. "psllq $16, %%mm5\n\t"
  1109. "por %%mm4, %%mm3\n\t"
  1110. "por %%mm5, %%mm3\n\t"
  1111. :"=m"(*d)
  1112. :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mmx_null)
  1113. :"memory");
  1114. /* Borrowed 32 to 24 */
  1115. __asm __volatile(
  1116. "movq %%mm0, %%mm4\n\t"
  1117. "movq %%mm3, %%mm5\n\t"
  1118. "movq %%mm6, %%mm0\n\t"
  1119. "movq %%mm7, %%mm1\n\t"
  1120. "movq %%mm4, %%mm6\n\t"
  1121. "movq %%mm5, %%mm7\n\t"
  1122. "movq %%mm0, %%mm2\n\t"
  1123. "movq %%mm1, %%mm3\n\t"
  1124. "psrlq $8, %%mm2\n\t"
  1125. "psrlq $8, %%mm3\n\t"
  1126. "psrlq $8, %%mm6\n\t"
  1127. "psrlq $8, %%mm7\n\t"
  1128. "pand %2, %%mm0\n\t"
  1129. "pand %2, %%mm1\n\t"
  1130. "pand %2, %%mm4\n\t"
  1131. "pand %2, %%mm5\n\t"
  1132. "pand %3, %%mm2\n\t"
  1133. "pand %3, %%mm3\n\t"
  1134. "pand %3, %%mm6\n\t"
  1135. "pand %3, %%mm7\n\t"
  1136. "por %%mm2, %%mm0\n\t"
  1137. "por %%mm3, %%mm1\n\t"
  1138. "por %%mm6, %%mm4\n\t"
  1139. "por %%mm7, %%mm5\n\t"
  1140. "movq %%mm1, %%mm2\n\t"
  1141. "movq %%mm4, %%mm3\n\t"
  1142. "psllq $48, %%mm2\n\t"
  1143. "psllq $32, %%mm3\n\t"
  1144. "pand %4, %%mm2\n\t"
  1145. "pand %5, %%mm3\n\t"
  1146. "por %%mm2, %%mm0\n\t"
  1147. "psrlq $16, %%mm1\n\t"
  1148. "psrlq $32, %%mm4\n\t"
  1149. "psllq $16, %%mm5\n\t"
  1150. "por %%mm3, %%mm1\n\t"
  1151. "pand %6, %%mm5\n\t"
  1152. "por %%mm5, %%mm4\n\t"
  1153. MOVNTQ" %%mm0, %0\n\t"
  1154. MOVNTQ" %%mm1, 8%0\n\t"
  1155. MOVNTQ" %%mm4, 16%0"
  1156. :"=m"(*d)
  1157. :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
  1158. :"memory");
  1159. d += 24;
  1160. s += 8;
  1161. }
  1162. __asm __volatile(SFENCE:::"memory");
  1163. __asm __volatile(EMMS:::"memory");
  1164. #endif
  1165. while(s < end)
  1166. {
  1167. register uint16_t bgr;
  1168. bgr = *s++;
  1169. *d++ = (bgr&0x1F)<<3;
  1170. *d++ = (bgr&0x7E0)>>3;
  1171. *d++ = (bgr&0xF800)>>8;
  1172. }
  1173. }
  1174. static inline void RENAME(rgb15to32)(const uint8_t *src, uint8_t *dst, long src_size)
  1175. {
  1176. const uint16_t *end;
  1177. #ifdef HAVE_MMX
  1178. const uint16_t *mm_end;
  1179. #endif
  1180. uint8_t *d = (uint8_t *)dst;
  1181. const uint16_t *s = (const uint16_t *)src;
  1182. end = s + src_size/2;
  1183. #ifdef HAVE_MMX
  1184. __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
  1185. __asm __volatile("pxor %%mm7,%%mm7\n\t":::"memory");
  1186. mm_end = end - 3;
  1187. while(s < mm_end)
  1188. {
  1189. __asm __volatile(
  1190. PREFETCH" 32%1\n\t"
  1191. "movq %1, %%mm0\n\t"
  1192. "movq %1, %%mm1\n\t"
  1193. "movq %1, %%mm2\n\t"
  1194. "pand %2, %%mm0\n\t"
  1195. "pand %3, %%mm1\n\t"
  1196. "pand %4, %%mm2\n\t"
  1197. "psllq $3, %%mm0\n\t"
  1198. "psrlq $2, %%mm1\n\t"
  1199. "psrlq $7, %%mm2\n\t"
  1200. "movq %%mm0, %%mm3\n\t"
  1201. "movq %%mm1, %%mm4\n\t"
  1202. "movq %%mm2, %%mm5\n\t"
  1203. "punpcklwd %%mm7, %%mm0\n\t"
  1204. "punpcklwd %%mm7, %%mm1\n\t"
  1205. "punpcklwd %%mm7, %%mm2\n\t"
  1206. "punpckhwd %%mm7, %%mm3\n\t"
  1207. "punpckhwd %%mm7, %%mm4\n\t"
  1208. "punpckhwd %%mm7, %%mm5\n\t"
  1209. "psllq $8, %%mm1\n\t"
  1210. "psllq $16, %%mm2\n\t"
  1211. "por %%mm1, %%mm0\n\t"
  1212. "por %%mm2, %%mm0\n\t"
  1213. "psllq $8, %%mm4\n\t"
  1214. "psllq $16, %%mm5\n\t"
  1215. "por %%mm4, %%mm3\n\t"
  1216. "por %%mm5, %%mm3\n\t"
  1217. MOVNTQ" %%mm0, %0\n\t"
  1218. MOVNTQ" %%mm3, 8%0\n\t"
  1219. :"=m"(*d)
  1220. :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r)
  1221. :"memory");
  1222. d += 16;
  1223. s += 4;
  1224. }
  1225. __asm __volatile(SFENCE:::"memory");
  1226. __asm __volatile(EMMS:::"memory");
  1227. #endif
  1228. while(s < end)
  1229. {
  1230. #if 0 //slightly slower on athlon
  1231. int bgr= *s++;
  1232. *((uint32_t*)d)++ = ((bgr&0x1F)<<3) + ((bgr&0x3E0)<<6) + ((bgr&0x7C00)<<9);
  1233. #else
  1234. register uint16_t bgr;
  1235. bgr = *s++;
  1236. #ifdef WORDS_BIGENDIAN
  1237. *d++ = 0;
  1238. *d++ = (bgr&0x7C00)>>7;
  1239. *d++ = (bgr&0x3E0)>>2;
  1240. *d++ = (bgr&0x1F)<<3;
  1241. #else
  1242. *d++ = (bgr&0x1F)<<3;
  1243. *d++ = (bgr&0x3E0)>>2;
  1244. *d++ = (bgr&0x7C00)>>7;
  1245. *d++ = 0;
  1246. #endif
  1247. #endif
  1248. }
  1249. }
  1250. static inline void RENAME(rgb16to32)(const uint8_t *src, uint8_t *dst, long src_size)
  1251. {
  1252. const uint16_t *end;
  1253. #ifdef HAVE_MMX
  1254. const uint16_t *mm_end;
  1255. #endif
  1256. uint8_t *d = (uint8_t *)dst;
  1257. const uint16_t *s = (uint16_t *)src;
  1258. end = s + src_size/2;
  1259. #ifdef HAVE_MMX
  1260. __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
  1261. __asm __volatile("pxor %%mm7,%%mm7\n\t":::"memory");
  1262. mm_end = end - 3;
  1263. while(s < mm_end)
  1264. {
  1265. __asm __volatile(
  1266. PREFETCH" 32%1\n\t"
  1267. "movq %1, %%mm0\n\t"
  1268. "movq %1, %%mm1\n\t"
  1269. "movq %1, %%mm2\n\t"
  1270. "pand %2, %%mm0\n\t"
  1271. "pand %3, %%mm1\n\t"
  1272. "pand %4, %%mm2\n\t"
  1273. "psllq $3, %%mm0\n\t"
  1274. "psrlq $3, %%mm1\n\t"
  1275. "psrlq $8, %%mm2\n\t"
  1276. "movq %%mm0, %%mm3\n\t"
  1277. "movq %%mm1, %%mm4\n\t"
  1278. "movq %%mm2, %%mm5\n\t"
  1279. "punpcklwd %%mm7, %%mm0\n\t"
  1280. "punpcklwd %%mm7, %%mm1\n\t"
  1281. "punpcklwd %%mm7, %%mm2\n\t"
  1282. "punpckhwd %%mm7, %%mm3\n\t"
  1283. "punpckhwd %%mm7, %%mm4\n\t"
  1284. "punpckhwd %%mm7, %%mm5\n\t"
  1285. "psllq $8, %%mm1\n\t"
  1286. "psllq $16, %%mm2\n\t"
  1287. "por %%mm1, %%mm0\n\t"
  1288. "por %%mm2, %%mm0\n\t"
  1289. "psllq $8, %%mm4\n\t"
  1290. "psllq $16, %%mm5\n\t"
  1291. "por %%mm4, %%mm3\n\t"
  1292. "por %%mm5, %%mm3\n\t"
  1293. MOVNTQ" %%mm0, %0\n\t"
  1294. MOVNTQ" %%mm3, 8%0\n\t"
  1295. :"=m"(*d)
  1296. :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r)
  1297. :"memory");
  1298. d += 16;
  1299. s += 4;
  1300. }
  1301. __asm __volatile(SFENCE:::"memory");
  1302. __asm __volatile(EMMS:::"memory");
  1303. #endif
  1304. while(s < end)
  1305. {
  1306. register uint16_t bgr;
  1307. bgr = *s++;
  1308. #ifdef WORDS_BIGENDIAN
  1309. *d++ = 0;
  1310. *d++ = (bgr&0xF800)>>8;
  1311. *d++ = (bgr&0x7E0)>>3;
  1312. *d++ = (bgr&0x1F)<<3;
  1313. #else
  1314. *d++ = (bgr&0x1F)<<3;
  1315. *d++ = (bgr&0x7E0)>>3;
  1316. *d++ = (bgr&0xF800)>>8;
  1317. *d++ = 0;
  1318. #endif
  1319. }
  1320. }
  1321. static inline void RENAME(rgb32tobgr32)(const uint8_t *src, uint8_t *dst, long src_size)
  1322. {
  1323. #ifdef HAVE_MMX
  1324. /* TODO: unroll this loop */
  1325. asm volatile (
  1326. "xor %%"REG_a", %%"REG_a" \n\t"
  1327. ASMALIGN(4)
  1328. "1: \n\t"
  1329. PREFETCH" 32(%0, %%"REG_a") \n\t"
  1330. "movq (%0, %%"REG_a"), %%mm0 \n\t"
  1331. "movq %%mm0, %%mm1 \n\t"
  1332. "movq %%mm0, %%mm2 \n\t"
  1333. "pslld $16, %%mm0 \n\t"
  1334. "psrld $16, %%mm1 \n\t"
  1335. "pand "MANGLE(mask32r)", %%mm0 \n\t"
  1336. "pand "MANGLE(mask32g)", %%mm2 \n\t"
  1337. "pand "MANGLE(mask32b)", %%mm1 \n\t"
  1338. "por %%mm0, %%mm2 \n\t"
  1339. "por %%mm1, %%mm2 \n\t"
  1340. MOVNTQ" %%mm2, (%1, %%"REG_a") \n\t"
  1341. "add $8, %%"REG_a" \n\t"
  1342. "cmp %2, %%"REG_a" \n\t"
  1343. " jb 1b \n\t"
  1344. :: "r" (src), "r"(dst), "r" (src_size-7)
  1345. : "%"REG_a
  1346. );
  1347. __asm __volatile(SFENCE:::"memory");
  1348. __asm __volatile(EMMS:::"memory");
  1349. #else
  1350. unsigned i;
  1351. unsigned num_pixels = src_size >> 2;
  1352. for(i=0; i<num_pixels; i++)
  1353. {
  1354. #ifdef WORDS_BIGENDIAN
  1355. dst[4*i + 1] = src[4*i + 3];
  1356. dst[4*i + 2] = src[4*i + 2];
  1357. dst[4*i + 3] = src[4*i + 1];
  1358. #else
  1359. dst[4*i + 0] = src[4*i + 2];
  1360. dst[4*i + 1] = src[4*i + 1];
  1361. dst[4*i + 2] = src[4*i + 0];
  1362. #endif
  1363. }
  1364. #endif
  1365. }
  1366. static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
  1367. {
  1368. unsigned i;
  1369. #ifdef HAVE_MMX
  1370. long mmx_size= 23 - src_size;
  1371. asm volatile (
  1372. "movq "MANGLE(mask24r)", %%mm5 \n\t"
  1373. "movq "MANGLE(mask24g)", %%mm6 \n\t"
  1374. "movq "MANGLE(mask24b)", %%mm7 \n\t"
  1375. ASMALIGN(4)
  1376. "1: \n\t"
  1377. PREFETCH" 32(%1, %%"REG_a") \n\t"
  1378. "movq (%1, %%"REG_a"), %%mm0 \n\t" // BGR BGR BG
  1379. "movq (%1, %%"REG_a"), %%mm1 \n\t" // BGR BGR BG
  1380. "movq 2(%1, %%"REG_a"), %%mm2 \n\t" // R BGR BGR B
  1381. "psllq $16, %%mm0 \n\t" // 00 BGR BGR
  1382. "pand %%mm5, %%mm0 \n\t"
  1383. "pand %%mm6, %%mm1 \n\t"
  1384. "pand %%mm7, %%mm2 \n\t"
  1385. "por %%mm0, %%mm1 \n\t"
  1386. "por %%mm2, %%mm1 \n\t"
  1387. "movq 6(%1, %%"REG_a"), %%mm0 \n\t" // BGR BGR BG
  1388. MOVNTQ" %%mm1, (%2, %%"REG_a")\n\t" // RGB RGB RG
  1389. "movq 8(%1, %%"REG_a"), %%mm1 \n\t" // R BGR BGR B
  1390. "movq 10(%1, %%"REG_a"), %%mm2 \n\t" // GR BGR BGR
  1391. "pand %%mm7, %%mm0 \n\t"
  1392. "pand %%mm5, %%mm1 \n\t"
  1393. "pand %%mm6, %%mm2 \n\t"
  1394. "por %%mm0, %%mm1 \n\t"
  1395. "por %%mm2, %%mm1 \n\t"
  1396. "movq 14(%1, %%"REG_a"), %%mm0 \n\t" // R BGR BGR B
  1397. MOVNTQ" %%mm1, 8(%2, %%"REG_a")\n\t" // B RGB RGB R
  1398. "movq 16(%1, %%"REG_a"), %%mm1 \n\t" // GR BGR BGR
  1399. "movq 18(%1, %%"REG_a"), %%mm2 \n\t" // BGR BGR BG
  1400. "pand %%mm6, %%mm0 \n\t"
  1401. "pand %%mm7, %%mm1 \n\t"
  1402. "pand %%mm5, %%mm2 \n\t"
  1403. "por %%mm0, %%mm1 \n\t"
  1404. "por %%mm2, %%mm1 \n\t"
  1405. MOVNTQ" %%mm1, 16(%2, %%"REG_a")\n\t"
  1406. "add $24, %%"REG_a" \n\t"
  1407. " js 1b \n\t"
  1408. : "+a" (mmx_size)
  1409. : "r" (src-mmx_size), "r"(dst-mmx_size)
  1410. );
  1411. __asm __volatile(SFENCE:::"memory");
  1412. __asm __volatile(EMMS:::"memory");
  1413. if(mmx_size==23) return; //finihsed, was multiple of 8
  1414. src+= src_size;
  1415. dst+= src_size;
  1416. src_size= 23-mmx_size;
  1417. src-= src_size;
  1418. dst-= src_size;
  1419. #endif
  1420. for(i=0; i<src_size; i+=3)
  1421. {
  1422. register uint8_t x;
  1423. x = src[i + 2];
  1424. dst[i + 1] = src[i + 1];
  1425. dst[i + 2] = src[i + 0];
  1426. dst[i + 0] = x;
  1427. }
  1428. }
  1429. static inline void RENAME(yuvPlanartoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
  1430. long width, long height,
  1431. long lumStride, long chromStride, long dstStride, long vertLumPerChroma)
  1432. {
  1433. long y;
  1434. const long chromWidth= width>>1;
  1435. for(y=0; y<height; y++)
  1436. {
  1437. #ifdef HAVE_MMX
  1438. //FIXME handle 2 lines a once (fewer prefetch, reuse some chrom, but very likely limited by mem anyway)
  1439. asm volatile(
  1440. "xor %%"REG_a", %%"REG_a" \n\t"
  1441. ASMALIGN(4)
  1442. "1: \n\t"
  1443. PREFETCH" 32(%1, %%"REG_a", 2) \n\t"
  1444. PREFETCH" 32(%2, %%"REG_a") \n\t"
  1445. PREFETCH" 32(%3, %%"REG_a") \n\t"
  1446. "movq (%2, %%"REG_a"), %%mm0 \n\t" // U(0)
  1447. "movq %%mm0, %%mm2 \n\t" // U(0)
  1448. "movq (%3, %%"REG_a"), %%mm1 \n\t" // V(0)
  1449. "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
  1450. "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8)
  1451. "movq (%1, %%"REG_a",2), %%mm3 \n\t" // Y(0)
  1452. "movq 8(%1, %%"REG_a",2), %%mm5 \n\t" // Y(8)
  1453. "movq %%mm3, %%mm4 \n\t" // Y(0)
  1454. "movq %%mm5, %%mm6 \n\t" // Y(8)
  1455. "punpcklbw %%mm0, %%mm3 \n\t" // YUYV YUYV(0)
  1456. "punpckhbw %%mm0, %%mm4 \n\t" // YUYV YUYV(4)
  1457. "punpcklbw %%mm2, %%mm5 \n\t" // YUYV YUYV(8)
  1458. "punpckhbw %%mm2, %%mm6 \n\t" // YUYV YUYV(12)
  1459. MOVNTQ" %%mm3, (%0, %%"REG_a", 4)\n\t"
  1460. MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4)\n\t"
  1461. MOVNTQ" %%mm5, 16(%0, %%"REG_a", 4)\n\t"
  1462. MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4)\n\t"
  1463. "add $8, %%"REG_a" \n\t"
  1464. "cmp %4, %%"REG_a" \n\t"
  1465. " jb 1b \n\t"
  1466. ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
  1467. : "%"REG_a
  1468. );
  1469. #else
  1470. #if defined ARCH_ALPHA && defined HAVE_MVI
  1471. #define pl2yuy2(n) \
  1472. y1 = yc[n]; \
  1473. y2 = yc2[n]; \
  1474. u = uc[n]; \
  1475. v = vc[n]; \
  1476. asm("unpkbw %1, %0" : "=r"(y1) : "r"(y1)); \
  1477. asm("unpkbw %1, %0" : "=r"(y2) : "r"(y2)); \
  1478. asm("unpkbl %1, %0" : "=r"(u) : "r"(u)); \
  1479. asm("unpkbl %1, %0" : "=r"(v) : "r"(v)); \
  1480. yuv1 = (u << 8) + (v << 24); \
  1481. yuv2 = yuv1 + y2; \
  1482. yuv1 += y1; \
  1483. qdst[n] = yuv1; \
  1484. qdst2[n] = yuv2;
  1485. int i;
  1486. uint64_t *qdst = (uint64_t *) dst;
  1487. uint64_t *qdst2 = (uint64_t *) (dst + dstStride);
  1488. const uint32_t *yc = (uint32_t *) ysrc;
  1489. const uint32_t *yc2 = (uint32_t *) (ysrc + lumStride);
  1490. const uint16_t *uc = (uint16_t*) usrc, *vc = (uint16_t*) vsrc;
  1491. for(i = 0; i < chromWidth; i += 8){
  1492. uint64_t y1, y2, yuv1, yuv2;
  1493. uint64_t u, v;
  1494. /* Prefetch */
  1495. asm("ldq $31,64(%0)" :: "r"(yc));
  1496. asm("ldq $31,64(%0)" :: "r"(yc2));
  1497. asm("ldq $31,64(%0)" :: "r"(uc));
  1498. asm("ldq $31,64(%0)" :: "r"(vc));
  1499. pl2yuy2(0);
  1500. pl2yuy2(1);
  1501. pl2yuy2(2);
  1502. pl2yuy2(3);
  1503. yc += 4;
  1504. yc2 += 4;
  1505. uc += 4;
  1506. vc += 4;
  1507. qdst += 4;
  1508. qdst2 += 4;
  1509. }
  1510. y++;
  1511. ysrc += lumStride;
  1512. dst += dstStride;
  1513. #elif __WORDSIZE >= 64
  1514. int i;
  1515. uint64_t *ldst = (uint64_t *) dst;
  1516. const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
  1517. for(i = 0; i < chromWidth; i += 2){
  1518. uint64_t k, l;
  1519. k = yc[0] + (uc[0] << 8) +
  1520. (yc[1] << 16) + (vc[0] << 24);
  1521. l = yc[2] + (uc[1] << 8) +
  1522. (yc[3] << 16) + (vc[1] << 24);
  1523. *ldst++ = k + (l << 32);
  1524. yc += 4;
  1525. uc += 2;
  1526. vc += 2;
  1527. }
  1528. #else
  1529. int i, *idst = (int32_t *) dst;
  1530. const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
  1531. for(i = 0; i < chromWidth; i++){
  1532. #ifdef WORDS_BIGENDIAN
  1533. *idst++ = (yc[0] << 24)+ (uc[0] << 16) +
  1534. (yc[1] << 8) + (vc[0] << 0);
  1535. #else
  1536. *idst++ = yc[0] + (uc[0] << 8) +
  1537. (yc[1] << 16) + (vc[0] << 24);
  1538. #endif
  1539. yc += 2;
  1540. uc++;
  1541. vc++;
  1542. }
  1543. #endif
  1544. #endif
  1545. if((y&(vertLumPerChroma-1))==(vertLumPerChroma-1) )
  1546. {
  1547. usrc += chromStride;
  1548. vsrc += chromStride;
  1549. }
  1550. ysrc += lumStride;
  1551. dst += dstStride;
  1552. }
  1553. #ifdef HAVE_MMX
  1554. asm( EMMS" \n\t"
  1555. SFENCE" \n\t"
  1556. :::"memory");
  1557. #endif
  1558. }
  1559. /**
  1560. *
  1561. * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
  1562. * problem for anyone then tell me, and ill fix it)
  1563. */
  1564. static inline void RENAME(yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
  1565. long width, long height,
  1566. long lumStride, long chromStride, long dstStride)
  1567. {
  1568. //FIXME interpolate chroma
  1569. RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
  1570. }
  1571. static inline void RENAME(yuvPlanartouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
  1572. long width, long height,
  1573. long lumStride, long chromStride, long dstStride, long vertLumPerChroma)
  1574. {
  1575. long y;
  1576. const long chromWidth= width>>1;
  1577. for(y=0; y<height; y++)
  1578. {
  1579. #ifdef HAVE_MMX
  1580. //FIXME handle 2 lines a once (fewer prefetch, reuse some chrom, but very likely limited by mem anyway)
  1581. asm volatile(
  1582. "xor %%"REG_a", %%"REG_a" \n\t"
  1583. ASMALIGN(4)
  1584. "1: \n\t"
  1585. PREFETCH" 32(%1, %%"REG_a", 2) \n\t"
  1586. PREFETCH" 32(%2, %%"REG_a") \n\t"
  1587. PREFETCH" 32(%3, %%"REG_a") \n\t"
  1588. "movq (%2, %%"REG_a"), %%mm0 \n\t" // U(0)
  1589. "movq %%mm0, %%mm2 \n\t" // U(0)
  1590. "movq (%3, %%"REG_a"), %%mm1 \n\t" // V(0)
  1591. "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
  1592. "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8)
  1593. "movq (%1, %%"REG_a",2), %%mm3 \n\t" // Y(0)
  1594. "movq 8(%1, %%"REG_a",2), %%mm5 \n\t" // Y(8)
  1595. "movq %%mm0, %%mm4 \n\t" // Y(0)
  1596. "movq %%mm2, %%mm6 \n\t" // Y(8)
  1597. "punpcklbw %%mm3, %%mm0 \n\t" // YUYV YUYV(0)
  1598. "punpckhbw %%mm3, %%mm4 \n\t" // YUYV YUYV(4)
  1599. "punpcklbw %%mm5, %%mm2 \n\t" // YUYV YUYV(8)
  1600. "punpckhbw %%mm5, %%mm6 \n\t" // YUYV YUYV(12)
  1601. MOVNTQ" %%mm0, (%0, %%"REG_a", 4)\n\t"
  1602. MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4)\n\t"
  1603. MOVNTQ" %%mm2, 16(%0, %%"REG_a", 4)\n\t"
  1604. MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4)\n\t"
  1605. "add $8, %%"REG_a" \n\t"
  1606. "cmp %4, %%"REG_a" \n\t"
  1607. " jb 1b \n\t"
  1608. ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
  1609. : "%"REG_a
  1610. );
  1611. #else
  1612. //FIXME adapt the alpha asm code from yv12->yuy2
  1613. #if __WORDSIZE >= 64
  1614. int i;
  1615. uint64_t *ldst = (uint64_t *) dst;
  1616. const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
  1617. for(i = 0; i < chromWidth; i += 2){
  1618. uint64_t k, l;
  1619. k = uc[0] + (yc[0] << 8) +
  1620. (vc[0] << 16) + (yc[1] << 24);
  1621. l = uc[1] + (yc[2] << 8) +
  1622. (vc[1] << 16) + (yc[3] << 24);
  1623. *ldst++ = k + (l << 32);
  1624. yc += 4;
  1625. uc += 2;
  1626. vc += 2;
  1627. }
  1628. #else
  1629. int i, *idst = (int32_t *) dst;
  1630. const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
  1631. for(i = 0; i < chromWidth; i++){
  1632. #ifdef WORDS_BIGENDIAN
  1633. *idst++ = (uc[0] << 24)+ (yc[0] << 16) +
  1634. (vc[0] << 8) + (yc[1] << 0);
  1635. #else
  1636. *idst++ = uc[0] + (yc[0] << 8) +
  1637. (vc[0] << 16) + (yc[1] << 24);
  1638. #endif
  1639. yc += 2;
  1640. uc++;
  1641. vc++;
  1642. }
  1643. #endif
  1644. #endif
  1645. if((y&(vertLumPerChroma-1))==(vertLumPerChroma-1) )
  1646. {
  1647. usrc += chromStride;
  1648. vsrc += chromStride;
  1649. }
  1650. ysrc += lumStride;
  1651. dst += dstStride;
  1652. }
  1653. #ifdef HAVE_MMX
  1654. asm( EMMS" \n\t"
  1655. SFENCE" \n\t"
  1656. :::"memory");
  1657. #endif
  1658. }
  1659. /**
  1660. *
  1661. * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
  1662. * problem for anyone then tell me, and ill fix it)
  1663. */
  1664. static inline void RENAME(yv12touyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
  1665. long width, long height,
  1666. long lumStride, long chromStride, long dstStride)
  1667. {
  1668. //FIXME interpolate chroma
  1669. RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
  1670. }
  1671. /**
  1672. *
  1673. * width should be a multiple of 16
  1674. */
  1675. static inline void RENAME(yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
  1676. long width, long height,
  1677. long lumStride, long chromStride, long dstStride)
  1678. {
  1679. RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
  1680. }
  1681. /**
  1682. *
  1683. * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
  1684. * problem for anyone then tell me, and ill fix it)
  1685. */
  1686. static inline void RENAME(yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
  1687. long width, long height,
  1688. long lumStride, long chromStride, long srcStride)
  1689. {
  1690. long y;
  1691. const long chromWidth= width>>1;
  1692. for(y=0; y<height; y+=2)
  1693. {
  1694. #ifdef HAVE_MMX
  1695. asm volatile(
  1696. "xor %%"REG_a", %%"REG_a" \n\t"
  1697. "pcmpeqw %%mm7, %%mm7 \n\t"
  1698. "psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
  1699. ASMALIGN(4)
  1700. "1: \n\t"
  1701. PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
  1702. "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
  1703. "movq 8(%0, %%"REG_a", 4), %%mm1\n\t" // YUYV YUYV(4)
  1704. "movq %%mm0, %%mm2 \n\t" // YUYV YUYV(0)
  1705. "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(4)
  1706. "psrlw $8, %%mm0 \n\t" // U0V0 U0V0(0)
  1707. "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(4)
  1708. "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
  1709. "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
  1710. "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
  1711. "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
  1712. MOVNTQ" %%mm2, (%1, %%"REG_a", 2)\n\t"
  1713. "movq 16(%0, %%"REG_a", 4), %%mm1\n\t" // YUYV YUYV(8)
  1714. "movq 24(%0, %%"REG_a", 4), %%mm2\n\t" // YUYV YUYV(12)
  1715. "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(8)
  1716. "movq %%mm2, %%mm4 \n\t" // YUYV YUYV(12)
  1717. "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(8)
  1718. "psrlw $8, %%mm2 \n\t" // U0V0 U0V0(12)
  1719. "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
  1720. "pand %%mm7, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
  1721. "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
  1722. "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
  1723. MOVNTQ" %%mm3, 8(%1, %%"REG_a", 2)\n\t"
  1724. "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
  1725. "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
  1726. "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
  1727. "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
  1728. "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
  1729. "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
  1730. "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
  1731. "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
  1732. MOVNTQ" %%mm0, (%3, %%"REG_a") \n\t"
  1733. MOVNTQ" %%mm2, (%2, %%"REG_a") \n\t"
  1734. "add $8, %%"REG_a" \n\t"
  1735. "cmp %4, %%"REG_a" \n\t"
  1736. " jb 1b \n\t"
  1737. ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
  1738. : "memory", "%"REG_a
  1739. );
  1740. ydst += lumStride;
  1741. src += srcStride;
  1742. asm volatile(
  1743. "xor %%"REG_a", %%"REG_a" \n\t"
  1744. ASMALIGN(4)
  1745. "1: \n\t"
  1746. PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
  1747. "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
  1748. "movq 8(%0, %%"REG_a", 4), %%mm1\n\t" // YUYV YUYV(4)
  1749. "movq 16(%0, %%"REG_a", 4), %%mm2\n\t" // YUYV YUYV(8)
  1750. "movq 24(%0, %%"REG_a", 4), %%mm3\n\t" // YUYV YUYV(12)
  1751. "pand %%mm7, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
  1752. "pand %%mm7, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
  1753. "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
  1754. "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
  1755. "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
  1756. "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
  1757. MOVNTQ" %%mm0, (%1, %%"REG_a", 2)\n\t"
  1758. MOVNTQ" %%mm2, 8(%1, %%"REG_a", 2)\n\t"
  1759. "add $8, %%"REG_a" \n\t"
  1760. "cmp %4, %%"REG_a" \n\t"
  1761. " jb 1b \n\t"
  1762. ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
  1763. : "memory", "%"REG_a
  1764. );
  1765. #else
  1766. long i;
  1767. for(i=0; i<chromWidth; i++)
  1768. {
  1769. ydst[2*i+0] = src[4*i+0];
  1770. udst[i] = src[4*i+1];
  1771. ydst[2*i+1] = src[4*i+2];
  1772. vdst[i] = src[4*i+3];
  1773. }
  1774. ydst += lumStride;
  1775. src += srcStride;
  1776. for(i=0; i<chromWidth; i++)
  1777. {
  1778. ydst[2*i+0] = src[4*i+0];
  1779. ydst[2*i+1] = src[4*i+2];
  1780. }
  1781. #endif
  1782. udst += chromStride;
  1783. vdst += chromStride;
  1784. ydst += lumStride;
  1785. src += srcStride;
  1786. }
  1787. #ifdef HAVE_MMX
  1788. asm volatile( EMMS" \n\t"
  1789. SFENCE" \n\t"
  1790. :::"memory");
  1791. #endif
  1792. }
  1793. static inline void RENAME(yvu9toyv12)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc,
  1794. uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
  1795. long width, long height, long lumStride, long chromStride)
  1796. {
  1797. /* Y Plane */
  1798. memcpy(ydst, ysrc, width*height);
  1799. /* XXX: implement upscaling for U,V */
  1800. }
  1801. static inline void RENAME(planar2x)(const uint8_t *src, uint8_t *dst, long srcWidth, long srcHeight, long srcStride, long dstStride)
  1802. {
  1803. long x,y;
  1804. dst[0]= src[0];
  1805. // first line
  1806. for(x=0; x<srcWidth-1; x++){
  1807. dst[2*x+1]= (3*src[x] + src[x+1])>>2;
  1808. dst[2*x+2]= ( src[x] + 3*src[x+1])>>2;
  1809. }
  1810. dst[2*srcWidth-1]= src[srcWidth-1];
  1811. dst+= dstStride;
  1812. for(y=1; y<srcHeight; y++){
  1813. #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
  1814. const long mmxSize= srcWidth&~15;
  1815. asm volatile(
  1816. "mov %4, %%"REG_a" \n\t"
  1817. "1: \n\t"
  1818. "movq (%0, %%"REG_a"), %%mm0 \n\t"
  1819. "movq (%1, %%"REG_a"), %%mm1 \n\t"
  1820. "movq 1(%0, %%"REG_a"), %%mm2 \n\t"
  1821. "movq 1(%1, %%"REG_a"), %%mm3 \n\t"
  1822. "movq -1(%0, %%"REG_a"), %%mm4 \n\t"
  1823. "movq -1(%1, %%"REG_a"), %%mm5 \n\t"
  1824. PAVGB" %%mm0, %%mm5 \n\t"
  1825. PAVGB" %%mm0, %%mm3 \n\t"
  1826. PAVGB" %%mm0, %%mm5 \n\t"
  1827. PAVGB" %%mm0, %%mm3 \n\t"
  1828. PAVGB" %%mm1, %%mm4 \n\t"
  1829. PAVGB" %%mm1, %%mm2 \n\t"
  1830. PAVGB" %%mm1, %%mm4 \n\t"
  1831. PAVGB" %%mm1, %%mm2 \n\t"
  1832. "movq %%mm5, %%mm7 \n\t"
  1833. "movq %%mm4, %%mm6 \n\t"
  1834. "punpcklbw %%mm3, %%mm5 \n\t"
  1835. "punpckhbw %%mm3, %%mm7 \n\t"
  1836. "punpcklbw %%mm2, %%mm4 \n\t"
  1837. "punpckhbw %%mm2, %%mm6 \n\t"
  1838. #if 1
  1839. MOVNTQ" %%mm5, (%2, %%"REG_a", 2)\n\t"
  1840. MOVNTQ" %%mm7, 8(%2, %%"REG_a", 2)\n\t"
  1841. MOVNTQ" %%mm4, (%3, %%"REG_a", 2)\n\t"
  1842. MOVNTQ" %%mm6, 8(%3, %%"REG_a", 2)\n\t"
  1843. #else
  1844. "movq %%mm5, (%2, %%"REG_a", 2) \n\t"
  1845. "movq %%mm7, 8(%2, %%"REG_a", 2)\n\t"
  1846. "movq %%mm4, (%3, %%"REG_a", 2) \n\t"
  1847. "movq %%mm6, 8(%3, %%"REG_a", 2)\n\t"
  1848. #endif
  1849. "add $8, %%"REG_a" \n\t"
  1850. " js 1b \n\t"
  1851. :: "r" (src + mmxSize ), "r" (src + srcStride + mmxSize ),
  1852. "r" (dst + mmxSize*2), "r" (dst + dstStride + mmxSize*2),
  1853. "g" (-mmxSize)
  1854. : "%"REG_a
  1855. );
  1856. #else
  1857. const long mmxSize=1;
  1858. #endif
  1859. dst[0 ]= (3*src[0] + src[srcStride])>>2;
  1860. dst[dstStride]= ( src[0] + 3*src[srcStride])>>2;
  1861. for(x=mmxSize-1; x<srcWidth-1; x++){
  1862. dst[2*x +1]= (3*src[x+0] + src[x+srcStride+1])>>2;
  1863. dst[2*x+dstStride+2]= ( src[x+0] + 3*src[x+srcStride+1])>>2;
  1864. dst[2*x+dstStride+1]= ( src[x+1] + 3*src[x+srcStride ])>>2;
  1865. dst[2*x +2]= (3*src[x+1] + src[x+srcStride ])>>2;
  1866. }
  1867. dst[srcWidth*2 -1 ]= (3*src[srcWidth-1] + src[srcWidth-1 + srcStride])>>2;
  1868. dst[srcWidth*2 -1 + dstStride]= ( src[srcWidth-1] + 3*src[srcWidth-1 + srcStride])>>2;
  1869. dst+=dstStride*2;
  1870. src+=srcStride;
  1871. }
  1872. // last line
  1873. #if 1
  1874. dst[0]= src[0];
  1875. for(x=0; x<srcWidth-1; x++){
  1876. dst[2*x+1]= (3*src[x] + src[x+1])>>2;
  1877. dst[2*x+2]= ( src[x] + 3*src[x+1])>>2;
  1878. }
  1879. dst[2*srcWidth-1]= src[srcWidth-1];
  1880. #else
  1881. for(x=0; x<srcWidth; x++){
  1882. dst[2*x+0]=
  1883. dst[2*x+1]= src[x];
  1884. }
  1885. #endif
  1886. #ifdef HAVE_MMX
  1887. asm volatile( EMMS" \n\t"
  1888. SFENCE" \n\t"
  1889. :::"memory");
  1890. #endif
  1891. }
  1892. /**
  1893. *
  1894. * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
  1895. * problem for anyone then tell me, and ill fix it)
  1896. * chrominance data is only taken from every secound line others are ignored FIXME write HQ version
  1897. */
  1898. static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
  1899. long width, long height,
  1900. long lumStride, long chromStride, long srcStride)
  1901. {
  1902. long y;
  1903. const long chromWidth= width>>1;
  1904. for(y=0; y<height; y+=2)
  1905. {
  1906. #ifdef HAVE_MMX
  1907. asm volatile(
  1908. "xorl %%eax, %%eax \n\t"
  1909. "pcmpeqw %%mm7, %%mm7 \n\t"
  1910. "psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
  1911. ASMALIGN(4)
  1912. "1: \n\t"
  1913. PREFETCH" 64(%0, %%eax, 4) \n\t"
  1914. "movq (%0, %%eax, 4), %%mm0 \n\t" // UYVY UYVY(0)
  1915. "movq 8(%0, %%eax, 4), %%mm1 \n\t" // UYVY UYVY(4)
  1916. "movq %%mm0, %%mm2 \n\t" // UYVY UYVY(0)
  1917. "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(4)
  1918. "pand %%mm7, %%mm0 \n\t" // U0V0 U0V0(0)
  1919. "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(4)
  1920. "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
  1921. "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
  1922. "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
  1923. "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
  1924. MOVNTQ" %%mm2, (%1, %%eax, 2) \n\t"
  1925. "movq 16(%0, %%eax, 4), %%mm1 \n\t" // UYVY UYVY(8)
  1926. "movq 24(%0, %%eax, 4), %%mm2 \n\t" // UYVY UYVY(12)
  1927. "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(8)
  1928. "movq %%mm2, %%mm4 \n\t" // UYVY UYVY(12)
  1929. "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(8)
  1930. "pand %%mm7, %%mm2 \n\t" // U0V0 U0V0(12)
  1931. "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
  1932. "psrlw $8, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
  1933. "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
  1934. "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
  1935. MOVNTQ" %%mm3, 8(%1, %%eax, 2) \n\t"
  1936. "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
  1937. "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
  1938. "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
  1939. "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
  1940. "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
  1941. "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
  1942. "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
  1943. "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
  1944. MOVNTQ" %%mm0, (%3, %%eax) \n\t"
  1945. MOVNTQ" %%mm2, (%2, %%eax) \n\t"
  1946. "addl $8, %%eax \n\t"
  1947. "cmpl %4, %%eax \n\t"
  1948. " jb 1b \n\t"
  1949. ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
  1950. : "memory", "%eax"
  1951. );
  1952. ydst += lumStride;
  1953. src += srcStride;
  1954. asm volatile(
  1955. "xorl %%eax, %%eax \n\t"
  1956. ASMALIGN(4)
  1957. "1: \n\t"
  1958. PREFETCH" 64(%0, %%eax, 4) \n\t"
  1959. "movq (%0, %%eax, 4), %%mm0 \n\t" // YUYV YUYV(0)
  1960. "movq 8(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(4)
  1961. "movq 16(%0, %%eax, 4), %%mm2 \n\t" // YUYV YUYV(8)
  1962. "movq 24(%0, %%eax, 4), %%mm3 \n\t" // YUYV YUYV(12)
  1963. "psrlw $8, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
  1964. "psrlw $8, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
  1965. "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
  1966. "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
  1967. "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
  1968. "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
  1969. MOVNTQ" %%mm0, (%1, %%eax, 2) \n\t"
  1970. MOVNTQ" %%mm2, 8(%1, %%eax, 2) \n\t"
  1971. "addl $8, %%eax \n\t"
  1972. "cmpl %4, %%eax \n\t"
  1973. " jb 1b \n\t"
  1974. ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
  1975. : "memory", "%eax"
  1976. );
  1977. #else
  1978. long i;
  1979. for(i=0; i<chromWidth; i++)
  1980. {
  1981. udst[i] = src[4*i+0];
  1982. ydst[2*i+0] = src[4*i+1];
  1983. vdst[i] = src[4*i+2];
  1984. ydst[2*i+1] = src[4*i+3];
  1985. }
  1986. ydst += lumStride;
  1987. src += srcStride;
  1988. for(i=0; i<chromWidth; i++)
  1989. {
  1990. ydst[2*i+0] = src[4*i+1];
  1991. ydst[2*i+1] = src[4*i+3];
  1992. }
  1993. #endif
  1994. udst += chromStride;
  1995. vdst += chromStride;
  1996. ydst += lumStride;
  1997. src += srcStride;
  1998. }
  1999. #ifdef HAVE_MMX
  2000. asm volatile( EMMS" \n\t"
  2001. SFENCE" \n\t"
  2002. :::"memory");
  2003. #endif
  2004. }
  2005. /**
  2006. *
  2007. * height should be a multiple of 2 and width should be a multiple of 2 (if this is a
  2008. * problem for anyone then tell me, and ill fix it)
  2009. * chrominance data is only taken from every secound line others are ignored in the C version FIXME write HQ version
  2010. */
  2011. static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
  2012. long width, long height,
  2013. long lumStride, long chromStride, long srcStride)
  2014. {
  2015. long y;
  2016. const long chromWidth= width>>1;
  2017. #ifdef HAVE_MMX
  2018. for(y=0; y<height-2; y+=2)
  2019. {
  2020. long i;
  2021. for(i=0; i<2; i++)
  2022. {
  2023. asm volatile(
  2024. "mov %2, %%"REG_a" \n\t"
  2025. "movq "MANGLE(bgr2YCoeff)", %%mm6 \n\t"
  2026. "movq "MANGLE(w1111)", %%mm5 \n\t"
  2027. "pxor %%mm7, %%mm7 \n\t"
  2028. "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d"\n\t"
  2029. ASMALIGN(4)
  2030. "1: \n\t"
  2031. PREFETCH" 64(%0, %%"REG_d") \n\t"
  2032. "movd (%0, %%"REG_d"), %%mm0 \n\t"
  2033. "movd 3(%0, %%"REG_d"), %%mm1 \n\t"
  2034. "punpcklbw %%mm7, %%mm0 \n\t"
  2035. "punpcklbw %%mm7, %%mm1 \n\t"
  2036. "movd 6(%0, %%"REG_d"), %%mm2 \n\t"
  2037. "movd 9(%0, %%"REG_d"), %%mm3 \n\t"
  2038. "punpcklbw %%mm7, %%mm2 \n\t"
  2039. "punpcklbw %%mm7, %%mm3 \n\t"
  2040. "pmaddwd %%mm6, %%mm0 \n\t"
  2041. "pmaddwd %%mm6, %%mm1 \n\t"
  2042. "pmaddwd %%mm6, %%mm2 \n\t"
  2043. "pmaddwd %%mm6, %%mm3 \n\t"
  2044. #ifndef FAST_BGR2YV12
  2045. "psrad $8, %%mm0 \n\t"
  2046. "psrad $8, %%mm1 \n\t"
  2047. "psrad $8, %%mm2 \n\t"
  2048. "psrad $8, %%mm3 \n\t"
  2049. #endif
  2050. "packssdw %%mm1, %%mm0 \n\t"
  2051. "packssdw %%mm3, %%mm2 \n\t"
  2052. "pmaddwd %%mm5, %%mm0 \n\t"
  2053. "pmaddwd %%mm5, %%mm2 \n\t"
  2054. "packssdw %%mm2, %%mm0 \n\t"
  2055. "psraw $7, %%mm0 \n\t"
  2056. "movd 12(%0, %%"REG_d"), %%mm4 \n\t"
  2057. "movd 15(%0, %%"REG_d"), %%mm1 \n\t"
  2058. "punpcklbw %%mm7, %%mm4 \n\t"
  2059. "punpcklbw %%mm7, %%mm1 \n\t"
  2060. "movd 18(%0, %%"REG_d"), %%mm2 \n\t"
  2061. "movd 21(%0, %%"REG_d"), %%mm3 \n\t"
  2062. "punpcklbw %%mm7, %%mm2 \n\t"
  2063. "punpcklbw %%mm7, %%mm3 \n\t"
  2064. "pmaddwd %%mm6, %%mm4 \n\t"
  2065. "pmaddwd %%mm6, %%mm1 \n\t"
  2066. "pmaddwd %%mm6, %%mm2 \n\t"
  2067. "pmaddwd %%mm6, %%mm3 \n\t"
  2068. #ifndef FAST_BGR2YV12
  2069. "psrad $8, %%mm4 \n\t"
  2070. "psrad $8, %%mm1 \n\t"
  2071. "psrad $8, %%mm2 \n\t"
  2072. "psrad $8, %%mm3 \n\t"
  2073. #endif
  2074. "packssdw %%mm1, %%mm4 \n\t"
  2075. "packssdw %%mm3, %%mm2 \n\t"
  2076. "pmaddwd %%mm5, %%mm4 \n\t"
  2077. "pmaddwd %%mm5, %%mm2 \n\t"
  2078. "add $24, %%"REG_d" \n\t"
  2079. "packssdw %%mm2, %%mm4 \n\t"
  2080. "psraw $7, %%mm4 \n\t"
  2081. "packuswb %%mm4, %%mm0 \n\t"
  2082. "paddusb "MANGLE(bgr2YOffset)", %%mm0 \n\t"
  2083. MOVNTQ" %%mm0, (%1, %%"REG_a") \n\t"
  2084. "add $8, %%"REG_a" \n\t"
  2085. " js 1b \n\t"
  2086. : : "r" (src+width*3), "r" (ydst+width), "g" (-width)
  2087. : "%"REG_a, "%"REG_d
  2088. );
  2089. ydst += lumStride;
  2090. src += srcStride;
  2091. }
  2092. src -= srcStride*2;
  2093. asm volatile(
  2094. "mov %4, %%"REG_a" \n\t"
  2095. "movq "MANGLE(w1111)", %%mm5 \n\t"
  2096. "movq "MANGLE(bgr2UCoeff)", %%mm6 \n\t"
  2097. "pxor %%mm7, %%mm7 \n\t"
  2098. "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d"\n\t"
  2099. "add %%"REG_d", %%"REG_d" \n\t"
  2100. ASMALIGN(4)
  2101. "1: \n\t"
  2102. PREFETCH" 64(%0, %%"REG_d") \n\t"
  2103. PREFETCH" 64(%1, %%"REG_d") \n\t"
  2104. #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
  2105. "movq (%0, %%"REG_d"), %%mm0 \n\t"
  2106. "movq (%1, %%"REG_d"), %%mm1 \n\t"
  2107. "movq 6(%0, %%"REG_d"), %%mm2 \n\t"
  2108. "movq 6(%1, %%"REG_d"), %%mm3 \n\t"
  2109. PAVGB" %%mm1, %%mm0 \n\t"
  2110. PAVGB" %%mm3, %%mm2 \n\t"
  2111. "movq %%mm0, %%mm1 \n\t"
  2112. "movq %%mm2, %%mm3 \n\t"
  2113. "psrlq $24, %%mm0 \n\t"
  2114. "psrlq $24, %%mm2 \n\t"
  2115. PAVGB" %%mm1, %%mm0 \n\t"
  2116. PAVGB" %%mm3, %%mm2 \n\t"
  2117. "punpcklbw %%mm7, %%mm0 \n\t"
  2118. "punpcklbw %%mm7, %%mm2 \n\t"
  2119. #else
  2120. "movd (%0, %%"REG_d"), %%mm0 \n\t"
  2121. "movd (%1, %%"REG_d"), %%mm1 \n\t"
  2122. "movd 3(%0, %%"REG_d"), %%mm2 \n\t"
  2123. "movd 3(%1, %%"REG_d"), %%mm3 \n\t"
  2124. "punpcklbw %%mm7, %%mm0 \n\t"
  2125. "punpcklbw %%mm7, %%mm1 \n\t"
  2126. "punpcklbw %%mm7, %%mm2 \n\t"
  2127. "punpcklbw %%mm7, %%mm3 \n\t"
  2128. "paddw %%mm1, %%mm0 \n\t"
  2129. "paddw %%mm3, %%mm2 \n\t"
  2130. "paddw %%mm2, %%mm0 \n\t"
  2131. "movd 6(%0, %%"REG_d"), %%mm4 \n\t"
  2132. "movd 6(%1, %%"REG_d"), %%mm1 \n\t"
  2133. "movd 9(%0, %%"REG_d"), %%mm2 \n\t"
  2134. "movd 9(%1, %%"REG_d"), %%mm3 \n\t"
  2135. "punpcklbw %%mm7, %%mm4 \n\t"
  2136. "punpcklbw %%mm7, %%mm1 \n\t"
  2137. "punpcklbw %%mm7, %%mm2 \n\t"
  2138. "punpcklbw %%mm7, %%mm3 \n\t"
  2139. "paddw %%mm1, %%mm4 \n\t"
  2140. "paddw %%mm3, %%mm2 \n\t"
  2141. "paddw %%mm4, %%mm2 \n\t"
  2142. "psrlw $2, %%mm0 \n\t"
  2143. "psrlw $2, %%mm2 \n\t"
  2144. #endif
  2145. "movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t"
  2146. "movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t"
  2147. "pmaddwd %%mm0, %%mm1 \n\t"
  2148. "pmaddwd %%mm2, %%mm3 \n\t"
  2149. "pmaddwd %%mm6, %%mm0 \n\t"
  2150. "pmaddwd %%mm6, %%mm2 \n\t"
  2151. #ifndef FAST_BGR2YV12
  2152. "psrad $8, %%mm0 \n\t"
  2153. "psrad $8, %%mm1 \n\t"
  2154. "psrad $8, %%mm2 \n\t"
  2155. "psrad $8, %%mm3 \n\t"
  2156. #endif
  2157. "packssdw %%mm2, %%mm0 \n\t"
  2158. "packssdw %%mm3, %%mm1 \n\t"
  2159. "pmaddwd %%mm5, %%mm0 \n\t"
  2160. "pmaddwd %%mm5, %%mm1 \n\t"
  2161. "packssdw %%mm1, %%mm0 \n\t" // V1 V0 U1 U0
  2162. "psraw $7, %%mm0 \n\t"
  2163. #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
  2164. "movq 12(%0, %%"REG_d"), %%mm4 \n\t"
  2165. "movq 12(%1, %%"REG_d"), %%mm1 \n\t"
  2166. "movq 18(%0, %%"REG_d"), %%mm2 \n\t"
  2167. "movq 18(%1, %%"REG_d"), %%mm3 \n\t"
  2168. PAVGB" %%mm1, %%mm4 \n\t"
  2169. PAVGB" %%mm3, %%mm2 \n\t"
  2170. "movq %%mm4, %%mm1 \n\t"
  2171. "movq %%mm2, %%mm3 \n\t"
  2172. "psrlq $24, %%mm4 \n\t"
  2173. "psrlq $24, %%mm2 \n\t"
  2174. PAVGB" %%mm1, %%mm4 \n\t"
  2175. PAVGB" %%mm3, %%mm2 \n\t"
  2176. "punpcklbw %%mm7, %%mm4 \n\t"
  2177. "punpcklbw %%mm7, %%mm2 \n\t"
  2178. #else
  2179. "movd 12(%0, %%"REG_d"), %%mm4 \n\t"
  2180. "movd 12(%1, %%"REG_d"), %%mm1 \n\t"
  2181. "movd 15(%0, %%"REG_d"), %%mm2 \n\t"
  2182. "movd 15(%1, %%"REG_d"), %%mm3 \n\t"
  2183. "punpcklbw %%mm7, %%mm4 \n\t"
  2184. "punpcklbw %%mm7, %%mm1 \n\t"
  2185. "punpcklbw %%mm7, %%mm2 \n\t"
  2186. "punpcklbw %%mm7, %%mm3 \n\t"
  2187. "paddw %%mm1, %%mm4 \n\t"
  2188. "paddw %%mm3, %%mm2 \n\t"
  2189. "paddw %%mm2, %%mm4 \n\t"
  2190. "movd 18(%0, %%"REG_d"), %%mm5 \n\t"
  2191. "movd 18(%1, %%"REG_d"), %%mm1 \n\t"
  2192. "movd 21(%0, %%"REG_d"), %%mm2 \n\t"
  2193. "movd 21(%1, %%"REG_d"), %%mm3 \n\t"
  2194. "punpcklbw %%mm7, %%mm5 \n\t"
  2195. "punpcklbw %%mm7, %%mm1 \n\t"
  2196. "punpcklbw %%mm7, %%mm2 \n\t"
  2197. "punpcklbw %%mm7, %%mm3 \n\t"
  2198. "paddw %%mm1, %%mm5 \n\t"
  2199. "paddw %%mm3, %%mm2 \n\t"
  2200. "paddw %%mm5, %%mm2 \n\t"
  2201. "movq "MANGLE(w1111)", %%mm5 \n\t"
  2202. "psrlw $2, %%mm4 \n\t"
  2203. "psrlw $2, %%mm2 \n\t"
  2204. #endif
  2205. "movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t"
  2206. "movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t"
  2207. "pmaddwd %%mm4, %%mm1 \n\t"
  2208. "pmaddwd %%mm2, %%mm3 \n\t"
  2209. "pmaddwd %%mm6, %%mm4 \n\t"
  2210. "pmaddwd %%mm6, %%mm2 \n\t"
  2211. #ifndef FAST_BGR2YV12
  2212. "psrad $8, %%mm4 \n\t"
  2213. "psrad $8, %%mm1 \n\t"
  2214. "psrad $8, %%mm2 \n\t"
  2215. "psrad $8, %%mm3 \n\t"
  2216. #endif
  2217. "packssdw %%mm2, %%mm4 \n\t"
  2218. "packssdw %%mm3, %%mm1 \n\t"
  2219. "pmaddwd %%mm5, %%mm4 \n\t"
  2220. "pmaddwd %%mm5, %%mm1 \n\t"
  2221. "add $24, %%"REG_d" \n\t"
  2222. "packssdw %%mm1, %%mm4 \n\t" // V3 V2 U3 U2
  2223. "psraw $7, %%mm4 \n\t"
  2224. "movq %%mm0, %%mm1 \n\t"
  2225. "punpckldq %%mm4, %%mm0 \n\t"
  2226. "punpckhdq %%mm4, %%mm1 \n\t"
  2227. "packsswb %%mm1, %%mm0 \n\t"
  2228. "paddb "MANGLE(bgr2UVOffset)", %%mm0 \n\t"
  2229. "movd %%mm0, (%2, %%"REG_a") \n\t"
  2230. "punpckhdq %%mm0, %%mm0 \n\t"
  2231. "movd %%mm0, (%3, %%"REG_a") \n\t"
  2232. "add $4, %%"REG_a" \n\t"
  2233. " js 1b \n\t"
  2234. : : "r" (src+chromWidth*6), "r" (src+srcStride+chromWidth*6), "r" (udst+chromWidth), "r" (vdst+chromWidth), "g" (-chromWidth)
  2235. : "%"REG_a, "%"REG_d
  2236. );
  2237. udst += chromStride;
  2238. vdst += chromStride;
  2239. src += srcStride*2;
  2240. }
  2241. asm volatile( EMMS" \n\t"
  2242. SFENCE" \n\t"
  2243. :::"memory");
  2244. #else
  2245. y=0;
  2246. #endif
  2247. for(; y<height; y+=2)
  2248. {
  2249. long i;
  2250. for(i=0; i<chromWidth; i++)
  2251. {
  2252. unsigned int b= src[6*i+0];
  2253. unsigned int g= src[6*i+1];
  2254. unsigned int r= src[6*i+2];
  2255. unsigned int Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
  2256. unsigned int V = ((RV*r + GV*g + BV*b)>>RGB2YUV_SHIFT) + 128;
  2257. unsigned int U = ((RU*r + GU*g + BU*b)>>RGB2YUV_SHIFT) + 128;
  2258. udst[i] = U;
  2259. vdst[i] = V;
  2260. ydst[2*i] = Y;
  2261. b= src[6*i+3];
  2262. g= src[6*i+4];
  2263. r= src[6*i+5];
  2264. Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
  2265. ydst[2*i+1] = Y;
  2266. }
  2267. ydst += lumStride;
  2268. src += srcStride;
  2269. for(i=0; i<chromWidth; i++)
  2270. {
  2271. unsigned int b= src[6*i+0];
  2272. unsigned int g= src[6*i+1];
  2273. unsigned int r= src[6*i+2];
  2274. unsigned int Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
  2275. ydst[2*i] = Y;
  2276. b= src[6*i+3];
  2277. g= src[6*i+4];
  2278. r= src[6*i+5];
  2279. Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
  2280. ydst[2*i+1] = Y;
  2281. }
  2282. udst += chromStride;
  2283. vdst += chromStride;
  2284. ydst += lumStride;
  2285. src += srcStride;
  2286. }
  2287. }
  2288. void RENAME(interleaveBytes)(uint8_t *src1, uint8_t *src2, uint8_t *dest,
  2289. long width, long height, long src1Stride,
  2290. long src2Stride, long dstStride){
  2291. long h;
  2292. for(h=0; h < height; h++)
  2293. {
  2294. long w;
  2295. #ifdef HAVE_MMX
  2296. #ifdef HAVE_SSE2
  2297. asm(
  2298. "xor %%"REG_a", %%"REG_a" \n\t"
  2299. "1: \n\t"
  2300. PREFETCH" 64(%1, %%"REG_a") \n\t"
  2301. PREFETCH" 64(%2, %%"REG_a") \n\t"
  2302. "movdqa (%1, %%"REG_a"), %%xmm0 \n\t"
  2303. "movdqa (%1, %%"REG_a"), %%xmm1 \n\t"
  2304. "movdqa (%2, %%"REG_a"), %%xmm2 \n\t"
  2305. "punpcklbw %%xmm2, %%xmm0 \n\t"
  2306. "punpckhbw %%xmm2, %%xmm1 \n\t"
  2307. "movntdq %%xmm0, (%0, %%"REG_a", 2)\n\t"
  2308. "movntdq %%xmm1, 16(%0, %%"REG_a", 2)\n\t"
  2309. "add $16, %%"REG_a" \n\t"
  2310. "cmp %3, %%"REG_a" \n\t"
  2311. " jb 1b \n\t"
  2312. ::"r"(dest), "r"(src1), "r"(src2), "r" (width-15)
  2313. : "memory", "%"REG_a""
  2314. );
  2315. #else
  2316. asm(
  2317. "xor %%"REG_a", %%"REG_a" \n\t"
  2318. "1: \n\t"
  2319. PREFETCH" 64(%1, %%"REG_a") \n\t"
  2320. PREFETCH" 64(%2, %%"REG_a") \n\t"
  2321. "movq (%1, %%"REG_a"), %%mm0 \n\t"
  2322. "movq 8(%1, %%"REG_a"), %%mm2 \n\t"
  2323. "movq %%mm0, %%mm1 \n\t"
  2324. "movq %%mm2, %%mm3 \n\t"
  2325. "movq (%2, %%"REG_a"), %%mm4 \n\t"
  2326. "movq 8(%2, %%"REG_a"), %%mm5 \n\t"
  2327. "punpcklbw %%mm4, %%mm0 \n\t"
  2328. "punpckhbw %%mm4, %%mm1 \n\t"
  2329. "punpcklbw %%mm5, %%mm2 \n\t"
  2330. "punpckhbw %%mm5, %%mm3 \n\t"
  2331. MOVNTQ" %%mm0, (%0, %%"REG_a", 2)\n\t"
  2332. MOVNTQ" %%mm1, 8(%0, %%"REG_a", 2)\n\t"
  2333. MOVNTQ" %%mm2, 16(%0, %%"REG_a", 2)\n\t"
  2334. MOVNTQ" %%mm3, 24(%0, %%"REG_a", 2)\n\t"
  2335. "add $16, %%"REG_a" \n\t"
  2336. "cmp %3, %%"REG_a" \n\t"
  2337. " jb 1b \n\t"
  2338. ::"r"(dest), "r"(src1), "r"(src2), "r" (width-15)
  2339. : "memory", "%"REG_a
  2340. );
  2341. #endif
  2342. for(w= (width&(~15)); w < width; w++)
  2343. {
  2344. dest[2*w+0] = src1[w];
  2345. dest[2*w+1] = src2[w];
  2346. }
  2347. #else
  2348. for(w=0; w < width; w++)
  2349. {
  2350. dest[2*w+0] = src1[w];
  2351. dest[2*w+1] = src2[w];
  2352. }
  2353. #endif
  2354. dest += dstStride;
  2355. src1 += src1Stride;
  2356. src2 += src2Stride;
  2357. }
  2358. #ifdef HAVE_MMX
  2359. asm(
  2360. EMMS" \n\t"
  2361. SFENCE" \n\t"
  2362. ::: "memory"
  2363. );
  2364. #endif
  2365. }
  2366. static inline void RENAME(vu9_to_vu12)(const uint8_t *src1, const uint8_t *src2,
  2367. uint8_t *dst1, uint8_t *dst2,
  2368. long width, long height,
  2369. long srcStride1, long srcStride2,
  2370. long dstStride1, long dstStride2)
  2371. {
  2372. long y,x,w,h;
  2373. w=width/2; h=height/2;
  2374. #ifdef HAVE_MMX
  2375. asm volatile(
  2376. PREFETCH" %0\n\t"
  2377. PREFETCH" %1\n\t"
  2378. ::"m"(*(src1+srcStride1)),"m"(*(src2+srcStride2)):"memory");
  2379. #endif
  2380. for(y=0;y<h;y++){
  2381. const uint8_t* s1=src1+srcStride1*(y>>1);
  2382. uint8_t* d=dst1+dstStride1*y;
  2383. x=0;
  2384. #ifdef HAVE_MMX
  2385. for(;x<w-31;x+=32)
  2386. {
  2387. asm volatile(
  2388. PREFETCH" 32%1\n\t"
  2389. "movq %1, %%mm0\n\t"
  2390. "movq 8%1, %%mm2\n\t"
  2391. "movq 16%1, %%mm4\n\t"
  2392. "movq 24%1, %%mm6\n\t"
  2393. "movq %%mm0, %%mm1\n\t"
  2394. "movq %%mm2, %%mm3\n\t"
  2395. "movq %%mm4, %%mm5\n\t"
  2396. "movq %%mm6, %%mm7\n\t"
  2397. "punpcklbw %%mm0, %%mm0\n\t"
  2398. "punpckhbw %%mm1, %%mm1\n\t"
  2399. "punpcklbw %%mm2, %%mm2\n\t"
  2400. "punpckhbw %%mm3, %%mm3\n\t"
  2401. "punpcklbw %%mm4, %%mm4\n\t"
  2402. "punpckhbw %%mm5, %%mm5\n\t"
  2403. "punpcklbw %%mm6, %%mm6\n\t"
  2404. "punpckhbw %%mm7, %%mm7\n\t"
  2405. MOVNTQ" %%mm0, %0\n\t"
  2406. MOVNTQ" %%mm1, 8%0\n\t"
  2407. MOVNTQ" %%mm2, 16%0\n\t"
  2408. MOVNTQ" %%mm3, 24%0\n\t"
  2409. MOVNTQ" %%mm4, 32%0\n\t"
  2410. MOVNTQ" %%mm5, 40%0\n\t"
  2411. MOVNTQ" %%mm6, 48%0\n\t"
  2412. MOVNTQ" %%mm7, 56%0"
  2413. :"=m"(d[2*x])
  2414. :"m"(s1[x])
  2415. :"memory");
  2416. }
  2417. #endif
  2418. for(;x<w;x++) d[2*x]=d[2*x+1]=s1[x];
  2419. }
  2420. for(y=0;y<h;y++){
  2421. const uint8_t* s2=src2+srcStride2*(y>>1);
  2422. uint8_t* d=dst2+dstStride2*y;
  2423. x=0;
  2424. #ifdef HAVE_MMX
  2425. for(;x<w-31;x+=32)
  2426. {
  2427. asm volatile(
  2428. PREFETCH" 32%1\n\t"
  2429. "movq %1, %%mm0\n\t"
  2430. "movq 8%1, %%mm2\n\t"
  2431. "movq 16%1, %%mm4\n\t"
  2432. "movq 24%1, %%mm6\n\t"
  2433. "movq %%mm0, %%mm1\n\t"
  2434. "movq %%mm2, %%mm3\n\t"
  2435. "movq %%mm4, %%mm5\n\t"
  2436. "movq %%mm6, %%mm7\n\t"
  2437. "punpcklbw %%mm0, %%mm0\n\t"
  2438. "punpckhbw %%mm1, %%mm1\n\t"
  2439. "punpcklbw %%mm2, %%mm2\n\t"
  2440. "punpckhbw %%mm3, %%mm3\n\t"
  2441. "punpcklbw %%mm4, %%mm4\n\t"
  2442. "punpckhbw %%mm5, %%mm5\n\t"
  2443. "punpcklbw %%mm6, %%mm6\n\t"
  2444. "punpckhbw %%mm7, %%mm7\n\t"
  2445. MOVNTQ" %%mm0, %0\n\t"
  2446. MOVNTQ" %%mm1, 8%0\n\t"
  2447. MOVNTQ" %%mm2, 16%0\n\t"
  2448. MOVNTQ" %%mm3, 24%0\n\t"
  2449. MOVNTQ" %%mm4, 32%0\n\t"
  2450. MOVNTQ" %%mm5, 40%0\n\t"
  2451. MOVNTQ" %%mm6, 48%0\n\t"
  2452. MOVNTQ" %%mm7, 56%0"
  2453. :"=m"(d[2*x])
  2454. :"m"(s2[x])
  2455. :"memory");
  2456. }
  2457. #endif
  2458. for(;x<w;x++) d[2*x]=d[2*x+1]=s2[x];
  2459. }
  2460. #ifdef HAVE_MMX
  2461. asm(
  2462. EMMS" \n\t"
  2463. SFENCE" \n\t"
  2464. ::: "memory"
  2465. );
  2466. #endif
  2467. }
  2468. static inline void RENAME(yvu9_to_yuy2)(const uint8_t *src1, const uint8_t *src2, const uint8_t *src3,
  2469. uint8_t *dst,
  2470. long width, long height,
  2471. long srcStride1, long srcStride2,
  2472. long srcStride3, long dstStride)
  2473. {
  2474. long y,x,w,h;
  2475. w=width/2; h=height;
  2476. for(y=0;y<h;y++){
  2477. const uint8_t* yp=src1+srcStride1*y;
  2478. const uint8_t* up=src2+srcStride2*(y>>2);
  2479. const uint8_t* vp=src3+srcStride3*(y>>2);
  2480. uint8_t* d=dst+dstStride*y;
  2481. x=0;
  2482. #ifdef HAVE_MMX
  2483. for(;x<w-7;x+=8)
  2484. {
  2485. asm volatile(
  2486. PREFETCH" 32(%1, %0)\n\t"
  2487. PREFETCH" 32(%2, %0)\n\t"
  2488. PREFETCH" 32(%3, %0)\n\t"
  2489. "movq (%1, %0, 4), %%mm0\n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
  2490. "movq (%2, %0), %%mm1\n\t" /* U0U1U2U3U4U5U6U7 */
  2491. "movq (%3, %0), %%mm2\n\t" /* V0V1V2V3V4V5V6V7 */
  2492. "movq %%mm0, %%mm3\n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
  2493. "movq %%mm1, %%mm4\n\t" /* U0U1U2U3U4U5U6U7 */
  2494. "movq %%mm2, %%mm5\n\t" /* V0V1V2V3V4V5V6V7 */
  2495. "punpcklbw %%mm1, %%mm1\n\t" /* U0U0 U1U1 U2U2 U3U3 */
  2496. "punpcklbw %%mm2, %%mm2\n\t" /* V0V0 V1V1 V2V2 V3V3 */
  2497. "punpckhbw %%mm4, %%mm4\n\t" /* U4U4 U5U5 U6U6 U7U7 */
  2498. "punpckhbw %%mm5, %%mm5\n\t" /* V4V4 V5V5 V6V6 V7V7 */
  2499. "movq %%mm1, %%mm6\n\t"
  2500. "punpcklbw %%mm2, %%mm1\n\t" /* U0V0 U0V0 U1V1 U1V1*/
  2501. "punpcklbw %%mm1, %%mm0\n\t" /* Y0U0 Y1V0 Y2U0 Y3V0*/
  2502. "punpckhbw %%mm1, %%mm3\n\t" /* Y4U1 Y5V1 Y6U1 Y7V1*/
  2503. MOVNTQ" %%mm0, (%4, %0, 8)\n\t"
  2504. MOVNTQ" %%mm3, 8(%4, %0, 8)\n\t"
  2505. "punpckhbw %%mm2, %%mm6\n\t" /* U2V2 U2V2 U3V3 U3V3*/
  2506. "movq 8(%1, %0, 4), %%mm0\n\t"
  2507. "movq %%mm0, %%mm3\n\t"
  2508. "punpcklbw %%mm6, %%mm0\n\t" /* Y U2 Y V2 Y U2 Y V2*/
  2509. "punpckhbw %%mm6, %%mm3\n\t" /* Y U3 Y V3 Y U3 Y V3*/
  2510. MOVNTQ" %%mm0, 16(%4, %0, 8)\n\t"
  2511. MOVNTQ" %%mm3, 24(%4, %0, 8)\n\t"
  2512. "movq %%mm4, %%mm6\n\t"
  2513. "movq 16(%1, %0, 4), %%mm0\n\t"
  2514. "movq %%mm0, %%mm3\n\t"
  2515. "punpcklbw %%mm5, %%mm4\n\t"
  2516. "punpcklbw %%mm4, %%mm0\n\t" /* Y U4 Y V4 Y U4 Y V4*/
  2517. "punpckhbw %%mm4, %%mm3\n\t" /* Y U5 Y V5 Y U5 Y V5*/
  2518. MOVNTQ" %%mm0, 32(%4, %0, 8)\n\t"
  2519. MOVNTQ" %%mm3, 40(%4, %0, 8)\n\t"
  2520. "punpckhbw %%mm5, %%mm6\n\t"
  2521. "movq 24(%1, %0, 4), %%mm0\n\t"
  2522. "movq %%mm0, %%mm3\n\t"
  2523. "punpcklbw %%mm6, %%mm0\n\t" /* Y U6 Y V6 Y U6 Y V6*/
  2524. "punpckhbw %%mm6, %%mm3\n\t" /* Y U7 Y V7 Y U7 Y V7*/
  2525. MOVNTQ" %%mm0, 48(%4, %0, 8)\n\t"
  2526. MOVNTQ" %%mm3, 56(%4, %0, 8)\n\t"
  2527. : "+r" (x)
  2528. : "r"(yp), "r" (up), "r"(vp), "r"(d)
  2529. :"memory");
  2530. }
  2531. #endif
  2532. for(; x<w; x++)
  2533. {
  2534. const long x2= x<<2;
  2535. d[8*x+0]=yp[x2];
  2536. d[8*x+1]=up[x];
  2537. d[8*x+2]=yp[x2+1];
  2538. d[8*x+3]=vp[x];
  2539. d[8*x+4]=yp[x2+2];
  2540. d[8*x+5]=up[x];
  2541. d[8*x+6]=yp[x2+3];
  2542. d[8*x+7]=vp[x];
  2543. }
  2544. }
  2545. #ifdef HAVE_MMX
  2546. asm(
  2547. EMMS" \n\t"
  2548. SFENCE" \n\t"
  2549. ::: "memory"
  2550. );
  2551. #endif
  2552. }