You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

2668 lines
67KB

  1. /*
  2. *
  3. * rgb2rgb.c, Software RGB to RGB convertor
  4. * pluralize by Software PAL8 to RGB convertor
  5. * Software YUV to YUV convertor
  6. * Software YUV to RGB convertor
  7. * Written by Nick Kurshev.
  8. * palette & yuv & runtime cpu stuff by Michael (michaelni@gmx.at) (under GPL)
  9. * lot of big-endian byteorder fixes by Alex Beregszaszi
  10. */
  11. #include <stddef.h>
  12. #include <inttypes.h> /* for __WORDSIZE */
  13. #ifndef __WORDSIZE
  14. // #warning You have misconfigured system and probably will lose performance!
  15. #define __WORDSIZE MP_WORDSIZE
  16. #endif
  17. #undef PREFETCH
  18. #undef MOVNTQ
  19. #undef EMMS
  20. #undef SFENCE
  21. #undef MMREG_SIZE
  22. #undef PREFETCHW
  23. #undef PAVGB
  24. #ifdef HAVE_SSE2
  25. #define MMREG_SIZE 16
  26. #else
  27. #define MMREG_SIZE 8
  28. #endif
  29. #ifdef HAVE_3DNOW
  30. #define PREFETCH "prefetch"
  31. #define PREFETCHW "prefetchw"
  32. #define PAVGB "pavgusb"
  33. #elif defined ( HAVE_MMX2 )
  34. #define PREFETCH "prefetchnta"
  35. #define PREFETCHW "prefetcht0"
  36. #define PAVGB "pavgb"
  37. #else
  38. #define PREFETCH "/nop"
  39. #define PREFETCHW "/nop"
  40. #endif
  41. #ifdef HAVE_3DNOW
  42. /* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
  43. #define EMMS "femms"
  44. #else
  45. #define EMMS "emms"
  46. #endif
  47. #ifdef HAVE_MMX2
  48. #define MOVNTQ "movntq"
  49. #define SFENCE "sfence"
  50. #else
  51. #define MOVNTQ "movq"
  52. #define SFENCE "/nop"
  53. #endif
  54. static inline void RENAME(rgb24to32)(const uint8_t *src,uint8_t *dst,unsigned src_size)
  55. {
  56. uint8_t *dest = dst;
  57. const uint8_t *s = src;
  58. const uint8_t *end;
  59. #ifdef HAVE_MMX
  60. const uint8_t *mm_end;
  61. #endif
  62. end = s + src_size;
  63. #ifdef HAVE_MMX
  64. __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
  65. mm_end = end - 23;
  66. __asm __volatile("movq %0, %%mm7"::"m"(mask32):"memory");
  67. while(s < mm_end)
  68. {
  69. __asm __volatile(
  70. PREFETCH" 32%1\n\t"
  71. "movd %1, %%mm0\n\t"
  72. "punpckldq 3%1, %%mm0\n\t"
  73. "movd 6%1, %%mm1\n\t"
  74. "punpckldq 9%1, %%mm1\n\t"
  75. "movd 12%1, %%mm2\n\t"
  76. "punpckldq 15%1, %%mm2\n\t"
  77. "movd 18%1, %%mm3\n\t"
  78. "punpckldq 21%1, %%mm3\n\t"
  79. "pand %%mm7, %%mm0\n\t"
  80. "pand %%mm7, %%mm1\n\t"
  81. "pand %%mm7, %%mm2\n\t"
  82. "pand %%mm7, %%mm3\n\t"
  83. MOVNTQ" %%mm0, %0\n\t"
  84. MOVNTQ" %%mm1, 8%0\n\t"
  85. MOVNTQ" %%mm2, 16%0\n\t"
  86. MOVNTQ" %%mm3, 24%0"
  87. :"=m"(*dest)
  88. :"m"(*s)
  89. :"memory");
  90. dest += 32;
  91. s += 24;
  92. }
  93. __asm __volatile(SFENCE:::"memory");
  94. __asm __volatile(EMMS:::"memory");
  95. #endif
  96. while(s < end)
  97. {
  98. #ifdef WORDS_BIGENDIAN
  99. *dest++ = 0;
  100. *dest++ = *s++;
  101. *dest++ = *s++;
  102. *dest++ = *s++;
  103. #else
  104. *dest++ = *s++;
  105. *dest++ = *s++;
  106. *dest++ = *s++;
  107. *dest++ = 0;
  108. #endif
  109. }
  110. }
  111. static inline void RENAME(rgb32to24)(const uint8_t *src,uint8_t *dst,unsigned src_size)
  112. {
  113. uint8_t *dest = dst;
  114. const uint8_t *s = src;
  115. const uint8_t *end;
  116. #ifdef HAVE_MMX
  117. const uint8_t *mm_end;
  118. #endif
  119. end = s + src_size;
  120. #ifdef HAVE_MMX
  121. __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
  122. mm_end = end - 31;
  123. while(s < mm_end)
  124. {
  125. __asm __volatile(
  126. PREFETCH" 32%1\n\t"
  127. "movq %1, %%mm0\n\t"
  128. "movq 8%1, %%mm1\n\t"
  129. "movq 16%1, %%mm4\n\t"
  130. "movq 24%1, %%mm5\n\t"
  131. "movq %%mm0, %%mm2\n\t"
  132. "movq %%mm1, %%mm3\n\t"
  133. "movq %%mm4, %%mm6\n\t"
  134. "movq %%mm5, %%mm7\n\t"
  135. "psrlq $8, %%mm2\n\t"
  136. "psrlq $8, %%mm3\n\t"
  137. "psrlq $8, %%mm6\n\t"
  138. "psrlq $8, %%mm7\n\t"
  139. "pand %2, %%mm0\n\t"
  140. "pand %2, %%mm1\n\t"
  141. "pand %2, %%mm4\n\t"
  142. "pand %2, %%mm5\n\t"
  143. "pand %3, %%mm2\n\t"
  144. "pand %3, %%mm3\n\t"
  145. "pand %3, %%mm6\n\t"
  146. "pand %3, %%mm7\n\t"
  147. "por %%mm2, %%mm0\n\t"
  148. "por %%mm3, %%mm1\n\t"
  149. "por %%mm6, %%mm4\n\t"
  150. "por %%mm7, %%mm5\n\t"
  151. "movq %%mm1, %%mm2\n\t"
  152. "movq %%mm4, %%mm3\n\t"
  153. "psllq $48, %%mm2\n\t"
  154. "psllq $32, %%mm3\n\t"
  155. "pand %4, %%mm2\n\t"
  156. "pand %5, %%mm3\n\t"
  157. "por %%mm2, %%mm0\n\t"
  158. "psrlq $16, %%mm1\n\t"
  159. "psrlq $32, %%mm4\n\t"
  160. "psllq $16, %%mm5\n\t"
  161. "por %%mm3, %%mm1\n\t"
  162. "pand %6, %%mm5\n\t"
  163. "por %%mm5, %%mm4\n\t"
  164. MOVNTQ" %%mm0, %0\n\t"
  165. MOVNTQ" %%mm1, 8%0\n\t"
  166. MOVNTQ" %%mm4, 16%0"
  167. :"=m"(*dest)
  168. :"m"(*s),"m"(mask24l),
  169. "m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
  170. :"memory");
  171. dest += 24;
  172. s += 32;
  173. }
  174. __asm __volatile(SFENCE:::"memory");
  175. __asm __volatile(EMMS:::"memory");
  176. #endif
  177. while(s < end)
  178. {
  179. #ifdef WORDS_BIGENDIAN
  180. s++;
  181. *dest++ = *s++;
  182. *dest++ = *s++;
  183. *dest++ = *s++;
  184. #else
  185. *dest++ = *s++;
  186. *dest++ = *s++;
  187. *dest++ = *s++;
  188. s++;
  189. #endif
  190. }
  191. }
  192. /*
  193. Original by Strepto/Astral
  194. ported to gcc & bugfixed : A'rpi
  195. MMX2, 3DNOW optimization by Nick Kurshev
  196. 32bit c version, and and&add trick by Michael Niedermayer
  197. */
  198. static inline void RENAME(rgb15to16)(const uint8_t *src,uint8_t *dst,unsigned src_size)
  199. {
  200. register const uint8_t* s=src;
  201. register uint8_t* d=dst;
  202. register const uint8_t *end;
  203. const uint8_t *mm_end;
  204. end = s + src_size;
  205. #ifdef HAVE_MMX
  206. __asm __volatile(PREFETCH" %0"::"m"(*s));
  207. __asm __volatile("movq %0, %%mm4"::"m"(mask15s));
  208. mm_end = end - 15;
  209. while(s<mm_end)
  210. {
  211. __asm __volatile(
  212. PREFETCH" 32%1\n\t"
  213. "movq %1, %%mm0\n\t"
  214. "movq 8%1, %%mm2\n\t"
  215. "movq %%mm0, %%mm1\n\t"
  216. "movq %%mm2, %%mm3\n\t"
  217. "pand %%mm4, %%mm0\n\t"
  218. "pand %%mm4, %%mm2\n\t"
  219. "paddw %%mm1, %%mm0\n\t"
  220. "paddw %%mm3, %%mm2\n\t"
  221. MOVNTQ" %%mm0, %0\n\t"
  222. MOVNTQ" %%mm2, 8%0"
  223. :"=m"(*d)
  224. :"m"(*s)
  225. );
  226. d+=16;
  227. s+=16;
  228. }
  229. __asm __volatile(SFENCE:::"memory");
  230. __asm __volatile(EMMS:::"memory");
  231. #endif
  232. mm_end = end - 3;
  233. while(s < mm_end)
  234. {
  235. register unsigned x= *((uint32_t *)s);
  236. *((uint32_t *)d) = (x&0x7FFF7FFF) + (x&0x7FE07FE0);
  237. d+=4;
  238. s+=4;
  239. }
  240. if(s < end)
  241. {
  242. register unsigned short x= *((uint16_t *)s);
  243. *((uint16_t *)d) = (x&0x7FFF) + (x&0x7FE0);
  244. }
  245. }
  246. static inline void RENAME(rgb16to15)(const uint8_t *src,uint8_t *dst,unsigned src_size)
  247. {
  248. register const uint8_t* s=src;
  249. register uint8_t* d=dst;
  250. register const uint8_t *end;
  251. const uint8_t *mm_end;
  252. end = s + src_size;
  253. #ifdef HAVE_MMX
  254. __asm __volatile(PREFETCH" %0"::"m"(*s));
  255. __asm __volatile("movq %0, %%mm7"::"m"(mask15rg));
  256. __asm __volatile("movq %0, %%mm6"::"m"(mask15b));
  257. mm_end = end - 15;
  258. while(s<mm_end)
  259. {
  260. __asm __volatile(
  261. PREFETCH" 32%1\n\t"
  262. "movq %1, %%mm0\n\t"
  263. "movq 8%1, %%mm2\n\t"
  264. "movq %%mm0, %%mm1\n\t"
  265. "movq %%mm2, %%mm3\n\t"
  266. "psrlq $1, %%mm0\n\t"
  267. "psrlq $1, %%mm2\n\t"
  268. "pand %%mm7, %%mm0\n\t"
  269. "pand %%mm7, %%mm2\n\t"
  270. "pand %%mm6, %%mm1\n\t"
  271. "pand %%mm6, %%mm3\n\t"
  272. "por %%mm1, %%mm0\n\t"
  273. "por %%mm3, %%mm2\n\t"
  274. MOVNTQ" %%mm0, %0\n\t"
  275. MOVNTQ" %%mm2, 8%0"
  276. :"=m"(*d)
  277. :"m"(*s)
  278. );
  279. d+=16;
  280. s+=16;
  281. }
  282. __asm __volatile(SFENCE:::"memory");
  283. __asm __volatile(EMMS:::"memory");
  284. #endif
  285. mm_end = end - 3;
  286. while(s < mm_end)
  287. {
  288. register uint32_t x= *((uint32_t *)s);
  289. *((uint32_t *)d) = ((x>>1)&0x7FE07FE0) | (x&0x001F001F);
  290. s+=4;
  291. d+=4;
  292. }
  293. if(s < end)
  294. {
  295. register uint16_t x= *((uint16_t *)s);
  296. *((uint16_t *)d) = ((x>>1)&0x7FE0) | (x&0x001F);
  297. s+=2;
  298. d+=2;
  299. }
  300. }
  301. static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, unsigned src_size)
  302. {
  303. const uint8_t *s = src;
  304. const uint8_t *end;
  305. #ifdef HAVE_MMX
  306. const uint8_t *mm_end;
  307. #endif
  308. uint16_t *d = (uint16_t *)dst;
  309. end = s + src_size;
  310. #ifdef HAVE_MMX
  311. mm_end = end - 15;
  312. #if 1 //is faster only if multiplies are reasonable fast (FIXME figure out on which cpus this is faster, on Athlon its slightly faster)
  313. asm volatile(
  314. "movq %3, %%mm5 \n\t"
  315. "movq %4, %%mm6 \n\t"
  316. "movq %5, %%mm7 \n\t"
  317. ".balign 16 \n\t"
  318. "1: \n\t"
  319. PREFETCH" 32(%1) \n\t"
  320. "movd (%1), %%mm0 \n\t"
  321. "movd 4(%1), %%mm3 \n\t"
  322. "punpckldq 8(%1), %%mm0 \n\t"
  323. "punpckldq 12(%1), %%mm3 \n\t"
  324. "movq %%mm0, %%mm1 \n\t"
  325. "movq %%mm3, %%mm4 \n\t"
  326. "pand %%mm6, %%mm0 \n\t"
  327. "pand %%mm6, %%mm3 \n\t"
  328. "pmaddwd %%mm7, %%mm0 \n\t"
  329. "pmaddwd %%mm7, %%mm3 \n\t"
  330. "pand %%mm5, %%mm1 \n\t"
  331. "pand %%mm5, %%mm4 \n\t"
  332. "por %%mm1, %%mm0 \n\t"
  333. "por %%mm4, %%mm3 \n\t"
  334. "psrld $5, %%mm0 \n\t"
  335. "pslld $11, %%mm3 \n\t"
  336. "por %%mm3, %%mm0 \n\t"
  337. MOVNTQ" %%mm0, (%0) \n\t"
  338. "add $16, %1 \n\t"
  339. "add $8, %0 \n\t"
  340. "cmp %2, %1 \n\t"
  341. " jb 1b \n\t"
  342. : "+r" (d), "+r"(s)
  343. : "r" (mm_end), "m" (mask3216g), "m" (mask3216br), "m" (mul3216)
  344. );
  345. #else
  346. __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
  347. __asm __volatile(
  348. "movq %0, %%mm7\n\t"
  349. "movq %1, %%mm6\n\t"
  350. ::"m"(red_16mask),"m"(green_16mask));
  351. while(s < mm_end)
  352. {
  353. __asm __volatile(
  354. PREFETCH" 32%1\n\t"
  355. "movd %1, %%mm0\n\t"
  356. "movd 4%1, %%mm3\n\t"
  357. "punpckldq 8%1, %%mm0\n\t"
  358. "punpckldq 12%1, %%mm3\n\t"
  359. "movq %%mm0, %%mm1\n\t"
  360. "movq %%mm0, %%mm2\n\t"
  361. "movq %%mm3, %%mm4\n\t"
  362. "movq %%mm3, %%mm5\n\t"
  363. "psrlq $3, %%mm0\n\t"
  364. "psrlq $3, %%mm3\n\t"
  365. "pand %2, %%mm0\n\t"
  366. "pand %2, %%mm3\n\t"
  367. "psrlq $5, %%mm1\n\t"
  368. "psrlq $5, %%mm4\n\t"
  369. "pand %%mm6, %%mm1\n\t"
  370. "pand %%mm6, %%mm4\n\t"
  371. "psrlq $8, %%mm2\n\t"
  372. "psrlq $8, %%mm5\n\t"
  373. "pand %%mm7, %%mm2\n\t"
  374. "pand %%mm7, %%mm5\n\t"
  375. "por %%mm1, %%mm0\n\t"
  376. "por %%mm4, %%mm3\n\t"
  377. "por %%mm2, %%mm0\n\t"
  378. "por %%mm5, %%mm3\n\t"
  379. "psllq $16, %%mm3\n\t"
  380. "por %%mm3, %%mm0\n\t"
  381. MOVNTQ" %%mm0, %0\n\t"
  382. :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
  383. d += 4;
  384. s += 16;
  385. }
  386. #endif
  387. __asm __volatile(SFENCE:::"memory");
  388. __asm __volatile(EMMS:::"memory");
  389. #endif
  390. while(s < end)
  391. {
  392. // FIXME on bigendian
  393. const int src= *s; s += 4;
  394. *d++ = ((src&0xFF)>>3) + ((src&0xFC00)>>5) + ((src&0xF80000)>>8);
  395. // *d++ = ((src>>3)&0x1F) + ((src>>5)&0x7E0) + ((src>>8)&0xF800);
  396. }
  397. }
  398. static inline void RENAME(rgb32tobgr16)(const uint8_t *src, uint8_t *dst, unsigned int src_size)
  399. {
  400. const uint8_t *s = src;
  401. const uint8_t *end;
  402. #ifdef HAVE_MMX
  403. const uint8_t *mm_end;
  404. #endif
  405. uint16_t *d = (uint16_t *)dst;
  406. end = s + src_size;
  407. #ifdef HAVE_MMX
  408. __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
  409. __asm __volatile(
  410. "movq %0, %%mm7\n\t"
  411. "movq %1, %%mm6\n\t"
  412. ::"m"(red_16mask),"m"(green_16mask));
  413. mm_end = end - 15;
  414. while(s < mm_end)
  415. {
  416. __asm __volatile(
  417. PREFETCH" 32%1\n\t"
  418. "movd %1, %%mm0\n\t"
  419. "movd 4%1, %%mm3\n\t"
  420. "punpckldq 8%1, %%mm0\n\t"
  421. "punpckldq 12%1, %%mm3\n\t"
  422. "movq %%mm0, %%mm1\n\t"
  423. "movq %%mm0, %%mm2\n\t"
  424. "movq %%mm3, %%mm4\n\t"
  425. "movq %%mm3, %%mm5\n\t"
  426. "psllq $8, %%mm0\n\t"
  427. "psllq $8, %%mm3\n\t"
  428. "pand %%mm7, %%mm0\n\t"
  429. "pand %%mm7, %%mm3\n\t"
  430. "psrlq $5, %%mm1\n\t"
  431. "psrlq $5, %%mm4\n\t"
  432. "pand %%mm6, %%mm1\n\t"
  433. "pand %%mm6, %%mm4\n\t"
  434. "psrlq $19, %%mm2\n\t"
  435. "psrlq $19, %%mm5\n\t"
  436. "pand %2, %%mm2\n\t"
  437. "pand %2, %%mm5\n\t"
  438. "por %%mm1, %%mm0\n\t"
  439. "por %%mm4, %%mm3\n\t"
  440. "por %%mm2, %%mm0\n\t"
  441. "por %%mm5, %%mm3\n\t"
  442. "psllq $16, %%mm3\n\t"
  443. "por %%mm3, %%mm0\n\t"
  444. MOVNTQ" %%mm0, %0\n\t"
  445. :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
  446. d += 4;
  447. s += 16;
  448. }
  449. __asm __volatile(SFENCE:::"memory");
  450. __asm __volatile(EMMS:::"memory");
  451. #endif
  452. while(s < end)
  453. {
  454. // FIXME on bigendian
  455. const int src= *s; s += 4;
  456. *d++ = ((src&0xF8)<<8) + ((src&0xFC00)>>5) + ((src&0xF80000)>>19);
  457. }
  458. }
  459. static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, unsigned src_size)
  460. {
  461. const uint8_t *s = src;
  462. const uint8_t *end;
  463. #ifdef HAVE_MMX
  464. const uint8_t *mm_end;
  465. #endif
  466. uint16_t *d = (uint16_t *)dst;
  467. end = s + src_size;
  468. #ifdef HAVE_MMX
  469. mm_end = end - 15;
  470. #if 1 //is faster only if multiplies are reasonable fast (FIXME figure out on which cpus this is faster, on Athlon its slightly faster)
  471. asm volatile(
  472. "movq %3, %%mm5 \n\t"
  473. "movq %4, %%mm6 \n\t"
  474. "movq %5, %%mm7 \n\t"
  475. ".balign 16 \n\t"
  476. "1: \n\t"
  477. PREFETCH" 32(%1) \n\t"
  478. "movd (%1), %%mm0 \n\t"
  479. "movd 4(%1), %%mm3 \n\t"
  480. "punpckldq 8(%1), %%mm0 \n\t"
  481. "punpckldq 12(%1), %%mm3 \n\t"
  482. "movq %%mm0, %%mm1 \n\t"
  483. "movq %%mm3, %%mm4 \n\t"
  484. "pand %%mm6, %%mm0 \n\t"
  485. "pand %%mm6, %%mm3 \n\t"
  486. "pmaddwd %%mm7, %%mm0 \n\t"
  487. "pmaddwd %%mm7, %%mm3 \n\t"
  488. "pand %%mm5, %%mm1 \n\t"
  489. "pand %%mm5, %%mm4 \n\t"
  490. "por %%mm1, %%mm0 \n\t"
  491. "por %%mm4, %%mm3 \n\t"
  492. "psrld $6, %%mm0 \n\t"
  493. "pslld $10, %%mm3 \n\t"
  494. "por %%mm3, %%mm0 \n\t"
  495. MOVNTQ" %%mm0, (%0) \n\t"
  496. "add $16, %1 \n\t"
  497. "add $8, %0 \n\t"
  498. "cmp %2, %1 \n\t"
  499. " jb 1b \n\t"
  500. : "+r" (d), "+r"(s)
  501. : "r" (mm_end), "m" (mask3215g), "m" (mask3216br), "m" (mul3215)
  502. );
  503. #else
  504. __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
  505. __asm __volatile(
  506. "movq %0, %%mm7\n\t"
  507. "movq %1, %%mm6\n\t"
  508. ::"m"(red_15mask),"m"(green_15mask));
  509. while(s < mm_end)
  510. {
  511. __asm __volatile(
  512. PREFETCH" 32%1\n\t"
  513. "movd %1, %%mm0\n\t"
  514. "movd 4%1, %%mm3\n\t"
  515. "punpckldq 8%1, %%mm0\n\t"
  516. "punpckldq 12%1, %%mm3\n\t"
  517. "movq %%mm0, %%mm1\n\t"
  518. "movq %%mm0, %%mm2\n\t"
  519. "movq %%mm3, %%mm4\n\t"
  520. "movq %%mm3, %%mm5\n\t"
  521. "psrlq $3, %%mm0\n\t"
  522. "psrlq $3, %%mm3\n\t"
  523. "pand %2, %%mm0\n\t"
  524. "pand %2, %%mm3\n\t"
  525. "psrlq $6, %%mm1\n\t"
  526. "psrlq $6, %%mm4\n\t"
  527. "pand %%mm6, %%mm1\n\t"
  528. "pand %%mm6, %%mm4\n\t"
  529. "psrlq $9, %%mm2\n\t"
  530. "psrlq $9, %%mm5\n\t"
  531. "pand %%mm7, %%mm2\n\t"
  532. "pand %%mm7, %%mm5\n\t"
  533. "por %%mm1, %%mm0\n\t"
  534. "por %%mm4, %%mm3\n\t"
  535. "por %%mm2, %%mm0\n\t"
  536. "por %%mm5, %%mm3\n\t"
  537. "psllq $16, %%mm3\n\t"
  538. "por %%mm3, %%mm0\n\t"
  539. MOVNTQ" %%mm0, %0\n\t"
  540. :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
  541. d += 4;
  542. s += 16;
  543. }
  544. #endif
  545. __asm __volatile(SFENCE:::"memory");
  546. __asm __volatile(EMMS:::"memory");
  547. #endif
  548. while(s < end)
  549. {
  550. // FIXME on bigendian
  551. const int src= *s; s += 4;
  552. *d++ = ((src&0xFF)>>3) + ((src&0xF800)>>6) + ((src&0xF80000)>>9);
  553. }
  554. }
  555. static inline void RENAME(rgb32tobgr15)(const uint8_t *src, uint8_t *dst, unsigned src_size)
  556. {
  557. const uint8_t *s = src;
  558. const uint8_t *end;
  559. #ifdef HAVE_MMX
  560. const uint8_t *mm_end;
  561. #endif
  562. uint16_t *d = (uint16_t *)dst;
  563. end = s + src_size;
  564. #ifdef HAVE_MMX
  565. __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
  566. __asm __volatile(
  567. "movq %0, %%mm7\n\t"
  568. "movq %1, %%mm6\n\t"
  569. ::"m"(red_15mask),"m"(green_15mask));
  570. mm_end = end - 15;
  571. while(s < mm_end)
  572. {
  573. __asm __volatile(
  574. PREFETCH" 32%1\n\t"
  575. "movd %1, %%mm0\n\t"
  576. "movd 4%1, %%mm3\n\t"
  577. "punpckldq 8%1, %%mm0\n\t"
  578. "punpckldq 12%1, %%mm3\n\t"
  579. "movq %%mm0, %%mm1\n\t"
  580. "movq %%mm0, %%mm2\n\t"
  581. "movq %%mm3, %%mm4\n\t"
  582. "movq %%mm3, %%mm5\n\t"
  583. "psllq $7, %%mm0\n\t"
  584. "psllq $7, %%mm3\n\t"
  585. "pand %%mm7, %%mm0\n\t"
  586. "pand %%mm7, %%mm3\n\t"
  587. "psrlq $6, %%mm1\n\t"
  588. "psrlq $6, %%mm4\n\t"
  589. "pand %%mm6, %%mm1\n\t"
  590. "pand %%mm6, %%mm4\n\t"
  591. "psrlq $19, %%mm2\n\t"
  592. "psrlq $19, %%mm5\n\t"
  593. "pand %2, %%mm2\n\t"
  594. "pand %2, %%mm5\n\t"
  595. "por %%mm1, %%mm0\n\t"
  596. "por %%mm4, %%mm3\n\t"
  597. "por %%mm2, %%mm0\n\t"
  598. "por %%mm5, %%mm3\n\t"
  599. "psllq $16, %%mm3\n\t"
  600. "por %%mm3, %%mm0\n\t"
  601. MOVNTQ" %%mm0, %0\n\t"
  602. :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
  603. d += 4;
  604. s += 16;
  605. }
  606. __asm __volatile(SFENCE:::"memory");
  607. __asm __volatile(EMMS:::"memory");
  608. #endif
  609. while(s < end)
  610. {
  611. // FIXME on bigendian
  612. const int src= *s; s += 4;
  613. *d++ = ((src&0xF8)<<7) + ((src&0xF800)>>6) + ((src&0xF80000)>>19);
  614. }
  615. }
  616. static inline void RENAME(rgb24to16)(const uint8_t *src, uint8_t *dst, unsigned src_size)
  617. {
  618. const uint8_t *s = src;
  619. const uint8_t *end;
  620. #ifdef HAVE_MMX
  621. const uint8_t *mm_end;
  622. #endif
  623. uint16_t *d = (uint16_t *)dst;
  624. end = s + src_size;
  625. #ifdef HAVE_MMX
  626. __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
  627. __asm __volatile(
  628. "movq %0, %%mm7\n\t"
  629. "movq %1, %%mm6\n\t"
  630. ::"m"(red_16mask),"m"(green_16mask));
  631. mm_end = end - 11;
  632. while(s < mm_end)
  633. {
  634. __asm __volatile(
  635. PREFETCH" 32%1\n\t"
  636. "movd %1, %%mm0\n\t"
  637. "movd 3%1, %%mm3\n\t"
  638. "punpckldq 6%1, %%mm0\n\t"
  639. "punpckldq 9%1, %%mm3\n\t"
  640. "movq %%mm0, %%mm1\n\t"
  641. "movq %%mm0, %%mm2\n\t"
  642. "movq %%mm3, %%mm4\n\t"
  643. "movq %%mm3, %%mm5\n\t"
  644. "psrlq $3, %%mm0\n\t"
  645. "psrlq $3, %%mm3\n\t"
  646. "pand %2, %%mm0\n\t"
  647. "pand %2, %%mm3\n\t"
  648. "psrlq $5, %%mm1\n\t"
  649. "psrlq $5, %%mm4\n\t"
  650. "pand %%mm6, %%mm1\n\t"
  651. "pand %%mm6, %%mm4\n\t"
  652. "psrlq $8, %%mm2\n\t"
  653. "psrlq $8, %%mm5\n\t"
  654. "pand %%mm7, %%mm2\n\t"
  655. "pand %%mm7, %%mm5\n\t"
  656. "por %%mm1, %%mm0\n\t"
  657. "por %%mm4, %%mm3\n\t"
  658. "por %%mm2, %%mm0\n\t"
  659. "por %%mm5, %%mm3\n\t"
  660. "psllq $16, %%mm3\n\t"
  661. "por %%mm3, %%mm0\n\t"
  662. MOVNTQ" %%mm0, %0\n\t"
  663. :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
  664. d += 4;
  665. s += 12;
  666. }
  667. __asm __volatile(SFENCE:::"memory");
  668. __asm __volatile(EMMS:::"memory");
  669. #endif
  670. while(s < end)
  671. {
  672. const int b= *s++;
  673. const int g= *s++;
  674. const int r= *s++;
  675. *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
  676. }
  677. }
  678. static inline void RENAME(rgb24tobgr16)(const uint8_t *src, uint8_t *dst, unsigned int src_size)
  679. {
  680. const uint8_t *s = src;
  681. const uint8_t *end;
  682. #ifdef HAVE_MMX
  683. const uint8_t *mm_end;
  684. #endif
  685. uint16_t *d = (uint16_t *)dst;
  686. end = s + src_size;
  687. #ifdef HAVE_MMX
  688. __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
  689. __asm __volatile(
  690. "movq %0, %%mm7\n\t"
  691. "movq %1, %%mm6\n\t"
  692. ::"m"(red_16mask),"m"(green_16mask));
  693. mm_end = end - 15;
  694. while(s < mm_end)
  695. {
  696. __asm __volatile(
  697. PREFETCH" 32%1\n\t"
  698. "movd %1, %%mm0\n\t"
  699. "movd 3%1, %%mm3\n\t"
  700. "punpckldq 6%1, %%mm0\n\t"
  701. "punpckldq 9%1, %%mm3\n\t"
  702. "movq %%mm0, %%mm1\n\t"
  703. "movq %%mm0, %%mm2\n\t"
  704. "movq %%mm3, %%mm4\n\t"
  705. "movq %%mm3, %%mm5\n\t"
  706. "psllq $8, %%mm0\n\t"
  707. "psllq $8, %%mm3\n\t"
  708. "pand %%mm7, %%mm0\n\t"
  709. "pand %%mm7, %%mm3\n\t"
  710. "psrlq $5, %%mm1\n\t"
  711. "psrlq $5, %%mm4\n\t"
  712. "pand %%mm6, %%mm1\n\t"
  713. "pand %%mm6, %%mm4\n\t"
  714. "psrlq $19, %%mm2\n\t"
  715. "psrlq $19, %%mm5\n\t"
  716. "pand %2, %%mm2\n\t"
  717. "pand %2, %%mm5\n\t"
  718. "por %%mm1, %%mm0\n\t"
  719. "por %%mm4, %%mm3\n\t"
  720. "por %%mm2, %%mm0\n\t"
  721. "por %%mm5, %%mm3\n\t"
  722. "psllq $16, %%mm3\n\t"
  723. "por %%mm3, %%mm0\n\t"
  724. MOVNTQ" %%mm0, %0\n\t"
  725. :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
  726. d += 4;
  727. s += 12;
  728. }
  729. __asm __volatile(SFENCE:::"memory");
  730. __asm __volatile(EMMS:::"memory");
  731. #endif
  732. while(s < end)
  733. {
  734. const int r= *s++;
  735. const int g= *s++;
  736. const int b= *s++;
  737. *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
  738. }
  739. }
  740. static inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, unsigned src_size)
  741. {
  742. const uint8_t *s = src;
  743. const uint8_t *end;
  744. #ifdef HAVE_MMX
  745. const uint8_t *mm_end;
  746. #endif
  747. uint16_t *d = (uint16_t *)dst;
  748. end = s + src_size;
  749. #ifdef HAVE_MMX
  750. __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
  751. __asm __volatile(
  752. "movq %0, %%mm7\n\t"
  753. "movq %1, %%mm6\n\t"
  754. ::"m"(red_15mask),"m"(green_15mask));
  755. mm_end = end - 11;
  756. while(s < mm_end)
  757. {
  758. __asm __volatile(
  759. PREFETCH" 32%1\n\t"
  760. "movd %1, %%mm0\n\t"
  761. "movd 3%1, %%mm3\n\t"
  762. "punpckldq 6%1, %%mm0\n\t"
  763. "punpckldq 9%1, %%mm3\n\t"
  764. "movq %%mm0, %%mm1\n\t"
  765. "movq %%mm0, %%mm2\n\t"
  766. "movq %%mm3, %%mm4\n\t"
  767. "movq %%mm3, %%mm5\n\t"
  768. "psrlq $3, %%mm0\n\t"
  769. "psrlq $3, %%mm3\n\t"
  770. "pand %2, %%mm0\n\t"
  771. "pand %2, %%mm3\n\t"
  772. "psrlq $6, %%mm1\n\t"
  773. "psrlq $6, %%mm4\n\t"
  774. "pand %%mm6, %%mm1\n\t"
  775. "pand %%mm6, %%mm4\n\t"
  776. "psrlq $9, %%mm2\n\t"
  777. "psrlq $9, %%mm5\n\t"
  778. "pand %%mm7, %%mm2\n\t"
  779. "pand %%mm7, %%mm5\n\t"
  780. "por %%mm1, %%mm0\n\t"
  781. "por %%mm4, %%mm3\n\t"
  782. "por %%mm2, %%mm0\n\t"
  783. "por %%mm5, %%mm3\n\t"
  784. "psllq $16, %%mm3\n\t"
  785. "por %%mm3, %%mm0\n\t"
  786. MOVNTQ" %%mm0, %0\n\t"
  787. :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
  788. d += 4;
  789. s += 12;
  790. }
  791. __asm __volatile(SFENCE:::"memory");
  792. __asm __volatile(EMMS:::"memory");
  793. #endif
  794. while(s < end)
  795. {
  796. const int b= *s++;
  797. const int g= *s++;
  798. const int r= *s++;
  799. *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
  800. }
  801. }
  802. static inline void RENAME(rgb24tobgr15)(const uint8_t *src, uint8_t *dst, unsigned src_size)
  803. {
  804. const uint8_t *s = src;
  805. const uint8_t *end;
  806. #ifdef HAVE_MMX
  807. const uint8_t *mm_end;
  808. #endif
  809. uint16_t *d = (uint16_t *)dst;
  810. end = s + src_size;
  811. #ifdef HAVE_MMX
  812. __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
  813. __asm __volatile(
  814. "movq %0, %%mm7\n\t"
  815. "movq %1, %%mm6\n\t"
  816. ::"m"(red_15mask),"m"(green_15mask));
  817. mm_end = end - 15;
  818. while(s < mm_end)
  819. {
  820. __asm __volatile(
  821. PREFETCH" 32%1\n\t"
  822. "movd %1, %%mm0\n\t"
  823. "movd 3%1, %%mm3\n\t"
  824. "punpckldq 6%1, %%mm0\n\t"
  825. "punpckldq 9%1, %%mm3\n\t"
  826. "movq %%mm0, %%mm1\n\t"
  827. "movq %%mm0, %%mm2\n\t"
  828. "movq %%mm3, %%mm4\n\t"
  829. "movq %%mm3, %%mm5\n\t"
  830. "psllq $7, %%mm0\n\t"
  831. "psllq $7, %%mm3\n\t"
  832. "pand %%mm7, %%mm0\n\t"
  833. "pand %%mm7, %%mm3\n\t"
  834. "psrlq $6, %%mm1\n\t"
  835. "psrlq $6, %%mm4\n\t"
  836. "pand %%mm6, %%mm1\n\t"
  837. "pand %%mm6, %%mm4\n\t"
  838. "psrlq $19, %%mm2\n\t"
  839. "psrlq $19, %%mm5\n\t"
  840. "pand %2, %%mm2\n\t"
  841. "pand %2, %%mm5\n\t"
  842. "por %%mm1, %%mm0\n\t"
  843. "por %%mm4, %%mm3\n\t"
  844. "por %%mm2, %%mm0\n\t"
  845. "por %%mm5, %%mm3\n\t"
  846. "psllq $16, %%mm3\n\t"
  847. "por %%mm3, %%mm0\n\t"
  848. MOVNTQ" %%mm0, %0\n\t"
  849. :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
  850. d += 4;
  851. s += 12;
  852. }
  853. __asm __volatile(SFENCE:::"memory");
  854. __asm __volatile(EMMS:::"memory");
  855. #endif
  856. while(s < end)
  857. {
  858. const int r= *s++;
  859. const int g= *s++;
  860. const int b= *s++;
  861. *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
  862. }
  863. }
  864. /*
  865. I use here less accurate approximation by simply
  866. left-shifting the input
  867. value and filling the low order bits with
  868. zeroes. This method improves png's
  869. compression but this scheme cannot reproduce white exactly, since it does not
  870. generate an all-ones maximum value; the net effect is to darken the
  871. image slightly.
  872. The better method should be "left bit replication":
  873. 4 3 2 1 0
  874. ---------
  875. 1 1 0 1 1
  876. 7 6 5 4 3 2 1 0
  877. ----------------
  878. 1 1 0 1 1 1 1 0
  879. |=======| |===|
  880. | Leftmost Bits Repeated to Fill Open Bits
  881. |
  882. Original Bits
  883. */
  884. static inline void RENAME(rgb15to24)(const uint8_t *src, uint8_t *dst, unsigned src_size)
  885. {
  886. const uint16_t *end;
  887. #ifdef HAVE_MMX
  888. const uint16_t *mm_end;
  889. #endif
  890. uint8_t *d = (uint8_t *)dst;
  891. const uint16_t *s = (uint16_t *)src;
  892. end = s + src_size/2;
  893. #ifdef HAVE_MMX
  894. __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
  895. mm_end = end - 7;
  896. while(s < mm_end)
  897. {
  898. __asm __volatile(
  899. PREFETCH" 32%1\n\t"
  900. "movq %1, %%mm0\n\t"
  901. "movq %1, %%mm1\n\t"
  902. "movq %1, %%mm2\n\t"
  903. "pand %2, %%mm0\n\t"
  904. "pand %3, %%mm1\n\t"
  905. "pand %4, %%mm2\n\t"
  906. "psllq $3, %%mm0\n\t"
  907. "psrlq $2, %%mm1\n\t"
  908. "psrlq $7, %%mm2\n\t"
  909. "movq %%mm0, %%mm3\n\t"
  910. "movq %%mm1, %%mm4\n\t"
  911. "movq %%mm2, %%mm5\n\t"
  912. "punpcklwd %5, %%mm0\n\t"
  913. "punpcklwd %5, %%mm1\n\t"
  914. "punpcklwd %5, %%mm2\n\t"
  915. "punpckhwd %5, %%mm3\n\t"
  916. "punpckhwd %5, %%mm4\n\t"
  917. "punpckhwd %5, %%mm5\n\t"
  918. "psllq $8, %%mm1\n\t"
  919. "psllq $16, %%mm2\n\t"
  920. "por %%mm1, %%mm0\n\t"
  921. "por %%mm2, %%mm0\n\t"
  922. "psllq $8, %%mm4\n\t"
  923. "psllq $16, %%mm5\n\t"
  924. "por %%mm4, %%mm3\n\t"
  925. "por %%mm5, %%mm3\n\t"
  926. "movq %%mm0, %%mm6\n\t"
  927. "movq %%mm3, %%mm7\n\t"
  928. "movq 8%1, %%mm0\n\t"
  929. "movq 8%1, %%mm1\n\t"
  930. "movq 8%1, %%mm2\n\t"
  931. "pand %2, %%mm0\n\t"
  932. "pand %3, %%mm1\n\t"
  933. "pand %4, %%mm2\n\t"
  934. "psllq $3, %%mm0\n\t"
  935. "psrlq $2, %%mm1\n\t"
  936. "psrlq $7, %%mm2\n\t"
  937. "movq %%mm0, %%mm3\n\t"
  938. "movq %%mm1, %%mm4\n\t"
  939. "movq %%mm2, %%mm5\n\t"
  940. "punpcklwd %5, %%mm0\n\t"
  941. "punpcklwd %5, %%mm1\n\t"
  942. "punpcklwd %5, %%mm2\n\t"
  943. "punpckhwd %5, %%mm3\n\t"
  944. "punpckhwd %5, %%mm4\n\t"
  945. "punpckhwd %5, %%mm5\n\t"
  946. "psllq $8, %%mm1\n\t"
  947. "psllq $16, %%mm2\n\t"
  948. "por %%mm1, %%mm0\n\t"
  949. "por %%mm2, %%mm0\n\t"
  950. "psllq $8, %%mm4\n\t"
  951. "psllq $16, %%mm5\n\t"
  952. "por %%mm4, %%mm3\n\t"
  953. "por %%mm5, %%mm3\n\t"
  954. :"=m"(*d)
  955. :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r), "m"(mmx_null)
  956. :"memory");
  957. /* Borrowed 32 to 24 */
  958. __asm __volatile(
  959. "movq %%mm0, %%mm4\n\t"
  960. "movq %%mm3, %%mm5\n\t"
  961. "movq %%mm6, %%mm0\n\t"
  962. "movq %%mm7, %%mm1\n\t"
  963. "movq %%mm4, %%mm6\n\t"
  964. "movq %%mm5, %%mm7\n\t"
  965. "movq %%mm0, %%mm2\n\t"
  966. "movq %%mm1, %%mm3\n\t"
  967. "psrlq $8, %%mm2\n\t"
  968. "psrlq $8, %%mm3\n\t"
  969. "psrlq $8, %%mm6\n\t"
  970. "psrlq $8, %%mm7\n\t"
  971. "pand %2, %%mm0\n\t"
  972. "pand %2, %%mm1\n\t"
  973. "pand %2, %%mm4\n\t"
  974. "pand %2, %%mm5\n\t"
  975. "pand %3, %%mm2\n\t"
  976. "pand %3, %%mm3\n\t"
  977. "pand %3, %%mm6\n\t"
  978. "pand %3, %%mm7\n\t"
  979. "por %%mm2, %%mm0\n\t"
  980. "por %%mm3, %%mm1\n\t"
  981. "por %%mm6, %%mm4\n\t"
  982. "por %%mm7, %%mm5\n\t"
  983. "movq %%mm1, %%mm2\n\t"
  984. "movq %%mm4, %%mm3\n\t"
  985. "psllq $48, %%mm2\n\t"
  986. "psllq $32, %%mm3\n\t"
  987. "pand %4, %%mm2\n\t"
  988. "pand %5, %%mm3\n\t"
  989. "por %%mm2, %%mm0\n\t"
  990. "psrlq $16, %%mm1\n\t"
  991. "psrlq $32, %%mm4\n\t"
  992. "psllq $16, %%mm5\n\t"
  993. "por %%mm3, %%mm1\n\t"
  994. "pand %6, %%mm5\n\t"
  995. "por %%mm5, %%mm4\n\t"
  996. MOVNTQ" %%mm0, %0\n\t"
  997. MOVNTQ" %%mm1, 8%0\n\t"
  998. MOVNTQ" %%mm4, 16%0"
  999. :"=m"(*d)
  1000. :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
  1001. :"memory");
  1002. d += 24;
  1003. s += 8;
  1004. }
  1005. __asm __volatile(SFENCE:::"memory");
  1006. __asm __volatile(EMMS:::"memory");
  1007. #endif
  1008. while(s < end)
  1009. {
  1010. register uint16_t bgr;
  1011. bgr = *s++;
  1012. *d++ = (bgr&0x1F)<<3;
  1013. *d++ = (bgr&0x3E0)>>2;
  1014. *d++ = (bgr&0x7C00)>>7;
  1015. }
  1016. }
  1017. static inline void RENAME(rgb16to24)(const uint8_t *src, uint8_t *dst, unsigned src_size)
  1018. {
  1019. const uint16_t *end;
  1020. #ifdef HAVE_MMX
  1021. const uint16_t *mm_end;
  1022. #endif
  1023. uint8_t *d = (uint8_t *)dst;
  1024. const uint16_t *s = (const uint16_t *)src;
  1025. end = s + src_size/2;
  1026. #ifdef HAVE_MMX
  1027. __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
  1028. mm_end = end - 7;
  1029. while(s < mm_end)
  1030. {
  1031. __asm __volatile(
  1032. PREFETCH" 32%1\n\t"
  1033. "movq %1, %%mm0\n\t"
  1034. "movq %1, %%mm1\n\t"
  1035. "movq %1, %%mm2\n\t"
  1036. "pand %2, %%mm0\n\t"
  1037. "pand %3, %%mm1\n\t"
  1038. "pand %4, %%mm2\n\t"
  1039. "psllq $3, %%mm0\n\t"
  1040. "psrlq $3, %%mm1\n\t"
  1041. "psrlq $8, %%mm2\n\t"
  1042. "movq %%mm0, %%mm3\n\t"
  1043. "movq %%mm1, %%mm4\n\t"
  1044. "movq %%mm2, %%mm5\n\t"
  1045. "punpcklwd %5, %%mm0\n\t"
  1046. "punpcklwd %5, %%mm1\n\t"
  1047. "punpcklwd %5, %%mm2\n\t"
  1048. "punpckhwd %5, %%mm3\n\t"
  1049. "punpckhwd %5, %%mm4\n\t"
  1050. "punpckhwd %5, %%mm5\n\t"
  1051. "psllq $8, %%mm1\n\t"
  1052. "psllq $16, %%mm2\n\t"
  1053. "por %%mm1, %%mm0\n\t"
  1054. "por %%mm2, %%mm0\n\t"
  1055. "psllq $8, %%mm4\n\t"
  1056. "psllq $16, %%mm5\n\t"
  1057. "por %%mm4, %%mm3\n\t"
  1058. "por %%mm5, %%mm3\n\t"
  1059. "movq %%mm0, %%mm6\n\t"
  1060. "movq %%mm3, %%mm7\n\t"
  1061. "movq 8%1, %%mm0\n\t"
  1062. "movq 8%1, %%mm1\n\t"
  1063. "movq 8%1, %%mm2\n\t"
  1064. "pand %2, %%mm0\n\t"
  1065. "pand %3, %%mm1\n\t"
  1066. "pand %4, %%mm2\n\t"
  1067. "psllq $3, %%mm0\n\t"
  1068. "psrlq $3, %%mm1\n\t"
  1069. "psrlq $8, %%mm2\n\t"
  1070. "movq %%mm0, %%mm3\n\t"
  1071. "movq %%mm1, %%mm4\n\t"
  1072. "movq %%mm2, %%mm5\n\t"
  1073. "punpcklwd %5, %%mm0\n\t"
  1074. "punpcklwd %5, %%mm1\n\t"
  1075. "punpcklwd %5, %%mm2\n\t"
  1076. "punpckhwd %5, %%mm3\n\t"
  1077. "punpckhwd %5, %%mm4\n\t"
  1078. "punpckhwd %5, %%mm5\n\t"
  1079. "psllq $8, %%mm1\n\t"
  1080. "psllq $16, %%mm2\n\t"
  1081. "por %%mm1, %%mm0\n\t"
  1082. "por %%mm2, %%mm0\n\t"
  1083. "psllq $8, %%mm4\n\t"
  1084. "psllq $16, %%mm5\n\t"
  1085. "por %%mm4, %%mm3\n\t"
  1086. "por %%mm5, %%mm3\n\t"
  1087. :"=m"(*d)
  1088. :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mmx_null)
  1089. :"memory");
  1090. /* Borrowed 32 to 24 */
  1091. __asm __volatile(
  1092. "movq %%mm0, %%mm4\n\t"
  1093. "movq %%mm3, %%mm5\n\t"
  1094. "movq %%mm6, %%mm0\n\t"
  1095. "movq %%mm7, %%mm1\n\t"
  1096. "movq %%mm4, %%mm6\n\t"
  1097. "movq %%mm5, %%mm7\n\t"
  1098. "movq %%mm0, %%mm2\n\t"
  1099. "movq %%mm1, %%mm3\n\t"
  1100. "psrlq $8, %%mm2\n\t"
  1101. "psrlq $8, %%mm3\n\t"
  1102. "psrlq $8, %%mm6\n\t"
  1103. "psrlq $8, %%mm7\n\t"
  1104. "pand %2, %%mm0\n\t"
  1105. "pand %2, %%mm1\n\t"
  1106. "pand %2, %%mm4\n\t"
  1107. "pand %2, %%mm5\n\t"
  1108. "pand %3, %%mm2\n\t"
  1109. "pand %3, %%mm3\n\t"
  1110. "pand %3, %%mm6\n\t"
  1111. "pand %3, %%mm7\n\t"
  1112. "por %%mm2, %%mm0\n\t"
  1113. "por %%mm3, %%mm1\n\t"
  1114. "por %%mm6, %%mm4\n\t"
  1115. "por %%mm7, %%mm5\n\t"
  1116. "movq %%mm1, %%mm2\n\t"
  1117. "movq %%mm4, %%mm3\n\t"
  1118. "psllq $48, %%mm2\n\t"
  1119. "psllq $32, %%mm3\n\t"
  1120. "pand %4, %%mm2\n\t"
  1121. "pand %5, %%mm3\n\t"
  1122. "por %%mm2, %%mm0\n\t"
  1123. "psrlq $16, %%mm1\n\t"
  1124. "psrlq $32, %%mm4\n\t"
  1125. "psllq $16, %%mm5\n\t"
  1126. "por %%mm3, %%mm1\n\t"
  1127. "pand %6, %%mm5\n\t"
  1128. "por %%mm5, %%mm4\n\t"
  1129. MOVNTQ" %%mm0, %0\n\t"
  1130. MOVNTQ" %%mm1, 8%0\n\t"
  1131. MOVNTQ" %%mm4, 16%0"
  1132. :"=m"(*d)
  1133. :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
  1134. :"memory");
  1135. d += 24;
  1136. s += 8;
  1137. }
  1138. __asm __volatile(SFENCE:::"memory");
  1139. __asm __volatile(EMMS:::"memory");
  1140. #endif
  1141. while(s < end)
  1142. {
  1143. register uint16_t bgr;
  1144. bgr = *s++;
  1145. *d++ = (bgr&0x1F)<<3;
  1146. *d++ = (bgr&0x7E0)>>3;
  1147. *d++ = (bgr&0xF800)>>8;
  1148. }
  1149. }
  1150. static inline void RENAME(rgb15to32)(const uint8_t *src, uint8_t *dst, unsigned src_size)
  1151. {
  1152. const uint16_t *end;
  1153. #ifdef HAVE_MMX
  1154. const uint16_t *mm_end;
  1155. #endif
  1156. uint8_t *d = (uint8_t *)dst;
  1157. const uint16_t *s = (const uint16_t *)src;
  1158. end = s + src_size/2;
  1159. #ifdef HAVE_MMX
  1160. __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
  1161. __asm __volatile("pxor %%mm7,%%mm7\n\t":::"memory");
  1162. mm_end = end - 3;
  1163. while(s < mm_end)
  1164. {
  1165. __asm __volatile(
  1166. PREFETCH" 32%1\n\t"
  1167. "movq %1, %%mm0\n\t"
  1168. "movq %1, %%mm1\n\t"
  1169. "movq %1, %%mm2\n\t"
  1170. "pand %2, %%mm0\n\t"
  1171. "pand %3, %%mm1\n\t"
  1172. "pand %4, %%mm2\n\t"
  1173. "psllq $3, %%mm0\n\t"
  1174. "psrlq $2, %%mm1\n\t"
  1175. "psrlq $7, %%mm2\n\t"
  1176. "movq %%mm0, %%mm3\n\t"
  1177. "movq %%mm1, %%mm4\n\t"
  1178. "movq %%mm2, %%mm5\n\t"
  1179. "punpcklwd %%mm7, %%mm0\n\t"
  1180. "punpcklwd %%mm7, %%mm1\n\t"
  1181. "punpcklwd %%mm7, %%mm2\n\t"
  1182. "punpckhwd %%mm7, %%mm3\n\t"
  1183. "punpckhwd %%mm7, %%mm4\n\t"
  1184. "punpckhwd %%mm7, %%mm5\n\t"
  1185. "psllq $8, %%mm1\n\t"
  1186. "psllq $16, %%mm2\n\t"
  1187. "por %%mm1, %%mm0\n\t"
  1188. "por %%mm2, %%mm0\n\t"
  1189. "psllq $8, %%mm4\n\t"
  1190. "psllq $16, %%mm5\n\t"
  1191. "por %%mm4, %%mm3\n\t"
  1192. "por %%mm5, %%mm3\n\t"
  1193. MOVNTQ" %%mm0, %0\n\t"
  1194. MOVNTQ" %%mm3, 8%0\n\t"
  1195. :"=m"(*d)
  1196. :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r)
  1197. :"memory");
  1198. d += 16;
  1199. s += 4;
  1200. }
  1201. __asm __volatile(SFENCE:::"memory");
  1202. __asm __volatile(EMMS:::"memory");
  1203. #endif
  1204. while(s < end)
  1205. {
  1206. #if 0 //slightly slower on athlon
  1207. int bgr= *s++;
  1208. *((uint32_t*)d)++ = ((bgr&0x1F)<<3) + ((bgr&0x3E0)<<6) + ((bgr&0x7C00)<<9);
  1209. #else
  1210. //FIXME this is very likely wrong for bigendian (and the following converters too)
  1211. register uint16_t bgr;
  1212. bgr = *s++;
  1213. #ifdef WORDS_BIGENDIAN
  1214. *d++ = 0;
  1215. *d++ = (bgr&0x1F)<<3;
  1216. *d++ = (bgr&0x3E0)>>2;
  1217. *d++ = (bgr&0x7C00)>>7;
  1218. #else
  1219. *d++ = (bgr&0x1F)<<3;
  1220. *d++ = (bgr&0x3E0)>>2;
  1221. *d++ = (bgr&0x7C00)>>7;
  1222. *d++ = 0;
  1223. #endif
  1224. #endif
  1225. }
  1226. }
  1227. static inline void RENAME(rgb16to32)(const uint8_t *src, uint8_t *dst, unsigned src_size)
  1228. {
  1229. const uint16_t *end;
  1230. #ifdef HAVE_MMX
  1231. const uint16_t *mm_end;
  1232. #endif
  1233. uint8_t *d = (uint8_t *)dst;
  1234. const uint16_t *s = (uint16_t *)src;
  1235. end = s + src_size/2;
  1236. #ifdef HAVE_MMX
  1237. __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
  1238. __asm __volatile("pxor %%mm7,%%mm7\n\t":::"memory");
  1239. mm_end = end - 3;
  1240. while(s < mm_end)
  1241. {
  1242. __asm __volatile(
  1243. PREFETCH" 32%1\n\t"
  1244. "movq %1, %%mm0\n\t"
  1245. "movq %1, %%mm1\n\t"
  1246. "movq %1, %%mm2\n\t"
  1247. "pand %2, %%mm0\n\t"
  1248. "pand %3, %%mm1\n\t"
  1249. "pand %4, %%mm2\n\t"
  1250. "psllq $3, %%mm0\n\t"
  1251. "psrlq $3, %%mm1\n\t"
  1252. "psrlq $8, %%mm2\n\t"
  1253. "movq %%mm0, %%mm3\n\t"
  1254. "movq %%mm1, %%mm4\n\t"
  1255. "movq %%mm2, %%mm5\n\t"
  1256. "punpcklwd %%mm7, %%mm0\n\t"
  1257. "punpcklwd %%mm7, %%mm1\n\t"
  1258. "punpcklwd %%mm7, %%mm2\n\t"
  1259. "punpckhwd %%mm7, %%mm3\n\t"
  1260. "punpckhwd %%mm7, %%mm4\n\t"
  1261. "punpckhwd %%mm7, %%mm5\n\t"
  1262. "psllq $8, %%mm1\n\t"
  1263. "psllq $16, %%mm2\n\t"
  1264. "por %%mm1, %%mm0\n\t"
  1265. "por %%mm2, %%mm0\n\t"
  1266. "psllq $8, %%mm4\n\t"
  1267. "psllq $16, %%mm5\n\t"
  1268. "por %%mm4, %%mm3\n\t"
  1269. "por %%mm5, %%mm3\n\t"
  1270. MOVNTQ" %%mm0, %0\n\t"
  1271. MOVNTQ" %%mm3, 8%0\n\t"
  1272. :"=m"(*d)
  1273. :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r)
  1274. :"memory");
  1275. d += 16;
  1276. s += 4;
  1277. }
  1278. __asm __volatile(SFENCE:::"memory");
  1279. __asm __volatile(EMMS:::"memory");
  1280. #endif
  1281. while(s < end)
  1282. {
  1283. register uint16_t bgr;
  1284. bgr = *s++;
  1285. #ifdef WORDS_BIGENDIAN
  1286. *d++ = 0;
  1287. *d++ = (bgr&0x1F)<<3;
  1288. *d++ = (bgr&0x7E0)>>3;
  1289. *d++ = (bgr&0xF800)>>8;
  1290. #else
  1291. *d++ = (bgr&0x1F)<<3;
  1292. *d++ = (bgr&0x7E0)>>3;
  1293. *d++ = (bgr&0xF800)>>8;
  1294. *d++ = 0;
  1295. #endif
  1296. }
  1297. }
  1298. static inline void RENAME(rgb32tobgr32)(const uint8_t *src, uint8_t *dst, unsigned int src_size)
  1299. {
  1300. #ifdef HAVE_MMX
  1301. /* TODO: unroll this loop */
  1302. asm volatile (
  1303. "xor %%"REG_a", %%"REG_a" \n\t"
  1304. ".balign 16 \n\t"
  1305. "1: \n\t"
  1306. PREFETCH" 32(%0, %%"REG_a") \n\t"
  1307. "movq (%0, %%"REG_a"), %%mm0 \n\t"
  1308. "movq %%mm0, %%mm1 \n\t"
  1309. "movq %%mm0, %%mm2 \n\t"
  1310. "pslld $16, %%mm0 \n\t"
  1311. "psrld $16, %%mm1 \n\t"
  1312. "pand "MANGLE(mask32r)", %%mm0 \n\t"
  1313. "pand "MANGLE(mask32g)", %%mm2 \n\t"
  1314. "pand "MANGLE(mask32b)", %%mm1 \n\t"
  1315. "por %%mm0, %%mm2 \n\t"
  1316. "por %%mm1, %%mm2 \n\t"
  1317. MOVNTQ" %%mm2, (%1, %%"REG_a") \n\t"
  1318. "add $8, %%"REG_a" \n\t"
  1319. "cmp %2, %%"REG_a" \n\t"
  1320. " jb 1b \n\t"
  1321. :: "r" (src), "r"(dst), "r" ((long)src_size-7)
  1322. : "%"REG_a
  1323. );
  1324. __asm __volatile(SFENCE:::"memory");
  1325. __asm __volatile(EMMS:::"memory");
  1326. #else
  1327. unsigned i;
  1328. unsigned num_pixels = src_size >> 2;
  1329. for(i=0; i<num_pixels; i++)
  1330. {
  1331. #ifdef WORDS_BIGENDIAN
  1332. dst[4*i + 1] = src[4*i + 3];
  1333. dst[4*i + 2] = src[4*i + 2];
  1334. dst[4*i + 3] = src[4*i + 1];
  1335. #else
  1336. dst[4*i + 0] = src[4*i + 2];
  1337. dst[4*i + 1] = src[4*i + 1];
  1338. dst[4*i + 2] = src[4*i + 0];
  1339. #endif
  1340. }
  1341. #endif
  1342. }
  1343. static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, unsigned int src_size)
  1344. {
  1345. unsigned i;
  1346. #ifdef HAVE_MMX
  1347. long mmx_size= 23 - src_size;
  1348. asm volatile (
  1349. "movq "MANGLE(mask24r)", %%mm5 \n\t"
  1350. "movq "MANGLE(mask24g)", %%mm6 \n\t"
  1351. "movq "MANGLE(mask24b)", %%mm7 \n\t"
  1352. ".balign 16 \n\t"
  1353. "1: \n\t"
  1354. PREFETCH" 32(%1, %%"REG_a") \n\t"
  1355. "movq (%1, %%"REG_a"), %%mm0 \n\t" // BGR BGR BG
  1356. "movq (%1, %%"REG_a"), %%mm1 \n\t" // BGR BGR BG
  1357. "movq 2(%1, %%"REG_a"), %%mm2 \n\t" // R BGR BGR B
  1358. "psllq $16, %%mm0 \n\t" // 00 BGR BGR
  1359. "pand %%mm5, %%mm0 \n\t"
  1360. "pand %%mm6, %%mm1 \n\t"
  1361. "pand %%mm7, %%mm2 \n\t"
  1362. "por %%mm0, %%mm1 \n\t"
  1363. "por %%mm2, %%mm1 \n\t"
  1364. "movq 6(%1, %%"REG_a"), %%mm0 \n\t" // BGR BGR BG
  1365. MOVNTQ" %%mm1, (%2, %%"REG_a")\n\t" // RGB RGB RG
  1366. "movq 8(%1, %%"REG_a"), %%mm1 \n\t" // R BGR BGR B
  1367. "movq 10(%1, %%"REG_a"), %%mm2 \n\t" // GR BGR BGR
  1368. "pand %%mm7, %%mm0 \n\t"
  1369. "pand %%mm5, %%mm1 \n\t"
  1370. "pand %%mm6, %%mm2 \n\t"
  1371. "por %%mm0, %%mm1 \n\t"
  1372. "por %%mm2, %%mm1 \n\t"
  1373. "movq 14(%1, %%"REG_a"), %%mm0 \n\t" // R BGR BGR B
  1374. MOVNTQ" %%mm1, 8(%2, %%"REG_a")\n\t" // B RGB RGB R
  1375. "movq 16(%1, %%"REG_a"), %%mm1 \n\t" // GR BGR BGR
  1376. "movq 18(%1, %%"REG_a"), %%mm2 \n\t" // BGR BGR BG
  1377. "pand %%mm6, %%mm0 \n\t"
  1378. "pand %%mm7, %%mm1 \n\t"
  1379. "pand %%mm5, %%mm2 \n\t"
  1380. "por %%mm0, %%mm1 \n\t"
  1381. "por %%mm2, %%mm1 \n\t"
  1382. MOVNTQ" %%mm1, 16(%2, %%"REG_a")\n\t"
  1383. "add $24, %%"REG_a" \n\t"
  1384. " js 1b \n\t"
  1385. : "+a" (mmx_size)
  1386. : "r" (src-mmx_size), "r"(dst-mmx_size)
  1387. );
  1388. __asm __volatile(SFENCE:::"memory");
  1389. __asm __volatile(EMMS:::"memory");
  1390. if(mmx_size==23) return; //finihsed, was multiple of 8
  1391. src+= src_size;
  1392. dst+= src_size;
  1393. src_size= 23-mmx_size;
  1394. src-= src_size;
  1395. dst-= src_size;
  1396. #endif
  1397. for(i=0; i<src_size; i+=3)
  1398. {
  1399. register uint8_t x;
  1400. x = src[i + 2];
  1401. dst[i + 1] = src[i + 1];
  1402. dst[i + 2] = src[i + 0];
  1403. dst[i + 0] = x;
  1404. }
  1405. }
  1406. static inline void RENAME(yuvPlanartoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
  1407. unsigned int width, unsigned int height,
  1408. int lumStride, int chromStride, int dstStride, int vertLumPerChroma)
  1409. {
  1410. unsigned y;
  1411. const unsigned chromWidth= width>>1;
  1412. for(y=0; y<height; y++)
  1413. {
  1414. #ifdef HAVE_MMX
  1415. //FIXME handle 2 lines a once (fewer prefetch, reuse some chrom, but very likely limited by mem anyway)
  1416. asm volatile(
  1417. "xor %%"REG_a", %%"REG_a" \n\t"
  1418. ".balign 16 \n\t"
  1419. "1: \n\t"
  1420. PREFETCH" 32(%1, %%"REG_a", 2) \n\t"
  1421. PREFETCH" 32(%2, %%"REG_a") \n\t"
  1422. PREFETCH" 32(%3, %%"REG_a") \n\t"
  1423. "movq (%2, %%"REG_a"), %%mm0 \n\t" // U(0)
  1424. "movq %%mm0, %%mm2 \n\t" // U(0)
  1425. "movq (%3, %%"REG_a"), %%mm1 \n\t" // V(0)
  1426. "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
  1427. "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8)
  1428. "movq (%1, %%"REG_a",2), %%mm3 \n\t" // Y(0)
  1429. "movq 8(%1, %%"REG_a",2), %%mm5 \n\t" // Y(8)
  1430. "movq %%mm3, %%mm4 \n\t" // Y(0)
  1431. "movq %%mm5, %%mm6 \n\t" // Y(8)
  1432. "punpcklbw %%mm0, %%mm3 \n\t" // YUYV YUYV(0)
  1433. "punpckhbw %%mm0, %%mm4 \n\t" // YUYV YUYV(4)
  1434. "punpcklbw %%mm2, %%mm5 \n\t" // YUYV YUYV(8)
  1435. "punpckhbw %%mm2, %%mm6 \n\t" // YUYV YUYV(12)
  1436. MOVNTQ" %%mm3, (%0, %%"REG_a", 4)\n\t"
  1437. MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4)\n\t"
  1438. MOVNTQ" %%mm5, 16(%0, %%"REG_a", 4)\n\t"
  1439. MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4)\n\t"
  1440. "add $8, %%"REG_a" \n\t"
  1441. "cmp %4, %%"REG_a" \n\t"
  1442. " jb 1b \n\t"
  1443. ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" ((long)chromWidth)
  1444. : "%"REG_a
  1445. );
  1446. #else
  1447. #if defined ARCH_ALPHA && defined HAVE_MVI
  1448. #define pl2yuy2(n) \
  1449. y1 = yc[n]; \
  1450. y2 = yc2[n]; \
  1451. u = uc[n]; \
  1452. v = vc[n]; \
  1453. asm("unpkbw %1, %0" : "=r"(y1) : "r"(y1)); \
  1454. asm("unpkbw %1, %0" : "=r"(y2) : "r"(y2)); \
  1455. asm("unpkbl %1, %0" : "=r"(u) : "r"(u)); \
  1456. asm("unpkbl %1, %0" : "=r"(v) : "r"(v)); \
  1457. yuv1 = (u << 8) + (v << 24); \
  1458. yuv2 = yuv1 + y2; \
  1459. yuv1 += y1; \
  1460. qdst[n] = yuv1; \
  1461. qdst2[n] = yuv2;
  1462. int i;
  1463. uint64_t *qdst = (uint64_t *) dst;
  1464. uint64_t *qdst2 = (uint64_t *) (dst + dstStride);
  1465. const uint32_t *yc = (uint32_t *) ysrc;
  1466. const uint32_t *yc2 = (uint32_t *) (ysrc + lumStride);
  1467. const uint16_t *uc = (uint16_t*) usrc, *vc = (uint16_t*) vsrc;
  1468. for(i = 0; i < chromWidth; i += 8){
  1469. uint64_t y1, y2, yuv1, yuv2;
  1470. uint64_t u, v;
  1471. /* Prefetch */
  1472. asm("ldq $31,64(%0)" :: "r"(yc));
  1473. asm("ldq $31,64(%0)" :: "r"(yc2));
  1474. asm("ldq $31,64(%0)" :: "r"(uc));
  1475. asm("ldq $31,64(%0)" :: "r"(vc));
  1476. pl2yuy2(0);
  1477. pl2yuy2(1);
  1478. pl2yuy2(2);
  1479. pl2yuy2(3);
  1480. yc += 4;
  1481. yc2 += 4;
  1482. uc += 4;
  1483. vc += 4;
  1484. qdst += 4;
  1485. qdst2 += 4;
  1486. }
  1487. y++;
  1488. ysrc += lumStride;
  1489. dst += dstStride;
  1490. #elif __WORDSIZE >= 64
  1491. int i;
  1492. uint64_t *ldst = (uint64_t *) dst;
  1493. const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
  1494. for(i = 0; i < chromWidth; i += 2){
  1495. uint64_t k, l;
  1496. k = yc[0] + (uc[0] << 8) +
  1497. (yc[1] << 16) + (vc[0] << 24);
  1498. l = yc[2] + (uc[1] << 8) +
  1499. (yc[3] << 16) + (vc[1] << 24);
  1500. *ldst++ = k + (l << 32);
  1501. yc += 4;
  1502. uc += 2;
  1503. vc += 2;
  1504. }
  1505. #else
  1506. int i, *idst = (int32_t *) dst;
  1507. const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
  1508. for(i = 0; i < chromWidth; i++){
  1509. #ifdef WORDS_BIGENDIAN
  1510. *idst++ = (yc[0] << 24)+ (uc[0] << 16) +
  1511. (yc[1] << 8) + (vc[0] << 0);
  1512. #else
  1513. *idst++ = yc[0] + (uc[0] << 8) +
  1514. (yc[1] << 16) + (vc[0] << 24);
  1515. #endif
  1516. yc += 2;
  1517. uc++;
  1518. vc++;
  1519. }
  1520. #endif
  1521. #endif
  1522. if((y&(vertLumPerChroma-1))==(vertLumPerChroma-1) )
  1523. {
  1524. usrc += chromStride;
  1525. vsrc += chromStride;
  1526. }
  1527. ysrc += lumStride;
  1528. dst += dstStride;
  1529. }
  1530. #ifdef HAVE_MMX
  1531. asm( EMMS" \n\t"
  1532. SFENCE" \n\t"
  1533. :::"memory");
  1534. #endif
  1535. }
  1536. /**
  1537. *
  1538. * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
  1539. * problem for anyone then tell me, and ill fix it)
  1540. */
  1541. static inline void RENAME(yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
  1542. unsigned int width, unsigned int height,
  1543. int lumStride, int chromStride, int dstStride)
  1544. {
  1545. //FIXME interpolate chroma
  1546. RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
  1547. }
  1548. static inline void RENAME(yuvPlanartouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
  1549. unsigned int width, unsigned int height,
  1550. int lumStride, int chromStride, int dstStride, int vertLumPerChroma)
  1551. {
  1552. unsigned y;
  1553. const unsigned chromWidth= width>>1;
  1554. for(y=0; y<height; y++)
  1555. {
  1556. #ifdef HAVE_MMX
  1557. //FIXME handle 2 lines a once (fewer prefetch, reuse some chrom, but very likely limited by mem anyway)
  1558. asm volatile(
  1559. "xor %%"REG_a", %%"REG_a" \n\t"
  1560. ".balign 16 \n\t"
  1561. "1: \n\t"
  1562. PREFETCH" 32(%1, %%"REG_a", 2) \n\t"
  1563. PREFETCH" 32(%2, %%"REG_a") \n\t"
  1564. PREFETCH" 32(%3, %%"REG_a") \n\t"
  1565. "movq (%2, %%"REG_a"), %%mm0 \n\t" // U(0)
  1566. "movq %%mm0, %%mm2 \n\t" // U(0)
  1567. "movq (%3, %%"REG_a"), %%mm1 \n\t" // V(0)
  1568. "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
  1569. "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8)
  1570. "movq (%1, %%"REG_a",2), %%mm3 \n\t" // Y(0)
  1571. "movq 8(%1, %%"REG_a",2), %%mm5 \n\t" // Y(8)
  1572. "movq %%mm0, %%mm4 \n\t" // Y(0)
  1573. "movq %%mm2, %%mm6 \n\t" // Y(8)
  1574. "punpcklbw %%mm3, %%mm0 \n\t" // YUYV YUYV(0)
  1575. "punpckhbw %%mm3, %%mm4 \n\t" // YUYV YUYV(4)
  1576. "punpcklbw %%mm5, %%mm2 \n\t" // YUYV YUYV(8)
  1577. "punpckhbw %%mm5, %%mm6 \n\t" // YUYV YUYV(12)
  1578. MOVNTQ" %%mm0, (%0, %%"REG_a", 4)\n\t"
  1579. MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4)\n\t"
  1580. MOVNTQ" %%mm2, 16(%0, %%"REG_a", 4)\n\t"
  1581. MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4)\n\t"
  1582. "add $8, %%"REG_a" \n\t"
  1583. "cmp %4, %%"REG_a" \n\t"
  1584. " jb 1b \n\t"
  1585. ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" ((long)chromWidth)
  1586. : "%"REG_a
  1587. );
  1588. #else
  1589. //FIXME adapt the alpha asm code from yv12->yuy2
  1590. #if __WORDSIZE >= 64
  1591. int i;
  1592. uint64_t *ldst = (uint64_t *) dst;
  1593. const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
  1594. for(i = 0; i < chromWidth; i += 2){
  1595. uint64_t k, l;
  1596. k = uc[0] + (yc[0] << 8) +
  1597. (vc[0] << 16) + (yc[1] << 24);
  1598. l = uc[1] + (yc[2] << 8) +
  1599. (vc[1] << 16) + (yc[3] << 24);
  1600. *ldst++ = k + (l << 32);
  1601. yc += 4;
  1602. uc += 2;
  1603. vc += 2;
  1604. }
  1605. #else
  1606. int i, *idst = (int32_t *) dst;
  1607. const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
  1608. for(i = 0; i < chromWidth; i++){
  1609. #ifdef WORDS_BIGENDIAN
  1610. *idst++ = (uc[0] << 24)+ (yc[0] << 16) +
  1611. (vc[0] << 8) + (yc[1] << 0);
  1612. #else
  1613. *idst++ = uc[0] + (yc[0] << 8) +
  1614. (vc[0] << 16) + (yc[1] << 24);
  1615. #endif
  1616. yc += 2;
  1617. uc++;
  1618. vc++;
  1619. }
  1620. #endif
  1621. #endif
  1622. if((y&(vertLumPerChroma-1))==(vertLumPerChroma-1) )
  1623. {
  1624. usrc += chromStride;
  1625. vsrc += chromStride;
  1626. }
  1627. ysrc += lumStride;
  1628. dst += dstStride;
  1629. }
  1630. #ifdef HAVE_MMX
  1631. asm( EMMS" \n\t"
  1632. SFENCE" \n\t"
  1633. :::"memory");
  1634. #endif
  1635. }
  1636. /**
  1637. *
  1638. * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
  1639. * problem for anyone then tell me, and ill fix it)
  1640. */
  1641. static inline void RENAME(yv12touyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
  1642. unsigned int width, unsigned int height,
  1643. int lumStride, int chromStride, int dstStride)
  1644. {
  1645. //FIXME interpolate chroma
  1646. RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
  1647. }
  1648. /**
  1649. *
  1650. * width should be a multiple of 16
  1651. */
  1652. static inline void RENAME(yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
  1653. unsigned int width, unsigned int height,
  1654. int lumStride, int chromStride, int dstStride)
  1655. {
  1656. RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
  1657. }
  1658. /**
  1659. *
  1660. * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
  1661. * problem for anyone then tell me, and ill fix it)
  1662. */
  1663. static inline void RENAME(yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
  1664. unsigned int width, unsigned int height,
  1665. int lumStride, int chromStride, int srcStride)
  1666. {
  1667. unsigned y;
  1668. const unsigned chromWidth= width>>1;
  1669. for(y=0; y<height; y+=2)
  1670. {
  1671. #ifdef HAVE_MMX
  1672. asm volatile(
  1673. "xor %%"REG_a", %%"REG_a" \n\t"
  1674. "pcmpeqw %%mm7, %%mm7 \n\t"
  1675. "psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
  1676. ".balign 16 \n\t"
  1677. "1: \n\t"
  1678. PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
  1679. "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
  1680. "movq 8(%0, %%"REG_a", 4), %%mm1\n\t" // YUYV YUYV(4)
  1681. "movq %%mm0, %%mm2 \n\t" // YUYV YUYV(0)
  1682. "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(4)
  1683. "psrlw $8, %%mm0 \n\t" // U0V0 U0V0(0)
  1684. "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(4)
  1685. "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
  1686. "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
  1687. "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
  1688. "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
  1689. MOVNTQ" %%mm2, (%1, %%"REG_a", 2)\n\t"
  1690. "movq 16(%0, %%"REG_a", 4), %%mm1\n\t" // YUYV YUYV(8)
  1691. "movq 24(%0, %%"REG_a", 4), %%mm2\n\t" // YUYV YUYV(12)
  1692. "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(8)
  1693. "movq %%mm2, %%mm4 \n\t" // YUYV YUYV(12)
  1694. "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(8)
  1695. "psrlw $8, %%mm2 \n\t" // U0V0 U0V0(12)
  1696. "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
  1697. "pand %%mm7, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
  1698. "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
  1699. "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
  1700. MOVNTQ" %%mm3, 8(%1, %%"REG_a", 2)\n\t"
  1701. "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
  1702. "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
  1703. "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
  1704. "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
  1705. "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
  1706. "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
  1707. "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
  1708. "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
  1709. MOVNTQ" %%mm0, (%3, %%"REG_a") \n\t"
  1710. MOVNTQ" %%mm2, (%2, %%"REG_a") \n\t"
  1711. "add $8, %%"REG_a" \n\t"
  1712. "cmp %4, %%"REG_a" \n\t"
  1713. " jb 1b \n\t"
  1714. ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" ((long)chromWidth)
  1715. : "memory", "%"REG_a
  1716. );
  1717. ydst += lumStride;
  1718. src += srcStride;
  1719. asm volatile(
  1720. "xor %%"REG_a", %%"REG_a" \n\t"
  1721. ".balign 16 \n\t"
  1722. "1: \n\t"
  1723. PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
  1724. "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
  1725. "movq 8(%0, %%"REG_a", 4), %%mm1\n\t" // YUYV YUYV(4)
  1726. "movq 16(%0, %%"REG_a", 4), %%mm2\n\t" // YUYV YUYV(8)
  1727. "movq 24(%0, %%"REG_a", 4), %%mm3\n\t" // YUYV YUYV(12)
  1728. "pand %%mm7, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
  1729. "pand %%mm7, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
  1730. "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
  1731. "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
  1732. "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
  1733. "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
  1734. MOVNTQ" %%mm0, (%1, %%"REG_a", 2)\n\t"
  1735. MOVNTQ" %%mm2, 8(%1, %%"REG_a", 2)\n\t"
  1736. "add $8, %%"REG_a" \n\t"
  1737. "cmp %4, %%"REG_a" \n\t"
  1738. " jb 1b \n\t"
  1739. ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" ((long)chromWidth)
  1740. : "memory", "%"REG_a
  1741. );
  1742. #else
  1743. unsigned i;
  1744. for(i=0; i<chromWidth; i++)
  1745. {
  1746. ydst[2*i+0] = src[4*i+0];
  1747. udst[i] = src[4*i+1];
  1748. ydst[2*i+1] = src[4*i+2];
  1749. vdst[i] = src[4*i+3];
  1750. }
  1751. ydst += lumStride;
  1752. src += srcStride;
  1753. for(i=0; i<chromWidth; i++)
  1754. {
  1755. ydst[2*i+0] = src[4*i+0];
  1756. ydst[2*i+1] = src[4*i+2];
  1757. }
  1758. #endif
  1759. udst += chromStride;
  1760. vdst += chromStride;
  1761. ydst += lumStride;
  1762. src += srcStride;
  1763. }
  1764. #ifdef HAVE_MMX
  1765. asm volatile( EMMS" \n\t"
  1766. SFENCE" \n\t"
  1767. :::"memory");
  1768. #endif
  1769. }
  1770. static inline void RENAME(yvu9toyv12)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc,
  1771. uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
  1772. unsigned int width, unsigned int height, int lumStride, int chromStride)
  1773. {
  1774. /* Y Plane */
  1775. memcpy(ydst, ysrc, width*height);
  1776. /* XXX: implement upscaling for U,V */
  1777. }
  1778. static inline void RENAME(planar2x)(const uint8_t *src, uint8_t *dst, int srcWidth, int srcHeight, int srcStride, int dstStride)
  1779. {
  1780. int x,y;
  1781. dst[0]= src[0];
  1782. // first line
  1783. for(x=0; x<srcWidth-1; x++){
  1784. dst[2*x+1]= (3*src[x] + src[x+1])>>2;
  1785. dst[2*x+2]= ( src[x] + 3*src[x+1])>>2;
  1786. }
  1787. dst[2*srcWidth-1]= src[srcWidth-1];
  1788. dst+= dstStride;
  1789. for(y=1; y<srcHeight; y++){
  1790. #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
  1791. const long mmxSize= srcWidth&~15;
  1792. asm volatile(
  1793. "mov %4, %%"REG_a" \n\t"
  1794. "1: \n\t"
  1795. "movq (%0, %%"REG_a"), %%mm0 \n\t"
  1796. "movq (%1, %%"REG_a"), %%mm1 \n\t"
  1797. "movq 1(%0, %%"REG_a"), %%mm2 \n\t"
  1798. "movq 1(%1, %%"REG_a"), %%mm3 \n\t"
  1799. "movq -1(%0, %%"REG_a"), %%mm4 \n\t"
  1800. "movq -1(%1, %%"REG_a"), %%mm5 \n\t"
  1801. PAVGB" %%mm0, %%mm5 \n\t"
  1802. PAVGB" %%mm0, %%mm3 \n\t"
  1803. PAVGB" %%mm0, %%mm5 \n\t"
  1804. PAVGB" %%mm0, %%mm3 \n\t"
  1805. PAVGB" %%mm1, %%mm4 \n\t"
  1806. PAVGB" %%mm1, %%mm2 \n\t"
  1807. PAVGB" %%mm1, %%mm4 \n\t"
  1808. PAVGB" %%mm1, %%mm2 \n\t"
  1809. "movq %%mm5, %%mm7 \n\t"
  1810. "movq %%mm4, %%mm6 \n\t"
  1811. "punpcklbw %%mm3, %%mm5 \n\t"
  1812. "punpckhbw %%mm3, %%mm7 \n\t"
  1813. "punpcklbw %%mm2, %%mm4 \n\t"
  1814. "punpckhbw %%mm2, %%mm6 \n\t"
  1815. #if 1
  1816. MOVNTQ" %%mm5, (%2, %%"REG_a", 2)\n\t"
  1817. MOVNTQ" %%mm7, 8(%2, %%"REG_a", 2)\n\t"
  1818. MOVNTQ" %%mm4, (%3, %%"REG_a", 2)\n\t"
  1819. MOVNTQ" %%mm6, 8(%3, %%"REG_a", 2)\n\t"
  1820. #else
  1821. "movq %%mm5, (%2, %%"REG_a", 2) \n\t"
  1822. "movq %%mm7, 8(%2, %%"REG_a", 2)\n\t"
  1823. "movq %%mm4, (%3, %%"REG_a", 2) \n\t"
  1824. "movq %%mm6, 8(%3, %%"REG_a", 2)\n\t"
  1825. #endif
  1826. "add $8, %%"REG_a" \n\t"
  1827. " js 1b \n\t"
  1828. :: "r" (src + mmxSize ), "r" (src + srcStride + mmxSize ),
  1829. "r" (dst + mmxSize*2), "r" (dst + dstStride + mmxSize*2),
  1830. "g" (-mmxSize)
  1831. : "%"REG_a
  1832. );
  1833. #else
  1834. const int mmxSize=1;
  1835. #endif
  1836. dst[0 ]= (3*src[0] + src[srcStride])>>2;
  1837. dst[dstStride]= ( src[0] + 3*src[srcStride])>>2;
  1838. for(x=mmxSize-1; x<srcWidth-1; x++){
  1839. dst[2*x +1]= (3*src[x+0] + src[x+srcStride+1])>>2;
  1840. dst[2*x+dstStride+2]= ( src[x+0] + 3*src[x+srcStride+1])>>2;
  1841. dst[2*x+dstStride+1]= ( src[x+1] + 3*src[x+srcStride ])>>2;
  1842. dst[2*x +2]= (3*src[x+1] + src[x+srcStride ])>>2;
  1843. }
  1844. dst[srcWidth*2 -1 ]= (3*src[srcWidth-1] + src[srcWidth-1 + srcStride])>>2;
  1845. dst[srcWidth*2 -1 + dstStride]= ( src[srcWidth-1] + 3*src[srcWidth-1 + srcStride])>>2;
  1846. dst+=dstStride*2;
  1847. src+=srcStride;
  1848. }
  1849. // last line
  1850. #if 1
  1851. dst[0]= src[0];
  1852. for(x=0; x<srcWidth-1; x++){
  1853. dst[2*x+1]= (3*src[x] + src[x+1])>>2;
  1854. dst[2*x+2]= ( src[x] + 3*src[x+1])>>2;
  1855. }
  1856. dst[2*srcWidth-1]= src[srcWidth-1];
  1857. #else
  1858. for(x=0; x<srcWidth; x++){
  1859. dst[2*x+0]=
  1860. dst[2*x+1]= src[x];
  1861. }
  1862. #endif
  1863. #ifdef HAVE_MMX
  1864. asm volatile( EMMS" \n\t"
  1865. SFENCE" \n\t"
  1866. :::"memory");
  1867. #endif
  1868. }
  1869. /**
  1870. *
  1871. * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
  1872. * problem for anyone then tell me, and ill fix it)
  1873. * chrominance data is only taken from every secound line others are ignored FIXME write HQ version
  1874. */
  1875. static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
  1876. unsigned int width, unsigned int height,
  1877. int lumStride, int chromStride, int srcStride)
  1878. {
  1879. unsigned y;
  1880. const unsigned chromWidth= width>>1;
  1881. for(y=0; y<height; y+=2)
  1882. {
  1883. #ifdef HAVE_MMX
  1884. asm volatile(
  1885. "xorl %%eax, %%eax \n\t"
  1886. "pcmpeqw %%mm7, %%mm7 \n\t"
  1887. "psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
  1888. ".balign 16 \n\t"
  1889. "1: \n\t"
  1890. PREFETCH" 64(%0, %%eax, 4) \n\t"
  1891. "movq (%0, %%eax, 4), %%mm0 \n\t" // UYVY UYVY(0)
  1892. "movq 8(%0, %%eax, 4), %%mm1 \n\t" // UYVY UYVY(4)
  1893. "movq %%mm0, %%mm2 \n\t" // UYVY UYVY(0)
  1894. "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(4)
  1895. "pand %%mm7, %%mm0 \n\t" // U0V0 U0V0(0)
  1896. "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(4)
  1897. "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
  1898. "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
  1899. "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
  1900. "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
  1901. MOVNTQ" %%mm2, (%1, %%eax, 2) \n\t"
  1902. "movq 16(%0, %%eax, 4), %%mm1 \n\t" // UYVY UYVY(8)
  1903. "movq 24(%0, %%eax, 4), %%mm2 \n\t" // UYVY UYVY(12)
  1904. "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(8)
  1905. "movq %%mm2, %%mm4 \n\t" // UYVY UYVY(12)
  1906. "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(8)
  1907. "pand %%mm7, %%mm2 \n\t" // U0V0 U0V0(12)
  1908. "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
  1909. "psrlw $8, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
  1910. "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
  1911. "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
  1912. MOVNTQ" %%mm3, 8(%1, %%eax, 2) \n\t"
  1913. "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
  1914. "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
  1915. "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
  1916. "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
  1917. "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
  1918. "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
  1919. "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
  1920. "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
  1921. MOVNTQ" %%mm0, (%3, %%eax) \n\t"
  1922. MOVNTQ" %%mm2, (%2, %%eax) \n\t"
  1923. "addl $8, %%eax \n\t"
  1924. "cmpl %4, %%eax \n\t"
  1925. " jb 1b \n\t"
  1926. ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
  1927. : "memory", "%eax"
  1928. );
  1929. ydst += lumStride;
  1930. src += srcStride;
  1931. asm volatile(
  1932. "xorl %%eax, %%eax \n\t"
  1933. ".balign 16 \n\t"
  1934. "1: \n\t"
  1935. PREFETCH" 64(%0, %%eax, 4) \n\t"
  1936. "movq (%0, %%eax, 4), %%mm0 \n\t" // YUYV YUYV(0)
  1937. "movq 8(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(4)
  1938. "movq 16(%0, %%eax, 4), %%mm2 \n\t" // YUYV YUYV(8)
  1939. "movq 24(%0, %%eax, 4), %%mm3 \n\t" // YUYV YUYV(12)
  1940. "psrlw $8, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
  1941. "psrlw $8, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
  1942. "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
  1943. "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
  1944. "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
  1945. "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
  1946. MOVNTQ" %%mm0, (%1, %%eax, 2) \n\t"
  1947. MOVNTQ" %%mm2, 8(%1, %%eax, 2) \n\t"
  1948. "addl $8, %%eax \n\t"
  1949. "cmpl %4, %%eax \n\t"
  1950. " jb 1b \n\t"
  1951. ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
  1952. : "memory", "%eax"
  1953. );
  1954. #else
  1955. unsigned i;
  1956. for(i=0; i<chromWidth; i++)
  1957. {
  1958. udst[i] = src[4*i+0];
  1959. ydst[2*i+0] = src[4*i+1];
  1960. vdst[i] = src[4*i+2];
  1961. ydst[2*i+1] = src[4*i+3];
  1962. }
  1963. ydst += lumStride;
  1964. src += srcStride;
  1965. for(i=0; i<chromWidth; i++)
  1966. {
  1967. ydst[2*i+0] = src[4*i+1];
  1968. ydst[2*i+1] = src[4*i+3];
  1969. }
  1970. #endif
  1971. udst += chromStride;
  1972. vdst += chromStride;
  1973. ydst += lumStride;
  1974. src += srcStride;
  1975. }
  1976. #ifdef HAVE_MMX
  1977. asm volatile( EMMS" \n\t"
  1978. SFENCE" \n\t"
  1979. :::"memory");
  1980. #endif
  1981. }
  1982. /**
  1983. *
  1984. * height should be a multiple of 2 and width should be a multiple of 2 (if this is a
  1985. * problem for anyone then tell me, and ill fix it)
  1986. * chrominance data is only taken from every secound line others are ignored in the C version FIXME write HQ version
  1987. */
  1988. static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
  1989. unsigned int width, unsigned int height,
  1990. int lumStride, int chromStride, int srcStride)
  1991. {
  1992. unsigned y;
  1993. const unsigned chromWidth= width>>1;
  1994. #ifdef HAVE_MMX
  1995. for(y=0; y<height-2; y+=2)
  1996. {
  1997. unsigned i;
  1998. for(i=0; i<2; i++)
  1999. {
  2000. asm volatile(
  2001. "mov %2, %%"REG_a" \n\t"
  2002. "movq "MANGLE(bgr2YCoeff)", %%mm6 \n\t"
  2003. "movq "MANGLE(w1111)", %%mm5 \n\t"
  2004. "pxor %%mm7, %%mm7 \n\t"
  2005. "lea (%%"REG_a", %%"REG_a", 2), %%"REG_b"\n\t"
  2006. ".balign 16 \n\t"
  2007. "1: \n\t"
  2008. PREFETCH" 64(%0, %%"REG_b") \n\t"
  2009. "movd (%0, %%"REG_b"), %%mm0 \n\t"
  2010. "movd 3(%0, %%"REG_b"), %%mm1 \n\t"
  2011. "punpcklbw %%mm7, %%mm0 \n\t"
  2012. "punpcklbw %%mm7, %%mm1 \n\t"
  2013. "movd 6(%0, %%"REG_b"), %%mm2 \n\t"
  2014. "movd 9(%0, %%"REG_b"), %%mm3 \n\t"
  2015. "punpcklbw %%mm7, %%mm2 \n\t"
  2016. "punpcklbw %%mm7, %%mm3 \n\t"
  2017. "pmaddwd %%mm6, %%mm0 \n\t"
  2018. "pmaddwd %%mm6, %%mm1 \n\t"
  2019. "pmaddwd %%mm6, %%mm2 \n\t"
  2020. "pmaddwd %%mm6, %%mm3 \n\t"
  2021. #ifndef FAST_BGR2YV12
  2022. "psrad $8, %%mm0 \n\t"
  2023. "psrad $8, %%mm1 \n\t"
  2024. "psrad $8, %%mm2 \n\t"
  2025. "psrad $8, %%mm3 \n\t"
  2026. #endif
  2027. "packssdw %%mm1, %%mm0 \n\t"
  2028. "packssdw %%mm3, %%mm2 \n\t"
  2029. "pmaddwd %%mm5, %%mm0 \n\t"
  2030. "pmaddwd %%mm5, %%mm2 \n\t"
  2031. "packssdw %%mm2, %%mm0 \n\t"
  2032. "psraw $7, %%mm0 \n\t"
  2033. "movd 12(%0, %%"REG_b"), %%mm4 \n\t"
  2034. "movd 15(%0, %%"REG_b"), %%mm1 \n\t"
  2035. "punpcklbw %%mm7, %%mm4 \n\t"
  2036. "punpcklbw %%mm7, %%mm1 \n\t"
  2037. "movd 18(%0, %%"REG_b"), %%mm2 \n\t"
  2038. "movd 21(%0, %%"REG_b"), %%mm3 \n\t"
  2039. "punpcklbw %%mm7, %%mm2 \n\t"
  2040. "punpcklbw %%mm7, %%mm3 \n\t"
  2041. "pmaddwd %%mm6, %%mm4 \n\t"
  2042. "pmaddwd %%mm6, %%mm1 \n\t"
  2043. "pmaddwd %%mm6, %%mm2 \n\t"
  2044. "pmaddwd %%mm6, %%mm3 \n\t"
  2045. #ifndef FAST_BGR2YV12
  2046. "psrad $8, %%mm4 \n\t"
  2047. "psrad $8, %%mm1 \n\t"
  2048. "psrad $8, %%mm2 \n\t"
  2049. "psrad $8, %%mm3 \n\t"
  2050. #endif
  2051. "packssdw %%mm1, %%mm4 \n\t"
  2052. "packssdw %%mm3, %%mm2 \n\t"
  2053. "pmaddwd %%mm5, %%mm4 \n\t"
  2054. "pmaddwd %%mm5, %%mm2 \n\t"
  2055. "add $24, %%"REG_b" \n\t"
  2056. "packssdw %%mm2, %%mm4 \n\t"
  2057. "psraw $7, %%mm4 \n\t"
  2058. "packuswb %%mm4, %%mm0 \n\t"
  2059. "paddusb "MANGLE(bgr2YOffset)", %%mm0 \n\t"
  2060. MOVNTQ" %%mm0, (%1, %%"REG_a") \n\t"
  2061. "add $8, %%"REG_a" \n\t"
  2062. " js 1b \n\t"
  2063. : : "r" (src+width*3), "r" (ydst+width), "g" ((long)-width)
  2064. : "%"REG_a, "%"REG_b
  2065. );
  2066. ydst += lumStride;
  2067. src += srcStride;
  2068. }
  2069. src -= srcStride*2;
  2070. asm volatile(
  2071. "mov %4, %%"REG_a" \n\t"
  2072. "movq "MANGLE(w1111)", %%mm5 \n\t"
  2073. "movq "MANGLE(bgr2UCoeff)", %%mm6 \n\t"
  2074. "pxor %%mm7, %%mm7 \n\t"
  2075. "lea (%%"REG_a", %%"REG_a", 2), %%"REG_b"\n\t"
  2076. "add %%"REG_b", %%"REG_b" \n\t"
  2077. ".balign 16 \n\t"
  2078. "1: \n\t"
  2079. PREFETCH" 64(%0, %%"REG_b") \n\t"
  2080. PREFETCH" 64(%1, %%"REG_b") \n\t"
  2081. #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
  2082. "movq (%0, %%"REG_b"), %%mm0 \n\t"
  2083. "movq (%1, %%"REG_b"), %%mm1 \n\t"
  2084. "movq 6(%0, %%"REG_b"), %%mm2 \n\t"
  2085. "movq 6(%1, %%"REG_b"), %%mm3 \n\t"
  2086. PAVGB" %%mm1, %%mm0 \n\t"
  2087. PAVGB" %%mm3, %%mm2 \n\t"
  2088. "movq %%mm0, %%mm1 \n\t"
  2089. "movq %%mm2, %%mm3 \n\t"
  2090. "psrlq $24, %%mm0 \n\t"
  2091. "psrlq $24, %%mm2 \n\t"
  2092. PAVGB" %%mm1, %%mm0 \n\t"
  2093. PAVGB" %%mm3, %%mm2 \n\t"
  2094. "punpcklbw %%mm7, %%mm0 \n\t"
  2095. "punpcklbw %%mm7, %%mm2 \n\t"
  2096. #else
  2097. "movd (%0, %%"REG_b"), %%mm0 \n\t"
  2098. "movd (%1, %%"REG_b"), %%mm1 \n\t"
  2099. "movd 3(%0, %%"REG_b"), %%mm2 \n\t"
  2100. "movd 3(%1, %%"REG_b"), %%mm3 \n\t"
  2101. "punpcklbw %%mm7, %%mm0 \n\t"
  2102. "punpcklbw %%mm7, %%mm1 \n\t"
  2103. "punpcklbw %%mm7, %%mm2 \n\t"
  2104. "punpcklbw %%mm7, %%mm3 \n\t"
  2105. "paddw %%mm1, %%mm0 \n\t"
  2106. "paddw %%mm3, %%mm2 \n\t"
  2107. "paddw %%mm2, %%mm0 \n\t"
  2108. "movd 6(%0, %%"REG_b"), %%mm4 \n\t"
  2109. "movd 6(%1, %%"REG_b"), %%mm1 \n\t"
  2110. "movd 9(%0, %%"REG_b"), %%mm2 \n\t"
  2111. "movd 9(%1, %%"REG_b"), %%mm3 \n\t"
  2112. "punpcklbw %%mm7, %%mm4 \n\t"
  2113. "punpcklbw %%mm7, %%mm1 \n\t"
  2114. "punpcklbw %%mm7, %%mm2 \n\t"
  2115. "punpcklbw %%mm7, %%mm3 \n\t"
  2116. "paddw %%mm1, %%mm4 \n\t"
  2117. "paddw %%mm3, %%mm2 \n\t"
  2118. "paddw %%mm4, %%mm2 \n\t"
  2119. "psrlw $2, %%mm0 \n\t"
  2120. "psrlw $2, %%mm2 \n\t"
  2121. #endif
  2122. "movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t"
  2123. "movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t"
  2124. "pmaddwd %%mm0, %%mm1 \n\t"
  2125. "pmaddwd %%mm2, %%mm3 \n\t"
  2126. "pmaddwd %%mm6, %%mm0 \n\t"
  2127. "pmaddwd %%mm6, %%mm2 \n\t"
  2128. #ifndef FAST_BGR2YV12
  2129. "psrad $8, %%mm0 \n\t"
  2130. "psrad $8, %%mm1 \n\t"
  2131. "psrad $8, %%mm2 \n\t"
  2132. "psrad $8, %%mm3 \n\t"
  2133. #endif
  2134. "packssdw %%mm2, %%mm0 \n\t"
  2135. "packssdw %%mm3, %%mm1 \n\t"
  2136. "pmaddwd %%mm5, %%mm0 \n\t"
  2137. "pmaddwd %%mm5, %%mm1 \n\t"
  2138. "packssdw %%mm1, %%mm0 \n\t" // V1 V0 U1 U0
  2139. "psraw $7, %%mm0 \n\t"
  2140. #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
  2141. "movq 12(%0, %%"REG_b"), %%mm4 \n\t"
  2142. "movq 12(%1, %%"REG_b"), %%mm1 \n\t"
  2143. "movq 18(%0, %%"REG_b"), %%mm2 \n\t"
  2144. "movq 18(%1, %%"REG_b"), %%mm3 \n\t"
  2145. PAVGB" %%mm1, %%mm4 \n\t"
  2146. PAVGB" %%mm3, %%mm2 \n\t"
  2147. "movq %%mm4, %%mm1 \n\t"
  2148. "movq %%mm2, %%mm3 \n\t"
  2149. "psrlq $24, %%mm4 \n\t"
  2150. "psrlq $24, %%mm2 \n\t"
  2151. PAVGB" %%mm1, %%mm4 \n\t"
  2152. PAVGB" %%mm3, %%mm2 \n\t"
  2153. "punpcklbw %%mm7, %%mm4 \n\t"
  2154. "punpcklbw %%mm7, %%mm2 \n\t"
  2155. #else
  2156. "movd 12(%0, %%"REG_b"), %%mm4 \n\t"
  2157. "movd 12(%1, %%"REG_b"), %%mm1 \n\t"
  2158. "movd 15(%0, %%"REG_b"), %%mm2 \n\t"
  2159. "movd 15(%1, %%"REG_b"), %%mm3 \n\t"
  2160. "punpcklbw %%mm7, %%mm4 \n\t"
  2161. "punpcklbw %%mm7, %%mm1 \n\t"
  2162. "punpcklbw %%mm7, %%mm2 \n\t"
  2163. "punpcklbw %%mm7, %%mm3 \n\t"
  2164. "paddw %%mm1, %%mm4 \n\t"
  2165. "paddw %%mm3, %%mm2 \n\t"
  2166. "paddw %%mm2, %%mm4 \n\t"
  2167. "movd 18(%0, %%"REG_b"), %%mm5 \n\t"
  2168. "movd 18(%1, %%"REG_b"), %%mm1 \n\t"
  2169. "movd 21(%0, %%"REG_b"), %%mm2 \n\t"
  2170. "movd 21(%1, %%"REG_b"), %%mm3 \n\t"
  2171. "punpcklbw %%mm7, %%mm5 \n\t"
  2172. "punpcklbw %%mm7, %%mm1 \n\t"
  2173. "punpcklbw %%mm7, %%mm2 \n\t"
  2174. "punpcklbw %%mm7, %%mm3 \n\t"
  2175. "paddw %%mm1, %%mm5 \n\t"
  2176. "paddw %%mm3, %%mm2 \n\t"
  2177. "paddw %%mm5, %%mm2 \n\t"
  2178. "movq "MANGLE(w1111)", %%mm5 \n\t"
  2179. "psrlw $2, %%mm4 \n\t"
  2180. "psrlw $2, %%mm2 \n\t"
  2181. #endif
  2182. "movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t"
  2183. "movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t"
  2184. "pmaddwd %%mm4, %%mm1 \n\t"
  2185. "pmaddwd %%mm2, %%mm3 \n\t"
  2186. "pmaddwd %%mm6, %%mm4 \n\t"
  2187. "pmaddwd %%mm6, %%mm2 \n\t"
  2188. #ifndef FAST_BGR2YV12
  2189. "psrad $8, %%mm4 \n\t"
  2190. "psrad $8, %%mm1 \n\t"
  2191. "psrad $8, %%mm2 \n\t"
  2192. "psrad $8, %%mm3 \n\t"
  2193. #endif
  2194. "packssdw %%mm2, %%mm4 \n\t"
  2195. "packssdw %%mm3, %%mm1 \n\t"
  2196. "pmaddwd %%mm5, %%mm4 \n\t"
  2197. "pmaddwd %%mm5, %%mm1 \n\t"
  2198. "add $24, %%"REG_b" \n\t"
  2199. "packssdw %%mm1, %%mm4 \n\t" // V3 V2 U3 U2
  2200. "psraw $7, %%mm4 \n\t"
  2201. "movq %%mm0, %%mm1 \n\t"
  2202. "punpckldq %%mm4, %%mm0 \n\t"
  2203. "punpckhdq %%mm4, %%mm1 \n\t"
  2204. "packsswb %%mm1, %%mm0 \n\t"
  2205. "paddb "MANGLE(bgr2UVOffset)", %%mm0 \n\t"
  2206. "movd %%mm0, (%2, %%"REG_a") \n\t"
  2207. "punpckhdq %%mm0, %%mm0 \n\t"
  2208. "movd %%mm0, (%3, %%"REG_a") \n\t"
  2209. "add $4, %%"REG_a" \n\t"
  2210. " js 1b \n\t"
  2211. : : "r" (src+chromWidth*6), "r" (src+srcStride+chromWidth*6), "r" (udst+chromWidth), "r" (vdst+chromWidth), "g" ((long)-chromWidth)
  2212. : "%"REG_a, "%"REG_b
  2213. );
  2214. udst += chromStride;
  2215. vdst += chromStride;
  2216. src += srcStride*2;
  2217. }
  2218. asm volatile( EMMS" \n\t"
  2219. SFENCE" \n\t"
  2220. :::"memory");
  2221. #else
  2222. y=0;
  2223. #endif
  2224. for(; y<height; y+=2)
  2225. {
  2226. unsigned i;
  2227. for(i=0; i<chromWidth; i++)
  2228. {
  2229. unsigned int b= src[6*i+0];
  2230. unsigned int g= src[6*i+1];
  2231. unsigned int r= src[6*i+2];
  2232. unsigned int Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
  2233. unsigned int V = ((RV*r + GV*g + BV*b)>>RGB2YUV_SHIFT) + 128;
  2234. unsigned int U = ((RU*r + GU*g + BU*b)>>RGB2YUV_SHIFT) + 128;
  2235. udst[i] = U;
  2236. vdst[i] = V;
  2237. ydst[2*i] = Y;
  2238. b= src[6*i+3];
  2239. g= src[6*i+4];
  2240. r= src[6*i+5];
  2241. Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
  2242. ydst[2*i+1] = Y;
  2243. }
  2244. ydst += lumStride;
  2245. src += srcStride;
  2246. for(i=0; i<chromWidth; i++)
  2247. {
  2248. unsigned int b= src[6*i+0];
  2249. unsigned int g= src[6*i+1];
  2250. unsigned int r= src[6*i+2];
  2251. unsigned int Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
  2252. ydst[2*i] = Y;
  2253. b= src[6*i+3];
  2254. g= src[6*i+4];
  2255. r= src[6*i+5];
  2256. Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
  2257. ydst[2*i+1] = Y;
  2258. }
  2259. udst += chromStride;
  2260. vdst += chromStride;
  2261. ydst += lumStride;
  2262. src += srcStride;
  2263. }
  2264. }
  2265. void RENAME(interleaveBytes)(uint8_t *src1, uint8_t *src2, uint8_t *dest,
  2266. unsigned width, unsigned height, int src1Stride,
  2267. int src2Stride, int dstStride){
  2268. unsigned h;
  2269. for(h=0; h < height; h++)
  2270. {
  2271. unsigned w;
  2272. #ifdef HAVE_MMX
  2273. #ifdef HAVE_SSE2
  2274. asm(
  2275. "xor %%"REG_a", %%"REG_a" \n\t"
  2276. "1: \n\t"
  2277. PREFETCH" 64(%1, %%"REG_a") \n\t"
  2278. PREFETCH" 64(%2, %%"REG_a") \n\t"
  2279. "movdqa (%1, %%"REG_a"), %%xmm0 \n\t"
  2280. "movdqa (%1, %%"REG_a"), %%xmm1 \n\t"
  2281. "movdqa (%2, %%"REG_a"), %%xmm2 \n\t"
  2282. "punpcklbw %%xmm2, %%xmm0 \n\t"
  2283. "punpckhbw %%xmm2, %%xmm1 \n\t"
  2284. "movntdq %%xmm0, (%0, %%"REG_a", 2)\n\t"
  2285. "movntdq %%xmm1, 16(%0, %%"REG_a", 2)\n\t"
  2286. "add $16, %%"REG_a" \n\t"
  2287. "cmp %3, %%"REG_a" \n\t"
  2288. " jb 1b \n\t"
  2289. ::"r"(dest), "r"(src1), "r"(src2), "r" ((long)width-15)
  2290. : "memory", "%"REG_a""
  2291. );
  2292. #else
  2293. asm(
  2294. "xor %%"REG_a", %%"REG_a" \n\t"
  2295. "1: \n\t"
  2296. PREFETCH" 64(%1, %%"REG_a") \n\t"
  2297. PREFETCH" 64(%2, %%"REG_a") \n\t"
  2298. "movq (%1, %%"REG_a"), %%mm0 \n\t"
  2299. "movq 8(%1, %%"REG_a"), %%mm2 \n\t"
  2300. "movq %%mm0, %%mm1 \n\t"
  2301. "movq %%mm2, %%mm3 \n\t"
  2302. "movq (%2, %%"REG_a"), %%mm4 \n\t"
  2303. "movq 8(%2, %%"REG_a"), %%mm5 \n\t"
  2304. "punpcklbw %%mm4, %%mm0 \n\t"
  2305. "punpckhbw %%mm4, %%mm1 \n\t"
  2306. "punpcklbw %%mm5, %%mm2 \n\t"
  2307. "punpckhbw %%mm5, %%mm3 \n\t"
  2308. MOVNTQ" %%mm0, (%0, %%"REG_a", 2)\n\t"
  2309. MOVNTQ" %%mm1, 8(%0, %%"REG_a", 2)\n\t"
  2310. MOVNTQ" %%mm2, 16(%0, %%"REG_a", 2)\n\t"
  2311. MOVNTQ" %%mm3, 24(%0, %%"REG_a", 2)\n\t"
  2312. "add $16, %%"REG_a" \n\t"
  2313. "cmp %3, %%"REG_a" \n\t"
  2314. " jb 1b \n\t"
  2315. ::"r"(dest), "r"(src1), "r"(src2), "r" ((long)width-15)
  2316. : "memory", "%"REG_a
  2317. );
  2318. #endif
  2319. for(w= (width&(~15)); w < width; w++)
  2320. {
  2321. dest[2*w+0] = src1[w];
  2322. dest[2*w+1] = src2[w];
  2323. }
  2324. #else
  2325. for(w=0; w < width; w++)
  2326. {
  2327. dest[2*w+0] = src1[w];
  2328. dest[2*w+1] = src2[w];
  2329. }
  2330. #endif
  2331. dest += dstStride;
  2332. src1 += src1Stride;
  2333. src2 += src2Stride;
  2334. }
  2335. #ifdef HAVE_MMX
  2336. asm(
  2337. EMMS" \n\t"
  2338. SFENCE" \n\t"
  2339. ::: "memory"
  2340. );
  2341. #endif
  2342. }
  2343. static inline void RENAME(vu9_to_vu12)(const uint8_t *src1, const uint8_t *src2,
  2344. uint8_t *dst1, uint8_t *dst2,
  2345. unsigned width, unsigned height,
  2346. int srcStride1, int srcStride2,
  2347. int dstStride1, int dstStride2)
  2348. {
  2349. unsigned int y,x,h;
  2350. int w;
  2351. w=width/2; h=height/2;
  2352. #ifdef HAVE_MMX
  2353. asm volatile(
  2354. PREFETCH" %0\n\t"
  2355. PREFETCH" %1\n\t"
  2356. ::"m"(*(src1+srcStride1)),"m"(*(src2+srcStride2)):"memory");
  2357. #endif
  2358. for(y=0;y<h;y++){
  2359. const uint8_t* s1=src1+srcStride1*(y>>1);
  2360. uint8_t* d=dst1+dstStride1*y;
  2361. x=0;
  2362. #ifdef HAVE_MMX
  2363. for(;x<w-31;x+=32)
  2364. {
  2365. asm volatile(
  2366. PREFETCH" 32%1\n\t"
  2367. "movq %1, %%mm0\n\t"
  2368. "movq 8%1, %%mm2\n\t"
  2369. "movq 16%1, %%mm4\n\t"
  2370. "movq 24%1, %%mm6\n\t"
  2371. "movq %%mm0, %%mm1\n\t"
  2372. "movq %%mm2, %%mm3\n\t"
  2373. "movq %%mm4, %%mm5\n\t"
  2374. "movq %%mm6, %%mm7\n\t"
  2375. "punpcklbw %%mm0, %%mm0\n\t"
  2376. "punpckhbw %%mm1, %%mm1\n\t"
  2377. "punpcklbw %%mm2, %%mm2\n\t"
  2378. "punpckhbw %%mm3, %%mm3\n\t"
  2379. "punpcklbw %%mm4, %%mm4\n\t"
  2380. "punpckhbw %%mm5, %%mm5\n\t"
  2381. "punpcklbw %%mm6, %%mm6\n\t"
  2382. "punpckhbw %%mm7, %%mm7\n\t"
  2383. MOVNTQ" %%mm0, %0\n\t"
  2384. MOVNTQ" %%mm1, 8%0\n\t"
  2385. MOVNTQ" %%mm2, 16%0\n\t"
  2386. MOVNTQ" %%mm3, 24%0\n\t"
  2387. MOVNTQ" %%mm4, 32%0\n\t"
  2388. MOVNTQ" %%mm5, 40%0\n\t"
  2389. MOVNTQ" %%mm6, 48%0\n\t"
  2390. MOVNTQ" %%mm7, 56%0"
  2391. :"=m"(d[2*x])
  2392. :"m"(s1[x])
  2393. :"memory");
  2394. }
  2395. #endif
  2396. for(;x<w;x++) d[2*x]=d[2*x+1]=s1[x];
  2397. }
  2398. for(y=0;y<h;y++){
  2399. const uint8_t* s2=src2+srcStride2*(y>>1);
  2400. uint8_t* d=dst2+dstStride2*y;
  2401. x=0;
  2402. #ifdef HAVE_MMX
  2403. for(;x<w-31;x+=32)
  2404. {
  2405. asm volatile(
  2406. PREFETCH" 32%1\n\t"
  2407. "movq %1, %%mm0\n\t"
  2408. "movq 8%1, %%mm2\n\t"
  2409. "movq 16%1, %%mm4\n\t"
  2410. "movq 24%1, %%mm6\n\t"
  2411. "movq %%mm0, %%mm1\n\t"
  2412. "movq %%mm2, %%mm3\n\t"
  2413. "movq %%mm4, %%mm5\n\t"
  2414. "movq %%mm6, %%mm7\n\t"
  2415. "punpcklbw %%mm0, %%mm0\n\t"
  2416. "punpckhbw %%mm1, %%mm1\n\t"
  2417. "punpcklbw %%mm2, %%mm2\n\t"
  2418. "punpckhbw %%mm3, %%mm3\n\t"
  2419. "punpcklbw %%mm4, %%mm4\n\t"
  2420. "punpckhbw %%mm5, %%mm5\n\t"
  2421. "punpcklbw %%mm6, %%mm6\n\t"
  2422. "punpckhbw %%mm7, %%mm7\n\t"
  2423. MOVNTQ" %%mm0, %0\n\t"
  2424. MOVNTQ" %%mm1, 8%0\n\t"
  2425. MOVNTQ" %%mm2, 16%0\n\t"
  2426. MOVNTQ" %%mm3, 24%0\n\t"
  2427. MOVNTQ" %%mm4, 32%0\n\t"
  2428. MOVNTQ" %%mm5, 40%0\n\t"
  2429. MOVNTQ" %%mm6, 48%0\n\t"
  2430. MOVNTQ" %%mm7, 56%0"
  2431. :"=m"(d[2*x])
  2432. :"m"(s2[x])
  2433. :"memory");
  2434. }
  2435. #endif
  2436. for(;x<w;x++) d[2*x]=d[2*x+1]=s2[x];
  2437. }
  2438. #ifdef HAVE_MMX
  2439. asm(
  2440. EMMS" \n\t"
  2441. SFENCE" \n\t"
  2442. ::: "memory"
  2443. );
  2444. #endif
  2445. }
  2446. static inline void RENAME(yvu9_to_yuy2)(const uint8_t *src1, const uint8_t *src2, const uint8_t *src3,
  2447. uint8_t *dst,
  2448. unsigned width, unsigned height,
  2449. int srcStride1, int srcStride2,
  2450. int srcStride3, int dstStride)
  2451. {
  2452. unsigned long y,x,w,h;
  2453. w=width/2; h=height;
  2454. for(y=0;y<h;y++){
  2455. const uint8_t* yp=src1+srcStride1*y;
  2456. const uint8_t* up=src2+srcStride2*(y>>2);
  2457. const uint8_t* vp=src3+srcStride3*(y>>2);
  2458. uint8_t* d=dst+dstStride*y;
  2459. x=0;
  2460. #ifdef HAVE_MMX
  2461. for(;x<w-7;x+=8)
  2462. {
  2463. asm volatile(
  2464. PREFETCH" 32(%1, %0)\n\t"
  2465. PREFETCH" 32(%2, %0)\n\t"
  2466. PREFETCH" 32(%3, %0)\n\t"
  2467. "movq (%1, %0, 4), %%mm0\n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
  2468. "movq (%2, %0), %%mm1\n\t" /* U0U1U2U3U4U5U6U7 */
  2469. "movq (%3, %0), %%mm2\n\t" /* V0V1V2V3V4V5V6V7 */
  2470. "movq %%mm0, %%mm3\n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
  2471. "movq %%mm1, %%mm4\n\t" /* U0U1U2U3U4U5U6U7 */
  2472. "movq %%mm2, %%mm5\n\t" /* V0V1V2V3V4V5V6V7 */
  2473. "punpcklbw %%mm1, %%mm1\n\t" /* U0U0 U1U1 U2U2 U3U3 */
  2474. "punpcklbw %%mm2, %%mm2\n\t" /* V0V0 V1V1 V2V2 V3V3 */
  2475. "punpckhbw %%mm4, %%mm4\n\t" /* U4U4 U5U5 U6U6 U7U7 */
  2476. "punpckhbw %%mm5, %%mm5\n\t" /* V4V4 V5V5 V6V6 V7V7 */
  2477. "movq %%mm1, %%mm6\n\t"
  2478. "punpcklbw %%mm2, %%mm1\n\t" /* U0V0 U0V0 U1V1 U1V1*/
  2479. "punpcklbw %%mm1, %%mm0\n\t" /* Y0U0 Y1V0 Y2U0 Y3V0*/
  2480. "punpckhbw %%mm1, %%mm3\n\t" /* Y4U1 Y5V1 Y6U1 Y7V1*/
  2481. MOVNTQ" %%mm0, (%4, %0, 8)\n\t"
  2482. MOVNTQ" %%mm3, 8(%4, %0, 8)\n\t"
  2483. "punpckhbw %%mm2, %%mm6\n\t" /* U2V2 U2V2 U3V3 U3V3*/
  2484. "movq 8(%1, %0, 4), %%mm0\n\t"
  2485. "movq %%mm0, %%mm3\n\t"
  2486. "punpcklbw %%mm6, %%mm0\n\t" /* Y U2 Y V2 Y U2 Y V2*/
  2487. "punpckhbw %%mm6, %%mm3\n\t" /* Y U3 Y V3 Y U3 Y V3*/
  2488. MOVNTQ" %%mm0, 16(%4, %0, 8)\n\t"
  2489. MOVNTQ" %%mm3, 24(%4, %0, 8)\n\t"
  2490. "movq %%mm4, %%mm6\n\t"
  2491. "movq 16(%1, %0, 4), %%mm0\n\t"
  2492. "movq %%mm0, %%mm3\n\t"
  2493. "punpcklbw %%mm5, %%mm4\n\t"
  2494. "punpcklbw %%mm4, %%mm0\n\t" /* Y U4 Y V4 Y U4 Y V4*/
  2495. "punpckhbw %%mm4, %%mm3\n\t" /* Y U5 Y V5 Y U5 Y V5*/
  2496. MOVNTQ" %%mm0, 32(%4, %0, 8)\n\t"
  2497. MOVNTQ" %%mm3, 40(%4, %0, 8)\n\t"
  2498. "punpckhbw %%mm5, %%mm6\n\t"
  2499. "movq 24(%1, %0, 4), %%mm0\n\t"
  2500. "movq %%mm0, %%mm3\n\t"
  2501. "punpcklbw %%mm6, %%mm0\n\t" /* Y U6 Y V6 Y U6 Y V6*/
  2502. "punpckhbw %%mm6, %%mm3\n\t" /* Y U7 Y V7 Y U7 Y V7*/
  2503. MOVNTQ" %%mm0, 48(%4, %0, 8)\n\t"
  2504. MOVNTQ" %%mm3, 56(%4, %0, 8)\n\t"
  2505. : "+r" (x)
  2506. : "r"(yp), "r" (up), "r"(vp), "r"(d)
  2507. :"memory");
  2508. }
  2509. #endif
  2510. for(; x<w; x++)
  2511. {
  2512. const int x2= x<<2;
  2513. d[8*x+0]=yp[x2];
  2514. d[8*x+1]=up[x];
  2515. d[8*x+2]=yp[x2+1];
  2516. d[8*x+3]=vp[x];
  2517. d[8*x+4]=yp[x2+2];
  2518. d[8*x+5]=up[x];
  2519. d[8*x+6]=yp[x2+3];
  2520. d[8*x+7]=vp[x];
  2521. }
  2522. }
  2523. #ifdef HAVE_MMX
  2524. asm(
  2525. EMMS" \n\t"
  2526. SFENCE" \n\t"
  2527. ::: "memory"
  2528. );
  2529. #endif
  2530. }