You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

2665 lines
67KB

  1. /*
  2. *
  3. * rgb2rgb.c, Software RGB to RGB convertor
  4. * pluralize by Software PAL8 to RGB convertor
  5. * Software YUV to YUV convertor
  6. * Software YUV to RGB convertor
  7. * Written by Nick Kurshev.
  8. * palette & yuv & runtime cpu stuff by Michael (michaelni@gmx.at) (under GPL)
  9. * lot of big-endian byteorder fixes by Alex Beregszaszi
  10. */
  11. #include <stddef.h>
  12. #include <inttypes.h> /* for __WORDSIZE */
  13. #ifndef __WORDSIZE
  14. // #warning You have misconfigured system and probably will lose performance!
  15. #define __WORDSIZE MP_WORDSIZE
  16. #endif
  17. #undef PREFETCH
  18. #undef MOVNTQ
  19. #undef EMMS
  20. #undef SFENCE
  21. #undef MMREG_SIZE
  22. #undef PREFETCHW
  23. #undef PAVGB
  24. #ifdef HAVE_SSE2
  25. #define MMREG_SIZE 16
  26. #else
  27. #define MMREG_SIZE 8
  28. #endif
  29. #ifdef HAVE_3DNOW
  30. #define PREFETCH "prefetch"
  31. #define PREFETCHW "prefetchw"
  32. #define PAVGB "pavgusb"
  33. #elif defined ( HAVE_MMX2 )
  34. #define PREFETCH "prefetchnta"
  35. #define PREFETCHW "prefetcht0"
  36. #define PAVGB "pavgb"
  37. #else
  38. #define PREFETCH "/nop"
  39. #define PREFETCHW "/nop"
  40. #endif
  41. #ifdef HAVE_3DNOW
  42. /* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
  43. #define EMMS "femms"
  44. #else
  45. #define EMMS "emms"
  46. #endif
  47. #ifdef HAVE_MMX2
  48. #define MOVNTQ "movntq"
  49. #define SFENCE "sfence"
  50. #else
  51. #define MOVNTQ "movq"
  52. #define SFENCE "/nop"
  53. #endif
  54. static inline void RENAME(rgb24to32)(const uint8_t *src,uint8_t *dst,long src_size)
  55. {
  56. uint8_t *dest = dst;
  57. const uint8_t *s = src;
  58. const uint8_t *end;
  59. #ifdef HAVE_MMX
  60. const uint8_t *mm_end;
  61. #endif
  62. end = s + src_size;
  63. #ifdef HAVE_MMX
  64. __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
  65. mm_end = end - 23;
  66. __asm __volatile("movq %0, %%mm7"::"m"(mask32):"memory");
  67. while(s < mm_end)
  68. {
  69. __asm __volatile(
  70. PREFETCH" 32%1\n\t"
  71. "movd %1, %%mm0\n\t"
  72. "punpckldq 3%1, %%mm0\n\t"
  73. "movd 6%1, %%mm1\n\t"
  74. "punpckldq 9%1, %%mm1\n\t"
  75. "movd 12%1, %%mm2\n\t"
  76. "punpckldq 15%1, %%mm2\n\t"
  77. "movd 18%1, %%mm3\n\t"
  78. "punpckldq 21%1, %%mm3\n\t"
  79. "pand %%mm7, %%mm0\n\t"
  80. "pand %%mm7, %%mm1\n\t"
  81. "pand %%mm7, %%mm2\n\t"
  82. "pand %%mm7, %%mm3\n\t"
  83. MOVNTQ" %%mm0, %0\n\t"
  84. MOVNTQ" %%mm1, 8%0\n\t"
  85. MOVNTQ" %%mm2, 16%0\n\t"
  86. MOVNTQ" %%mm3, 24%0"
  87. :"=m"(*dest)
  88. :"m"(*s)
  89. :"memory");
  90. dest += 32;
  91. s += 24;
  92. }
  93. __asm __volatile(SFENCE:::"memory");
  94. __asm __volatile(EMMS:::"memory");
  95. #endif
  96. while(s < end)
  97. {
  98. #ifdef WORDS_BIGENDIAN
  99. /* RGB24 (= R,G,B) -> RGB32 (= A,B,G,R) */
  100. *dest++ = 0;
  101. *dest++ = s[2];
  102. *dest++ = s[1];
  103. *dest++ = s[0];
  104. s+=3;
  105. #else
  106. *dest++ = *s++;
  107. *dest++ = *s++;
  108. *dest++ = *s++;
  109. *dest++ = 0;
  110. #endif
  111. }
  112. }
  113. static inline void RENAME(rgb32to24)(const uint8_t *src,uint8_t *dst,long src_size)
  114. {
  115. uint8_t *dest = dst;
  116. const uint8_t *s = src;
  117. const uint8_t *end;
  118. #ifdef HAVE_MMX
  119. const uint8_t *mm_end;
  120. #endif
  121. end = s + src_size;
  122. #ifdef HAVE_MMX
  123. __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
  124. mm_end = end - 31;
  125. while(s < mm_end)
  126. {
  127. __asm __volatile(
  128. PREFETCH" 32%1\n\t"
  129. "movq %1, %%mm0\n\t"
  130. "movq 8%1, %%mm1\n\t"
  131. "movq 16%1, %%mm4\n\t"
  132. "movq 24%1, %%mm5\n\t"
  133. "movq %%mm0, %%mm2\n\t"
  134. "movq %%mm1, %%mm3\n\t"
  135. "movq %%mm4, %%mm6\n\t"
  136. "movq %%mm5, %%mm7\n\t"
  137. "psrlq $8, %%mm2\n\t"
  138. "psrlq $8, %%mm3\n\t"
  139. "psrlq $8, %%mm6\n\t"
  140. "psrlq $8, %%mm7\n\t"
  141. "pand %2, %%mm0\n\t"
  142. "pand %2, %%mm1\n\t"
  143. "pand %2, %%mm4\n\t"
  144. "pand %2, %%mm5\n\t"
  145. "pand %3, %%mm2\n\t"
  146. "pand %3, %%mm3\n\t"
  147. "pand %3, %%mm6\n\t"
  148. "pand %3, %%mm7\n\t"
  149. "por %%mm2, %%mm0\n\t"
  150. "por %%mm3, %%mm1\n\t"
  151. "por %%mm6, %%mm4\n\t"
  152. "por %%mm7, %%mm5\n\t"
  153. "movq %%mm1, %%mm2\n\t"
  154. "movq %%mm4, %%mm3\n\t"
  155. "psllq $48, %%mm2\n\t"
  156. "psllq $32, %%mm3\n\t"
  157. "pand %4, %%mm2\n\t"
  158. "pand %5, %%mm3\n\t"
  159. "por %%mm2, %%mm0\n\t"
  160. "psrlq $16, %%mm1\n\t"
  161. "psrlq $32, %%mm4\n\t"
  162. "psllq $16, %%mm5\n\t"
  163. "por %%mm3, %%mm1\n\t"
  164. "pand %6, %%mm5\n\t"
  165. "por %%mm5, %%mm4\n\t"
  166. MOVNTQ" %%mm0, %0\n\t"
  167. MOVNTQ" %%mm1, 8%0\n\t"
  168. MOVNTQ" %%mm4, 16%0"
  169. :"=m"(*dest)
  170. :"m"(*s),"m"(mask24l),
  171. "m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
  172. :"memory");
  173. dest += 24;
  174. s += 32;
  175. }
  176. __asm __volatile(SFENCE:::"memory");
  177. __asm __volatile(EMMS:::"memory");
  178. #endif
  179. while(s < end)
  180. {
  181. #ifdef WORDS_BIGENDIAN
  182. /* RGB32 (= A,B,G,R) -> RGB24 (= R,G,B) */
  183. s++;
  184. dest[2] = *s++;
  185. dest[1] = *s++;
  186. dest[0] = *s++;
  187. dest += 3;
  188. #else
  189. *dest++ = *s++;
  190. *dest++ = *s++;
  191. *dest++ = *s++;
  192. s++;
  193. #endif
  194. }
  195. }
  196. /*
  197. Original by Strepto/Astral
  198. ported to gcc & bugfixed : A'rpi
  199. MMX2, 3DNOW optimization by Nick Kurshev
  200. 32bit c version, and and&add trick by Michael Niedermayer
  201. */
  202. static inline void RENAME(rgb15to16)(const uint8_t *src,uint8_t *dst,long src_size)
  203. {
  204. register const uint8_t* s=src;
  205. register uint8_t* d=dst;
  206. register const uint8_t *end;
  207. const uint8_t *mm_end;
  208. end = s + src_size;
  209. #ifdef HAVE_MMX
  210. __asm __volatile(PREFETCH" %0"::"m"(*s));
  211. __asm __volatile("movq %0, %%mm4"::"m"(mask15s));
  212. mm_end = end - 15;
  213. while(s<mm_end)
  214. {
  215. __asm __volatile(
  216. PREFETCH" 32%1\n\t"
  217. "movq %1, %%mm0\n\t"
  218. "movq 8%1, %%mm2\n\t"
  219. "movq %%mm0, %%mm1\n\t"
  220. "movq %%mm2, %%mm3\n\t"
  221. "pand %%mm4, %%mm0\n\t"
  222. "pand %%mm4, %%mm2\n\t"
  223. "paddw %%mm1, %%mm0\n\t"
  224. "paddw %%mm3, %%mm2\n\t"
  225. MOVNTQ" %%mm0, %0\n\t"
  226. MOVNTQ" %%mm2, 8%0"
  227. :"=m"(*d)
  228. :"m"(*s)
  229. );
  230. d+=16;
  231. s+=16;
  232. }
  233. __asm __volatile(SFENCE:::"memory");
  234. __asm __volatile(EMMS:::"memory");
  235. #endif
  236. mm_end = end - 3;
  237. while(s < mm_end)
  238. {
  239. register unsigned x= *((uint32_t *)s);
  240. *((uint32_t *)d) = (x&0x7FFF7FFF) + (x&0x7FE07FE0);
  241. d+=4;
  242. s+=4;
  243. }
  244. if(s < end)
  245. {
  246. register unsigned short x= *((uint16_t *)s);
  247. *((uint16_t *)d) = (x&0x7FFF) + (x&0x7FE0);
  248. }
  249. }
  250. static inline void RENAME(rgb16to15)(const uint8_t *src,uint8_t *dst,long src_size)
  251. {
  252. register const uint8_t* s=src;
  253. register uint8_t* d=dst;
  254. register const uint8_t *end;
  255. const uint8_t *mm_end;
  256. end = s + src_size;
  257. #ifdef HAVE_MMX
  258. __asm __volatile(PREFETCH" %0"::"m"(*s));
  259. __asm __volatile("movq %0, %%mm7"::"m"(mask15rg));
  260. __asm __volatile("movq %0, %%mm6"::"m"(mask15b));
  261. mm_end = end - 15;
  262. while(s<mm_end)
  263. {
  264. __asm __volatile(
  265. PREFETCH" 32%1\n\t"
  266. "movq %1, %%mm0\n\t"
  267. "movq 8%1, %%mm2\n\t"
  268. "movq %%mm0, %%mm1\n\t"
  269. "movq %%mm2, %%mm3\n\t"
  270. "psrlq $1, %%mm0\n\t"
  271. "psrlq $1, %%mm2\n\t"
  272. "pand %%mm7, %%mm0\n\t"
  273. "pand %%mm7, %%mm2\n\t"
  274. "pand %%mm6, %%mm1\n\t"
  275. "pand %%mm6, %%mm3\n\t"
  276. "por %%mm1, %%mm0\n\t"
  277. "por %%mm3, %%mm2\n\t"
  278. MOVNTQ" %%mm0, %0\n\t"
  279. MOVNTQ" %%mm2, 8%0"
  280. :"=m"(*d)
  281. :"m"(*s)
  282. );
  283. d+=16;
  284. s+=16;
  285. }
  286. __asm __volatile(SFENCE:::"memory");
  287. __asm __volatile(EMMS:::"memory");
  288. #endif
  289. mm_end = end - 3;
  290. while(s < mm_end)
  291. {
  292. register uint32_t x= *((uint32_t *)s);
  293. *((uint32_t *)d) = ((x>>1)&0x7FE07FE0) | (x&0x001F001F);
  294. s+=4;
  295. d+=4;
  296. }
  297. if(s < end)
  298. {
  299. register uint16_t x= *((uint16_t *)s);
  300. *((uint16_t *)d) = ((x>>1)&0x7FE0) | (x&0x001F);
  301. s+=2;
  302. d+=2;
  303. }
  304. }
  305. static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, long src_size)
  306. {
  307. const uint8_t *s = src;
  308. const uint8_t *end;
  309. #ifdef HAVE_MMX
  310. const uint8_t *mm_end;
  311. #endif
  312. uint16_t *d = (uint16_t *)dst;
  313. end = s + src_size;
  314. #ifdef HAVE_MMX
  315. mm_end = end - 15;
  316. #if 1 //is faster only if multiplies are reasonable fast (FIXME figure out on which cpus this is faster, on Athlon its slightly faster)
  317. asm volatile(
  318. "movq %3, %%mm5 \n\t"
  319. "movq %4, %%mm6 \n\t"
  320. "movq %5, %%mm7 \n\t"
  321. ".balign 16 \n\t"
  322. "1: \n\t"
  323. PREFETCH" 32(%1) \n\t"
  324. "movd (%1), %%mm0 \n\t"
  325. "movd 4(%1), %%mm3 \n\t"
  326. "punpckldq 8(%1), %%mm0 \n\t"
  327. "punpckldq 12(%1), %%mm3 \n\t"
  328. "movq %%mm0, %%mm1 \n\t"
  329. "movq %%mm3, %%mm4 \n\t"
  330. "pand %%mm6, %%mm0 \n\t"
  331. "pand %%mm6, %%mm3 \n\t"
  332. "pmaddwd %%mm7, %%mm0 \n\t"
  333. "pmaddwd %%mm7, %%mm3 \n\t"
  334. "pand %%mm5, %%mm1 \n\t"
  335. "pand %%mm5, %%mm4 \n\t"
  336. "por %%mm1, %%mm0 \n\t"
  337. "por %%mm4, %%mm3 \n\t"
  338. "psrld $5, %%mm0 \n\t"
  339. "pslld $11, %%mm3 \n\t"
  340. "por %%mm3, %%mm0 \n\t"
  341. MOVNTQ" %%mm0, (%0) \n\t"
  342. "add $16, %1 \n\t"
  343. "add $8, %0 \n\t"
  344. "cmp %2, %1 \n\t"
  345. " jb 1b \n\t"
  346. : "+r" (d), "+r"(s)
  347. : "r" (mm_end), "m" (mask3216g), "m" (mask3216br), "m" (mul3216)
  348. );
  349. #else
  350. __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
  351. __asm __volatile(
  352. "movq %0, %%mm7\n\t"
  353. "movq %1, %%mm6\n\t"
  354. ::"m"(red_16mask),"m"(green_16mask));
  355. while(s < mm_end)
  356. {
  357. __asm __volatile(
  358. PREFETCH" 32%1\n\t"
  359. "movd %1, %%mm0\n\t"
  360. "movd 4%1, %%mm3\n\t"
  361. "punpckldq 8%1, %%mm0\n\t"
  362. "punpckldq 12%1, %%mm3\n\t"
  363. "movq %%mm0, %%mm1\n\t"
  364. "movq %%mm0, %%mm2\n\t"
  365. "movq %%mm3, %%mm4\n\t"
  366. "movq %%mm3, %%mm5\n\t"
  367. "psrlq $3, %%mm0\n\t"
  368. "psrlq $3, %%mm3\n\t"
  369. "pand %2, %%mm0\n\t"
  370. "pand %2, %%mm3\n\t"
  371. "psrlq $5, %%mm1\n\t"
  372. "psrlq $5, %%mm4\n\t"
  373. "pand %%mm6, %%mm1\n\t"
  374. "pand %%mm6, %%mm4\n\t"
  375. "psrlq $8, %%mm2\n\t"
  376. "psrlq $8, %%mm5\n\t"
  377. "pand %%mm7, %%mm2\n\t"
  378. "pand %%mm7, %%mm5\n\t"
  379. "por %%mm1, %%mm0\n\t"
  380. "por %%mm4, %%mm3\n\t"
  381. "por %%mm2, %%mm0\n\t"
  382. "por %%mm5, %%mm3\n\t"
  383. "psllq $16, %%mm3\n\t"
  384. "por %%mm3, %%mm0\n\t"
  385. MOVNTQ" %%mm0, %0\n\t"
  386. :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
  387. d += 4;
  388. s += 16;
  389. }
  390. #endif
  391. __asm __volatile(SFENCE:::"memory");
  392. __asm __volatile(EMMS:::"memory");
  393. #endif
  394. while(s < end)
  395. {
  396. register int rgb = *(uint32_t*)s; s += 4;
  397. *d++ = ((rgb&0xFF)>>3) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>8);
  398. }
  399. }
  400. static inline void RENAME(rgb32tobgr16)(const uint8_t *src, uint8_t *dst, long src_size)
  401. {
  402. const uint8_t *s = src;
  403. const uint8_t *end;
  404. #ifdef HAVE_MMX
  405. const uint8_t *mm_end;
  406. #endif
  407. uint16_t *d = (uint16_t *)dst;
  408. end = s + src_size;
  409. #ifdef HAVE_MMX
  410. __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
  411. __asm __volatile(
  412. "movq %0, %%mm7\n\t"
  413. "movq %1, %%mm6\n\t"
  414. ::"m"(red_16mask),"m"(green_16mask));
  415. mm_end = end - 15;
  416. while(s < mm_end)
  417. {
  418. __asm __volatile(
  419. PREFETCH" 32%1\n\t"
  420. "movd %1, %%mm0\n\t"
  421. "movd 4%1, %%mm3\n\t"
  422. "punpckldq 8%1, %%mm0\n\t"
  423. "punpckldq 12%1, %%mm3\n\t"
  424. "movq %%mm0, %%mm1\n\t"
  425. "movq %%mm0, %%mm2\n\t"
  426. "movq %%mm3, %%mm4\n\t"
  427. "movq %%mm3, %%mm5\n\t"
  428. "psllq $8, %%mm0\n\t"
  429. "psllq $8, %%mm3\n\t"
  430. "pand %%mm7, %%mm0\n\t"
  431. "pand %%mm7, %%mm3\n\t"
  432. "psrlq $5, %%mm1\n\t"
  433. "psrlq $5, %%mm4\n\t"
  434. "pand %%mm6, %%mm1\n\t"
  435. "pand %%mm6, %%mm4\n\t"
  436. "psrlq $19, %%mm2\n\t"
  437. "psrlq $19, %%mm5\n\t"
  438. "pand %2, %%mm2\n\t"
  439. "pand %2, %%mm5\n\t"
  440. "por %%mm1, %%mm0\n\t"
  441. "por %%mm4, %%mm3\n\t"
  442. "por %%mm2, %%mm0\n\t"
  443. "por %%mm5, %%mm3\n\t"
  444. "psllq $16, %%mm3\n\t"
  445. "por %%mm3, %%mm0\n\t"
  446. MOVNTQ" %%mm0, %0\n\t"
  447. :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
  448. d += 4;
  449. s += 16;
  450. }
  451. __asm __volatile(SFENCE:::"memory");
  452. __asm __volatile(EMMS:::"memory");
  453. #endif
  454. while(s < end)
  455. {
  456. register int rgb = *(uint32_t*)s; s += 4;
  457. *d++ = ((rgb&0xF8)<<8) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>19);
  458. }
  459. }
  460. static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, long src_size)
  461. {
  462. const uint8_t *s = src;
  463. const uint8_t *end;
  464. #ifdef HAVE_MMX
  465. const uint8_t *mm_end;
  466. #endif
  467. uint16_t *d = (uint16_t *)dst;
  468. end = s + src_size;
  469. #ifdef HAVE_MMX
  470. mm_end = end - 15;
  471. #if 1 //is faster only if multiplies are reasonable fast (FIXME figure out on which cpus this is faster, on Athlon its slightly faster)
  472. asm volatile(
  473. "movq %3, %%mm5 \n\t"
  474. "movq %4, %%mm6 \n\t"
  475. "movq %5, %%mm7 \n\t"
  476. ".balign 16 \n\t"
  477. "1: \n\t"
  478. PREFETCH" 32(%1) \n\t"
  479. "movd (%1), %%mm0 \n\t"
  480. "movd 4(%1), %%mm3 \n\t"
  481. "punpckldq 8(%1), %%mm0 \n\t"
  482. "punpckldq 12(%1), %%mm3 \n\t"
  483. "movq %%mm0, %%mm1 \n\t"
  484. "movq %%mm3, %%mm4 \n\t"
  485. "pand %%mm6, %%mm0 \n\t"
  486. "pand %%mm6, %%mm3 \n\t"
  487. "pmaddwd %%mm7, %%mm0 \n\t"
  488. "pmaddwd %%mm7, %%mm3 \n\t"
  489. "pand %%mm5, %%mm1 \n\t"
  490. "pand %%mm5, %%mm4 \n\t"
  491. "por %%mm1, %%mm0 \n\t"
  492. "por %%mm4, %%mm3 \n\t"
  493. "psrld $6, %%mm0 \n\t"
  494. "pslld $10, %%mm3 \n\t"
  495. "por %%mm3, %%mm0 \n\t"
  496. MOVNTQ" %%mm0, (%0) \n\t"
  497. "add $16, %1 \n\t"
  498. "add $8, %0 \n\t"
  499. "cmp %2, %1 \n\t"
  500. " jb 1b \n\t"
  501. : "+r" (d), "+r"(s)
  502. : "r" (mm_end), "m" (mask3215g), "m" (mask3216br), "m" (mul3215)
  503. );
  504. #else
  505. __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
  506. __asm __volatile(
  507. "movq %0, %%mm7\n\t"
  508. "movq %1, %%mm6\n\t"
  509. ::"m"(red_15mask),"m"(green_15mask));
  510. while(s < mm_end)
  511. {
  512. __asm __volatile(
  513. PREFETCH" 32%1\n\t"
  514. "movd %1, %%mm0\n\t"
  515. "movd 4%1, %%mm3\n\t"
  516. "punpckldq 8%1, %%mm0\n\t"
  517. "punpckldq 12%1, %%mm3\n\t"
  518. "movq %%mm0, %%mm1\n\t"
  519. "movq %%mm0, %%mm2\n\t"
  520. "movq %%mm3, %%mm4\n\t"
  521. "movq %%mm3, %%mm5\n\t"
  522. "psrlq $3, %%mm0\n\t"
  523. "psrlq $3, %%mm3\n\t"
  524. "pand %2, %%mm0\n\t"
  525. "pand %2, %%mm3\n\t"
  526. "psrlq $6, %%mm1\n\t"
  527. "psrlq $6, %%mm4\n\t"
  528. "pand %%mm6, %%mm1\n\t"
  529. "pand %%mm6, %%mm4\n\t"
  530. "psrlq $9, %%mm2\n\t"
  531. "psrlq $9, %%mm5\n\t"
  532. "pand %%mm7, %%mm2\n\t"
  533. "pand %%mm7, %%mm5\n\t"
  534. "por %%mm1, %%mm0\n\t"
  535. "por %%mm4, %%mm3\n\t"
  536. "por %%mm2, %%mm0\n\t"
  537. "por %%mm5, %%mm3\n\t"
  538. "psllq $16, %%mm3\n\t"
  539. "por %%mm3, %%mm0\n\t"
  540. MOVNTQ" %%mm0, %0\n\t"
  541. :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
  542. d += 4;
  543. s += 16;
  544. }
  545. #endif
  546. __asm __volatile(SFENCE:::"memory");
  547. __asm __volatile(EMMS:::"memory");
  548. #endif
  549. while(s < end)
  550. {
  551. register int rgb = *(uint32_t*)s; s += 4;
  552. *d++ = ((rgb&0xFF)>>3) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>9);
  553. }
  554. }
  555. static inline void RENAME(rgb32tobgr15)(const uint8_t *src, uint8_t *dst, long src_size)
  556. {
  557. const uint8_t *s = src;
  558. const uint8_t *end;
  559. #ifdef HAVE_MMX
  560. const uint8_t *mm_end;
  561. #endif
  562. uint16_t *d = (uint16_t *)dst;
  563. end = s + src_size;
  564. #ifdef HAVE_MMX
  565. __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
  566. __asm __volatile(
  567. "movq %0, %%mm7\n\t"
  568. "movq %1, %%mm6\n\t"
  569. ::"m"(red_15mask),"m"(green_15mask));
  570. mm_end = end - 15;
  571. while(s < mm_end)
  572. {
  573. __asm __volatile(
  574. PREFETCH" 32%1\n\t"
  575. "movd %1, %%mm0\n\t"
  576. "movd 4%1, %%mm3\n\t"
  577. "punpckldq 8%1, %%mm0\n\t"
  578. "punpckldq 12%1, %%mm3\n\t"
  579. "movq %%mm0, %%mm1\n\t"
  580. "movq %%mm0, %%mm2\n\t"
  581. "movq %%mm3, %%mm4\n\t"
  582. "movq %%mm3, %%mm5\n\t"
  583. "psllq $7, %%mm0\n\t"
  584. "psllq $7, %%mm3\n\t"
  585. "pand %%mm7, %%mm0\n\t"
  586. "pand %%mm7, %%mm3\n\t"
  587. "psrlq $6, %%mm1\n\t"
  588. "psrlq $6, %%mm4\n\t"
  589. "pand %%mm6, %%mm1\n\t"
  590. "pand %%mm6, %%mm4\n\t"
  591. "psrlq $19, %%mm2\n\t"
  592. "psrlq $19, %%mm5\n\t"
  593. "pand %2, %%mm2\n\t"
  594. "pand %2, %%mm5\n\t"
  595. "por %%mm1, %%mm0\n\t"
  596. "por %%mm4, %%mm3\n\t"
  597. "por %%mm2, %%mm0\n\t"
  598. "por %%mm5, %%mm3\n\t"
  599. "psllq $16, %%mm3\n\t"
  600. "por %%mm3, %%mm0\n\t"
  601. MOVNTQ" %%mm0, %0\n\t"
  602. :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
  603. d += 4;
  604. s += 16;
  605. }
  606. __asm __volatile(SFENCE:::"memory");
  607. __asm __volatile(EMMS:::"memory");
  608. #endif
  609. while(s < end)
  610. {
  611. register int rgb = *(uint32_t*)s; s += 4;
  612. *d++ = ((rgb&0xF8)<<7) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>19);
  613. }
  614. }
  615. static inline void RENAME(rgb24to16)(const uint8_t *src, uint8_t *dst, long src_size)
  616. {
  617. const uint8_t *s = src;
  618. const uint8_t *end;
  619. #ifdef HAVE_MMX
  620. const uint8_t *mm_end;
  621. #endif
  622. uint16_t *d = (uint16_t *)dst;
  623. end = s + src_size;
  624. #ifdef HAVE_MMX
  625. __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
  626. __asm __volatile(
  627. "movq %0, %%mm7\n\t"
  628. "movq %1, %%mm6\n\t"
  629. ::"m"(red_16mask),"m"(green_16mask));
  630. mm_end = end - 11;
  631. while(s < mm_end)
  632. {
  633. __asm __volatile(
  634. PREFETCH" 32%1\n\t"
  635. "movd %1, %%mm0\n\t"
  636. "movd 3%1, %%mm3\n\t"
  637. "punpckldq 6%1, %%mm0\n\t"
  638. "punpckldq 9%1, %%mm3\n\t"
  639. "movq %%mm0, %%mm1\n\t"
  640. "movq %%mm0, %%mm2\n\t"
  641. "movq %%mm3, %%mm4\n\t"
  642. "movq %%mm3, %%mm5\n\t"
  643. "psrlq $3, %%mm0\n\t"
  644. "psrlq $3, %%mm3\n\t"
  645. "pand %2, %%mm0\n\t"
  646. "pand %2, %%mm3\n\t"
  647. "psrlq $5, %%mm1\n\t"
  648. "psrlq $5, %%mm4\n\t"
  649. "pand %%mm6, %%mm1\n\t"
  650. "pand %%mm6, %%mm4\n\t"
  651. "psrlq $8, %%mm2\n\t"
  652. "psrlq $8, %%mm5\n\t"
  653. "pand %%mm7, %%mm2\n\t"
  654. "pand %%mm7, %%mm5\n\t"
  655. "por %%mm1, %%mm0\n\t"
  656. "por %%mm4, %%mm3\n\t"
  657. "por %%mm2, %%mm0\n\t"
  658. "por %%mm5, %%mm3\n\t"
  659. "psllq $16, %%mm3\n\t"
  660. "por %%mm3, %%mm0\n\t"
  661. MOVNTQ" %%mm0, %0\n\t"
  662. :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
  663. d += 4;
  664. s += 12;
  665. }
  666. __asm __volatile(SFENCE:::"memory");
  667. __asm __volatile(EMMS:::"memory");
  668. #endif
  669. while(s < end)
  670. {
  671. const int b= *s++;
  672. const int g= *s++;
  673. const int r= *s++;
  674. *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
  675. }
  676. }
  677. static inline void RENAME(rgb24tobgr16)(const uint8_t *src, uint8_t *dst, long src_size)
  678. {
  679. const uint8_t *s = src;
  680. const uint8_t *end;
  681. #ifdef HAVE_MMX
  682. const uint8_t *mm_end;
  683. #endif
  684. uint16_t *d = (uint16_t *)dst;
  685. end = s + src_size;
  686. #ifdef HAVE_MMX
  687. __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
  688. __asm __volatile(
  689. "movq %0, %%mm7\n\t"
  690. "movq %1, %%mm6\n\t"
  691. ::"m"(red_16mask),"m"(green_16mask));
  692. mm_end = end - 15;
  693. while(s < mm_end)
  694. {
  695. __asm __volatile(
  696. PREFETCH" 32%1\n\t"
  697. "movd %1, %%mm0\n\t"
  698. "movd 3%1, %%mm3\n\t"
  699. "punpckldq 6%1, %%mm0\n\t"
  700. "punpckldq 9%1, %%mm3\n\t"
  701. "movq %%mm0, %%mm1\n\t"
  702. "movq %%mm0, %%mm2\n\t"
  703. "movq %%mm3, %%mm4\n\t"
  704. "movq %%mm3, %%mm5\n\t"
  705. "psllq $8, %%mm0\n\t"
  706. "psllq $8, %%mm3\n\t"
  707. "pand %%mm7, %%mm0\n\t"
  708. "pand %%mm7, %%mm3\n\t"
  709. "psrlq $5, %%mm1\n\t"
  710. "psrlq $5, %%mm4\n\t"
  711. "pand %%mm6, %%mm1\n\t"
  712. "pand %%mm6, %%mm4\n\t"
  713. "psrlq $19, %%mm2\n\t"
  714. "psrlq $19, %%mm5\n\t"
  715. "pand %2, %%mm2\n\t"
  716. "pand %2, %%mm5\n\t"
  717. "por %%mm1, %%mm0\n\t"
  718. "por %%mm4, %%mm3\n\t"
  719. "por %%mm2, %%mm0\n\t"
  720. "por %%mm5, %%mm3\n\t"
  721. "psllq $16, %%mm3\n\t"
  722. "por %%mm3, %%mm0\n\t"
  723. MOVNTQ" %%mm0, %0\n\t"
  724. :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
  725. d += 4;
  726. s += 12;
  727. }
  728. __asm __volatile(SFENCE:::"memory");
  729. __asm __volatile(EMMS:::"memory");
  730. #endif
  731. while(s < end)
  732. {
  733. const int r= *s++;
  734. const int g= *s++;
  735. const int b= *s++;
  736. *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
  737. }
  738. }
  739. static inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, long src_size)
  740. {
  741. const uint8_t *s = src;
  742. const uint8_t *end;
  743. #ifdef HAVE_MMX
  744. const uint8_t *mm_end;
  745. #endif
  746. uint16_t *d = (uint16_t *)dst;
  747. end = s + src_size;
  748. #ifdef HAVE_MMX
  749. __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
  750. __asm __volatile(
  751. "movq %0, %%mm7\n\t"
  752. "movq %1, %%mm6\n\t"
  753. ::"m"(red_15mask),"m"(green_15mask));
  754. mm_end = end - 11;
  755. while(s < mm_end)
  756. {
  757. __asm __volatile(
  758. PREFETCH" 32%1\n\t"
  759. "movd %1, %%mm0\n\t"
  760. "movd 3%1, %%mm3\n\t"
  761. "punpckldq 6%1, %%mm0\n\t"
  762. "punpckldq 9%1, %%mm3\n\t"
  763. "movq %%mm0, %%mm1\n\t"
  764. "movq %%mm0, %%mm2\n\t"
  765. "movq %%mm3, %%mm4\n\t"
  766. "movq %%mm3, %%mm5\n\t"
  767. "psrlq $3, %%mm0\n\t"
  768. "psrlq $3, %%mm3\n\t"
  769. "pand %2, %%mm0\n\t"
  770. "pand %2, %%mm3\n\t"
  771. "psrlq $6, %%mm1\n\t"
  772. "psrlq $6, %%mm4\n\t"
  773. "pand %%mm6, %%mm1\n\t"
  774. "pand %%mm6, %%mm4\n\t"
  775. "psrlq $9, %%mm2\n\t"
  776. "psrlq $9, %%mm5\n\t"
  777. "pand %%mm7, %%mm2\n\t"
  778. "pand %%mm7, %%mm5\n\t"
  779. "por %%mm1, %%mm0\n\t"
  780. "por %%mm4, %%mm3\n\t"
  781. "por %%mm2, %%mm0\n\t"
  782. "por %%mm5, %%mm3\n\t"
  783. "psllq $16, %%mm3\n\t"
  784. "por %%mm3, %%mm0\n\t"
  785. MOVNTQ" %%mm0, %0\n\t"
  786. :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
  787. d += 4;
  788. s += 12;
  789. }
  790. __asm __volatile(SFENCE:::"memory");
  791. __asm __volatile(EMMS:::"memory");
  792. #endif
  793. while(s < end)
  794. {
  795. const int b= *s++;
  796. const int g= *s++;
  797. const int r= *s++;
  798. *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
  799. }
  800. }
  801. static inline void RENAME(rgb24tobgr15)(const uint8_t *src, uint8_t *dst, long src_size)
  802. {
  803. const uint8_t *s = src;
  804. const uint8_t *end;
  805. #ifdef HAVE_MMX
  806. const uint8_t *mm_end;
  807. #endif
  808. uint16_t *d = (uint16_t *)dst;
  809. end = s + src_size;
  810. #ifdef HAVE_MMX
  811. __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
  812. __asm __volatile(
  813. "movq %0, %%mm7\n\t"
  814. "movq %1, %%mm6\n\t"
  815. ::"m"(red_15mask),"m"(green_15mask));
  816. mm_end = end - 15;
  817. while(s < mm_end)
  818. {
  819. __asm __volatile(
  820. PREFETCH" 32%1\n\t"
  821. "movd %1, %%mm0\n\t"
  822. "movd 3%1, %%mm3\n\t"
  823. "punpckldq 6%1, %%mm0\n\t"
  824. "punpckldq 9%1, %%mm3\n\t"
  825. "movq %%mm0, %%mm1\n\t"
  826. "movq %%mm0, %%mm2\n\t"
  827. "movq %%mm3, %%mm4\n\t"
  828. "movq %%mm3, %%mm5\n\t"
  829. "psllq $7, %%mm0\n\t"
  830. "psllq $7, %%mm3\n\t"
  831. "pand %%mm7, %%mm0\n\t"
  832. "pand %%mm7, %%mm3\n\t"
  833. "psrlq $6, %%mm1\n\t"
  834. "psrlq $6, %%mm4\n\t"
  835. "pand %%mm6, %%mm1\n\t"
  836. "pand %%mm6, %%mm4\n\t"
  837. "psrlq $19, %%mm2\n\t"
  838. "psrlq $19, %%mm5\n\t"
  839. "pand %2, %%mm2\n\t"
  840. "pand %2, %%mm5\n\t"
  841. "por %%mm1, %%mm0\n\t"
  842. "por %%mm4, %%mm3\n\t"
  843. "por %%mm2, %%mm0\n\t"
  844. "por %%mm5, %%mm3\n\t"
  845. "psllq $16, %%mm3\n\t"
  846. "por %%mm3, %%mm0\n\t"
  847. MOVNTQ" %%mm0, %0\n\t"
  848. :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
  849. d += 4;
  850. s += 12;
  851. }
  852. __asm __volatile(SFENCE:::"memory");
  853. __asm __volatile(EMMS:::"memory");
  854. #endif
  855. while(s < end)
  856. {
  857. const int r= *s++;
  858. const int g= *s++;
  859. const int b= *s++;
  860. *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
  861. }
  862. }
  863. /*
  864. I use here less accurate approximation by simply
  865. left-shifting the input
  866. value and filling the low order bits with
  867. zeroes. This method improves png's
  868. compression but this scheme cannot reproduce white exactly, since it does not
  869. generate an all-ones maximum value; the net effect is to darken the
  870. image slightly.
  871. The better method should be "left bit replication":
  872. 4 3 2 1 0
  873. ---------
  874. 1 1 0 1 1
  875. 7 6 5 4 3 2 1 0
  876. ----------------
  877. 1 1 0 1 1 1 1 0
  878. |=======| |===|
  879. | Leftmost Bits Repeated to Fill Open Bits
  880. |
  881. Original Bits
  882. */
  883. static inline void RENAME(rgb15to24)(const uint8_t *src, uint8_t *dst, long src_size)
  884. {
  885. const uint16_t *end;
  886. #ifdef HAVE_MMX
  887. const uint16_t *mm_end;
  888. #endif
  889. uint8_t *d = (uint8_t *)dst;
  890. const uint16_t *s = (uint16_t *)src;
  891. end = s + src_size/2;
  892. #ifdef HAVE_MMX
  893. __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
  894. mm_end = end - 7;
  895. while(s < mm_end)
  896. {
  897. __asm __volatile(
  898. PREFETCH" 32%1\n\t"
  899. "movq %1, %%mm0\n\t"
  900. "movq %1, %%mm1\n\t"
  901. "movq %1, %%mm2\n\t"
  902. "pand %2, %%mm0\n\t"
  903. "pand %3, %%mm1\n\t"
  904. "pand %4, %%mm2\n\t"
  905. "psllq $3, %%mm0\n\t"
  906. "psrlq $2, %%mm1\n\t"
  907. "psrlq $7, %%mm2\n\t"
  908. "movq %%mm0, %%mm3\n\t"
  909. "movq %%mm1, %%mm4\n\t"
  910. "movq %%mm2, %%mm5\n\t"
  911. "punpcklwd %5, %%mm0\n\t"
  912. "punpcklwd %5, %%mm1\n\t"
  913. "punpcklwd %5, %%mm2\n\t"
  914. "punpckhwd %5, %%mm3\n\t"
  915. "punpckhwd %5, %%mm4\n\t"
  916. "punpckhwd %5, %%mm5\n\t"
  917. "psllq $8, %%mm1\n\t"
  918. "psllq $16, %%mm2\n\t"
  919. "por %%mm1, %%mm0\n\t"
  920. "por %%mm2, %%mm0\n\t"
  921. "psllq $8, %%mm4\n\t"
  922. "psllq $16, %%mm5\n\t"
  923. "por %%mm4, %%mm3\n\t"
  924. "por %%mm5, %%mm3\n\t"
  925. "movq %%mm0, %%mm6\n\t"
  926. "movq %%mm3, %%mm7\n\t"
  927. "movq 8%1, %%mm0\n\t"
  928. "movq 8%1, %%mm1\n\t"
  929. "movq 8%1, %%mm2\n\t"
  930. "pand %2, %%mm0\n\t"
  931. "pand %3, %%mm1\n\t"
  932. "pand %4, %%mm2\n\t"
  933. "psllq $3, %%mm0\n\t"
  934. "psrlq $2, %%mm1\n\t"
  935. "psrlq $7, %%mm2\n\t"
  936. "movq %%mm0, %%mm3\n\t"
  937. "movq %%mm1, %%mm4\n\t"
  938. "movq %%mm2, %%mm5\n\t"
  939. "punpcklwd %5, %%mm0\n\t"
  940. "punpcklwd %5, %%mm1\n\t"
  941. "punpcklwd %5, %%mm2\n\t"
  942. "punpckhwd %5, %%mm3\n\t"
  943. "punpckhwd %5, %%mm4\n\t"
  944. "punpckhwd %5, %%mm5\n\t"
  945. "psllq $8, %%mm1\n\t"
  946. "psllq $16, %%mm2\n\t"
  947. "por %%mm1, %%mm0\n\t"
  948. "por %%mm2, %%mm0\n\t"
  949. "psllq $8, %%mm4\n\t"
  950. "psllq $16, %%mm5\n\t"
  951. "por %%mm4, %%mm3\n\t"
  952. "por %%mm5, %%mm3\n\t"
  953. :"=m"(*d)
  954. :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r), "m"(mmx_null)
  955. :"memory");
  956. /* Borrowed 32 to 24 */
  957. __asm __volatile(
  958. "movq %%mm0, %%mm4\n\t"
  959. "movq %%mm3, %%mm5\n\t"
  960. "movq %%mm6, %%mm0\n\t"
  961. "movq %%mm7, %%mm1\n\t"
  962. "movq %%mm4, %%mm6\n\t"
  963. "movq %%mm5, %%mm7\n\t"
  964. "movq %%mm0, %%mm2\n\t"
  965. "movq %%mm1, %%mm3\n\t"
  966. "psrlq $8, %%mm2\n\t"
  967. "psrlq $8, %%mm3\n\t"
  968. "psrlq $8, %%mm6\n\t"
  969. "psrlq $8, %%mm7\n\t"
  970. "pand %2, %%mm0\n\t"
  971. "pand %2, %%mm1\n\t"
  972. "pand %2, %%mm4\n\t"
  973. "pand %2, %%mm5\n\t"
  974. "pand %3, %%mm2\n\t"
  975. "pand %3, %%mm3\n\t"
  976. "pand %3, %%mm6\n\t"
  977. "pand %3, %%mm7\n\t"
  978. "por %%mm2, %%mm0\n\t"
  979. "por %%mm3, %%mm1\n\t"
  980. "por %%mm6, %%mm4\n\t"
  981. "por %%mm7, %%mm5\n\t"
  982. "movq %%mm1, %%mm2\n\t"
  983. "movq %%mm4, %%mm3\n\t"
  984. "psllq $48, %%mm2\n\t"
  985. "psllq $32, %%mm3\n\t"
  986. "pand %4, %%mm2\n\t"
  987. "pand %5, %%mm3\n\t"
  988. "por %%mm2, %%mm0\n\t"
  989. "psrlq $16, %%mm1\n\t"
  990. "psrlq $32, %%mm4\n\t"
  991. "psllq $16, %%mm5\n\t"
  992. "por %%mm3, %%mm1\n\t"
  993. "pand %6, %%mm5\n\t"
  994. "por %%mm5, %%mm4\n\t"
  995. MOVNTQ" %%mm0, %0\n\t"
  996. MOVNTQ" %%mm1, 8%0\n\t"
  997. MOVNTQ" %%mm4, 16%0"
  998. :"=m"(*d)
  999. :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
  1000. :"memory");
  1001. d += 24;
  1002. s += 8;
  1003. }
  1004. __asm __volatile(SFENCE:::"memory");
  1005. __asm __volatile(EMMS:::"memory");
  1006. #endif
  1007. while(s < end)
  1008. {
  1009. register uint16_t bgr;
  1010. bgr = *s++;
  1011. *d++ = (bgr&0x1F)<<3;
  1012. *d++ = (bgr&0x3E0)>>2;
  1013. *d++ = (bgr&0x7C00)>>7;
  1014. }
  1015. }
  1016. static inline void RENAME(rgb16to24)(const uint8_t *src, uint8_t *dst, long src_size)
  1017. {
  1018. const uint16_t *end;
  1019. #ifdef HAVE_MMX
  1020. const uint16_t *mm_end;
  1021. #endif
  1022. uint8_t *d = (uint8_t *)dst;
  1023. const uint16_t *s = (const uint16_t *)src;
  1024. end = s + src_size/2;
  1025. #ifdef HAVE_MMX
  1026. __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
  1027. mm_end = end - 7;
  1028. while(s < mm_end)
  1029. {
  1030. __asm __volatile(
  1031. PREFETCH" 32%1\n\t"
  1032. "movq %1, %%mm0\n\t"
  1033. "movq %1, %%mm1\n\t"
  1034. "movq %1, %%mm2\n\t"
  1035. "pand %2, %%mm0\n\t"
  1036. "pand %3, %%mm1\n\t"
  1037. "pand %4, %%mm2\n\t"
  1038. "psllq $3, %%mm0\n\t"
  1039. "psrlq $3, %%mm1\n\t"
  1040. "psrlq $8, %%mm2\n\t"
  1041. "movq %%mm0, %%mm3\n\t"
  1042. "movq %%mm1, %%mm4\n\t"
  1043. "movq %%mm2, %%mm5\n\t"
  1044. "punpcklwd %5, %%mm0\n\t"
  1045. "punpcklwd %5, %%mm1\n\t"
  1046. "punpcklwd %5, %%mm2\n\t"
  1047. "punpckhwd %5, %%mm3\n\t"
  1048. "punpckhwd %5, %%mm4\n\t"
  1049. "punpckhwd %5, %%mm5\n\t"
  1050. "psllq $8, %%mm1\n\t"
  1051. "psllq $16, %%mm2\n\t"
  1052. "por %%mm1, %%mm0\n\t"
  1053. "por %%mm2, %%mm0\n\t"
  1054. "psllq $8, %%mm4\n\t"
  1055. "psllq $16, %%mm5\n\t"
  1056. "por %%mm4, %%mm3\n\t"
  1057. "por %%mm5, %%mm3\n\t"
  1058. "movq %%mm0, %%mm6\n\t"
  1059. "movq %%mm3, %%mm7\n\t"
  1060. "movq 8%1, %%mm0\n\t"
  1061. "movq 8%1, %%mm1\n\t"
  1062. "movq 8%1, %%mm2\n\t"
  1063. "pand %2, %%mm0\n\t"
  1064. "pand %3, %%mm1\n\t"
  1065. "pand %4, %%mm2\n\t"
  1066. "psllq $3, %%mm0\n\t"
  1067. "psrlq $3, %%mm1\n\t"
  1068. "psrlq $8, %%mm2\n\t"
  1069. "movq %%mm0, %%mm3\n\t"
  1070. "movq %%mm1, %%mm4\n\t"
  1071. "movq %%mm2, %%mm5\n\t"
  1072. "punpcklwd %5, %%mm0\n\t"
  1073. "punpcklwd %5, %%mm1\n\t"
  1074. "punpcklwd %5, %%mm2\n\t"
  1075. "punpckhwd %5, %%mm3\n\t"
  1076. "punpckhwd %5, %%mm4\n\t"
  1077. "punpckhwd %5, %%mm5\n\t"
  1078. "psllq $8, %%mm1\n\t"
  1079. "psllq $16, %%mm2\n\t"
  1080. "por %%mm1, %%mm0\n\t"
  1081. "por %%mm2, %%mm0\n\t"
  1082. "psllq $8, %%mm4\n\t"
  1083. "psllq $16, %%mm5\n\t"
  1084. "por %%mm4, %%mm3\n\t"
  1085. "por %%mm5, %%mm3\n\t"
  1086. :"=m"(*d)
  1087. :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mmx_null)
  1088. :"memory");
  1089. /* Borrowed 32 to 24 */
  1090. __asm __volatile(
  1091. "movq %%mm0, %%mm4\n\t"
  1092. "movq %%mm3, %%mm5\n\t"
  1093. "movq %%mm6, %%mm0\n\t"
  1094. "movq %%mm7, %%mm1\n\t"
  1095. "movq %%mm4, %%mm6\n\t"
  1096. "movq %%mm5, %%mm7\n\t"
  1097. "movq %%mm0, %%mm2\n\t"
  1098. "movq %%mm1, %%mm3\n\t"
  1099. "psrlq $8, %%mm2\n\t"
  1100. "psrlq $8, %%mm3\n\t"
  1101. "psrlq $8, %%mm6\n\t"
  1102. "psrlq $8, %%mm7\n\t"
  1103. "pand %2, %%mm0\n\t"
  1104. "pand %2, %%mm1\n\t"
  1105. "pand %2, %%mm4\n\t"
  1106. "pand %2, %%mm5\n\t"
  1107. "pand %3, %%mm2\n\t"
  1108. "pand %3, %%mm3\n\t"
  1109. "pand %3, %%mm6\n\t"
  1110. "pand %3, %%mm7\n\t"
  1111. "por %%mm2, %%mm0\n\t"
  1112. "por %%mm3, %%mm1\n\t"
  1113. "por %%mm6, %%mm4\n\t"
  1114. "por %%mm7, %%mm5\n\t"
  1115. "movq %%mm1, %%mm2\n\t"
  1116. "movq %%mm4, %%mm3\n\t"
  1117. "psllq $48, %%mm2\n\t"
  1118. "psllq $32, %%mm3\n\t"
  1119. "pand %4, %%mm2\n\t"
  1120. "pand %5, %%mm3\n\t"
  1121. "por %%mm2, %%mm0\n\t"
  1122. "psrlq $16, %%mm1\n\t"
  1123. "psrlq $32, %%mm4\n\t"
  1124. "psllq $16, %%mm5\n\t"
  1125. "por %%mm3, %%mm1\n\t"
  1126. "pand %6, %%mm5\n\t"
  1127. "por %%mm5, %%mm4\n\t"
  1128. MOVNTQ" %%mm0, %0\n\t"
  1129. MOVNTQ" %%mm1, 8%0\n\t"
  1130. MOVNTQ" %%mm4, 16%0"
  1131. :"=m"(*d)
  1132. :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
  1133. :"memory");
  1134. d += 24;
  1135. s += 8;
  1136. }
  1137. __asm __volatile(SFENCE:::"memory");
  1138. __asm __volatile(EMMS:::"memory");
  1139. #endif
  1140. while(s < end)
  1141. {
  1142. register uint16_t bgr;
  1143. bgr = *s++;
  1144. *d++ = (bgr&0x1F)<<3;
  1145. *d++ = (bgr&0x7E0)>>3;
  1146. *d++ = (bgr&0xF800)>>8;
  1147. }
  1148. }
  1149. static inline void RENAME(rgb15to32)(const uint8_t *src, uint8_t *dst, long src_size)
  1150. {
  1151. const uint16_t *end;
  1152. #ifdef HAVE_MMX
  1153. const uint16_t *mm_end;
  1154. #endif
  1155. uint8_t *d = (uint8_t *)dst;
  1156. const uint16_t *s = (const uint16_t *)src;
  1157. end = s + src_size/2;
  1158. #ifdef HAVE_MMX
  1159. __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
  1160. __asm __volatile("pxor %%mm7,%%mm7\n\t":::"memory");
  1161. mm_end = end - 3;
  1162. while(s < mm_end)
  1163. {
  1164. __asm __volatile(
  1165. PREFETCH" 32%1\n\t"
  1166. "movq %1, %%mm0\n\t"
  1167. "movq %1, %%mm1\n\t"
  1168. "movq %1, %%mm2\n\t"
  1169. "pand %2, %%mm0\n\t"
  1170. "pand %3, %%mm1\n\t"
  1171. "pand %4, %%mm2\n\t"
  1172. "psllq $3, %%mm0\n\t"
  1173. "psrlq $2, %%mm1\n\t"
  1174. "psrlq $7, %%mm2\n\t"
  1175. "movq %%mm0, %%mm3\n\t"
  1176. "movq %%mm1, %%mm4\n\t"
  1177. "movq %%mm2, %%mm5\n\t"
  1178. "punpcklwd %%mm7, %%mm0\n\t"
  1179. "punpcklwd %%mm7, %%mm1\n\t"
  1180. "punpcklwd %%mm7, %%mm2\n\t"
  1181. "punpckhwd %%mm7, %%mm3\n\t"
  1182. "punpckhwd %%mm7, %%mm4\n\t"
  1183. "punpckhwd %%mm7, %%mm5\n\t"
  1184. "psllq $8, %%mm1\n\t"
  1185. "psllq $16, %%mm2\n\t"
  1186. "por %%mm1, %%mm0\n\t"
  1187. "por %%mm2, %%mm0\n\t"
  1188. "psllq $8, %%mm4\n\t"
  1189. "psllq $16, %%mm5\n\t"
  1190. "por %%mm4, %%mm3\n\t"
  1191. "por %%mm5, %%mm3\n\t"
  1192. MOVNTQ" %%mm0, %0\n\t"
  1193. MOVNTQ" %%mm3, 8%0\n\t"
  1194. :"=m"(*d)
  1195. :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r)
  1196. :"memory");
  1197. d += 16;
  1198. s += 4;
  1199. }
  1200. __asm __volatile(SFENCE:::"memory");
  1201. __asm __volatile(EMMS:::"memory");
  1202. #endif
  1203. while(s < end)
  1204. {
  1205. #if 0 //slightly slower on athlon
  1206. int bgr= *s++;
  1207. *((uint32_t*)d)++ = ((bgr&0x1F)<<3) + ((bgr&0x3E0)<<6) + ((bgr&0x7C00)<<9);
  1208. #else
  1209. register uint16_t bgr;
  1210. bgr = *s++;
  1211. #ifdef WORDS_BIGENDIAN
  1212. *d++ = 0;
  1213. *d++ = (bgr&0x7C00)>>7;
  1214. *d++ = (bgr&0x3E0)>>2;
  1215. *d++ = (bgr&0x1F)<<3;
  1216. #else
  1217. *d++ = (bgr&0x1F)<<3;
  1218. *d++ = (bgr&0x3E0)>>2;
  1219. *d++ = (bgr&0x7C00)>>7;
  1220. *d++ = 0;
  1221. #endif
  1222. #endif
  1223. }
  1224. }
  1225. static inline void RENAME(rgb16to32)(const uint8_t *src, uint8_t *dst, long src_size)
  1226. {
  1227. const uint16_t *end;
  1228. #ifdef HAVE_MMX
  1229. const uint16_t *mm_end;
  1230. #endif
  1231. uint8_t *d = (uint8_t *)dst;
  1232. const uint16_t *s = (uint16_t *)src;
  1233. end = s + src_size/2;
  1234. #ifdef HAVE_MMX
  1235. __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
  1236. __asm __volatile("pxor %%mm7,%%mm7\n\t":::"memory");
  1237. mm_end = end - 3;
  1238. while(s < mm_end)
  1239. {
  1240. __asm __volatile(
  1241. PREFETCH" 32%1\n\t"
  1242. "movq %1, %%mm0\n\t"
  1243. "movq %1, %%mm1\n\t"
  1244. "movq %1, %%mm2\n\t"
  1245. "pand %2, %%mm0\n\t"
  1246. "pand %3, %%mm1\n\t"
  1247. "pand %4, %%mm2\n\t"
  1248. "psllq $3, %%mm0\n\t"
  1249. "psrlq $3, %%mm1\n\t"
  1250. "psrlq $8, %%mm2\n\t"
  1251. "movq %%mm0, %%mm3\n\t"
  1252. "movq %%mm1, %%mm4\n\t"
  1253. "movq %%mm2, %%mm5\n\t"
  1254. "punpcklwd %%mm7, %%mm0\n\t"
  1255. "punpcklwd %%mm7, %%mm1\n\t"
  1256. "punpcklwd %%mm7, %%mm2\n\t"
  1257. "punpckhwd %%mm7, %%mm3\n\t"
  1258. "punpckhwd %%mm7, %%mm4\n\t"
  1259. "punpckhwd %%mm7, %%mm5\n\t"
  1260. "psllq $8, %%mm1\n\t"
  1261. "psllq $16, %%mm2\n\t"
  1262. "por %%mm1, %%mm0\n\t"
  1263. "por %%mm2, %%mm0\n\t"
  1264. "psllq $8, %%mm4\n\t"
  1265. "psllq $16, %%mm5\n\t"
  1266. "por %%mm4, %%mm3\n\t"
  1267. "por %%mm5, %%mm3\n\t"
  1268. MOVNTQ" %%mm0, %0\n\t"
  1269. MOVNTQ" %%mm3, 8%0\n\t"
  1270. :"=m"(*d)
  1271. :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r)
  1272. :"memory");
  1273. d += 16;
  1274. s += 4;
  1275. }
  1276. __asm __volatile(SFENCE:::"memory");
  1277. __asm __volatile(EMMS:::"memory");
  1278. #endif
  1279. while(s < end)
  1280. {
  1281. register uint16_t bgr;
  1282. bgr = *s++;
  1283. #ifdef WORDS_BIGENDIAN
  1284. *d++ = 0;
  1285. *d++ = (bgr&0xF800)>>8;
  1286. *d++ = (bgr&0x7E0)>>3;
  1287. *d++ = (bgr&0x1F)<<3;
  1288. #else
  1289. *d++ = (bgr&0x1F)<<3;
  1290. *d++ = (bgr&0x7E0)>>3;
  1291. *d++ = (bgr&0xF800)>>8;
  1292. *d++ = 0;
  1293. #endif
  1294. }
  1295. }
  1296. static inline void RENAME(rgb32tobgr32)(const uint8_t *src, uint8_t *dst, long src_size)
  1297. {
  1298. #ifdef HAVE_MMX
  1299. /* TODO: unroll this loop */
  1300. asm volatile (
  1301. "xor %%"REG_a", %%"REG_a" \n\t"
  1302. ".balign 16 \n\t"
  1303. "1: \n\t"
  1304. PREFETCH" 32(%0, %%"REG_a") \n\t"
  1305. "movq (%0, %%"REG_a"), %%mm0 \n\t"
  1306. "movq %%mm0, %%mm1 \n\t"
  1307. "movq %%mm0, %%mm2 \n\t"
  1308. "pslld $16, %%mm0 \n\t"
  1309. "psrld $16, %%mm1 \n\t"
  1310. "pand "MANGLE(mask32r)", %%mm0 \n\t"
  1311. "pand "MANGLE(mask32g)", %%mm2 \n\t"
  1312. "pand "MANGLE(mask32b)", %%mm1 \n\t"
  1313. "por %%mm0, %%mm2 \n\t"
  1314. "por %%mm1, %%mm2 \n\t"
  1315. MOVNTQ" %%mm2, (%1, %%"REG_a") \n\t"
  1316. "add $8, %%"REG_a" \n\t"
  1317. "cmp %2, %%"REG_a" \n\t"
  1318. " jb 1b \n\t"
  1319. :: "r" (src), "r"(dst), "r" (src_size-7)
  1320. : "%"REG_a
  1321. );
  1322. __asm __volatile(SFENCE:::"memory");
  1323. __asm __volatile(EMMS:::"memory");
  1324. #else
  1325. unsigned i;
  1326. unsigned num_pixels = src_size >> 2;
  1327. for(i=0; i<num_pixels; i++)
  1328. {
  1329. #ifdef WORDS_BIGENDIAN
  1330. dst[4*i + 1] = src[4*i + 3];
  1331. dst[4*i + 2] = src[4*i + 2];
  1332. dst[4*i + 3] = src[4*i + 1];
  1333. #else
  1334. dst[4*i + 0] = src[4*i + 2];
  1335. dst[4*i + 1] = src[4*i + 1];
  1336. dst[4*i + 2] = src[4*i + 0];
  1337. #endif
  1338. }
  1339. #endif
  1340. }
  1341. static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
  1342. {
  1343. unsigned i;
  1344. #ifdef HAVE_MMX
  1345. long mmx_size= 23 - src_size;
  1346. asm volatile (
  1347. "movq "MANGLE(mask24r)", %%mm5 \n\t"
  1348. "movq "MANGLE(mask24g)", %%mm6 \n\t"
  1349. "movq "MANGLE(mask24b)", %%mm7 \n\t"
  1350. ".balign 16 \n\t"
  1351. "1: \n\t"
  1352. PREFETCH" 32(%1, %%"REG_a") \n\t"
  1353. "movq (%1, %%"REG_a"), %%mm0 \n\t" // BGR BGR BG
  1354. "movq (%1, %%"REG_a"), %%mm1 \n\t" // BGR BGR BG
  1355. "movq 2(%1, %%"REG_a"), %%mm2 \n\t" // R BGR BGR B
  1356. "psllq $16, %%mm0 \n\t" // 00 BGR BGR
  1357. "pand %%mm5, %%mm0 \n\t"
  1358. "pand %%mm6, %%mm1 \n\t"
  1359. "pand %%mm7, %%mm2 \n\t"
  1360. "por %%mm0, %%mm1 \n\t"
  1361. "por %%mm2, %%mm1 \n\t"
  1362. "movq 6(%1, %%"REG_a"), %%mm0 \n\t" // BGR BGR BG
  1363. MOVNTQ" %%mm1, (%2, %%"REG_a")\n\t" // RGB RGB RG
  1364. "movq 8(%1, %%"REG_a"), %%mm1 \n\t" // R BGR BGR B
  1365. "movq 10(%1, %%"REG_a"), %%mm2 \n\t" // GR BGR BGR
  1366. "pand %%mm7, %%mm0 \n\t"
  1367. "pand %%mm5, %%mm1 \n\t"
  1368. "pand %%mm6, %%mm2 \n\t"
  1369. "por %%mm0, %%mm1 \n\t"
  1370. "por %%mm2, %%mm1 \n\t"
  1371. "movq 14(%1, %%"REG_a"), %%mm0 \n\t" // R BGR BGR B
  1372. MOVNTQ" %%mm1, 8(%2, %%"REG_a")\n\t" // B RGB RGB R
  1373. "movq 16(%1, %%"REG_a"), %%mm1 \n\t" // GR BGR BGR
  1374. "movq 18(%1, %%"REG_a"), %%mm2 \n\t" // BGR BGR BG
  1375. "pand %%mm6, %%mm0 \n\t"
  1376. "pand %%mm7, %%mm1 \n\t"
  1377. "pand %%mm5, %%mm2 \n\t"
  1378. "por %%mm0, %%mm1 \n\t"
  1379. "por %%mm2, %%mm1 \n\t"
  1380. MOVNTQ" %%mm1, 16(%2, %%"REG_a")\n\t"
  1381. "add $24, %%"REG_a" \n\t"
  1382. " js 1b \n\t"
  1383. : "+a" (mmx_size)
  1384. : "r" (src-mmx_size), "r"(dst-mmx_size)
  1385. );
  1386. __asm __volatile(SFENCE:::"memory");
  1387. __asm __volatile(EMMS:::"memory");
  1388. if(mmx_size==23) return; //finihsed, was multiple of 8
  1389. src+= src_size;
  1390. dst+= src_size;
  1391. src_size= 23-mmx_size;
  1392. src-= src_size;
  1393. dst-= src_size;
  1394. #endif
  1395. for(i=0; i<src_size; i+=3)
  1396. {
  1397. register uint8_t x;
  1398. x = src[i + 2];
  1399. dst[i + 1] = src[i + 1];
  1400. dst[i + 2] = src[i + 0];
  1401. dst[i + 0] = x;
  1402. }
  1403. }
  1404. static inline void RENAME(yuvPlanartoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
  1405. long width, long height,
  1406. long lumStride, long chromStride, long dstStride, long vertLumPerChroma)
  1407. {
  1408. long y;
  1409. const long chromWidth= width>>1;
  1410. for(y=0; y<height; y++)
  1411. {
  1412. #ifdef HAVE_MMX
  1413. //FIXME handle 2 lines a once (fewer prefetch, reuse some chrom, but very likely limited by mem anyway)
  1414. asm volatile(
  1415. "xor %%"REG_a", %%"REG_a" \n\t"
  1416. ".balign 16 \n\t"
  1417. "1: \n\t"
  1418. PREFETCH" 32(%1, %%"REG_a", 2) \n\t"
  1419. PREFETCH" 32(%2, %%"REG_a") \n\t"
  1420. PREFETCH" 32(%3, %%"REG_a") \n\t"
  1421. "movq (%2, %%"REG_a"), %%mm0 \n\t" // U(0)
  1422. "movq %%mm0, %%mm2 \n\t" // U(0)
  1423. "movq (%3, %%"REG_a"), %%mm1 \n\t" // V(0)
  1424. "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
  1425. "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8)
  1426. "movq (%1, %%"REG_a",2), %%mm3 \n\t" // Y(0)
  1427. "movq 8(%1, %%"REG_a",2), %%mm5 \n\t" // Y(8)
  1428. "movq %%mm3, %%mm4 \n\t" // Y(0)
  1429. "movq %%mm5, %%mm6 \n\t" // Y(8)
  1430. "punpcklbw %%mm0, %%mm3 \n\t" // YUYV YUYV(0)
  1431. "punpckhbw %%mm0, %%mm4 \n\t" // YUYV YUYV(4)
  1432. "punpcklbw %%mm2, %%mm5 \n\t" // YUYV YUYV(8)
  1433. "punpckhbw %%mm2, %%mm6 \n\t" // YUYV YUYV(12)
  1434. MOVNTQ" %%mm3, (%0, %%"REG_a", 4)\n\t"
  1435. MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4)\n\t"
  1436. MOVNTQ" %%mm5, 16(%0, %%"REG_a", 4)\n\t"
  1437. MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4)\n\t"
  1438. "add $8, %%"REG_a" \n\t"
  1439. "cmp %4, %%"REG_a" \n\t"
  1440. " jb 1b \n\t"
  1441. ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
  1442. : "%"REG_a
  1443. );
  1444. #else
  1445. #if defined ARCH_ALPHA && defined HAVE_MVI
  1446. #define pl2yuy2(n) \
  1447. y1 = yc[n]; \
  1448. y2 = yc2[n]; \
  1449. u = uc[n]; \
  1450. v = vc[n]; \
  1451. asm("unpkbw %1, %0" : "=r"(y1) : "r"(y1)); \
  1452. asm("unpkbw %1, %0" : "=r"(y2) : "r"(y2)); \
  1453. asm("unpkbl %1, %0" : "=r"(u) : "r"(u)); \
  1454. asm("unpkbl %1, %0" : "=r"(v) : "r"(v)); \
  1455. yuv1 = (u << 8) + (v << 24); \
  1456. yuv2 = yuv1 + y2; \
  1457. yuv1 += y1; \
  1458. qdst[n] = yuv1; \
  1459. qdst2[n] = yuv2;
  1460. int i;
  1461. uint64_t *qdst = (uint64_t *) dst;
  1462. uint64_t *qdst2 = (uint64_t *) (dst + dstStride);
  1463. const uint32_t *yc = (uint32_t *) ysrc;
  1464. const uint32_t *yc2 = (uint32_t *) (ysrc + lumStride);
  1465. const uint16_t *uc = (uint16_t*) usrc, *vc = (uint16_t*) vsrc;
  1466. for(i = 0; i < chromWidth; i += 8){
  1467. uint64_t y1, y2, yuv1, yuv2;
  1468. uint64_t u, v;
  1469. /* Prefetch */
  1470. asm("ldq $31,64(%0)" :: "r"(yc));
  1471. asm("ldq $31,64(%0)" :: "r"(yc2));
  1472. asm("ldq $31,64(%0)" :: "r"(uc));
  1473. asm("ldq $31,64(%0)" :: "r"(vc));
  1474. pl2yuy2(0);
  1475. pl2yuy2(1);
  1476. pl2yuy2(2);
  1477. pl2yuy2(3);
  1478. yc += 4;
  1479. yc2 += 4;
  1480. uc += 4;
  1481. vc += 4;
  1482. qdst += 4;
  1483. qdst2 += 4;
  1484. }
  1485. y++;
  1486. ysrc += lumStride;
  1487. dst += dstStride;
  1488. #elif __WORDSIZE >= 64
  1489. int i;
  1490. uint64_t *ldst = (uint64_t *) dst;
  1491. const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
  1492. for(i = 0; i < chromWidth; i += 2){
  1493. uint64_t k, l;
  1494. k = yc[0] + (uc[0] << 8) +
  1495. (yc[1] << 16) + (vc[0] << 24);
  1496. l = yc[2] + (uc[1] << 8) +
  1497. (yc[3] << 16) + (vc[1] << 24);
  1498. *ldst++ = k + (l << 32);
  1499. yc += 4;
  1500. uc += 2;
  1501. vc += 2;
  1502. }
  1503. #else
  1504. int i, *idst = (int32_t *) dst;
  1505. const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
  1506. for(i = 0; i < chromWidth; i++){
  1507. #ifdef WORDS_BIGENDIAN
  1508. *idst++ = (yc[0] << 24)+ (uc[0] << 16) +
  1509. (yc[1] << 8) + (vc[0] << 0);
  1510. #else
  1511. *idst++ = yc[0] + (uc[0] << 8) +
  1512. (yc[1] << 16) + (vc[0] << 24);
  1513. #endif
  1514. yc += 2;
  1515. uc++;
  1516. vc++;
  1517. }
  1518. #endif
  1519. #endif
  1520. if((y&(vertLumPerChroma-1))==(vertLumPerChroma-1) )
  1521. {
  1522. usrc += chromStride;
  1523. vsrc += chromStride;
  1524. }
  1525. ysrc += lumStride;
  1526. dst += dstStride;
  1527. }
  1528. #ifdef HAVE_MMX
  1529. asm( EMMS" \n\t"
  1530. SFENCE" \n\t"
  1531. :::"memory");
  1532. #endif
  1533. }
  1534. /**
  1535. *
  1536. * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
  1537. * problem for anyone then tell me, and ill fix it)
  1538. */
  1539. static inline void RENAME(yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
  1540. long width, long height,
  1541. long lumStride, long chromStride, long dstStride)
  1542. {
  1543. //FIXME interpolate chroma
  1544. RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
  1545. }
  1546. static inline void RENAME(yuvPlanartouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
  1547. long width, long height,
  1548. long lumStride, long chromStride, long dstStride, long vertLumPerChroma)
  1549. {
  1550. long y;
  1551. const long chromWidth= width>>1;
  1552. for(y=0; y<height; y++)
  1553. {
  1554. #ifdef HAVE_MMX
  1555. //FIXME handle 2 lines a once (fewer prefetch, reuse some chrom, but very likely limited by mem anyway)
  1556. asm volatile(
  1557. "xor %%"REG_a", %%"REG_a" \n\t"
  1558. ".balign 16 \n\t"
  1559. "1: \n\t"
  1560. PREFETCH" 32(%1, %%"REG_a", 2) \n\t"
  1561. PREFETCH" 32(%2, %%"REG_a") \n\t"
  1562. PREFETCH" 32(%3, %%"REG_a") \n\t"
  1563. "movq (%2, %%"REG_a"), %%mm0 \n\t" // U(0)
  1564. "movq %%mm0, %%mm2 \n\t" // U(0)
  1565. "movq (%3, %%"REG_a"), %%mm1 \n\t" // V(0)
  1566. "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
  1567. "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8)
  1568. "movq (%1, %%"REG_a",2), %%mm3 \n\t" // Y(0)
  1569. "movq 8(%1, %%"REG_a",2), %%mm5 \n\t" // Y(8)
  1570. "movq %%mm0, %%mm4 \n\t" // Y(0)
  1571. "movq %%mm2, %%mm6 \n\t" // Y(8)
  1572. "punpcklbw %%mm3, %%mm0 \n\t" // YUYV YUYV(0)
  1573. "punpckhbw %%mm3, %%mm4 \n\t" // YUYV YUYV(4)
  1574. "punpcklbw %%mm5, %%mm2 \n\t" // YUYV YUYV(8)
  1575. "punpckhbw %%mm5, %%mm6 \n\t" // YUYV YUYV(12)
  1576. MOVNTQ" %%mm0, (%0, %%"REG_a", 4)\n\t"
  1577. MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4)\n\t"
  1578. MOVNTQ" %%mm2, 16(%0, %%"REG_a", 4)\n\t"
  1579. MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4)\n\t"
  1580. "add $8, %%"REG_a" \n\t"
  1581. "cmp %4, %%"REG_a" \n\t"
  1582. " jb 1b \n\t"
  1583. ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
  1584. : "%"REG_a
  1585. );
  1586. #else
  1587. //FIXME adapt the alpha asm code from yv12->yuy2
  1588. #if __WORDSIZE >= 64
  1589. int i;
  1590. uint64_t *ldst = (uint64_t *) dst;
  1591. const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
  1592. for(i = 0; i < chromWidth; i += 2){
  1593. uint64_t k, l;
  1594. k = uc[0] + (yc[0] << 8) +
  1595. (vc[0] << 16) + (yc[1] << 24);
  1596. l = uc[1] + (yc[2] << 8) +
  1597. (vc[1] << 16) + (yc[3] << 24);
  1598. *ldst++ = k + (l << 32);
  1599. yc += 4;
  1600. uc += 2;
  1601. vc += 2;
  1602. }
  1603. #else
  1604. int i, *idst = (int32_t *) dst;
  1605. const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
  1606. for(i = 0; i < chromWidth; i++){
  1607. #ifdef WORDS_BIGENDIAN
  1608. *idst++ = (uc[0] << 24)+ (yc[0] << 16) +
  1609. (vc[0] << 8) + (yc[1] << 0);
  1610. #else
  1611. *idst++ = uc[0] + (yc[0] << 8) +
  1612. (vc[0] << 16) + (yc[1] << 24);
  1613. #endif
  1614. yc += 2;
  1615. uc++;
  1616. vc++;
  1617. }
  1618. #endif
  1619. #endif
  1620. if((y&(vertLumPerChroma-1))==(vertLumPerChroma-1) )
  1621. {
  1622. usrc += chromStride;
  1623. vsrc += chromStride;
  1624. }
  1625. ysrc += lumStride;
  1626. dst += dstStride;
  1627. }
  1628. #ifdef HAVE_MMX
  1629. asm( EMMS" \n\t"
  1630. SFENCE" \n\t"
  1631. :::"memory");
  1632. #endif
  1633. }
  1634. /**
  1635. *
  1636. * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
  1637. * problem for anyone then tell me, and ill fix it)
  1638. */
  1639. static inline void RENAME(yv12touyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
  1640. long width, long height,
  1641. long lumStride, long chromStride, long dstStride)
  1642. {
  1643. //FIXME interpolate chroma
  1644. RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
  1645. }
  1646. /**
  1647. *
  1648. * width should be a multiple of 16
  1649. */
  1650. static inline void RENAME(yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
  1651. long width, long height,
  1652. long lumStride, long chromStride, long dstStride)
  1653. {
  1654. RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
  1655. }
  1656. /**
  1657. *
  1658. * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
  1659. * problem for anyone then tell me, and ill fix it)
  1660. */
  1661. static inline void RENAME(yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
  1662. long width, long height,
  1663. long lumStride, long chromStride, long srcStride)
  1664. {
  1665. long y;
  1666. const long chromWidth= width>>1;
  1667. for(y=0; y<height; y+=2)
  1668. {
  1669. #ifdef HAVE_MMX
  1670. asm volatile(
  1671. "xor %%"REG_a", %%"REG_a" \n\t"
  1672. "pcmpeqw %%mm7, %%mm7 \n\t"
  1673. "psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
  1674. ".balign 16 \n\t"
  1675. "1: \n\t"
  1676. PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
  1677. "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
  1678. "movq 8(%0, %%"REG_a", 4), %%mm1\n\t" // YUYV YUYV(4)
  1679. "movq %%mm0, %%mm2 \n\t" // YUYV YUYV(0)
  1680. "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(4)
  1681. "psrlw $8, %%mm0 \n\t" // U0V0 U0V0(0)
  1682. "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(4)
  1683. "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
  1684. "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
  1685. "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
  1686. "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
  1687. MOVNTQ" %%mm2, (%1, %%"REG_a", 2)\n\t"
  1688. "movq 16(%0, %%"REG_a", 4), %%mm1\n\t" // YUYV YUYV(8)
  1689. "movq 24(%0, %%"REG_a", 4), %%mm2\n\t" // YUYV YUYV(12)
  1690. "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(8)
  1691. "movq %%mm2, %%mm4 \n\t" // YUYV YUYV(12)
  1692. "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(8)
  1693. "psrlw $8, %%mm2 \n\t" // U0V0 U0V0(12)
  1694. "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
  1695. "pand %%mm7, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
  1696. "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
  1697. "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
  1698. MOVNTQ" %%mm3, 8(%1, %%"REG_a", 2)\n\t"
  1699. "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
  1700. "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
  1701. "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
  1702. "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
  1703. "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
  1704. "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
  1705. "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
  1706. "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
  1707. MOVNTQ" %%mm0, (%3, %%"REG_a") \n\t"
  1708. MOVNTQ" %%mm2, (%2, %%"REG_a") \n\t"
  1709. "add $8, %%"REG_a" \n\t"
  1710. "cmp %4, %%"REG_a" \n\t"
  1711. " jb 1b \n\t"
  1712. ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
  1713. : "memory", "%"REG_a
  1714. );
  1715. ydst += lumStride;
  1716. src += srcStride;
  1717. asm volatile(
  1718. "xor %%"REG_a", %%"REG_a" \n\t"
  1719. ".balign 16 \n\t"
  1720. "1: \n\t"
  1721. PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
  1722. "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
  1723. "movq 8(%0, %%"REG_a", 4), %%mm1\n\t" // YUYV YUYV(4)
  1724. "movq 16(%0, %%"REG_a", 4), %%mm2\n\t" // YUYV YUYV(8)
  1725. "movq 24(%0, %%"REG_a", 4), %%mm3\n\t" // YUYV YUYV(12)
  1726. "pand %%mm7, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
  1727. "pand %%mm7, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
  1728. "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
  1729. "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
  1730. "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
  1731. "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
  1732. MOVNTQ" %%mm0, (%1, %%"REG_a", 2)\n\t"
  1733. MOVNTQ" %%mm2, 8(%1, %%"REG_a", 2)\n\t"
  1734. "add $8, %%"REG_a" \n\t"
  1735. "cmp %4, %%"REG_a" \n\t"
  1736. " jb 1b \n\t"
  1737. ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
  1738. : "memory", "%"REG_a
  1739. );
  1740. #else
  1741. long i;
  1742. for(i=0; i<chromWidth; i++)
  1743. {
  1744. ydst[2*i+0] = src[4*i+0];
  1745. udst[i] = src[4*i+1];
  1746. ydst[2*i+1] = src[4*i+2];
  1747. vdst[i] = src[4*i+3];
  1748. }
  1749. ydst += lumStride;
  1750. src += srcStride;
  1751. for(i=0; i<chromWidth; i++)
  1752. {
  1753. ydst[2*i+0] = src[4*i+0];
  1754. ydst[2*i+1] = src[4*i+2];
  1755. }
  1756. #endif
  1757. udst += chromStride;
  1758. vdst += chromStride;
  1759. ydst += lumStride;
  1760. src += srcStride;
  1761. }
  1762. #ifdef HAVE_MMX
  1763. asm volatile( EMMS" \n\t"
  1764. SFENCE" \n\t"
  1765. :::"memory");
  1766. #endif
  1767. }
  1768. static inline void RENAME(yvu9toyv12)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc,
  1769. uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
  1770. long width, long height, long lumStride, long chromStride)
  1771. {
  1772. /* Y Plane */
  1773. memcpy(ydst, ysrc, width*height);
  1774. /* XXX: implement upscaling for U,V */
  1775. }
  1776. static inline void RENAME(planar2x)(const uint8_t *src, uint8_t *dst, long srcWidth, long srcHeight, long srcStride, long dstStride)
  1777. {
  1778. long x,y;
  1779. dst[0]= src[0];
  1780. // first line
  1781. for(x=0; x<srcWidth-1; x++){
  1782. dst[2*x+1]= (3*src[x] + src[x+1])>>2;
  1783. dst[2*x+2]= ( src[x] + 3*src[x+1])>>2;
  1784. }
  1785. dst[2*srcWidth-1]= src[srcWidth-1];
  1786. dst+= dstStride;
  1787. for(y=1; y<srcHeight; y++){
  1788. #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
  1789. const long mmxSize= srcWidth&~15;
  1790. asm volatile(
  1791. "mov %4, %%"REG_a" \n\t"
  1792. "1: \n\t"
  1793. "movq (%0, %%"REG_a"), %%mm0 \n\t"
  1794. "movq (%1, %%"REG_a"), %%mm1 \n\t"
  1795. "movq 1(%0, %%"REG_a"), %%mm2 \n\t"
  1796. "movq 1(%1, %%"REG_a"), %%mm3 \n\t"
  1797. "movq -1(%0, %%"REG_a"), %%mm4 \n\t"
  1798. "movq -1(%1, %%"REG_a"), %%mm5 \n\t"
  1799. PAVGB" %%mm0, %%mm5 \n\t"
  1800. PAVGB" %%mm0, %%mm3 \n\t"
  1801. PAVGB" %%mm0, %%mm5 \n\t"
  1802. PAVGB" %%mm0, %%mm3 \n\t"
  1803. PAVGB" %%mm1, %%mm4 \n\t"
  1804. PAVGB" %%mm1, %%mm2 \n\t"
  1805. PAVGB" %%mm1, %%mm4 \n\t"
  1806. PAVGB" %%mm1, %%mm2 \n\t"
  1807. "movq %%mm5, %%mm7 \n\t"
  1808. "movq %%mm4, %%mm6 \n\t"
  1809. "punpcklbw %%mm3, %%mm5 \n\t"
  1810. "punpckhbw %%mm3, %%mm7 \n\t"
  1811. "punpcklbw %%mm2, %%mm4 \n\t"
  1812. "punpckhbw %%mm2, %%mm6 \n\t"
  1813. #if 1
  1814. MOVNTQ" %%mm5, (%2, %%"REG_a", 2)\n\t"
  1815. MOVNTQ" %%mm7, 8(%2, %%"REG_a", 2)\n\t"
  1816. MOVNTQ" %%mm4, (%3, %%"REG_a", 2)\n\t"
  1817. MOVNTQ" %%mm6, 8(%3, %%"REG_a", 2)\n\t"
  1818. #else
  1819. "movq %%mm5, (%2, %%"REG_a", 2) \n\t"
  1820. "movq %%mm7, 8(%2, %%"REG_a", 2)\n\t"
  1821. "movq %%mm4, (%3, %%"REG_a", 2) \n\t"
  1822. "movq %%mm6, 8(%3, %%"REG_a", 2)\n\t"
  1823. #endif
  1824. "add $8, %%"REG_a" \n\t"
  1825. " js 1b \n\t"
  1826. :: "r" (src + mmxSize ), "r" (src + srcStride + mmxSize ),
  1827. "r" (dst + mmxSize*2), "r" (dst + dstStride + mmxSize*2),
  1828. "g" (-mmxSize)
  1829. : "%"REG_a
  1830. );
  1831. #else
  1832. const long mmxSize=1;
  1833. #endif
  1834. dst[0 ]= (3*src[0] + src[srcStride])>>2;
  1835. dst[dstStride]= ( src[0] + 3*src[srcStride])>>2;
  1836. for(x=mmxSize-1; x<srcWidth-1; x++){
  1837. dst[2*x +1]= (3*src[x+0] + src[x+srcStride+1])>>2;
  1838. dst[2*x+dstStride+2]= ( src[x+0] + 3*src[x+srcStride+1])>>2;
  1839. dst[2*x+dstStride+1]= ( src[x+1] + 3*src[x+srcStride ])>>2;
  1840. dst[2*x +2]= (3*src[x+1] + src[x+srcStride ])>>2;
  1841. }
  1842. dst[srcWidth*2 -1 ]= (3*src[srcWidth-1] + src[srcWidth-1 + srcStride])>>2;
  1843. dst[srcWidth*2 -1 + dstStride]= ( src[srcWidth-1] + 3*src[srcWidth-1 + srcStride])>>2;
  1844. dst+=dstStride*2;
  1845. src+=srcStride;
  1846. }
  1847. // last line
  1848. #if 1
  1849. dst[0]= src[0];
  1850. for(x=0; x<srcWidth-1; x++){
  1851. dst[2*x+1]= (3*src[x] + src[x+1])>>2;
  1852. dst[2*x+2]= ( src[x] + 3*src[x+1])>>2;
  1853. }
  1854. dst[2*srcWidth-1]= src[srcWidth-1];
  1855. #else
  1856. for(x=0; x<srcWidth; x++){
  1857. dst[2*x+0]=
  1858. dst[2*x+1]= src[x];
  1859. }
  1860. #endif
  1861. #ifdef HAVE_MMX
  1862. asm volatile( EMMS" \n\t"
  1863. SFENCE" \n\t"
  1864. :::"memory");
  1865. #endif
  1866. }
  1867. /**
  1868. *
  1869. * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
  1870. * problem for anyone then tell me, and ill fix it)
  1871. * chrominance data is only taken from every secound line others are ignored FIXME write HQ version
  1872. */
  1873. static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
  1874. long width, long height,
  1875. long lumStride, long chromStride, long srcStride)
  1876. {
  1877. long y;
  1878. const long chromWidth= width>>1;
  1879. for(y=0; y<height; y+=2)
  1880. {
  1881. #ifdef HAVE_MMX
  1882. asm volatile(
  1883. "xorl %%eax, %%eax \n\t"
  1884. "pcmpeqw %%mm7, %%mm7 \n\t"
  1885. "psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
  1886. ".balign 16 \n\t"
  1887. "1: \n\t"
  1888. PREFETCH" 64(%0, %%eax, 4) \n\t"
  1889. "movq (%0, %%eax, 4), %%mm0 \n\t" // UYVY UYVY(0)
  1890. "movq 8(%0, %%eax, 4), %%mm1 \n\t" // UYVY UYVY(4)
  1891. "movq %%mm0, %%mm2 \n\t" // UYVY UYVY(0)
  1892. "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(4)
  1893. "pand %%mm7, %%mm0 \n\t" // U0V0 U0V0(0)
  1894. "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(4)
  1895. "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
  1896. "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
  1897. "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
  1898. "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
  1899. MOVNTQ" %%mm2, (%1, %%eax, 2) \n\t"
  1900. "movq 16(%0, %%eax, 4), %%mm1 \n\t" // UYVY UYVY(8)
  1901. "movq 24(%0, %%eax, 4), %%mm2 \n\t" // UYVY UYVY(12)
  1902. "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(8)
  1903. "movq %%mm2, %%mm4 \n\t" // UYVY UYVY(12)
  1904. "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(8)
  1905. "pand %%mm7, %%mm2 \n\t" // U0V0 U0V0(12)
  1906. "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
  1907. "psrlw $8, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
  1908. "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
  1909. "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
  1910. MOVNTQ" %%mm3, 8(%1, %%eax, 2) \n\t"
  1911. "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
  1912. "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
  1913. "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
  1914. "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
  1915. "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
  1916. "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
  1917. "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
  1918. "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
  1919. MOVNTQ" %%mm0, (%3, %%eax) \n\t"
  1920. MOVNTQ" %%mm2, (%2, %%eax) \n\t"
  1921. "addl $8, %%eax \n\t"
  1922. "cmpl %4, %%eax \n\t"
  1923. " jb 1b \n\t"
  1924. ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
  1925. : "memory", "%eax"
  1926. );
  1927. ydst += lumStride;
  1928. src += srcStride;
  1929. asm volatile(
  1930. "xorl %%eax, %%eax \n\t"
  1931. ".balign 16 \n\t"
  1932. "1: \n\t"
  1933. PREFETCH" 64(%0, %%eax, 4) \n\t"
  1934. "movq (%0, %%eax, 4), %%mm0 \n\t" // YUYV YUYV(0)
  1935. "movq 8(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(4)
  1936. "movq 16(%0, %%eax, 4), %%mm2 \n\t" // YUYV YUYV(8)
  1937. "movq 24(%0, %%eax, 4), %%mm3 \n\t" // YUYV YUYV(12)
  1938. "psrlw $8, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
  1939. "psrlw $8, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
  1940. "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
  1941. "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
  1942. "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
  1943. "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
  1944. MOVNTQ" %%mm0, (%1, %%eax, 2) \n\t"
  1945. MOVNTQ" %%mm2, 8(%1, %%eax, 2) \n\t"
  1946. "addl $8, %%eax \n\t"
  1947. "cmpl %4, %%eax \n\t"
  1948. " jb 1b \n\t"
  1949. ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
  1950. : "memory", "%eax"
  1951. );
  1952. #else
  1953. long i;
  1954. for(i=0; i<chromWidth; i++)
  1955. {
  1956. udst[i] = src[4*i+0];
  1957. ydst[2*i+0] = src[4*i+1];
  1958. vdst[i] = src[4*i+2];
  1959. ydst[2*i+1] = src[4*i+3];
  1960. }
  1961. ydst += lumStride;
  1962. src += srcStride;
  1963. for(i=0; i<chromWidth; i++)
  1964. {
  1965. ydst[2*i+0] = src[4*i+1];
  1966. ydst[2*i+1] = src[4*i+3];
  1967. }
  1968. #endif
  1969. udst += chromStride;
  1970. vdst += chromStride;
  1971. ydst += lumStride;
  1972. src += srcStride;
  1973. }
  1974. #ifdef HAVE_MMX
  1975. asm volatile( EMMS" \n\t"
  1976. SFENCE" \n\t"
  1977. :::"memory");
  1978. #endif
  1979. }
  1980. /**
  1981. *
  1982. * height should be a multiple of 2 and width should be a multiple of 2 (if this is a
  1983. * problem for anyone then tell me, and ill fix it)
  1984. * chrominance data is only taken from every secound line others are ignored in the C version FIXME write HQ version
  1985. */
  1986. static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
  1987. long width, long height,
  1988. long lumStride, long chromStride, long srcStride)
  1989. {
  1990. long y;
  1991. const long chromWidth= width>>1;
  1992. #ifdef HAVE_MMX
  1993. for(y=0; y<height-2; y+=2)
  1994. {
  1995. long i;
  1996. for(i=0; i<2; i++)
  1997. {
  1998. asm volatile(
  1999. "mov %2, %%"REG_a" \n\t"
  2000. "movq "MANGLE(bgr2YCoeff)", %%mm6 \n\t"
  2001. "movq "MANGLE(w1111)", %%mm5 \n\t"
  2002. "pxor %%mm7, %%mm7 \n\t"
  2003. "lea (%%"REG_a", %%"REG_a", 2), %%"REG_b"\n\t"
  2004. ".balign 16 \n\t"
  2005. "1: \n\t"
  2006. PREFETCH" 64(%0, %%"REG_b") \n\t"
  2007. "movd (%0, %%"REG_b"), %%mm0 \n\t"
  2008. "movd 3(%0, %%"REG_b"), %%mm1 \n\t"
  2009. "punpcklbw %%mm7, %%mm0 \n\t"
  2010. "punpcklbw %%mm7, %%mm1 \n\t"
  2011. "movd 6(%0, %%"REG_b"), %%mm2 \n\t"
  2012. "movd 9(%0, %%"REG_b"), %%mm3 \n\t"
  2013. "punpcklbw %%mm7, %%mm2 \n\t"
  2014. "punpcklbw %%mm7, %%mm3 \n\t"
  2015. "pmaddwd %%mm6, %%mm0 \n\t"
  2016. "pmaddwd %%mm6, %%mm1 \n\t"
  2017. "pmaddwd %%mm6, %%mm2 \n\t"
  2018. "pmaddwd %%mm6, %%mm3 \n\t"
  2019. #ifndef FAST_BGR2YV12
  2020. "psrad $8, %%mm0 \n\t"
  2021. "psrad $8, %%mm1 \n\t"
  2022. "psrad $8, %%mm2 \n\t"
  2023. "psrad $8, %%mm3 \n\t"
  2024. #endif
  2025. "packssdw %%mm1, %%mm0 \n\t"
  2026. "packssdw %%mm3, %%mm2 \n\t"
  2027. "pmaddwd %%mm5, %%mm0 \n\t"
  2028. "pmaddwd %%mm5, %%mm2 \n\t"
  2029. "packssdw %%mm2, %%mm0 \n\t"
  2030. "psraw $7, %%mm0 \n\t"
  2031. "movd 12(%0, %%"REG_b"), %%mm4 \n\t"
  2032. "movd 15(%0, %%"REG_b"), %%mm1 \n\t"
  2033. "punpcklbw %%mm7, %%mm4 \n\t"
  2034. "punpcklbw %%mm7, %%mm1 \n\t"
  2035. "movd 18(%0, %%"REG_b"), %%mm2 \n\t"
  2036. "movd 21(%0, %%"REG_b"), %%mm3 \n\t"
  2037. "punpcklbw %%mm7, %%mm2 \n\t"
  2038. "punpcklbw %%mm7, %%mm3 \n\t"
  2039. "pmaddwd %%mm6, %%mm4 \n\t"
  2040. "pmaddwd %%mm6, %%mm1 \n\t"
  2041. "pmaddwd %%mm6, %%mm2 \n\t"
  2042. "pmaddwd %%mm6, %%mm3 \n\t"
  2043. #ifndef FAST_BGR2YV12
  2044. "psrad $8, %%mm4 \n\t"
  2045. "psrad $8, %%mm1 \n\t"
  2046. "psrad $8, %%mm2 \n\t"
  2047. "psrad $8, %%mm3 \n\t"
  2048. #endif
  2049. "packssdw %%mm1, %%mm4 \n\t"
  2050. "packssdw %%mm3, %%mm2 \n\t"
  2051. "pmaddwd %%mm5, %%mm4 \n\t"
  2052. "pmaddwd %%mm5, %%mm2 \n\t"
  2053. "add $24, %%"REG_b" \n\t"
  2054. "packssdw %%mm2, %%mm4 \n\t"
  2055. "psraw $7, %%mm4 \n\t"
  2056. "packuswb %%mm4, %%mm0 \n\t"
  2057. "paddusb "MANGLE(bgr2YOffset)", %%mm0 \n\t"
  2058. MOVNTQ" %%mm0, (%1, %%"REG_a") \n\t"
  2059. "add $8, %%"REG_a" \n\t"
  2060. " js 1b \n\t"
  2061. : : "r" (src+width*3), "r" (ydst+width), "g" (-width)
  2062. : "%"REG_a, "%"REG_b
  2063. );
  2064. ydst += lumStride;
  2065. src += srcStride;
  2066. }
  2067. src -= srcStride*2;
  2068. asm volatile(
  2069. "mov %4, %%"REG_a" \n\t"
  2070. "movq "MANGLE(w1111)", %%mm5 \n\t"
  2071. "movq "MANGLE(bgr2UCoeff)", %%mm6 \n\t"
  2072. "pxor %%mm7, %%mm7 \n\t"
  2073. "lea (%%"REG_a", %%"REG_a", 2), %%"REG_b"\n\t"
  2074. "add %%"REG_b", %%"REG_b" \n\t"
  2075. ".balign 16 \n\t"
  2076. "1: \n\t"
  2077. PREFETCH" 64(%0, %%"REG_b") \n\t"
  2078. PREFETCH" 64(%1, %%"REG_b") \n\t"
  2079. #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
  2080. "movq (%0, %%"REG_b"), %%mm0 \n\t"
  2081. "movq (%1, %%"REG_b"), %%mm1 \n\t"
  2082. "movq 6(%0, %%"REG_b"), %%mm2 \n\t"
  2083. "movq 6(%1, %%"REG_b"), %%mm3 \n\t"
  2084. PAVGB" %%mm1, %%mm0 \n\t"
  2085. PAVGB" %%mm3, %%mm2 \n\t"
  2086. "movq %%mm0, %%mm1 \n\t"
  2087. "movq %%mm2, %%mm3 \n\t"
  2088. "psrlq $24, %%mm0 \n\t"
  2089. "psrlq $24, %%mm2 \n\t"
  2090. PAVGB" %%mm1, %%mm0 \n\t"
  2091. PAVGB" %%mm3, %%mm2 \n\t"
  2092. "punpcklbw %%mm7, %%mm0 \n\t"
  2093. "punpcklbw %%mm7, %%mm2 \n\t"
  2094. #else
  2095. "movd (%0, %%"REG_b"), %%mm0 \n\t"
  2096. "movd (%1, %%"REG_b"), %%mm1 \n\t"
  2097. "movd 3(%0, %%"REG_b"), %%mm2 \n\t"
  2098. "movd 3(%1, %%"REG_b"), %%mm3 \n\t"
  2099. "punpcklbw %%mm7, %%mm0 \n\t"
  2100. "punpcklbw %%mm7, %%mm1 \n\t"
  2101. "punpcklbw %%mm7, %%mm2 \n\t"
  2102. "punpcklbw %%mm7, %%mm3 \n\t"
  2103. "paddw %%mm1, %%mm0 \n\t"
  2104. "paddw %%mm3, %%mm2 \n\t"
  2105. "paddw %%mm2, %%mm0 \n\t"
  2106. "movd 6(%0, %%"REG_b"), %%mm4 \n\t"
  2107. "movd 6(%1, %%"REG_b"), %%mm1 \n\t"
  2108. "movd 9(%0, %%"REG_b"), %%mm2 \n\t"
  2109. "movd 9(%1, %%"REG_b"), %%mm3 \n\t"
  2110. "punpcklbw %%mm7, %%mm4 \n\t"
  2111. "punpcklbw %%mm7, %%mm1 \n\t"
  2112. "punpcklbw %%mm7, %%mm2 \n\t"
  2113. "punpcklbw %%mm7, %%mm3 \n\t"
  2114. "paddw %%mm1, %%mm4 \n\t"
  2115. "paddw %%mm3, %%mm2 \n\t"
  2116. "paddw %%mm4, %%mm2 \n\t"
  2117. "psrlw $2, %%mm0 \n\t"
  2118. "psrlw $2, %%mm2 \n\t"
  2119. #endif
  2120. "movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t"
  2121. "movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t"
  2122. "pmaddwd %%mm0, %%mm1 \n\t"
  2123. "pmaddwd %%mm2, %%mm3 \n\t"
  2124. "pmaddwd %%mm6, %%mm0 \n\t"
  2125. "pmaddwd %%mm6, %%mm2 \n\t"
  2126. #ifndef FAST_BGR2YV12
  2127. "psrad $8, %%mm0 \n\t"
  2128. "psrad $8, %%mm1 \n\t"
  2129. "psrad $8, %%mm2 \n\t"
  2130. "psrad $8, %%mm3 \n\t"
  2131. #endif
  2132. "packssdw %%mm2, %%mm0 \n\t"
  2133. "packssdw %%mm3, %%mm1 \n\t"
  2134. "pmaddwd %%mm5, %%mm0 \n\t"
  2135. "pmaddwd %%mm5, %%mm1 \n\t"
  2136. "packssdw %%mm1, %%mm0 \n\t" // V1 V0 U1 U0
  2137. "psraw $7, %%mm0 \n\t"
  2138. #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
  2139. "movq 12(%0, %%"REG_b"), %%mm4 \n\t"
  2140. "movq 12(%1, %%"REG_b"), %%mm1 \n\t"
  2141. "movq 18(%0, %%"REG_b"), %%mm2 \n\t"
  2142. "movq 18(%1, %%"REG_b"), %%mm3 \n\t"
  2143. PAVGB" %%mm1, %%mm4 \n\t"
  2144. PAVGB" %%mm3, %%mm2 \n\t"
  2145. "movq %%mm4, %%mm1 \n\t"
  2146. "movq %%mm2, %%mm3 \n\t"
  2147. "psrlq $24, %%mm4 \n\t"
  2148. "psrlq $24, %%mm2 \n\t"
  2149. PAVGB" %%mm1, %%mm4 \n\t"
  2150. PAVGB" %%mm3, %%mm2 \n\t"
  2151. "punpcklbw %%mm7, %%mm4 \n\t"
  2152. "punpcklbw %%mm7, %%mm2 \n\t"
  2153. #else
  2154. "movd 12(%0, %%"REG_b"), %%mm4 \n\t"
  2155. "movd 12(%1, %%"REG_b"), %%mm1 \n\t"
  2156. "movd 15(%0, %%"REG_b"), %%mm2 \n\t"
  2157. "movd 15(%1, %%"REG_b"), %%mm3 \n\t"
  2158. "punpcklbw %%mm7, %%mm4 \n\t"
  2159. "punpcklbw %%mm7, %%mm1 \n\t"
  2160. "punpcklbw %%mm7, %%mm2 \n\t"
  2161. "punpcklbw %%mm7, %%mm3 \n\t"
  2162. "paddw %%mm1, %%mm4 \n\t"
  2163. "paddw %%mm3, %%mm2 \n\t"
  2164. "paddw %%mm2, %%mm4 \n\t"
  2165. "movd 18(%0, %%"REG_b"), %%mm5 \n\t"
  2166. "movd 18(%1, %%"REG_b"), %%mm1 \n\t"
  2167. "movd 21(%0, %%"REG_b"), %%mm2 \n\t"
  2168. "movd 21(%1, %%"REG_b"), %%mm3 \n\t"
  2169. "punpcklbw %%mm7, %%mm5 \n\t"
  2170. "punpcklbw %%mm7, %%mm1 \n\t"
  2171. "punpcklbw %%mm7, %%mm2 \n\t"
  2172. "punpcklbw %%mm7, %%mm3 \n\t"
  2173. "paddw %%mm1, %%mm5 \n\t"
  2174. "paddw %%mm3, %%mm2 \n\t"
  2175. "paddw %%mm5, %%mm2 \n\t"
  2176. "movq "MANGLE(w1111)", %%mm5 \n\t"
  2177. "psrlw $2, %%mm4 \n\t"
  2178. "psrlw $2, %%mm2 \n\t"
  2179. #endif
  2180. "movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t"
  2181. "movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t"
  2182. "pmaddwd %%mm4, %%mm1 \n\t"
  2183. "pmaddwd %%mm2, %%mm3 \n\t"
  2184. "pmaddwd %%mm6, %%mm4 \n\t"
  2185. "pmaddwd %%mm6, %%mm2 \n\t"
  2186. #ifndef FAST_BGR2YV12
  2187. "psrad $8, %%mm4 \n\t"
  2188. "psrad $8, %%mm1 \n\t"
  2189. "psrad $8, %%mm2 \n\t"
  2190. "psrad $8, %%mm3 \n\t"
  2191. #endif
  2192. "packssdw %%mm2, %%mm4 \n\t"
  2193. "packssdw %%mm3, %%mm1 \n\t"
  2194. "pmaddwd %%mm5, %%mm4 \n\t"
  2195. "pmaddwd %%mm5, %%mm1 \n\t"
  2196. "add $24, %%"REG_b" \n\t"
  2197. "packssdw %%mm1, %%mm4 \n\t" // V3 V2 U3 U2
  2198. "psraw $7, %%mm4 \n\t"
  2199. "movq %%mm0, %%mm1 \n\t"
  2200. "punpckldq %%mm4, %%mm0 \n\t"
  2201. "punpckhdq %%mm4, %%mm1 \n\t"
  2202. "packsswb %%mm1, %%mm0 \n\t"
  2203. "paddb "MANGLE(bgr2UVOffset)", %%mm0 \n\t"
  2204. "movd %%mm0, (%2, %%"REG_a") \n\t"
  2205. "punpckhdq %%mm0, %%mm0 \n\t"
  2206. "movd %%mm0, (%3, %%"REG_a") \n\t"
  2207. "add $4, %%"REG_a" \n\t"
  2208. " js 1b \n\t"
  2209. : : "r" (src+chromWidth*6), "r" (src+srcStride+chromWidth*6), "r" (udst+chromWidth), "r" (vdst+chromWidth), "g" (-chromWidth)
  2210. : "%"REG_a, "%"REG_b
  2211. );
  2212. udst += chromStride;
  2213. vdst += chromStride;
  2214. src += srcStride*2;
  2215. }
  2216. asm volatile( EMMS" \n\t"
  2217. SFENCE" \n\t"
  2218. :::"memory");
  2219. #else
  2220. y=0;
  2221. #endif
  2222. for(; y<height; y+=2)
  2223. {
  2224. long i;
  2225. for(i=0; i<chromWidth; i++)
  2226. {
  2227. unsigned int b= src[6*i+0];
  2228. unsigned int g= src[6*i+1];
  2229. unsigned int r= src[6*i+2];
  2230. unsigned int Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
  2231. unsigned int V = ((RV*r + GV*g + BV*b)>>RGB2YUV_SHIFT) + 128;
  2232. unsigned int U = ((RU*r + GU*g + BU*b)>>RGB2YUV_SHIFT) + 128;
  2233. udst[i] = U;
  2234. vdst[i] = V;
  2235. ydst[2*i] = Y;
  2236. b= src[6*i+3];
  2237. g= src[6*i+4];
  2238. r= src[6*i+5];
  2239. Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
  2240. ydst[2*i+1] = Y;
  2241. }
  2242. ydst += lumStride;
  2243. src += srcStride;
  2244. for(i=0; i<chromWidth; i++)
  2245. {
  2246. unsigned int b= src[6*i+0];
  2247. unsigned int g= src[6*i+1];
  2248. unsigned int r= src[6*i+2];
  2249. unsigned int Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
  2250. ydst[2*i] = Y;
  2251. b= src[6*i+3];
  2252. g= src[6*i+4];
  2253. r= src[6*i+5];
  2254. Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
  2255. ydst[2*i+1] = Y;
  2256. }
  2257. udst += chromStride;
  2258. vdst += chromStride;
  2259. ydst += lumStride;
  2260. src += srcStride;
  2261. }
  2262. }
  2263. void RENAME(interleaveBytes)(uint8_t *src1, uint8_t *src2, uint8_t *dest,
  2264. long width, long height, long src1Stride,
  2265. long src2Stride, long dstStride){
  2266. long h;
  2267. for(h=0; h < height; h++)
  2268. {
  2269. long w;
  2270. #ifdef HAVE_MMX
  2271. #ifdef HAVE_SSE2
  2272. asm(
  2273. "xor %%"REG_a", %%"REG_a" \n\t"
  2274. "1: \n\t"
  2275. PREFETCH" 64(%1, %%"REG_a") \n\t"
  2276. PREFETCH" 64(%2, %%"REG_a") \n\t"
  2277. "movdqa (%1, %%"REG_a"), %%xmm0 \n\t"
  2278. "movdqa (%1, %%"REG_a"), %%xmm1 \n\t"
  2279. "movdqa (%2, %%"REG_a"), %%xmm2 \n\t"
  2280. "punpcklbw %%xmm2, %%xmm0 \n\t"
  2281. "punpckhbw %%xmm2, %%xmm1 \n\t"
  2282. "movntdq %%xmm0, (%0, %%"REG_a", 2)\n\t"
  2283. "movntdq %%xmm1, 16(%0, %%"REG_a", 2)\n\t"
  2284. "add $16, %%"REG_a" \n\t"
  2285. "cmp %3, %%"REG_a" \n\t"
  2286. " jb 1b \n\t"
  2287. ::"r"(dest), "r"(src1), "r"(src2), "r" (width-15)
  2288. : "memory", "%"REG_a""
  2289. );
  2290. #else
  2291. asm(
  2292. "xor %%"REG_a", %%"REG_a" \n\t"
  2293. "1: \n\t"
  2294. PREFETCH" 64(%1, %%"REG_a") \n\t"
  2295. PREFETCH" 64(%2, %%"REG_a") \n\t"
  2296. "movq (%1, %%"REG_a"), %%mm0 \n\t"
  2297. "movq 8(%1, %%"REG_a"), %%mm2 \n\t"
  2298. "movq %%mm0, %%mm1 \n\t"
  2299. "movq %%mm2, %%mm3 \n\t"
  2300. "movq (%2, %%"REG_a"), %%mm4 \n\t"
  2301. "movq 8(%2, %%"REG_a"), %%mm5 \n\t"
  2302. "punpcklbw %%mm4, %%mm0 \n\t"
  2303. "punpckhbw %%mm4, %%mm1 \n\t"
  2304. "punpcklbw %%mm5, %%mm2 \n\t"
  2305. "punpckhbw %%mm5, %%mm3 \n\t"
  2306. MOVNTQ" %%mm0, (%0, %%"REG_a", 2)\n\t"
  2307. MOVNTQ" %%mm1, 8(%0, %%"REG_a", 2)\n\t"
  2308. MOVNTQ" %%mm2, 16(%0, %%"REG_a", 2)\n\t"
  2309. MOVNTQ" %%mm3, 24(%0, %%"REG_a", 2)\n\t"
  2310. "add $16, %%"REG_a" \n\t"
  2311. "cmp %3, %%"REG_a" \n\t"
  2312. " jb 1b \n\t"
  2313. ::"r"(dest), "r"(src1), "r"(src2), "r" (width-15)
  2314. : "memory", "%"REG_a
  2315. );
  2316. #endif
  2317. for(w= (width&(~15)); w < width; w++)
  2318. {
  2319. dest[2*w+0] = src1[w];
  2320. dest[2*w+1] = src2[w];
  2321. }
  2322. #else
  2323. for(w=0; w < width; w++)
  2324. {
  2325. dest[2*w+0] = src1[w];
  2326. dest[2*w+1] = src2[w];
  2327. }
  2328. #endif
  2329. dest += dstStride;
  2330. src1 += src1Stride;
  2331. src2 += src2Stride;
  2332. }
  2333. #ifdef HAVE_MMX
  2334. asm(
  2335. EMMS" \n\t"
  2336. SFENCE" \n\t"
  2337. ::: "memory"
  2338. );
  2339. #endif
  2340. }
  2341. static inline void RENAME(vu9_to_vu12)(const uint8_t *src1, const uint8_t *src2,
  2342. uint8_t *dst1, uint8_t *dst2,
  2343. long width, long height,
  2344. long srcStride1, long srcStride2,
  2345. long dstStride1, long dstStride2)
  2346. {
  2347. long y,x,w,h;
  2348. w=width/2; h=height/2;
  2349. #ifdef HAVE_MMX
  2350. asm volatile(
  2351. PREFETCH" %0\n\t"
  2352. PREFETCH" %1\n\t"
  2353. ::"m"(*(src1+srcStride1)),"m"(*(src2+srcStride2)):"memory");
  2354. #endif
  2355. for(y=0;y<h;y++){
  2356. const uint8_t* s1=src1+srcStride1*(y>>1);
  2357. uint8_t* d=dst1+dstStride1*y;
  2358. x=0;
  2359. #ifdef HAVE_MMX
  2360. for(;x<w-31;x+=32)
  2361. {
  2362. asm volatile(
  2363. PREFETCH" 32%1\n\t"
  2364. "movq %1, %%mm0\n\t"
  2365. "movq 8%1, %%mm2\n\t"
  2366. "movq 16%1, %%mm4\n\t"
  2367. "movq 24%1, %%mm6\n\t"
  2368. "movq %%mm0, %%mm1\n\t"
  2369. "movq %%mm2, %%mm3\n\t"
  2370. "movq %%mm4, %%mm5\n\t"
  2371. "movq %%mm6, %%mm7\n\t"
  2372. "punpcklbw %%mm0, %%mm0\n\t"
  2373. "punpckhbw %%mm1, %%mm1\n\t"
  2374. "punpcklbw %%mm2, %%mm2\n\t"
  2375. "punpckhbw %%mm3, %%mm3\n\t"
  2376. "punpcklbw %%mm4, %%mm4\n\t"
  2377. "punpckhbw %%mm5, %%mm5\n\t"
  2378. "punpcklbw %%mm6, %%mm6\n\t"
  2379. "punpckhbw %%mm7, %%mm7\n\t"
  2380. MOVNTQ" %%mm0, %0\n\t"
  2381. MOVNTQ" %%mm1, 8%0\n\t"
  2382. MOVNTQ" %%mm2, 16%0\n\t"
  2383. MOVNTQ" %%mm3, 24%0\n\t"
  2384. MOVNTQ" %%mm4, 32%0\n\t"
  2385. MOVNTQ" %%mm5, 40%0\n\t"
  2386. MOVNTQ" %%mm6, 48%0\n\t"
  2387. MOVNTQ" %%mm7, 56%0"
  2388. :"=m"(d[2*x])
  2389. :"m"(s1[x])
  2390. :"memory");
  2391. }
  2392. #endif
  2393. for(;x<w;x++) d[2*x]=d[2*x+1]=s1[x];
  2394. }
  2395. for(y=0;y<h;y++){
  2396. const uint8_t* s2=src2+srcStride2*(y>>1);
  2397. uint8_t* d=dst2+dstStride2*y;
  2398. x=0;
  2399. #ifdef HAVE_MMX
  2400. for(;x<w-31;x+=32)
  2401. {
  2402. asm volatile(
  2403. PREFETCH" 32%1\n\t"
  2404. "movq %1, %%mm0\n\t"
  2405. "movq 8%1, %%mm2\n\t"
  2406. "movq 16%1, %%mm4\n\t"
  2407. "movq 24%1, %%mm6\n\t"
  2408. "movq %%mm0, %%mm1\n\t"
  2409. "movq %%mm2, %%mm3\n\t"
  2410. "movq %%mm4, %%mm5\n\t"
  2411. "movq %%mm6, %%mm7\n\t"
  2412. "punpcklbw %%mm0, %%mm0\n\t"
  2413. "punpckhbw %%mm1, %%mm1\n\t"
  2414. "punpcklbw %%mm2, %%mm2\n\t"
  2415. "punpckhbw %%mm3, %%mm3\n\t"
  2416. "punpcklbw %%mm4, %%mm4\n\t"
  2417. "punpckhbw %%mm5, %%mm5\n\t"
  2418. "punpcklbw %%mm6, %%mm6\n\t"
  2419. "punpckhbw %%mm7, %%mm7\n\t"
  2420. MOVNTQ" %%mm0, %0\n\t"
  2421. MOVNTQ" %%mm1, 8%0\n\t"
  2422. MOVNTQ" %%mm2, 16%0\n\t"
  2423. MOVNTQ" %%mm3, 24%0\n\t"
  2424. MOVNTQ" %%mm4, 32%0\n\t"
  2425. MOVNTQ" %%mm5, 40%0\n\t"
  2426. MOVNTQ" %%mm6, 48%0\n\t"
  2427. MOVNTQ" %%mm7, 56%0"
  2428. :"=m"(d[2*x])
  2429. :"m"(s2[x])
  2430. :"memory");
  2431. }
  2432. #endif
  2433. for(;x<w;x++) d[2*x]=d[2*x+1]=s2[x];
  2434. }
  2435. #ifdef HAVE_MMX
  2436. asm(
  2437. EMMS" \n\t"
  2438. SFENCE" \n\t"
  2439. ::: "memory"
  2440. );
  2441. #endif
  2442. }
  2443. static inline void RENAME(yvu9_to_yuy2)(const uint8_t *src1, const uint8_t *src2, const uint8_t *src3,
  2444. uint8_t *dst,
  2445. long width, long height,
  2446. long srcStride1, long srcStride2,
  2447. long srcStride3, long dstStride)
  2448. {
  2449. long y,x,w,h;
  2450. w=width/2; h=height;
  2451. for(y=0;y<h;y++){
  2452. const uint8_t* yp=src1+srcStride1*y;
  2453. const uint8_t* up=src2+srcStride2*(y>>2);
  2454. const uint8_t* vp=src3+srcStride3*(y>>2);
  2455. uint8_t* d=dst+dstStride*y;
  2456. x=0;
  2457. #ifdef HAVE_MMX
  2458. for(;x<w-7;x+=8)
  2459. {
  2460. asm volatile(
  2461. PREFETCH" 32(%1, %0)\n\t"
  2462. PREFETCH" 32(%2, %0)\n\t"
  2463. PREFETCH" 32(%3, %0)\n\t"
  2464. "movq (%1, %0, 4), %%mm0\n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
  2465. "movq (%2, %0), %%mm1\n\t" /* U0U1U2U3U4U5U6U7 */
  2466. "movq (%3, %0), %%mm2\n\t" /* V0V1V2V3V4V5V6V7 */
  2467. "movq %%mm0, %%mm3\n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
  2468. "movq %%mm1, %%mm4\n\t" /* U0U1U2U3U4U5U6U7 */
  2469. "movq %%mm2, %%mm5\n\t" /* V0V1V2V3V4V5V6V7 */
  2470. "punpcklbw %%mm1, %%mm1\n\t" /* U0U0 U1U1 U2U2 U3U3 */
  2471. "punpcklbw %%mm2, %%mm2\n\t" /* V0V0 V1V1 V2V2 V3V3 */
  2472. "punpckhbw %%mm4, %%mm4\n\t" /* U4U4 U5U5 U6U6 U7U7 */
  2473. "punpckhbw %%mm5, %%mm5\n\t" /* V4V4 V5V5 V6V6 V7V7 */
  2474. "movq %%mm1, %%mm6\n\t"
  2475. "punpcklbw %%mm2, %%mm1\n\t" /* U0V0 U0V0 U1V1 U1V1*/
  2476. "punpcklbw %%mm1, %%mm0\n\t" /* Y0U0 Y1V0 Y2U0 Y3V0*/
  2477. "punpckhbw %%mm1, %%mm3\n\t" /* Y4U1 Y5V1 Y6U1 Y7V1*/
  2478. MOVNTQ" %%mm0, (%4, %0, 8)\n\t"
  2479. MOVNTQ" %%mm3, 8(%4, %0, 8)\n\t"
  2480. "punpckhbw %%mm2, %%mm6\n\t" /* U2V2 U2V2 U3V3 U3V3*/
  2481. "movq 8(%1, %0, 4), %%mm0\n\t"
  2482. "movq %%mm0, %%mm3\n\t"
  2483. "punpcklbw %%mm6, %%mm0\n\t" /* Y U2 Y V2 Y U2 Y V2*/
  2484. "punpckhbw %%mm6, %%mm3\n\t" /* Y U3 Y V3 Y U3 Y V3*/
  2485. MOVNTQ" %%mm0, 16(%4, %0, 8)\n\t"
  2486. MOVNTQ" %%mm3, 24(%4, %0, 8)\n\t"
  2487. "movq %%mm4, %%mm6\n\t"
  2488. "movq 16(%1, %0, 4), %%mm0\n\t"
  2489. "movq %%mm0, %%mm3\n\t"
  2490. "punpcklbw %%mm5, %%mm4\n\t"
  2491. "punpcklbw %%mm4, %%mm0\n\t" /* Y U4 Y V4 Y U4 Y V4*/
  2492. "punpckhbw %%mm4, %%mm3\n\t" /* Y U5 Y V5 Y U5 Y V5*/
  2493. MOVNTQ" %%mm0, 32(%4, %0, 8)\n\t"
  2494. MOVNTQ" %%mm3, 40(%4, %0, 8)\n\t"
  2495. "punpckhbw %%mm5, %%mm6\n\t"
  2496. "movq 24(%1, %0, 4), %%mm0\n\t"
  2497. "movq %%mm0, %%mm3\n\t"
  2498. "punpcklbw %%mm6, %%mm0\n\t" /* Y U6 Y V6 Y U6 Y V6*/
  2499. "punpckhbw %%mm6, %%mm3\n\t" /* Y U7 Y V7 Y U7 Y V7*/
  2500. MOVNTQ" %%mm0, 48(%4, %0, 8)\n\t"
  2501. MOVNTQ" %%mm3, 56(%4, %0, 8)\n\t"
  2502. : "+r" (x)
  2503. : "r"(yp), "r" (up), "r"(vp), "r"(d)
  2504. :"memory");
  2505. }
  2506. #endif
  2507. for(; x<w; x++)
  2508. {
  2509. const long x2= x<<2;
  2510. d[8*x+0]=yp[x2];
  2511. d[8*x+1]=up[x];
  2512. d[8*x+2]=yp[x2+1];
  2513. d[8*x+3]=vp[x];
  2514. d[8*x+4]=yp[x2+2];
  2515. d[8*x+5]=up[x];
  2516. d[8*x+6]=yp[x2+3];
  2517. d[8*x+7]=vp[x];
  2518. }
  2519. }
  2520. #ifdef HAVE_MMX
  2521. asm(
  2522. EMMS" \n\t"
  2523. SFENCE" \n\t"
  2524. ::: "memory"
  2525. );
  2526. #endif
  2527. }