You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

2689 lines
68KB

  1. /*
  2. *
  3. * rgb2rgb.c, Software RGB to RGB convertor
  4. * pluralize by Software PAL8 to RGB convertor
  5. * Software YUV to YUV convertor
  6. * Software YUV to RGB convertor
  7. * Written by Nick Kurshev.
  8. * palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at)
  9. * lot of big-endian byteorder fixes by Alex Beregszaszi
  10. *
  11. * This file is part of FFmpeg.
  12. *
  13. * FFmpeg is free software; you can redistribute it and/or modify
  14. * it under the terms of the GNU General Public License as published by
  15. * the Free Software Foundation; either version 2 of the License, or
  16. * (at your option) any later version.
  17. *
  18. * FFmpeg is distributed in the hope that it will be useful,
  19. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  20. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  21. * GNU General Public License for more details.
  22. *
  23. * You should have received a copy of the GNU General Public License
  24. * along with FFmpeg; if not, write to the Free Software
  25. * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
  26. *
  27. * the C code (not assembly, mmx, ...) of this file can be used
  28. * under the LGPL license too
  29. */
  30. #include <stddef.h>
  31. #include <inttypes.h> /* for __WORDSIZE */
  32. #ifndef __WORDSIZE
  33. // #warning You have misconfigured system and probably will lose performance!
  34. #define __WORDSIZE MP_WORDSIZE
  35. #endif
  36. #undef PREFETCH
  37. #undef MOVNTQ
  38. #undef EMMS
  39. #undef SFENCE
  40. #undef MMREG_SIZE
  41. #undef PREFETCHW
  42. #undef PAVGB
  43. #ifdef HAVE_SSE2
  44. #define MMREG_SIZE 16
  45. #else
  46. #define MMREG_SIZE 8
  47. #endif
  48. #ifdef HAVE_3DNOW
  49. #define PREFETCH "prefetch"
  50. #define PREFETCHW "prefetchw"
  51. #define PAVGB "pavgusb"
  52. #elif defined ( HAVE_MMX2 )
  53. #define PREFETCH "prefetchnta"
  54. #define PREFETCHW "prefetcht0"
  55. #define PAVGB "pavgb"
  56. #else
  57. #ifdef __APPLE__
  58. #define PREFETCH "#"
  59. #define PREFETCHW "#"
  60. #else
  61. #define PREFETCH " # nop"
  62. #define PREFETCHW " # nop"
  63. #endif
  64. #endif
  65. #ifdef HAVE_3DNOW
  66. /* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
  67. #define EMMS "femms"
  68. #else
  69. #define EMMS "emms"
  70. #endif
  71. #ifdef HAVE_MMX2
  72. #define MOVNTQ "movntq"
  73. #define SFENCE "sfence"
  74. #else
  75. #define MOVNTQ "movq"
  76. #define SFENCE " # nop"
  77. #endif
  78. static inline void RENAME(rgb24to32)(const uint8_t *src,uint8_t *dst,long src_size)
  79. {
  80. uint8_t *dest = dst;
  81. const uint8_t *s = src;
  82. const uint8_t *end;
  83. #ifdef HAVE_MMX
  84. const uint8_t *mm_end;
  85. #endif
  86. end = s + src_size;
  87. #ifdef HAVE_MMX
  88. __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
  89. mm_end = end - 23;
  90. __asm __volatile("movq %0, %%mm7"::"m"(mask32):"memory");
  91. while(s < mm_end)
  92. {
  93. __asm __volatile(
  94. PREFETCH" 32%1\n\t"
  95. "movd %1, %%mm0\n\t"
  96. "punpckldq 3%1, %%mm0\n\t"
  97. "movd 6%1, %%mm1\n\t"
  98. "punpckldq 9%1, %%mm1\n\t"
  99. "movd 12%1, %%mm2\n\t"
  100. "punpckldq 15%1, %%mm2\n\t"
  101. "movd 18%1, %%mm3\n\t"
  102. "punpckldq 21%1, %%mm3\n\t"
  103. "pand %%mm7, %%mm0\n\t"
  104. "pand %%mm7, %%mm1\n\t"
  105. "pand %%mm7, %%mm2\n\t"
  106. "pand %%mm7, %%mm3\n\t"
  107. MOVNTQ" %%mm0, %0\n\t"
  108. MOVNTQ" %%mm1, 8%0\n\t"
  109. MOVNTQ" %%mm2, 16%0\n\t"
  110. MOVNTQ" %%mm3, 24%0"
  111. :"=m"(*dest)
  112. :"m"(*s)
  113. :"memory");
  114. dest += 32;
  115. s += 24;
  116. }
  117. __asm __volatile(SFENCE:::"memory");
  118. __asm __volatile(EMMS:::"memory");
  119. #endif
  120. while(s < end)
  121. {
  122. #ifdef WORDS_BIGENDIAN
  123. /* RGB24 (= R,G,B) -> RGB32 (= A,B,G,R) */
  124. *dest++ = 0;
  125. *dest++ = s[2];
  126. *dest++ = s[1];
  127. *dest++ = s[0];
  128. s+=3;
  129. #else
  130. *dest++ = *s++;
  131. *dest++ = *s++;
  132. *dest++ = *s++;
  133. *dest++ = 0;
  134. #endif
  135. }
  136. }
  137. static inline void RENAME(rgb32to24)(const uint8_t *src,uint8_t *dst,long src_size)
  138. {
  139. uint8_t *dest = dst;
  140. const uint8_t *s = src;
  141. const uint8_t *end;
  142. #ifdef HAVE_MMX
  143. const uint8_t *mm_end;
  144. #endif
  145. end = s + src_size;
  146. #ifdef HAVE_MMX
  147. __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
  148. mm_end = end - 31;
  149. while(s < mm_end)
  150. {
  151. __asm __volatile(
  152. PREFETCH" 32%1\n\t"
  153. "movq %1, %%mm0\n\t"
  154. "movq 8%1, %%mm1\n\t"
  155. "movq 16%1, %%mm4\n\t"
  156. "movq 24%1, %%mm5\n\t"
  157. "movq %%mm0, %%mm2\n\t"
  158. "movq %%mm1, %%mm3\n\t"
  159. "movq %%mm4, %%mm6\n\t"
  160. "movq %%mm5, %%mm7\n\t"
  161. "psrlq $8, %%mm2\n\t"
  162. "psrlq $8, %%mm3\n\t"
  163. "psrlq $8, %%mm6\n\t"
  164. "psrlq $8, %%mm7\n\t"
  165. "pand %2, %%mm0\n\t"
  166. "pand %2, %%mm1\n\t"
  167. "pand %2, %%mm4\n\t"
  168. "pand %2, %%mm5\n\t"
  169. "pand %3, %%mm2\n\t"
  170. "pand %3, %%mm3\n\t"
  171. "pand %3, %%mm6\n\t"
  172. "pand %3, %%mm7\n\t"
  173. "por %%mm2, %%mm0\n\t"
  174. "por %%mm3, %%mm1\n\t"
  175. "por %%mm6, %%mm4\n\t"
  176. "por %%mm7, %%mm5\n\t"
  177. "movq %%mm1, %%mm2\n\t"
  178. "movq %%mm4, %%mm3\n\t"
  179. "psllq $48, %%mm2\n\t"
  180. "psllq $32, %%mm3\n\t"
  181. "pand %4, %%mm2\n\t"
  182. "pand %5, %%mm3\n\t"
  183. "por %%mm2, %%mm0\n\t"
  184. "psrlq $16, %%mm1\n\t"
  185. "psrlq $32, %%mm4\n\t"
  186. "psllq $16, %%mm5\n\t"
  187. "por %%mm3, %%mm1\n\t"
  188. "pand %6, %%mm5\n\t"
  189. "por %%mm5, %%mm4\n\t"
  190. MOVNTQ" %%mm0, %0\n\t"
  191. MOVNTQ" %%mm1, 8%0\n\t"
  192. MOVNTQ" %%mm4, 16%0"
  193. :"=m"(*dest)
  194. :"m"(*s),"m"(mask24l),
  195. "m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
  196. :"memory");
  197. dest += 24;
  198. s += 32;
  199. }
  200. __asm __volatile(SFENCE:::"memory");
  201. __asm __volatile(EMMS:::"memory");
  202. #endif
  203. while(s < end)
  204. {
  205. #ifdef WORDS_BIGENDIAN
  206. /* RGB32 (= A,B,G,R) -> RGB24 (= R,G,B) */
  207. s++;
  208. dest[2] = *s++;
  209. dest[1] = *s++;
  210. dest[0] = *s++;
  211. dest += 3;
  212. #else
  213. *dest++ = *s++;
  214. *dest++ = *s++;
  215. *dest++ = *s++;
  216. s++;
  217. #endif
  218. }
  219. }
  220. /*
  221. Original by Strepto/Astral
  222. ported to gcc & bugfixed : A'rpi
  223. MMX2, 3DNOW optimization by Nick Kurshev
  224. 32bit c version, and and&add trick by Michael Niedermayer
  225. */
  226. static inline void RENAME(rgb15to16)(const uint8_t *src,uint8_t *dst,long src_size)
  227. {
  228. register const uint8_t* s=src;
  229. register uint8_t* d=dst;
  230. register const uint8_t *end;
  231. const uint8_t *mm_end;
  232. end = s + src_size;
  233. #ifdef HAVE_MMX
  234. __asm __volatile(PREFETCH" %0"::"m"(*s));
  235. __asm __volatile("movq %0, %%mm4"::"m"(mask15s));
  236. mm_end = end - 15;
  237. while(s<mm_end)
  238. {
  239. __asm __volatile(
  240. PREFETCH" 32%1\n\t"
  241. "movq %1, %%mm0\n\t"
  242. "movq 8%1, %%mm2\n\t"
  243. "movq %%mm0, %%mm1\n\t"
  244. "movq %%mm2, %%mm3\n\t"
  245. "pand %%mm4, %%mm0\n\t"
  246. "pand %%mm4, %%mm2\n\t"
  247. "paddw %%mm1, %%mm0\n\t"
  248. "paddw %%mm3, %%mm2\n\t"
  249. MOVNTQ" %%mm0, %0\n\t"
  250. MOVNTQ" %%mm2, 8%0"
  251. :"=m"(*d)
  252. :"m"(*s)
  253. );
  254. d+=16;
  255. s+=16;
  256. }
  257. __asm __volatile(SFENCE:::"memory");
  258. __asm __volatile(EMMS:::"memory");
  259. #endif
  260. mm_end = end - 3;
  261. while(s < mm_end)
  262. {
  263. register unsigned x= *((uint32_t *)s);
  264. *((uint32_t *)d) = (x&0x7FFF7FFF) + (x&0x7FE07FE0);
  265. d+=4;
  266. s+=4;
  267. }
  268. if(s < end)
  269. {
  270. register unsigned short x= *((uint16_t *)s);
  271. *((uint16_t *)d) = (x&0x7FFF) + (x&0x7FE0);
  272. }
  273. }
  274. static inline void RENAME(rgb16to15)(const uint8_t *src,uint8_t *dst,long src_size)
  275. {
  276. register const uint8_t* s=src;
  277. register uint8_t* d=dst;
  278. register const uint8_t *end;
  279. const uint8_t *mm_end;
  280. end = s + src_size;
  281. #ifdef HAVE_MMX
  282. __asm __volatile(PREFETCH" %0"::"m"(*s));
  283. __asm __volatile("movq %0, %%mm7"::"m"(mask15rg));
  284. __asm __volatile("movq %0, %%mm6"::"m"(mask15b));
  285. mm_end = end - 15;
  286. while(s<mm_end)
  287. {
  288. __asm __volatile(
  289. PREFETCH" 32%1\n\t"
  290. "movq %1, %%mm0\n\t"
  291. "movq 8%1, %%mm2\n\t"
  292. "movq %%mm0, %%mm1\n\t"
  293. "movq %%mm2, %%mm3\n\t"
  294. "psrlq $1, %%mm0\n\t"
  295. "psrlq $1, %%mm2\n\t"
  296. "pand %%mm7, %%mm0\n\t"
  297. "pand %%mm7, %%mm2\n\t"
  298. "pand %%mm6, %%mm1\n\t"
  299. "pand %%mm6, %%mm3\n\t"
  300. "por %%mm1, %%mm0\n\t"
  301. "por %%mm3, %%mm2\n\t"
  302. MOVNTQ" %%mm0, %0\n\t"
  303. MOVNTQ" %%mm2, 8%0"
  304. :"=m"(*d)
  305. :"m"(*s)
  306. );
  307. d+=16;
  308. s+=16;
  309. }
  310. __asm __volatile(SFENCE:::"memory");
  311. __asm __volatile(EMMS:::"memory");
  312. #endif
  313. mm_end = end - 3;
  314. while(s < mm_end)
  315. {
  316. register uint32_t x= *((uint32_t *)s);
  317. *((uint32_t *)d) = ((x>>1)&0x7FE07FE0) | (x&0x001F001F);
  318. s+=4;
  319. d+=4;
  320. }
  321. if(s < end)
  322. {
  323. register uint16_t x= *((uint16_t *)s);
  324. *((uint16_t *)d) = ((x>>1)&0x7FE0) | (x&0x001F);
  325. s+=2;
  326. d+=2;
  327. }
  328. }
  329. static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, long src_size)
  330. {
  331. const uint8_t *s = src;
  332. const uint8_t *end;
  333. #ifdef HAVE_MMX
  334. const uint8_t *mm_end;
  335. #endif
  336. uint16_t *d = (uint16_t *)dst;
  337. end = s + src_size;
  338. #ifdef HAVE_MMX
  339. mm_end = end - 15;
  340. #if 1 //is faster only if multiplies are reasonable fast (FIXME figure out on which cpus this is faster, on Athlon its slightly faster)
  341. asm volatile(
  342. "movq %3, %%mm5 \n\t"
  343. "movq %4, %%mm6 \n\t"
  344. "movq %5, %%mm7 \n\t"
  345. ASMALIGN(4)
  346. "1: \n\t"
  347. PREFETCH" 32(%1) \n\t"
  348. "movd (%1), %%mm0 \n\t"
  349. "movd 4(%1), %%mm3 \n\t"
  350. "punpckldq 8(%1), %%mm0 \n\t"
  351. "punpckldq 12(%1), %%mm3 \n\t"
  352. "movq %%mm0, %%mm1 \n\t"
  353. "movq %%mm3, %%mm4 \n\t"
  354. "pand %%mm6, %%mm0 \n\t"
  355. "pand %%mm6, %%mm3 \n\t"
  356. "pmaddwd %%mm7, %%mm0 \n\t"
  357. "pmaddwd %%mm7, %%mm3 \n\t"
  358. "pand %%mm5, %%mm1 \n\t"
  359. "pand %%mm5, %%mm4 \n\t"
  360. "por %%mm1, %%mm0 \n\t"
  361. "por %%mm4, %%mm3 \n\t"
  362. "psrld $5, %%mm0 \n\t"
  363. "pslld $11, %%mm3 \n\t"
  364. "por %%mm3, %%mm0 \n\t"
  365. MOVNTQ" %%mm0, (%0) \n\t"
  366. "add $16, %1 \n\t"
  367. "add $8, %0 \n\t"
  368. "cmp %2, %1 \n\t"
  369. " jb 1b \n\t"
  370. : "+r" (d), "+r"(s)
  371. : "r" (mm_end), "m" (mask3216g), "m" (mask3216br), "m" (mul3216)
  372. );
  373. #else
  374. __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
  375. __asm __volatile(
  376. "movq %0, %%mm7\n\t"
  377. "movq %1, %%mm6\n\t"
  378. ::"m"(red_16mask),"m"(green_16mask));
  379. while(s < mm_end)
  380. {
  381. __asm __volatile(
  382. PREFETCH" 32%1\n\t"
  383. "movd %1, %%mm0\n\t"
  384. "movd 4%1, %%mm3\n\t"
  385. "punpckldq 8%1, %%mm0\n\t"
  386. "punpckldq 12%1, %%mm3\n\t"
  387. "movq %%mm0, %%mm1\n\t"
  388. "movq %%mm0, %%mm2\n\t"
  389. "movq %%mm3, %%mm4\n\t"
  390. "movq %%mm3, %%mm5\n\t"
  391. "psrlq $3, %%mm0\n\t"
  392. "psrlq $3, %%mm3\n\t"
  393. "pand %2, %%mm0\n\t"
  394. "pand %2, %%mm3\n\t"
  395. "psrlq $5, %%mm1\n\t"
  396. "psrlq $5, %%mm4\n\t"
  397. "pand %%mm6, %%mm1\n\t"
  398. "pand %%mm6, %%mm4\n\t"
  399. "psrlq $8, %%mm2\n\t"
  400. "psrlq $8, %%mm5\n\t"
  401. "pand %%mm7, %%mm2\n\t"
  402. "pand %%mm7, %%mm5\n\t"
  403. "por %%mm1, %%mm0\n\t"
  404. "por %%mm4, %%mm3\n\t"
  405. "por %%mm2, %%mm0\n\t"
  406. "por %%mm5, %%mm3\n\t"
  407. "psllq $16, %%mm3\n\t"
  408. "por %%mm3, %%mm0\n\t"
  409. MOVNTQ" %%mm0, %0\n\t"
  410. :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
  411. d += 4;
  412. s += 16;
  413. }
  414. #endif
  415. __asm __volatile(SFENCE:::"memory");
  416. __asm __volatile(EMMS:::"memory");
  417. #endif
  418. while(s < end)
  419. {
  420. register int rgb = *(uint32_t*)s; s += 4;
  421. *d++ = ((rgb&0xFF)>>3) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>8);
  422. }
  423. }
  424. static inline void RENAME(rgb32tobgr16)(const uint8_t *src, uint8_t *dst, long src_size)
  425. {
  426. const uint8_t *s = src;
  427. const uint8_t *end;
  428. #ifdef HAVE_MMX
  429. const uint8_t *mm_end;
  430. #endif
  431. uint16_t *d = (uint16_t *)dst;
  432. end = s + src_size;
  433. #ifdef HAVE_MMX
  434. __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
  435. __asm __volatile(
  436. "movq %0, %%mm7\n\t"
  437. "movq %1, %%mm6\n\t"
  438. ::"m"(red_16mask),"m"(green_16mask));
  439. mm_end = end - 15;
  440. while(s < mm_end)
  441. {
  442. __asm __volatile(
  443. PREFETCH" 32%1\n\t"
  444. "movd %1, %%mm0\n\t"
  445. "movd 4%1, %%mm3\n\t"
  446. "punpckldq 8%1, %%mm0\n\t"
  447. "punpckldq 12%1, %%mm3\n\t"
  448. "movq %%mm0, %%mm1\n\t"
  449. "movq %%mm0, %%mm2\n\t"
  450. "movq %%mm3, %%mm4\n\t"
  451. "movq %%mm3, %%mm5\n\t"
  452. "psllq $8, %%mm0\n\t"
  453. "psllq $8, %%mm3\n\t"
  454. "pand %%mm7, %%mm0\n\t"
  455. "pand %%mm7, %%mm3\n\t"
  456. "psrlq $5, %%mm1\n\t"
  457. "psrlq $5, %%mm4\n\t"
  458. "pand %%mm6, %%mm1\n\t"
  459. "pand %%mm6, %%mm4\n\t"
  460. "psrlq $19, %%mm2\n\t"
  461. "psrlq $19, %%mm5\n\t"
  462. "pand %2, %%mm2\n\t"
  463. "pand %2, %%mm5\n\t"
  464. "por %%mm1, %%mm0\n\t"
  465. "por %%mm4, %%mm3\n\t"
  466. "por %%mm2, %%mm0\n\t"
  467. "por %%mm5, %%mm3\n\t"
  468. "psllq $16, %%mm3\n\t"
  469. "por %%mm3, %%mm0\n\t"
  470. MOVNTQ" %%mm0, %0\n\t"
  471. :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
  472. d += 4;
  473. s += 16;
  474. }
  475. __asm __volatile(SFENCE:::"memory");
  476. __asm __volatile(EMMS:::"memory");
  477. #endif
  478. while(s < end)
  479. {
  480. register int rgb = *(uint32_t*)s; s += 4;
  481. *d++ = ((rgb&0xF8)<<8) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>19);
  482. }
  483. }
  484. static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, long src_size)
  485. {
  486. const uint8_t *s = src;
  487. const uint8_t *end;
  488. #ifdef HAVE_MMX
  489. const uint8_t *mm_end;
  490. #endif
  491. uint16_t *d = (uint16_t *)dst;
  492. end = s + src_size;
  493. #ifdef HAVE_MMX
  494. mm_end = end - 15;
  495. #if 1 //is faster only if multiplies are reasonable fast (FIXME figure out on which cpus this is faster, on Athlon its slightly faster)
  496. asm volatile(
  497. "movq %3, %%mm5 \n\t"
  498. "movq %4, %%mm6 \n\t"
  499. "movq %5, %%mm7 \n\t"
  500. ASMALIGN(4)
  501. "1: \n\t"
  502. PREFETCH" 32(%1) \n\t"
  503. "movd (%1), %%mm0 \n\t"
  504. "movd 4(%1), %%mm3 \n\t"
  505. "punpckldq 8(%1), %%mm0 \n\t"
  506. "punpckldq 12(%1), %%mm3 \n\t"
  507. "movq %%mm0, %%mm1 \n\t"
  508. "movq %%mm3, %%mm4 \n\t"
  509. "pand %%mm6, %%mm0 \n\t"
  510. "pand %%mm6, %%mm3 \n\t"
  511. "pmaddwd %%mm7, %%mm0 \n\t"
  512. "pmaddwd %%mm7, %%mm3 \n\t"
  513. "pand %%mm5, %%mm1 \n\t"
  514. "pand %%mm5, %%mm4 \n\t"
  515. "por %%mm1, %%mm0 \n\t"
  516. "por %%mm4, %%mm3 \n\t"
  517. "psrld $6, %%mm0 \n\t"
  518. "pslld $10, %%mm3 \n\t"
  519. "por %%mm3, %%mm0 \n\t"
  520. MOVNTQ" %%mm0, (%0) \n\t"
  521. "add $16, %1 \n\t"
  522. "add $8, %0 \n\t"
  523. "cmp %2, %1 \n\t"
  524. " jb 1b \n\t"
  525. : "+r" (d), "+r"(s)
  526. : "r" (mm_end), "m" (mask3215g), "m" (mask3216br), "m" (mul3215)
  527. );
  528. #else
  529. __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
  530. __asm __volatile(
  531. "movq %0, %%mm7\n\t"
  532. "movq %1, %%mm6\n\t"
  533. ::"m"(red_15mask),"m"(green_15mask));
  534. while(s < mm_end)
  535. {
  536. __asm __volatile(
  537. PREFETCH" 32%1\n\t"
  538. "movd %1, %%mm0\n\t"
  539. "movd 4%1, %%mm3\n\t"
  540. "punpckldq 8%1, %%mm0\n\t"
  541. "punpckldq 12%1, %%mm3\n\t"
  542. "movq %%mm0, %%mm1\n\t"
  543. "movq %%mm0, %%mm2\n\t"
  544. "movq %%mm3, %%mm4\n\t"
  545. "movq %%mm3, %%mm5\n\t"
  546. "psrlq $3, %%mm0\n\t"
  547. "psrlq $3, %%mm3\n\t"
  548. "pand %2, %%mm0\n\t"
  549. "pand %2, %%mm3\n\t"
  550. "psrlq $6, %%mm1\n\t"
  551. "psrlq $6, %%mm4\n\t"
  552. "pand %%mm6, %%mm1\n\t"
  553. "pand %%mm6, %%mm4\n\t"
  554. "psrlq $9, %%mm2\n\t"
  555. "psrlq $9, %%mm5\n\t"
  556. "pand %%mm7, %%mm2\n\t"
  557. "pand %%mm7, %%mm5\n\t"
  558. "por %%mm1, %%mm0\n\t"
  559. "por %%mm4, %%mm3\n\t"
  560. "por %%mm2, %%mm0\n\t"
  561. "por %%mm5, %%mm3\n\t"
  562. "psllq $16, %%mm3\n\t"
  563. "por %%mm3, %%mm0\n\t"
  564. MOVNTQ" %%mm0, %0\n\t"
  565. :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
  566. d += 4;
  567. s += 16;
  568. }
  569. #endif
  570. __asm __volatile(SFENCE:::"memory");
  571. __asm __volatile(EMMS:::"memory");
  572. #endif
  573. while(s < end)
  574. {
  575. register int rgb = *(uint32_t*)s; s += 4;
  576. *d++ = ((rgb&0xFF)>>3) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>9);
  577. }
  578. }
  579. static inline void RENAME(rgb32tobgr15)(const uint8_t *src, uint8_t *dst, long src_size)
  580. {
  581. const uint8_t *s = src;
  582. const uint8_t *end;
  583. #ifdef HAVE_MMX
  584. const uint8_t *mm_end;
  585. #endif
  586. uint16_t *d = (uint16_t *)dst;
  587. end = s + src_size;
  588. #ifdef HAVE_MMX
  589. __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
  590. __asm __volatile(
  591. "movq %0, %%mm7\n\t"
  592. "movq %1, %%mm6\n\t"
  593. ::"m"(red_15mask),"m"(green_15mask));
  594. mm_end = end - 15;
  595. while(s < mm_end)
  596. {
  597. __asm __volatile(
  598. PREFETCH" 32%1\n\t"
  599. "movd %1, %%mm0\n\t"
  600. "movd 4%1, %%mm3\n\t"
  601. "punpckldq 8%1, %%mm0\n\t"
  602. "punpckldq 12%1, %%mm3\n\t"
  603. "movq %%mm0, %%mm1\n\t"
  604. "movq %%mm0, %%mm2\n\t"
  605. "movq %%mm3, %%mm4\n\t"
  606. "movq %%mm3, %%mm5\n\t"
  607. "psllq $7, %%mm0\n\t"
  608. "psllq $7, %%mm3\n\t"
  609. "pand %%mm7, %%mm0\n\t"
  610. "pand %%mm7, %%mm3\n\t"
  611. "psrlq $6, %%mm1\n\t"
  612. "psrlq $6, %%mm4\n\t"
  613. "pand %%mm6, %%mm1\n\t"
  614. "pand %%mm6, %%mm4\n\t"
  615. "psrlq $19, %%mm2\n\t"
  616. "psrlq $19, %%mm5\n\t"
  617. "pand %2, %%mm2\n\t"
  618. "pand %2, %%mm5\n\t"
  619. "por %%mm1, %%mm0\n\t"
  620. "por %%mm4, %%mm3\n\t"
  621. "por %%mm2, %%mm0\n\t"
  622. "por %%mm5, %%mm3\n\t"
  623. "psllq $16, %%mm3\n\t"
  624. "por %%mm3, %%mm0\n\t"
  625. MOVNTQ" %%mm0, %0\n\t"
  626. :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
  627. d += 4;
  628. s += 16;
  629. }
  630. __asm __volatile(SFENCE:::"memory");
  631. __asm __volatile(EMMS:::"memory");
  632. #endif
  633. while(s < end)
  634. {
  635. register int rgb = *(uint32_t*)s; s += 4;
  636. *d++ = ((rgb&0xF8)<<7) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>19);
  637. }
  638. }
  639. static inline void RENAME(rgb24to16)(const uint8_t *src, uint8_t *dst, long src_size)
  640. {
  641. const uint8_t *s = src;
  642. const uint8_t *end;
  643. #ifdef HAVE_MMX
  644. const uint8_t *mm_end;
  645. #endif
  646. uint16_t *d = (uint16_t *)dst;
  647. end = s + src_size;
  648. #ifdef HAVE_MMX
  649. __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
  650. __asm __volatile(
  651. "movq %0, %%mm7\n\t"
  652. "movq %1, %%mm6\n\t"
  653. ::"m"(red_16mask),"m"(green_16mask));
  654. mm_end = end - 11;
  655. while(s < mm_end)
  656. {
  657. __asm __volatile(
  658. PREFETCH" 32%1\n\t"
  659. "movd %1, %%mm0\n\t"
  660. "movd 3%1, %%mm3\n\t"
  661. "punpckldq 6%1, %%mm0\n\t"
  662. "punpckldq 9%1, %%mm3\n\t"
  663. "movq %%mm0, %%mm1\n\t"
  664. "movq %%mm0, %%mm2\n\t"
  665. "movq %%mm3, %%mm4\n\t"
  666. "movq %%mm3, %%mm5\n\t"
  667. "psrlq $3, %%mm0\n\t"
  668. "psrlq $3, %%mm3\n\t"
  669. "pand %2, %%mm0\n\t"
  670. "pand %2, %%mm3\n\t"
  671. "psrlq $5, %%mm1\n\t"
  672. "psrlq $5, %%mm4\n\t"
  673. "pand %%mm6, %%mm1\n\t"
  674. "pand %%mm6, %%mm4\n\t"
  675. "psrlq $8, %%mm2\n\t"
  676. "psrlq $8, %%mm5\n\t"
  677. "pand %%mm7, %%mm2\n\t"
  678. "pand %%mm7, %%mm5\n\t"
  679. "por %%mm1, %%mm0\n\t"
  680. "por %%mm4, %%mm3\n\t"
  681. "por %%mm2, %%mm0\n\t"
  682. "por %%mm5, %%mm3\n\t"
  683. "psllq $16, %%mm3\n\t"
  684. "por %%mm3, %%mm0\n\t"
  685. MOVNTQ" %%mm0, %0\n\t"
  686. :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
  687. d += 4;
  688. s += 12;
  689. }
  690. __asm __volatile(SFENCE:::"memory");
  691. __asm __volatile(EMMS:::"memory");
  692. #endif
  693. while(s < end)
  694. {
  695. const int b= *s++;
  696. const int g= *s++;
  697. const int r= *s++;
  698. *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
  699. }
  700. }
  701. static inline void RENAME(rgb24tobgr16)(const uint8_t *src, uint8_t *dst, long src_size)
  702. {
  703. const uint8_t *s = src;
  704. const uint8_t *end;
  705. #ifdef HAVE_MMX
  706. const uint8_t *mm_end;
  707. #endif
  708. uint16_t *d = (uint16_t *)dst;
  709. end = s + src_size;
  710. #ifdef HAVE_MMX
  711. __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
  712. __asm __volatile(
  713. "movq %0, %%mm7\n\t"
  714. "movq %1, %%mm6\n\t"
  715. ::"m"(red_16mask),"m"(green_16mask));
  716. mm_end = end - 15;
  717. while(s < mm_end)
  718. {
  719. __asm __volatile(
  720. PREFETCH" 32%1\n\t"
  721. "movd %1, %%mm0\n\t"
  722. "movd 3%1, %%mm3\n\t"
  723. "punpckldq 6%1, %%mm0\n\t"
  724. "punpckldq 9%1, %%mm3\n\t"
  725. "movq %%mm0, %%mm1\n\t"
  726. "movq %%mm0, %%mm2\n\t"
  727. "movq %%mm3, %%mm4\n\t"
  728. "movq %%mm3, %%mm5\n\t"
  729. "psllq $8, %%mm0\n\t"
  730. "psllq $8, %%mm3\n\t"
  731. "pand %%mm7, %%mm0\n\t"
  732. "pand %%mm7, %%mm3\n\t"
  733. "psrlq $5, %%mm1\n\t"
  734. "psrlq $5, %%mm4\n\t"
  735. "pand %%mm6, %%mm1\n\t"
  736. "pand %%mm6, %%mm4\n\t"
  737. "psrlq $19, %%mm2\n\t"
  738. "psrlq $19, %%mm5\n\t"
  739. "pand %2, %%mm2\n\t"
  740. "pand %2, %%mm5\n\t"
  741. "por %%mm1, %%mm0\n\t"
  742. "por %%mm4, %%mm3\n\t"
  743. "por %%mm2, %%mm0\n\t"
  744. "por %%mm5, %%mm3\n\t"
  745. "psllq $16, %%mm3\n\t"
  746. "por %%mm3, %%mm0\n\t"
  747. MOVNTQ" %%mm0, %0\n\t"
  748. :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
  749. d += 4;
  750. s += 12;
  751. }
  752. __asm __volatile(SFENCE:::"memory");
  753. __asm __volatile(EMMS:::"memory");
  754. #endif
  755. while(s < end)
  756. {
  757. const int r= *s++;
  758. const int g= *s++;
  759. const int b= *s++;
  760. *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
  761. }
  762. }
  763. static inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, long src_size)
  764. {
  765. const uint8_t *s = src;
  766. const uint8_t *end;
  767. #ifdef HAVE_MMX
  768. const uint8_t *mm_end;
  769. #endif
  770. uint16_t *d = (uint16_t *)dst;
  771. end = s + src_size;
  772. #ifdef HAVE_MMX
  773. __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
  774. __asm __volatile(
  775. "movq %0, %%mm7\n\t"
  776. "movq %1, %%mm6\n\t"
  777. ::"m"(red_15mask),"m"(green_15mask));
  778. mm_end = end - 11;
  779. while(s < mm_end)
  780. {
  781. __asm __volatile(
  782. PREFETCH" 32%1\n\t"
  783. "movd %1, %%mm0\n\t"
  784. "movd 3%1, %%mm3\n\t"
  785. "punpckldq 6%1, %%mm0\n\t"
  786. "punpckldq 9%1, %%mm3\n\t"
  787. "movq %%mm0, %%mm1\n\t"
  788. "movq %%mm0, %%mm2\n\t"
  789. "movq %%mm3, %%mm4\n\t"
  790. "movq %%mm3, %%mm5\n\t"
  791. "psrlq $3, %%mm0\n\t"
  792. "psrlq $3, %%mm3\n\t"
  793. "pand %2, %%mm0\n\t"
  794. "pand %2, %%mm3\n\t"
  795. "psrlq $6, %%mm1\n\t"
  796. "psrlq $6, %%mm4\n\t"
  797. "pand %%mm6, %%mm1\n\t"
  798. "pand %%mm6, %%mm4\n\t"
  799. "psrlq $9, %%mm2\n\t"
  800. "psrlq $9, %%mm5\n\t"
  801. "pand %%mm7, %%mm2\n\t"
  802. "pand %%mm7, %%mm5\n\t"
  803. "por %%mm1, %%mm0\n\t"
  804. "por %%mm4, %%mm3\n\t"
  805. "por %%mm2, %%mm0\n\t"
  806. "por %%mm5, %%mm3\n\t"
  807. "psllq $16, %%mm3\n\t"
  808. "por %%mm3, %%mm0\n\t"
  809. MOVNTQ" %%mm0, %0\n\t"
  810. :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
  811. d += 4;
  812. s += 12;
  813. }
  814. __asm __volatile(SFENCE:::"memory");
  815. __asm __volatile(EMMS:::"memory");
  816. #endif
  817. while(s < end)
  818. {
  819. const int b= *s++;
  820. const int g= *s++;
  821. const int r= *s++;
  822. *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
  823. }
  824. }
  825. static inline void RENAME(rgb24tobgr15)(const uint8_t *src, uint8_t *dst, long src_size)
  826. {
  827. const uint8_t *s = src;
  828. const uint8_t *end;
  829. #ifdef HAVE_MMX
  830. const uint8_t *mm_end;
  831. #endif
  832. uint16_t *d = (uint16_t *)dst;
  833. end = s + src_size;
  834. #ifdef HAVE_MMX
  835. __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
  836. __asm __volatile(
  837. "movq %0, %%mm7\n\t"
  838. "movq %1, %%mm6\n\t"
  839. ::"m"(red_15mask),"m"(green_15mask));
  840. mm_end = end - 15;
  841. while(s < mm_end)
  842. {
  843. __asm __volatile(
  844. PREFETCH" 32%1\n\t"
  845. "movd %1, %%mm0\n\t"
  846. "movd 3%1, %%mm3\n\t"
  847. "punpckldq 6%1, %%mm0\n\t"
  848. "punpckldq 9%1, %%mm3\n\t"
  849. "movq %%mm0, %%mm1\n\t"
  850. "movq %%mm0, %%mm2\n\t"
  851. "movq %%mm3, %%mm4\n\t"
  852. "movq %%mm3, %%mm5\n\t"
  853. "psllq $7, %%mm0\n\t"
  854. "psllq $7, %%mm3\n\t"
  855. "pand %%mm7, %%mm0\n\t"
  856. "pand %%mm7, %%mm3\n\t"
  857. "psrlq $6, %%mm1\n\t"
  858. "psrlq $6, %%mm4\n\t"
  859. "pand %%mm6, %%mm1\n\t"
  860. "pand %%mm6, %%mm4\n\t"
  861. "psrlq $19, %%mm2\n\t"
  862. "psrlq $19, %%mm5\n\t"
  863. "pand %2, %%mm2\n\t"
  864. "pand %2, %%mm5\n\t"
  865. "por %%mm1, %%mm0\n\t"
  866. "por %%mm4, %%mm3\n\t"
  867. "por %%mm2, %%mm0\n\t"
  868. "por %%mm5, %%mm3\n\t"
  869. "psllq $16, %%mm3\n\t"
  870. "por %%mm3, %%mm0\n\t"
  871. MOVNTQ" %%mm0, %0\n\t"
  872. :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
  873. d += 4;
  874. s += 12;
  875. }
  876. __asm __volatile(SFENCE:::"memory");
  877. __asm __volatile(EMMS:::"memory");
  878. #endif
  879. while(s < end)
  880. {
  881. const int r= *s++;
  882. const int g= *s++;
  883. const int b= *s++;
  884. *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
  885. }
  886. }
  887. /*
  888. I use here less accurate approximation by simply
  889. left-shifting the input
  890. value and filling the low order bits with
  891. zeroes. This method improves png's
  892. compression but this scheme cannot reproduce white exactly, since it does not
  893. generate an all-ones maximum value; the net effect is to darken the
  894. image slightly.
  895. The better method should be "left bit replication":
  896. 4 3 2 1 0
  897. ---------
  898. 1 1 0 1 1
  899. 7 6 5 4 3 2 1 0
  900. ----------------
  901. 1 1 0 1 1 1 1 0
  902. |=======| |===|
  903. | Leftmost Bits Repeated to Fill Open Bits
  904. |
  905. Original Bits
  906. */
  907. static inline void RENAME(rgb15to24)(const uint8_t *src, uint8_t *dst, long src_size)
  908. {
  909. const uint16_t *end;
  910. #ifdef HAVE_MMX
  911. const uint16_t *mm_end;
  912. #endif
  913. uint8_t *d = (uint8_t *)dst;
  914. const uint16_t *s = (uint16_t *)src;
  915. end = s + src_size/2;
  916. #ifdef HAVE_MMX
  917. __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
  918. mm_end = end - 7;
  919. while(s < mm_end)
  920. {
  921. __asm __volatile(
  922. PREFETCH" 32%1\n\t"
  923. "movq %1, %%mm0\n\t"
  924. "movq %1, %%mm1\n\t"
  925. "movq %1, %%mm2\n\t"
  926. "pand %2, %%mm0\n\t"
  927. "pand %3, %%mm1\n\t"
  928. "pand %4, %%mm2\n\t"
  929. "psllq $3, %%mm0\n\t"
  930. "psrlq $2, %%mm1\n\t"
  931. "psrlq $7, %%mm2\n\t"
  932. "movq %%mm0, %%mm3\n\t"
  933. "movq %%mm1, %%mm4\n\t"
  934. "movq %%mm2, %%mm5\n\t"
  935. "punpcklwd %5, %%mm0\n\t"
  936. "punpcklwd %5, %%mm1\n\t"
  937. "punpcklwd %5, %%mm2\n\t"
  938. "punpckhwd %5, %%mm3\n\t"
  939. "punpckhwd %5, %%mm4\n\t"
  940. "punpckhwd %5, %%mm5\n\t"
  941. "psllq $8, %%mm1\n\t"
  942. "psllq $16, %%mm2\n\t"
  943. "por %%mm1, %%mm0\n\t"
  944. "por %%mm2, %%mm0\n\t"
  945. "psllq $8, %%mm4\n\t"
  946. "psllq $16, %%mm5\n\t"
  947. "por %%mm4, %%mm3\n\t"
  948. "por %%mm5, %%mm3\n\t"
  949. "movq %%mm0, %%mm6\n\t"
  950. "movq %%mm3, %%mm7\n\t"
  951. "movq 8%1, %%mm0\n\t"
  952. "movq 8%1, %%mm1\n\t"
  953. "movq 8%1, %%mm2\n\t"
  954. "pand %2, %%mm0\n\t"
  955. "pand %3, %%mm1\n\t"
  956. "pand %4, %%mm2\n\t"
  957. "psllq $3, %%mm0\n\t"
  958. "psrlq $2, %%mm1\n\t"
  959. "psrlq $7, %%mm2\n\t"
  960. "movq %%mm0, %%mm3\n\t"
  961. "movq %%mm1, %%mm4\n\t"
  962. "movq %%mm2, %%mm5\n\t"
  963. "punpcklwd %5, %%mm0\n\t"
  964. "punpcklwd %5, %%mm1\n\t"
  965. "punpcklwd %5, %%mm2\n\t"
  966. "punpckhwd %5, %%mm3\n\t"
  967. "punpckhwd %5, %%mm4\n\t"
  968. "punpckhwd %5, %%mm5\n\t"
  969. "psllq $8, %%mm1\n\t"
  970. "psllq $16, %%mm2\n\t"
  971. "por %%mm1, %%mm0\n\t"
  972. "por %%mm2, %%mm0\n\t"
  973. "psllq $8, %%mm4\n\t"
  974. "psllq $16, %%mm5\n\t"
  975. "por %%mm4, %%mm3\n\t"
  976. "por %%mm5, %%mm3\n\t"
  977. :"=m"(*d)
  978. :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r), "m"(mmx_null)
  979. :"memory");
  980. /* Borrowed 32 to 24 */
  981. __asm __volatile(
  982. "movq %%mm0, %%mm4\n\t"
  983. "movq %%mm3, %%mm5\n\t"
  984. "movq %%mm6, %%mm0\n\t"
  985. "movq %%mm7, %%mm1\n\t"
  986. "movq %%mm4, %%mm6\n\t"
  987. "movq %%mm5, %%mm7\n\t"
  988. "movq %%mm0, %%mm2\n\t"
  989. "movq %%mm1, %%mm3\n\t"
  990. "psrlq $8, %%mm2\n\t"
  991. "psrlq $8, %%mm3\n\t"
  992. "psrlq $8, %%mm6\n\t"
  993. "psrlq $8, %%mm7\n\t"
  994. "pand %2, %%mm0\n\t"
  995. "pand %2, %%mm1\n\t"
  996. "pand %2, %%mm4\n\t"
  997. "pand %2, %%mm5\n\t"
  998. "pand %3, %%mm2\n\t"
  999. "pand %3, %%mm3\n\t"
  1000. "pand %3, %%mm6\n\t"
  1001. "pand %3, %%mm7\n\t"
  1002. "por %%mm2, %%mm0\n\t"
  1003. "por %%mm3, %%mm1\n\t"
  1004. "por %%mm6, %%mm4\n\t"
  1005. "por %%mm7, %%mm5\n\t"
  1006. "movq %%mm1, %%mm2\n\t"
  1007. "movq %%mm4, %%mm3\n\t"
  1008. "psllq $48, %%mm2\n\t"
  1009. "psllq $32, %%mm3\n\t"
  1010. "pand %4, %%mm2\n\t"
  1011. "pand %5, %%mm3\n\t"
  1012. "por %%mm2, %%mm0\n\t"
  1013. "psrlq $16, %%mm1\n\t"
  1014. "psrlq $32, %%mm4\n\t"
  1015. "psllq $16, %%mm5\n\t"
  1016. "por %%mm3, %%mm1\n\t"
  1017. "pand %6, %%mm5\n\t"
  1018. "por %%mm5, %%mm4\n\t"
  1019. MOVNTQ" %%mm0, %0\n\t"
  1020. MOVNTQ" %%mm1, 8%0\n\t"
  1021. MOVNTQ" %%mm4, 16%0"
  1022. :"=m"(*d)
  1023. :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
  1024. :"memory");
  1025. d += 24;
  1026. s += 8;
  1027. }
  1028. __asm __volatile(SFENCE:::"memory");
  1029. __asm __volatile(EMMS:::"memory");
  1030. #endif
  1031. while(s < end)
  1032. {
  1033. register uint16_t bgr;
  1034. bgr = *s++;
  1035. *d++ = (bgr&0x1F)<<3;
  1036. *d++ = (bgr&0x3E0)>>2;
  1037. *d++ = (bgr&0x7C00)>>7;
  1038. }
  1039. }
  1040. static inline void RENAME(rgb16to24)(const uint8_t *src, uint8_t *dst, long src_size)
  1041. {
  1042. const uint16_t *end;
  1043. #ifdef HAVE_MMX
  1044. const uint16_t *mm_end;
  1045. #endif
  1046. uint8_t *d = (uint8_t *)dst;
  1047. const uint16_t *s = (const uint16_t *)src;
  1048. end = s + src_size/2;
  1049. #ifdef HAVE_MMX
  1050. __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
  1051. mm_end = end - 7;
  1052. while(s < mm_end)
  1053. {
  1054. __asm __volatile(
  1055. PREFETCH" 32%1\n\t"
  1056. "movq %1, %%mm0\n\t"
  1057. "movq %1, %%mm1\n\t"
  1058. "movq %1, %%mm2\n\t"
  1059. "pand %2, %%mm0\n\t"
  1060. "pand %3, %%mm1\n\t"
  1061. "pand %4, %%mm2\n\t"
  1062. "psllq $3, %%mm0\n\t"
  1063. "psrlq $3, %%mm1\n\t"
  1064. "psrlq $8, %%mm2\n\t"
  1065. "movq %%mm0, %%mm3\n\t"
  1066. "movq %%mm1, %%mm4\n\t"
  1067. "movq %%mm2, %%mm5\n\t"
  1068. "punpcklwd %5, %%mm0\n\t"
  1069. "punpcklwd %5, %%mm1\n\t"
  1070. "punpcklwd %5, %%mm2\n\t"
  1071. "punpckhwd %5, %%mm3\n\t"
  1072. "punpckhwd %5, %%mm4\n\t"
  1073. "punpckhwd %5, %%mm5\n\t"
  1074. "psllq $8, %%mm1\n\t"
  1075. "psllq $16, %%mm2\n\t"
  1076. "por %%mm1, %%mm0\n\t"
  1077. "por %%mm2, %%mm0\n\t"
  1078. "psllq $8, %%mm4\n\t"
  1079. "psllq $16, %%mm5\n\t"
  1080. "por %%mm4, %%mm3\n\t"
  1081. "por %%mm5, %%mm3\n\t"
  1082. "movq %%mm0, %%mm6\n\t"
  1083. "movq %%mm3, %%mm7\n\t"
  1084. "movq 8%1, %%mm0\n\t"
  1085. "movq 8%1, %%mm1\n\t"
  1086. "movq 8%1, %%mm2\n\t"
  1087. "pand %2, %%mm0\n\t"
  1088. "pand %3, %%mm1\n\t"
  1089. "pand %4, %%mm2\n\t"
  1090. "psllq $3, %%mm0\n\t"
  1091. "psrlq $3, %%mm1\n\t"
  1092. "psrlq $8, %%mm2\n\t"
  1093. "movq %%mm0, %%mm3\n\t"
  1094. "movq %%mm1, %%mm4\n\t"
  1095. "movq %%mm2, %%mm5\n\t"
  1096. "punpcklwd %5, %%mm0\n\t"
  1097. "punpcklwd %5, %%mm1\n\t"
  1098. "punpcklwd %5, %%mm2\n\t"
  1099. "punpckhwd %5, %%mm3\n\t"
  1100. "punpckhwd %5, %%mm4\n\t"
  1101. "punpckhwd %5, %%mm5\n\t"
  1102. "psllq $8, %%mm1\n\t"
  1103. "psllq $16, %%mm2\n\t"
  1104. "por %%mm1, %%mm0\n\t"
  1105. "por %%mm2, %%mm0\n\t"
  1106. "psllq $8, %%mm4\n\t"
  1107. "psllq $16, %%mm5\n\t"
  1108. "por %%mm4, %%mm3\n\t"
  1109. "por %%mm5, %%mm3\n\t"
  1110. :"=m"(*d)
  1111. :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mmx_null)
  1112. :"memory");
  1113. /* Borrowed 32 to 24 */
  1114. __asm __volatile(
  1115. "movq %%mm0, %%mm4\n\t"
  1116. "movq %%mm3, %%mm5\n\t"
  1117. "movq %%mm6, %%mm0\n\t"
  1118. "movq %%mm7, %%mm1\n\t"
  1119. "movq %%mm4, %%mm6\n\t"
  1120. "movq %%mm5, %%mm7\n\t"
  1121. "movq %%mm0, %%mm2\n\t"
  1122. "movq %%mm1, %%mm3\n\t"
  1123. "psrlq $8, %%mm2\n\t"
  1124. "psrlq $8, %%mm3\n\t"
  1125. "psrlq $8, %%mm6\n\t"
  1126. "psrlq $8, %%mm7\n\t"
  1127. "pand %2, %%mm0\n\t"
  1128. "pand %2, %%mm1\n\t"
  1129. "pand %2, %%mm4\n\t"
  1130. "pand %2, %%mm5\n\t"
  1131. "pand %3, %%mm2\n\t"
  1132. "pand %3, %%mm3\n\t"
  1133. "pand %3, %%mm6\n\t"
  1134. "pand %3, %%mm7\n\t"
  1135. "por %%mm2, %%mm0\n\t"
  1136. "por %%mm3, %%mm1\n\t"
  1137. "por %%mm6, %%mm4\n\t"
  1138. "por %%mm7, %%mm5\n\t"
  1139. "movq %%mm1, %%mm2\n\t"
  1140. "movq %%mm4, %%mm3\n\t"
  1141. "psllq $48, %%mm2\n\t"
  1142. "psllq $32, %%mm3\n\t"
  1143. "pand %4, %%mm2\n\t"
  1144. "pand %5, %%mm3\n\t"
  1145. "por %%mm2, %%mm0\n\t"
  1146. "psrlq $16, %%mm1\n\t"
  1147. "psrlq $32, %%mm4\n\t"
  1148. "psllq $16, %%mm5\n\t"
  1149. "por %%mm3, %%mm1\n\t"
  1150. "pand %6, %%mm5\n\t"
  1151. "por %%mm5, %%mm4\n\t"
  1152. MOVNTQ" %%mm0, %0\n\t"
  1153. MOVNTQ" %%mm1, 8%0\n\t"
  1154. MOVNTQ" %%mm4, 16%0"
  1155. :"=m"(*d)
  1156. :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
  1157. :"memory");
  1158. d += 24;
  1159. s += 8;
  1160. }
  1161. __asm __volatile(SFENCE:::"memory");
  1162. __asm __volatile(EMMS:::"memory");
  1163. #endif
  1164. while(s < end)
  1165. {
  1166. register uint16_t bgr;
  1167. bgr = *s++;
  1168. *d++ = (bgr&0x1F)<<3;
  1169. *d++ = (bgr&0x7E0)>>3;
  1170. *d++ = (bgr&0xF800)>>8;
  1171. }
  1172. }
  1173. static inline void RENAME(rgb15to32)(const uint8_t *src, uint8_t *dst, long src_size)
  1174. {
  1175. const uint16_t *end;
  1176. #ifdef HAVE_MMX
  1177. const uint16_t *mm_end;
  1178. #endif
  1179. uint8_t *d = (uint8_t *)dst;
  1180. const uint16_t *s = (const uint16_t *)src;
  1181. end = s + src_size/2;
  1182. #ifdef HAVE_MMX
  1183. __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
  1184. __asm __volatile("pxor %%mm7,%%mm7\n\t":::"memory");
  1185. mm_end = end - 3;
  1186. while(s < mm_end)
  1187. {
  1188. __asm __volatile(
  1189. PREFETCH" 32%1\n\t"
  1190. "movq %1, %%mm0\n\t"
  1191. "movq %1, %%mm1\n\t"
  1192. "movq %1, %%mm2\n\t"
  1193. "pand %2, %%mm0\n\t"
  1194. "pand %3, %%mm1\n\t"
  1195. "pand %4, %%mm2\n\t"
  1196. "psllq $3, %%mm0\n\t"
  1197. "psrlq $2, %%mm1\n\t"
  1198. "psrlq $7, %%mm2\n\t"
  1199. "movq %%mm0, %%mm3\n\t"
  1200. "movq %%mm1, %%mm4\n\t"
  1201. "movq %%mm2, %%mm5\n\t"
  1202. "punpcklwd %%mm7, %%mm0\n\t"
  1203. "punpcklwd %%mm7, %%mm1\n\t"
  1204. "punpcklwd %%mm7, %%mm2\n\t"
  1205. "punpckhwd %%mm7, %%mm3\n\t"
  1206. "punpckhwd %%mm7, %%mm4\n\t"
  1207. "punpckhwd %%mm7, %%mm5\n\t"
  1208. "psllq $8, %%mm1\n\t"
  1209. "psllq $16, %%mm2\n\t"
  1210. "por %%mm1, %%mm0\n\t"
  1211. "por %%mm2, %%mm0\n\t"
  1212. "psllq $8, %%mm4\n\t"
  1213. "psllq $16, %%mm5\n\t"
  1214. "por %%mm4, %%mm3\n\t"
  1215. "por %%mm5, %%mm3\n\t"
  1216. MOVNTQ" %%mm0, %0\n\t"
  1217. MOVNTQ" %%mm3, 8%0\n\t"
  1218. :"=m"(*d)
  1219. :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r)
  1220. :"memory");
  1221. d += 16;
  1222. s += 4;
  1223. }
  1224. __asm __volatile(SFENCE:::"memory");
  1225. __asm __volatile(EMMS:::"memory");
  1226. #endif
  1227. while(s < end)
  1228. {
  1229. #if 0 //slightly slower on athlon
  1230. int bgr= *s++;
  1231. *((uint32_t*)d)++ = ((bgr&0x1F)<<3) + ((bgr&0x3E0)<<6) + ((bgr&0x7C00)<<9);
  1232. #else
  1233. register uint16_t bgr;
  1234. bgr = *s++;
  1235. #ifdef WORDS_BIGENDIAN
  1236. *d++ = 0;
  1237. *d++ = (bgr&0x7C00)>>7;
  1238. *d++ = (bgr&0x3E0)>>2;
  1239. *d++ = (bgr&0x1F)<<3;
  1240. #else
  1241. *d++ = (bgr&0x1F)<<3;
  1242. *d++ = (bgr&0x3E0)>>2;
  1243. *d++ = (bgr&0x7C00)>>7;
  1244. *d++ = 0;
  1245. #endif
  1246. #endif
  1247. }
  1248. }
  1249. static inline void RENAME(rgb16to32)(const uint8_t *src, uint8_t *dst, long src_size)
  1250. {
  1251. const uint16_t *end;
  1252. #ifdef HAVE_MMX
  1253. const uint16_t *mm_end;
  1254. #endif
  1255. uint8_t *d = (uint8_t *)dst;
  1256. const uint16_t *s = (uint16_t *)src;
  1257. end = s + src_size/2;
  1258. #ifdef HAVE_MMX
  1259. __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
  1260. __asm __volatile("pxor %%mm7,%%mm7\n\t":::"memory");
  1261. mm_end = end - 3;
  1262. while(s < mm_end)
  1263. {
  1264. __asm __volatile(
  1265. PREFETCH" 32%1\n\t"
  1266. "movq %1, %%mm0\n\t"
  1267. "movq %1, %%mm1\n\t"
  1268. "movq %1, %%mm2\n\t"
  1269. "pand %2, %%mm0\n\t"
  1270. "pand %3, %%mm1\n\t"
  1271. "pand %4, %%mm2\n\t"
  1272. "psllq $3, %%mm0\n\t"
  1273. "psrlq $3, %%mm1\n\t"
  1274. "psrlq $8, %%mm2\n\t"
  1275. "movq %%mm0, %%mm3\n\t"
  1276. "movq %%mm1, %%mm4\n\t"
  1277. "movq %%mm2, %%mm5\n\t"
  1278. "punpcklwd %%mm7, %%mm0\n\t"
  1279. "punpcklwd %%mm7, %%mm1\n\t"
  1280. "punpcklwd %%mm7, %%mm2\n\t"
  1281. "punpckhwd %%mm7, %%mm3\n\t"
  1282. "punpckhwd %%mm7, %%mm4\n\t"
  1283. "punpckhwd %%mm7, %%mm5\n\t"
  1284. "psllq $8, %%mm1\n\t"
  1285. "psllq $16, %%mm2\n\t"
  1286. "por %%mm1, %%mm0\n\t"
  1287. "por %%mm2, %%mm0\n\t"
  1288. "psllq $8, %%mm4\n\t"
  1289. "psllq $16, %%mm5\n\t"
  1290. "por %%mm4, %%mm3\n\t"
  1291. "por %%mm5, %%mm3\n\t"
  1292. MOVNTQ" %%mm0, %0\n\t"
  1293. MOVNTQ" %%mm3, 8%0\n\t"
  1294. :"=m"(*d)
  1295. :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r)
  1296. :"memory");
  1297. d += 16;
  1298. s += 4;
  1299. }
  1300. __asm __volatile(SFENCE:::"memory");
  1301. __asm __volatile(EMMS:::"memory");
  1302. #endif
  1303. while(s < end)
  1304. {
  1305. register uint16_t bgr;
  1306. bgr = *s++;
  1307. #ifdef WORDS_BIGENDIAN
  1308. *d++ = 0;
  1309. *d++ = (bgr&0xF800)>>8;
  1310. *d++ = (bgr&0x7E0)>>3;
  1311. *d++ = (bgr&0x1F)<<3;
  1312. #else
  1313. *d++ = (bgr&0x1F)<<3;
  1314. *d++ = (bgr&0x7E0)>>3;
  1315. *d++ = (bgr&0xF800)>>8;
  1316. *d++ = 0;
  1317. #endif
  1318. }
  1319. }
  1320. static inline void RENAME(rgb32tobgr32)(const uint8_t *src, uint8_t *dst, long src_size)
  1321. {
  1322. #ifdef HAVE_MMX
  1323. /* TODO: unroll this loop */
  1324. asm volatile (
  1325. "xor %%"REG_a", %%"REG_a" \n\t"
  1326. ASMALIGN(4)
  1327. "1: \n\t"
  1328. PREFETCH" 32(%0, %%"REG_a") \n\t"
  1329. "movq (%0, %%"REG_a"), %%mm0 \n\t"
  1330. "movq %%mm0, %%mm1 \n\t"
  1331. "movq %%mm0, %%mm2 \n\t"
  1332. "pslld $16, %%mm0 \n\t"
  1333. "psrld $16, %%mm1 \n\t"
  1334. "pand "MANGLE(mask32r)", %%mm0 \n\t"
  1335. "pand "MANGLE(mask32g)", %%mm2 \n\t"
  1336. "pand "MANGLE(mask32b)", %%mm1 \n\t"
  1337. "por %%mm0, %%mm2 \n\t"
  1338. "por %%mm1, %%mm2 \n\t"
  1339. MOVNTQ" %%mm2, (%1, %%"REG_a") \n\t"
  1340. "add $8, %%"REG_a" \n\t"
  1341. "cmp %2, %%"REG_a" \n\t"
  1342. " jb 1b \n\t"
  1343. :: "r" (src), "r"(dst), "r" (src_size-7)
  1344. : "%"REG_a
  1345. );
  1346. __asm __volatile(SFENCE:::"memory");
  1347. __asm __volatile(EMMS:::"memory");
  1348. #else
  1349. unsigned i;
  1350. unsigned num_pixels = src_size >> 2;
  1351. for(i=0; i<num_pixels; i++)
  1352. {
  1353. #ifdef WORDS_BIGENDIAN
  1354. dst[4*i + 1] = src[4*i + 3];
  1355. dst[4*i + 2] = src[4*i + 2];
  1356. dst[4*i + 3] = src[4*i + 1];
  1357. #else
  1358. dst[4*i + 0] = src[4*i + 2];
  1359. dst[4*i + 1] = src[4*i + 1];
  1360. dst[4*i + 2] = src[4*i + 0];
  1361. #endif
  1362. }
  1363. #endif
  1364. }
  1365. static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
  1366. {
  1367. unsigned i;
  1368. #ifdef HAVE_MMX
  1369. long mmx_size= 23 - src_size;
  1370. asm volatile (
  1371. "movq "MANGLE(mask24r)", %%mm5 \n\t"
  1372. "movq "MANGLE(mask24g)", %%mm6 \n\t"
  1373. "movq "MANGLE(mask24b)", %%mm7 \n\t"
  1374. ASMALIGN(4)
  1375. "1: \n\t"
  1376. PREFETCH" 32(%1, %%"REG_a") \n\t"
  1377. "movq (%1, %%"REG_a"), %%mm0 \n\t" // BGR BGR BG
  1378. "movq (%1, %%"REG_a"), %%mm1 \n\t" // BGR BGR BG
  1379. "movq 2(%1, %%"REG_a"), %%mm2 \n\t" // R BGR BGR B
  1380. "psllq $16, %%mm0 \n\t" // 00 BGR BGR
  1381. "pand %%mm5, %%mm0 \n\t"
  1382. "pand %%mm6, %%mm1 \n\t"
  1383. "pand %%mm7, %%mm2 \n\t"
  1384. "por %%mm0, %%mm1 \n\t"
  1385. "por %%mm2, %%mm1 \n\t"
  1386. "movq 6(%1, %%"REG_a"), %%mm0 \n\t" // BGR BGR BG
  1387. MOVNTQ" %%mm1, (%2, %%"REG_a")\n\t" // RGB RGB RG
  1388. "movq 8(%1, %%"REG_a"), %%mm1 \n\t" // R BGR BGR B
  1389. "movq 10(%1, %%"REG_a"), %%mm2 \n\t" // GR BGR BGR
  1390. "pand %%mm7, %%mm0 \n\t"
  1391. "pand %%mm5, %%mm1 \n\t"
  1392. "pand %%mm6, %%mm2 \n\t"
  1393. "por %%mm0, %%mm1 \n\t"
  1394. "por %%mm2, %%mm1 \n\t"
  1395. "movq 14(%1, %%"REG_a"), %%mm0 \n\t" // R BGR BGR B
  1396. MOVNTQ" %%mm1, 8(%2, %%"REG_a")\n\t" // B RGB RGB R
  1397. "movq 16(%1, %%"REG_a"), %%mm1 \n\t" // GR BGR BGR
  1398. "movq 18(%1, %%"REG_a"), %%mm2 \n\t" // BGR BGR BG
  1399. "pand %%mm6, %%mm0 \n\t"
  1400. "pand %%mm7, %%mm1 \n\t"
  1401. "pand %%mm5, %%mm2 \n\t"
  1402. "por %%mm0, %%mm1 \n\t"
  1403. "por %%mm2, %%mm1 \n\t"
  1404. MOVNTQ" %%mm1, 16(%2, %%"REG_a")\n\t"
  1405. "add $24, %%"REG_a" \n\t"
  1406. " js 1b \n\t"
  1407. : "+a" (mmx_size)
  1408. : "r" (src-mmx_size), "r"(dst-mmx_size)
  1409. );
  1410. __asm __volatile(SFENCE:::"memory");
  1411. __asm __volatile(EMMS:::"memory");
  1412. if(mmx_size==23) return; //finihsed, was multiple of 8
  1413. src+= src_size;
  1414. dst+= src_size;
  1415. src_size= 23-mmx_size;
  1416. src-= src_size;
  1417. dst-= src_size;
  1418. #endif
  1419. for(i=0; i<src_size; i+=3)
  1420. {
  1421. register uint8_t x;
  1422. x = src[i + 2];
  1423. dst[i + 1] = src[i + 1];
  1424. dst[i + 2] = src[i + 0];
  1425. dst[i + 0] = x;
  1426. }
  1427. }
  1428. static inline void RENAME(yuvPlanartoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
  1429. long width, long height,
  1430. long lumStride, long chromStride, long dstStride, long vertLumPerChroma)
  1431. {
  1432. long y;
  1433. const long chromWidth= width>>1;
  1434. for(y=0; y<height; y++)
  1435. {
  1436. #ifdef HAVE_MMX
  1437. //FIXME handle 2 lines a once (fewer prefetch, reuse some chrom, but very likely limited by mem anyway)
  1438. asm volatile(
  1439. "xor %%"REG_a", %%"REG_a" \n\t"
  1440. ASMALIGN(4)
  1441. "1: \n\t"
  1442. PREFETCH" 32(%1, %%"REG_a", 2) \n\t"
  1443. PREFETCH" 32(%2, %%"REG_a") \n\t"
  1444. PREFETCH" 32(%3, %%"REG_a") \n\t"
  1445. "movq (%2, %%"REG_a"), %%mm0 \n\t" // U(0)
  1446. "movq %%mm0, %%mm2 \n\t" // U(0)
  1447. "movq (%3, %%"REG_a"), %%mm1 \n\t" // V(0)
  1448. "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
  1449. "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8)
  1450. "movq (%1, %%"REG_a",2), %%mm3 \n\t" // Y(0)
  1451. "movq 8(%1, %%"REG_a",2), %%mm5 \n\t" // Y(8)
  1452. "movq %%mm3, %%mm4 \n\t" // Y(0)
  1453. "movq %%mm5, %%mm6 \n\t" // Y(8)
  1454. "punpcklbw %%mm0, %%mm3 \n\t" // YUYV YUYV(0)
  1455. "punpckhbw %%mm0, %%mm4 \n\t" // YUYV YUYV(4)
  1456. "punpcklbw %%mm2, %%mm5 \n\t" // YUYV YUYV(8)
  1457. "punpckhbw %%mm2, %%mm6 \n\t" // YUYV YUYV(12)
  1458. MOVNTQ" %%mm3, (%0, %%"REG_a", 4)\n\t"
  1459. MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4)\n\t"
  1460. MOVNTQ" %%mm5, 16(%0, %%"REG_a", 4)\n\t"
  1461. MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4)\n\t"
  1462. "add $8, %%"REG_a" \n\t"
  1463. "cmp %4, %%"REG_a" \n\t"
  1464. " jb 1b \n\t"
  1465. ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
  1466. : "%"REG_a
  1467. );
  1468. #else
  1469. #if defined ARCH_ALPHA && defined HAVE_MVI
  1470. #define pl2yuy2(n) \
  1471. y1 = yc[n]; \
  1472. y2 = yc2[n]; \
  1473. u = uc[n]; \
  1474. v = vc[n]; \
  1475. asm("unpkbw %1, %0" : "=r"(y1) : "r"(y1)); \
  1476. asm("unpkbw %1, %0" : "=r"(y2) : "r"(y2)); \
  1477. asm("unpkbl %1, %0" : "=r"(u) : "r"(u)); \
  1478. asm("unpkbl %1, %0" : "=r"(v) : "r"(v)); \
  1479. yuv1 = (u << 8) + (v << 24); \
  1480. yuv2 = yuv1 + y2; \
  1481. yuv1 += y1; \
  1482. qdst[n] = yuv1; \
  1483. qdst2[n] = yuv2;
  1484. int i;
  1485. uint64_t *qdst = (uint64_t *) dst;
  1486. uint64_t *qdst2 = (uint64_t *) (dst + dstStride);
  1487. const uint32_t *yc = (uint32_t *) ysrc;
  1488. const uint32_t *yc2 = (uint32_t *) (ysrc + lumStride);
  1489. const uint16_t *uc = (uint16_t*) usrc, *vc = (uint16_t*) vsrc;
  1490. for(i = 0; i < chromWidth; i += 8){
  1491. uint64_t y1, y2, yuv1, yuv2;
  1492. uint64_t u, v;
  1493. /* Prefetch */
  1494. asm("ldq $31,64(%0)" :: "r"(yc));
  1495. asm("ldq $31,64(%0)" :: "r"(yc2));
  1496. asm("ldq $31,64(%0)" :: "r"(uc));
  1497. asm("ldq $31,64(%0)" :: "r"(vc));
  1498. pl2yuy2(0);
  1499. pl2yuy2(1);
  1500. pl2yuy2(2);
  1501. pl2yuy2(3);
  1502. yc += 4;
  1503. yc2 += 4;
  1504. uc += 4;
  1505. vc += 4;
  1506. qdst += 4;
  1507. qdst2 += 4;
  1508. }
  1509. y++;
  1510. ysrc += lumStride;
  1511. dst += dstStride;
  1512. #elif __WORDSIZE >= 64
  1513. int i;
  1514. uint64_t *ldst = (uint64_t *) dst;
  1515. const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
  1516. for(i = 0; i < chromWidth; i += 2){
  1517. uint64_t k, l;
  1518. k = yc[0] + (uc[0] << 8) +
  1519. (yc[1] << 16) + (vc[0] << 24);
  1520. l = yc[2] + (uc[1] << 8) +
  1521. (yc[3] << 16) + (vc[1] << 24);
  1522. *ldst++ = k + (l << 32);
  1523. yc += 4;
  1524. uc += 2;
  1525. vc += 2;
  1526. }
  1527. #else
  1528. int i, *idst = (int32_t *) dst;
  1529. const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
  1530. for(i = 0; i < chromWidth; i++){
  1531. #ifdef WORDS_BIGENDIAN
  1532. *idst++ = (yc[0] << 24)+ (uc[0] << 16) +
  1533. (yc[1] << 8) + (vc[0] << 0);
  1534. #else
  1535. *idst++ = yc[0] + (uc[0] << 8) +
  1536. (yc[1] << 16) + (vc[0] << 24);
  1537. #endif
  1538. yc += 2;
  1539. uc++;
  1540. vc++;
  1541. }
  1542. #endif
  1543. #endif
  1544. if((y&(vertLumPerChroma-1))==(vertLumPerChroma-1) )
  1545. {
  1546. usrc += chromStride;
  1547. vsrc += chromStride;
  1548. }
  1549. ysrc += lumStride;
  1550. dst += dstStride;
  1551. }
  1552. #ifdef HAVE_MMX
  1553. asm( EMMS" \n\t"
  1554. SFENCE" \n\t"
  1555. :::"memory");
  1556. #endif
  1557. }
  1558. /**
  1559. *
  1560. * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
  1561. * problem for anyone then tell me, and ill fix it)
  1562. */
  1563. static inline void RENAME(yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
  1564. long width, long height,
  1565. long lumStride, long chromStride, long dstStride)
  1566. {
  1567. //FIXME interpolate chroma
  1568. RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
  1569. }
  1570. static inline void RENAME(yuvPlanartouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
  1571. long width, long height,
  1572. long lumStride, long chromStride, long dstStride, long vertLumPerChroma)
  1573. {
  1574. long y;
  1575. const long chromWidth= width>>1;
  1576. for(y=0; y<height; y++)
  1577. {
  1578. #ifdef HAVE_MMX
  1579. //FIXME handle 2 lines a once (fewer prefetch, reuse some chrom, but very likely limited by mem anyway)
  1580. asm volatile(
  1581. "xor %%"REG_a", %%"REG_a" \n\t"
  1582. ASMALIGN(4)
  1583. "1: \n\t"
  1584. PREFETCH" 32(%1, %%"REG_a", 2) \n\t"
  1585. PREFETCH" 32(%2, %%"REG_a") \n\t"
  1586. PREFETCH" 32(%3, %%"REG_a") \n\t"
  1587. "movq (%2, %%"REG_a"), %%mm0 \n\t" // U(0)
  1588. "movq %%mm0, %%mm2 \n\t" // U(0)
  1589. "movq (%3, %%"REG_a"), %%mm1 \n\t" // V(0)
  1590. "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
  1591. "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8)
  1592. "movq (%1, %%"REG_a",2), %%mm3 \n\t" // Y(0)
  1593. "movq 8(%1, %%"REG_a",2), %%mm5 \n\t" // Y(8)
  1594. "movq %%mm0, %%mm4 \n\t" // Y(0)
  1595. "movq %%mm2, %%mm6 \n\t" // Y(8)
  1596. "punpcklbw %%mm3, %%mm0 \n\t" // YUYV YUYV(0)
  1597. "punpckhbw %%mm3, %%mm4 \n\t" // YUYV YUYV(4)
  1598. "punpcklbw %%mm5, %%mm2 \n\t" // YUYV YUYV(8)
  1599. "punpckhbw %%mm5, %%mm6 \n\t" // YUYV YUYV(12)
  1600. MOVNTQ" %%mm0, (%0, %%"REG_a", 4)\n\t"
  1601. MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4)\n\t"
  1602. MOVNTQ" %%mm2, 16(%0, %%"REG_a", 4)\n\t"
  1603. MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4)\n\t"
  1604. "add $8, %%"REG_a" \n\t"
  1605. "cmp %4, %%"REG_a" \n\t"
  1606. " jb 1b \n\t"
  1607. ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
  1608. : "%"REG_a
  1609. );
  1610. #else
  1611. //FIXME adapt the alpha asm code from yv12->yuy2
  1612. #if __WORDSIZE >= 64
  1613. int i;
  1614. uint64_t *ldst = (uint64_t *) dst;
  1615. const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
  1616. for(i = 0; i < chromWidth; i += 2){
  1617. uint64_t k, l;
  1618. k = uc[0] + (yc[0] << 8) +
  1619. (vc[0] << 16) + (yc[1] << 24);
  1620. l = uc[1] + (yc[2] << 8) +
  1621. (vc[1] << 16) + (yc[3] << 24);
  1622. *ldst++ = k + (l << 32);
  1623. yc += 4;
  1624. uc += 2;
  1625. vc += 2;
  1626. }
  1627. #else
  1628. int i, *idst = (int32_t *) dst;
  1629. const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
  1630. for(i = 0; i < chromWidth; i++){
  1631. #ifdef WORDS_BIGENDIAN
  1632. *idst++ = (uc[0] << 24)+ (yc[0] << 16) +
  1633. (vc[0] << 8) + (yc[1] << 0);
  1634. #else
  1635. *idst++ = uc[0] + (yc[0] << 8) +
  1636. (vc[0] << 16) + (yc[1] << 24);
  1637. #endif
  1638. yc += 2;
  1639. uc++;
  1640. vc++;
  1641. }
  1642. #endif
  1643. #endif
  1644. if((y&(vertLumPerChroma-1))==(vertLumPerChroma-1) )
  1645. {
  1646. usrc += chromStride;
  1647. vsrc += chromStride;
  1648. }
  1649. ysrc += lumStride;
  1650. dst += dstStride;
  1651. }
  1652. #ifdef HAVE_MMX
  1653. asm( EMMS" \n\t"
  1654. SFENCE" \n\t"
  1655. :::"memory");
  1656. #endif
  1657. }
  1658. /**
  1659. *
  1660. * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
  1661. * problem for anyone then tell me, and ill fix it)
  1662. */
  1663. static inline void RENAME(yv12touyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
  1664. long width, long height,
  1665. long lumStride, long chromStride, long dstStride)
  1666. {
  1667. //FIXME interpolate chroma
  1668. RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
  1669. }
  1670. /**
  1671. *
  1672. * width should be a multiple of 16
  1673. */
  1674. static inline void RENAME(yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
  1675. long width, long height,
  1676. long lumStride, long chromStride, long dstStride)
  1677. {
  1678. RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
  1679. }
  1680. /**
  1681. *
  1682. * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
  1683. * problem for anyone then tell me, and ill fix it)
  1684. */
  1685. static inline void RENAME(yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
  1686. long width, long height,
  1687. long lumStride, long chromStride, long srcStride)
  1688. {
  1689. long y;
  1690. const long chromWidth= width>>1;
  1691. for(y=0; y<height; y+=2)
  1692. {
  1693. #ifdef HAVE_MMX
  1694. asm volatile(
  1695. "xor %%"REG_a", %%"REG_a" \n\t"
  1696. "pcmpeqw %%mm7, %%mm7 \n\t"
  1697. "psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
  1698. ASMALIGN(4)
  1699. "1: \n\t"
  1700. PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
  1701. "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
  1702. "movq 8(%0, %%"REG_a", 4), %%mm1\n\t" // YUYV YUYV(4)
  1703. "movq %%mm0, %%mm2 \n\t" // YUYV YUYV(0)
  1704. "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(4)
  1705. "psrlw $8, %%mm0 \n\t" // U0V0 U0V0(0)
  1706. "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(4)
  1707. "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
  1708. "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
  1709. "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
  1710. "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
  1711. MOVNTQ" %%mm2, (%1, %%"REG_a", 2)\n\t"
  1712. "movq 16(%0, %%"REG_a", 4), %%mm1\n\t" // YUYV YUYV(8)
  1713. "movq 24(%0, %%"REG_a", 4), %%mm2\n\t" // YUYV YUYV(12)
  1714. "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(8)
  1715. "movq %%mm2, %%mm4 \n\t" // YUYV YUYV(12)
  1716. "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(8)
  1717. "psrlw $8, %%mm2 \n\t" // U0V0 U0V0(12)
  1718. "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
  1719. "pand %%mm7, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
  1720. "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
  1721. "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
  1722. MOVNTQ" %%mm3, 8(%1, %%"REG_a", 2)\n\t"
  1723. "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
  1724. "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
  1725. "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
  1726. "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
  1727. "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
  1728. "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
  1729. "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
  1730. "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
  1731. MOVNTQ" %%mm0, (%3, %%"REG_a") \n\t"
  1732. MOVNTQ" %%mm2, (%2, %%"REG_a") \n\t"
  1733. "add $8, %%"REG_a" \n\t"
  1734. "cmp %4, %%"REG_a" \n\t"
  1735. " jb 1b \n\t"
  1736. ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
  1737. : "memory", "%"REG_a
  1738. );
  1739. ydst += lumStride;
  1740. src += srcStride;
  1741. asm volatile(
  1742. "xor %%"REG_a", %%"REG_a" \n\t"
  1743. ASMALIGN(4)
  1744. "1: \n\t"
  1745. PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
  1746. "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
  1747. "movq 8(%0, %%"REG_a", 4), %%mm1\n\t" // YUYV YUYV(4)
  1748. "movq 16(%0, %%"REG_a", 4), %%mm2\n\t" // YUYV YUYV(8)
  1749. "movq 24(%0, %%"REG_a", 4), %%mm3\n\t" // YUYV YUYV(12)
  1750. "pand %%mm7, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
  1751. "pand %%mm7, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
  1752. "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
  1753. "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
  1754. "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
  1755. "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
  1756. MOVNTQ" %%mm0, (%1, %%"REG_a", 2)\n\t"
  1757. MOVNTQ" %%mm2, 8(%1, %%"REG_a", 2)\n\t"
  1758. "add $8, %%"REG_a" \n\t"
  1759. "cmp %4, %%"REG_a" \n\t"
  1760. " jb 1b \n\t"
  1761. ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
  1762. : "memory", "%"REG_a
  1763. );
  1764. #else
  1765. long i;
  1766. for(i=0; i<chromWidth; i++)
  1767. {
  1768. ydst[2*i+0] = src[4*i+0];
  1769. udst[i] = src[4*i+1];
  1770. ydst[2*i+1] = src[4*i+2];
  1771. vdst[i] = src[4*i+3];
  1772. }
  1773. ydst += lumStride;
  1774. src += srcStride;
  1775. for(i=0; i<chromWidth; i++)
  1776. {
  1777. ydst[2*i+0] = src[4*i+0];
  1778. ydst[2*i+1] = src[4*i+2];
  1779. }
  1780. #endif
  1781. udst += chromStride;
  1782. vdst += chromStride;
  1783. ydst += lumStride;
  1784. src += srcStride;
  1785. }
  1786. #ifdef HAVE_MMX
  1787. asm volatile( EMMS" \n\t"
  1788. SFENCE" \n\t"
  1789. :::"memory");
  1790. #endif
  1791. }
  1792. static inline void RENAME(yvu9toyv12)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc,
  1793. uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
  1794. long width, long height, long lumStride, long chromStride)
  1795. {
  1796. /* Y Plane */
  1797. memcpy(ydst, ysrc, width*height);
  1798. /* XXX: implement upscaling for U,V */
  1799. }
  1800. static inline void RENAME(planar2x)(const uint8_t *src, uint8_t *dst, long srcWidth, long srcHeight, long srcStride, long dstStride)
  1801. {
  1802. long x,y;
  1803. dst[0]= src[0];
  1804. // first line
  1805. for(x=0; x<srcWidth-1; x++){
  1806. dst[2*x+1]= (3*src[x] + src[x+1])>>2;
  1807. dst[2*x+2]= ( src[x] + 3*src[x+1])>>2;
  1808. }
  1809. dst[2*srcWidth-1]= src[srcWidth-1];
  1810. dst+= dstStride;
  1811. for(y=1; y<srcHeight; y++){
  1812. #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
  1813. const long mmxSize= srcWidth&~15;
  1814. asm volatile(
  1815. "mov %4, %%"REG_a" \n\t"
  1816. "1: \n\t"
  1817. "movq (%0, %%"REG_a"), %%mm0 \n\t"
  1818. "movq (%1, %%"REG_a"), %%mm1 \n\t"
  1819. "movq 1(%0, %%"REG_a"), %%mm2 \n\t"
  1820. "movq 1(%1, %%"REG_a"), %%mm3 \n\t"
  1821. "movq -1(%0, %%"REG_a"), %%mm4 \n\t"
  1822. "movq -1(%1, %%"REG_a"), %%mm5 \n\t"
  1823. PAVGB" %%mm0, %%mm5 \n\t"
  1824. PAVGB" %%mm0, %%mm3 \n\t"
  1825. PAVGB" %%mm0, %%mm5 \n\t"
  1826. PAVGB" %%mm0, %%mm3 \n\t"
  1827. PAVGB" %%mm1, %%mm4 \n\t"
  1828. PAVGB" %%mm1, %%mm2 \n\t"
  1829. PAVGB" %%mm1, %%mm4 \n\t"
  1830. PAVGB" %%mm1, %%mm2 \n\t"
  1831. "movq %%mm5, %%mm7 \n\t"
  1832. "movq %%mm4, %%mm6 \n\t"
  1833. "punpcklbw %%mm3, %%mm5 \n\t"
  1834. "punpckhbw %%mm3, %%mm7 \n\t"
  1835. "punpcklbw %%mm2, %%mm4 \n\t"
  1836. "punpckhbw %%mm2, %%mm6 \n\t"
  1837. #if 1
  1838. MOVNTQ" %%mm5, (%2, %%"REG_a", 2)\n\t"
  1839. MOVNTQ" %%mm7, 8(%2, %%"REG_a", 2)\n\t"
  1840. MOVNTQ" %%mm4, (%3, %%"REG_a", 2)\n\t"
  1841. MOVNTQ" %%mm6, 8(%3, %%"REG_a", 2)\n\t"
  1842. #else
  1843. "movq %%mm5, (%2, %%"REG_a", 2) \n\t"
  1844. "movq %%mm7, 8(%2, %%"REG_a", 2)\n\t"
  1845. "movq %%mm4, (%3, %%"REG_a", 2) \n\t"
  1846. "movq %%mm6, 8(%3, %%"REG_a", 2)\n\t"
  1847. #endif
  1848. "add $8, %%"REG_a" \n\t"
  1849. " js 1b \n\t"
  1850. :: "r" (src + mmxSize ), "r" (src + srcStride + mmxSize ),
  1851. "r" (dst + mmxSize*2), "r" (dst + dstStride + mmxSize*2),
  1852. "g" (-mmxSize)
  1853. : "%"REG_a
  1854. );
  1855. #else
  1856. const long mmxSize=1;
  1857. #endif
  1858. dst[0 ]= (3*src[0] + src[srcStride])>>2;
  1859. dst[dstStride]= ( src[0] + 3*src[srcStride])>>2;
  1860. for(x=mmxSize-1; x<srcWidth-1; x++){
  1861. dst[2*x +1]= (3*src[x+0] + src[x+srcStride+1])>>2;
  1862. dst[2*x+dstStride+2]= ( src[x+0] + 3*src[x+srcStride+1])>>2;
  1863. dst[2*x+dstStride+1]= ( src[x+1] + 3*src[x+srcStride ])>>2;
  1864. dst[2*x +2]= (3*src[x+1] + src[x+srcStride ])>>2;
  1865. }
  1866. dst[srcWidth*2 -1 ]= (3*src[srcWidth-1] + src[srcWidth-1 + srcStride])>>2;
  1867. dst[srcWidth*2 -1 + dstStride]= ( src[srcWidth-1] + 3*src[srcWidth-1 + srcStride])>>2;
  1868. dst+=dstStride*2;
  1869. src+=srcStride;
  1870. }
  1871. // last line
  1872. #if 1
  1873. dst[0]= src[0];
  1874. for(x=0; x<srcWidth-1; x++){
  1875. dst[2*x+1]= (3*src[x] + src[x+1])>>2;
  1876. dst[2*x+2]= ( src[x] + 3*src[x+1])>>2;
  1877. }
  1878. dst[2*srcWidth-1]= src[srcWidth-1];
  1879. #else
  1880. for(x=0; x<srcWidth; x++){
  1881. dst[2*x+0]=
  1882. dst[2*x+1]= src[x];
  1883. }
  1884. #endif
  1885. #ifdef HAVE_MMX
  1886. asm volatile( EMMS" \n\t"
  1887. SFENCE" \n\t"
  1888. :::"memory");
  1889. #endif
  1890. }
  1891. /**
  1892. *
  1893. * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
  1894. * problem for anyone then tell me, and ill fix it)
  1895. * chrominance data is only taken from every secound line others are ignored FIXME write HQ version
  1896. */
  1897. static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
  1898. long width, long height,
  1899. long lumStride, long chromStride, long srcStride)
  1900. {
  1901. long y;
  1902. const long chromWidth= width>>1;
  1903. for(y=0; y<height; y+=2)
  1904. {
  1905. #ifdef HAVE_MMX
  1906. asm volatile(
  1907. "xorl %%eax, %%eax \n\t"
  1908. "pcmpeqw %%mm7, %%mm7 \n\t"
  1909. "psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
  1910. ASMALIGN(4)
  1911. "1: \n\t"
  1912. PREFETCH" 64(%0, %%eax, 4) \n\t"
  1913. "movq (%0, %%eax, 4), %%mm0 \n\t" // UYVY UYVY(0)
  1914. "movq 8(%0, %%eax, 4), %%mm1 \n\t" // UYVY UYVY(4)
  1915. "movq %%mm0, %%mm2 \n\t" // UYVY UYVY(0)
  1916. "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(4)
  1917. "pand %%mm7, %%mm0 \n\t" // U0V0 U0V0(0)
  1918. "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(4)
  1919. "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
  1920. "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
  1921. "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
  1922. "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
  1923. MOVNTQ" %%mm2, (%1, %%eax, 2) \n\t"
  1924. "movq 16(%0, %%eax, 4), %%mm1 \n\t" // UYVY UYVY(8)
  1925. "movq 24(%0, %%eax, 4), %%mm2 \n\t" // UYVY UYVY(12)
  1926. "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(8)
  1927. "movq %%mm2, %%mm4 \n\t" // UYVY UYVY(12)
  1928. "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(8)
  1929. "pand %%mm7, %%mm2 \n\t" // U0V0 U0V0(12)
  1930. "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
  1931. "psrlw $8, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
  1932. "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
  1933. "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
  1934. MOVNTQ" %%mm3, 8(%1, %%eax, 2) \n\t"
  1935. "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
  1936. "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
  1937. "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
  1938. "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
  1939. "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
  1940. "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
  1941. "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
  1942. "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
  1943. MOVNTQ" %%mm0, (%3, %%eax) \n\t"
  1944. MOVNTQ" %%mm2, (%2, %%eax) \n\t"
  1945. "addl $8, %%eax \n\t"
  1946. "cmpl %4, %%eax \n\t"
  1947. " jb 1b \n\t"
  1948. ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
  1949. : "memory", "%eax"
  1950. );
  1951. ydst += lumStride;
  1952. src += srcStride;
  1953. asm volatile(
  1954. "xorl %%eax, %%eax \n\t"
  1955. ASMALIGN(4)
  1956. "1: \n\t"
  1957. PREFETCH" 64(%0, %%eax, 4) \n\t"
  1958. "movq (%0, %%eax, 4), %%mm0 \n\t" // YUYV YUYV(0)
  1959. "movq 8(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(4)
  1960. "movq 16(%0, %%eax, 4), %%mm2 \n\t" // YUYV YUYV(8)
  1961. "movq 24(%0, %%eax, 4), %%mm3 \n\t" // YUYV YUYV(12)
  1962. "psrlw $8, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
  1963. "psrlw $8, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
  1964. "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
  1965. "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
  1966. "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
  1967. "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
  1968. MOVNTQ" %%mm0, (%1, %%eax, 2) \n\t"
  1969. MOVNTQ" %%mm2, 8(%1, %%eax, 2) \n\t"
  1970. "addl $8, %%eax \n\t"
  1971. "cmpl %4, %%eax \n\t"
  1972. " jb 1b \n\t"
  1973. ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
  1974. : "memory", "%eax"
  1975. );
  1976. #else
  1977. long i;
  1978. for(i=0; i<chromWidth; i++)
  1979. {
  1980. udst[i] = src[4*i+0];
  1981. ydst[2*i+0] = src[4*i+1];
  1982. vdst[i] = src[4*i+2];
  1983. ydst[2*i+1] = src[4*i+3];
  1984. }
  1985. ydst += lumStride;
  1986. src += srcStride;
  1987. for(i=0; i<chromWidth; i++)
  1988. {
  1989. ydst[2*i+0] = src[4*i+1];
  1990. ydst[2*i+1] = src[4*i+3];
  1991. }
  1992. #endif
  1993. udst += chromStride;
  1994. vdst += chromStride;
  1995. ydst += lumStride;
  1996. src += srcStride;
  1997. }
  1998. #ifdef HAVE_MMX
  1999. asm volatile( EMMS" \n\t"
  2000. SFENCE" \n\t"
  2001. :::"memory");
  2002. #endif
  2003. }
  2004. /**
  2005. *
  2006. * height should be a multiple of 2 and width should be a multiple of 2 (if this is a
  2007. * problem for anyone then tell me, and ill fix it)
  2008. * chrominance data is only taken from every secound line others are ignored in the C version FIXME write HQ version
  2009. */
  2010. static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
  2011. long width, long height,
  2012. long lumStride, long chromStride, long srcStride)
  2013. {
  2014. long y;
  2015. const long chromWidth= width>>1;
  2016. #ifdef HAVE_MMX
  2017. for(y=0; y<height-2; y+=2)
  2018. {
  2019. long i;
  2020. for(i=0; i<2; i++)
  2021. {
  2022. asm volatile(
  2023. "mov %2, %%"REG_a" \n\t"
  2024. "movq "MANGLE(bgr2YCoeff)", %%mm6 \n\t"
  2025. "movq "MANGLE(w1111)", %%mm5 \n\t"
  2026. "pxor %%mm7, %%mm7 \n\t"
  2027. "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d"\n\t"
  2028. ASMALIGN(4)
  2029. "1: \n\t"
  2030. PREFETCH" 64(%0, %%"REG_d") \n\t"
  2031. "movd (%0, %%"REG_d"), %%mm0 \n\t"
  2032. "movd 3(%0, %%"REG_d"), %%mm1 \n\t"
  2033. "punpcklbw %%mm7, %%mm0 \n\t"
  2034. "punpcklbw %%mm7, %%mm1 \n\t"
  2035. "movd 6(%0, %%"REG_d"), %%mm2 \n\t"
  2036. "movd 9(%0, %%"REG_d"), %%mm3 \n\t"
  2037. "punpcklbw %%mm7, %%mm2 \n\t"
  2038. "punpcklbw %%mm7, %%mm3 \n\t"
  2039. "pmaddwd %%mm6, %%mm0 \n\t"
  2040. "pmaddwd %%mm6, %%mm1 \n\t"
  2041. "pmaddwd %%mm6, %%mm2 \n\t"
  2042. "pmaddwd %%mm6, %%mm3 \n\t"
  2043. #ifndef FAST_BGR2YV12
  2044. "psrad $8, %%mm0 \n\t"
  2045. "psrad $8, %%mm1 \n\t"
  2046. "psrad $8, %%mm2 \n\t"
  2047. "psrad $8, %%mm3 \n\t"
  2048. #endif
  2049. "packssdw %%mm1, %%mm0 \n\t"
  2050. "packssdw %%mm3, %%mm2 \n\t"
  2051. "pmaddwd %%mm5, %%mm0 \n\t"
  2052. "pmaddwd %%mm5, %%mm2 \n\t"
  2053. "packssdw %%mm2, %%mm0 \n\t"
  2054. "psraw $7, %%mm0 \n\t"
  2055. "movd 12(%0, %%"REG_d"), %%mm4 \n\t"
  2056. "movd 15(%0, %%"REG_d"), %%mm1 \n\t"
  2057. "punpcklbw %%mm7, %%mm4 \n\t"
  2058. "punpcklbw %%mm7, %%mm1 \n\t"
  2059. "movd 18(%0, %%"REG_d"), %%mm2 \n\t"
  2060. "movd 21(%0, %%"REG_d"), %%mm3 \n\t"
  2061. "punpcklbw %%mm7, %%mm2 \n\t"
  2062. "punpcklbw %%mm7, %%mm3 \n\t"
  2063. "pmaddwd %%mm6, %%mm4 \n\t"
  2064. "pmaddwd %%mm6, %%mm1 \n\t"
  2065. "pmaddwd %%mm6, %%mm2 \n\t"
  2066. "pmaddwd %%mm6, %%mm3 \n\t"
  2067. #ifndef FAST_BGR2YV12
  2068. "psrad $8, %%mm4 \n\t"
  2069. "psrad $8, %%mm1 \n\t"
  2070. "psrad $8, %%mm2 \n\t"
  2071. "psrad $8, %%mm3 \n\t"
  2072. #endif
  2073. "packssdw %%mm1, %%mm4 \n\t"
  2074. "packssdw %%mm3, %%mm2 \n\t"
  2075. "pmaddwd %%mm5, %%mm4 \n\t"
  2076. "pmaddwd %%mm5, %%mm2 \n\t"
  2077. "add $24, %%"REG_d" \n\t"
  2078. "packssdw %%mm2, %%mm4 \n\t"
  2079. "psraw $7, %%mm4 \n\t"
  2080. "packuswb %%mm4, %%mm0 \n\t"
  2081. "paddusb "MANGLE(bgr2YOffset)", %%mm0 \n\t"
  2082. MOVNTQ" %%mm0, (%1, %%"REG_a") \n\t"
  2083. "add $8, %%"REG_a" \n\t"
  2084. " js 1b \n\t"
  2085. : : "r" (src+width*3), "r" (ydst+width), "g" (-width)
  2086. : "%"REG_a, "%"REG_d
  2087. );
  2088. ydst += lumStride;
  2089. src += srcStride;
  2090. }
  2091. src -= srcStride*2;
  2092. asm volatile(
  2093. "mov %4, %%"REG_a" \n\t"
  2094. "movq "MANGLE(w1111)", %%mm5 \n\t"
  2095. "movq "MANGLE(bgr2UCoeff)", %%mm6 \n\t"
  2096. "pxor %%mm7, %%mm7 \n\t"
  2097. "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d"\n\t"
  2098. "add %%"REG_d", %%"REG_d" \n\t"
  2099. ASMALIGN(4)
  2100. "1: \n\t"
  2101. PREFETCH" 64(%0, %%"REG_d") \n\t"
  2102. PREFETCH" 64(%1, %%"REG_d") \n\t"
  2103. #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
  2104. "movq (%0, %%"REG_d"), %%mm0 \n\t"
  2105. "movq (%1, %%"REG_d"), %%mm1 \n\t"
  2106. "movq 6(%0, %%"REG_d"), %%mm2 \n\t"
  2107. "movq 6(%1, %%"REG_d"), %%mm3 \n\t"
  2108. PAVGB" %%mm1, %%mm0 \n\t"
  2109. PAVGB" %%mm3, %%mm2 \n\t"
  2110. "movq %%mm0, %%mm1 \n\t"
  2111. "movq %%mm2, %%mm3 \n\t"
  2112. "psrlq $24, %%mm0 \n\t"
  2113. "psrlq $24, %%mm2 \n\t"
  2114. PAVGB" %%mm1, %%mm0 \n\t"
  2115. PAVGB" %%mm3, %%mm2 \n\t"
  2116. "punpcklbw %%mm7, %%mm0 \n\t"
  2117. "punpcklbw %%mm7, %%mm2 \n\t"
  2118. #else
  2119. "movd (%0, %%"REG_d"), %%mm0 \n\t"
  2120. "movd (%1, %%"REG_d"), %%mm1 \n\t"
  2121. "movd 3(%0, %%"REG_d"), %%mm2 \n\t"
  2122. "movd 3(%1, %%"REG_d"), %%mm3 \n\t"
  2123. "punpcklbw %%mm7, %%mm0 \n\t"
  2124. "punpcklbw %%mm7, %%mm1 \n\t"
  2125. "punpcklbw %%mm7, %%mm2 \n\t"
  2126. "punpcklbw %%mm7, %%mm3 \n\t"
  2127. "paddw %%mm1, %%mm0 \n\t"
  2128. "paddw %%mm3, %%mm2 \n\t"
  2129. "paddw %%mm2, %%mm0 \n\t"
  2130. "movd 6(%0, %%"REG_d"), %%mm4 \n\t"
  2131. "movd 6(%1, %%"REG_d"), %%mm1 \n\t"
  2132. "movd 9(%0, %%"REG_d"), %%mm2 \n\t"
  2133. "movd 9(%1, %%"REG_d"), %%mm3 \n\t"
  2134. "punpcklbw %%mm7, %%mm4 \n\t"
  2135. "punpcklbw %%mm7, %%mm1 \n\t"
  2136. "punpcklbw %%mm7, %%mm2 \n\t"
  2137. "punpcklbw %%mm7, %%mm3 \n\t"
  2138. "paddw %%mm1, %%mm4 \n\t"
  2139. "paddw %%mm3, %%mm2 \n\t"
  2140. "paddw %%mm4, %%mm2 \n\t"
  2141. "psrlw $2, %%mm0 \n\t"
  2142. "psrlw $2, %%mm2 \n\t"
  2143. #endif
  2144. "movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t"
  2145. "movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t"
  2146. "pmaddwd %%mm0, %%mm1 \n\t"
  2147. "pmaddwd %%mm2, %%mm3 \n\t"
  2148. "pmaddwd %%mm6, %%mm0 \n\t"
  2149. "pmaddwd %%mm6, %%mm2 \n\t"
  2150. #ifndef FAST_BGR2YV12
  2151. "psrad $8, %%mm0 \n\t"
  2152. "psrad $8, %%mm1 \n\t"
  2153. "psrad $8, %%mm2 \n\t"
  2154. "psrad $8, %%mm3 \n\t"
  2155. #endif
  2156. "packssdw %%mm2, %%mm0 \n\t"
  2157. "packssdw %%mm3, %%mm1 \n\t"
  2158. "pmaddwd %%mm5, %%mm0 \n\t"
  2159. "pmaddwd %%mm5, %%mm1 \n\t"
  2160. "packssdw %%mm1, %%mm0 \n\t" // V1 V0 U1 U0
  2161. "psraw $7, %%mm0 \n\t"
  2162. #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
  2163. "movq 12(%0, %%"REG_d"), %%mm4 \n\t"
  2164. "movq 12(%1, %%"REG_d"), %%mm1 \n\t"
  2165. "movq 18(%0, %%"REG_d"), %%mm2 \n\t"
  2166. "movq 18(%1, %%"REG_d"), %%mm3 \n\t"
  2167. PAVGB" %%mm1, %%mm4 \n\t"
  2168. PAVGB" %%mm3, %%mm2 \n\t"
  2169. "movq %%mm4, %%mm1 \n\t"
  2170. "movq %%mm2, %%mm3 \n\t"
  2171. "psrlq $24, %%mm4 \n\t"
  2172. "psrlq $24, %%mm2 \n\t"
  2173. PAVGB" %%mm1, %%mm4 \n\t"
  2174. PAVGB" %%mm3, %%mm2 \n\t"
  2175. "punpcklbw %%mm7, %%mm4 \n\t"
  2176. "punpcklbw %%mm7, %%mm2 \n\t"
  2177. #else
  2178. "movd 12(%0, %%"REG_d"), %%mm4 \n\t"
  2179. "movd 12(%1, %%"REG_d"), %%mm1 \n\t"
  2180. "movd 15(%0, %%"REG_d"), %%mm2 \n\t"
  2181. "movd 15(%1, %%"REG_d"), %%mm3 \n\t"
  2182. "punpcklbw %%mm7, %%mm4 \n\t"
  2183. "punpcklbw %%mm7, %%mm1 \n\t"
  2184. "punpcklbw %%mm7, %%mm2 \n\t"
  2185. "punpcklbw %%mm7, %%mm3 \n\t"
  2186. "paddw %%mm1, %%mm4 \n\t"
  2187. "paddw %%mm3, %%mm2 \n\t"
  2188. "paddw %%mm2, %%mm4 \n\t"
  2189. "movd 18(%0, %%"REG_d"), %%mm5 \n\t"
  2190. "movd 18(%1, %%"REG_d"), %%mm1 \n\t"
  2191. "movd 21(%0, %%"REG_d"), %%mm2 \n\t"
  2192. "movd 21(%1, %%"REG_d"), %%mm3 \n\t"
  2193. "punpcklbw %%mm7, %%mm5 \n\t"
  2194. "punpcklbw %%mm7, %%mm1 \n\t"
  2195. "punpcklbw %%mm7, %%mm2 \n\t"
  2196. "punpcklbw %%mm7, %%mm3 \n\t"
  2197. "paddw %%mm1, %%mm5 \n\t"
  2198. "paddw %%mm3, %%mm2 \n\t"
  2199. "paddw %%mm5, %%mm2 \n\t"
  2200. "movq "MANGLE(w1111)", %%mm5 \n\t"
  2201. "psrlw $2, %%mm4 \n\t"
  2202. "psrlw $2, %%mm2 \n\t"
  2203. #endif
  2204. "movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t"
  2205. "movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t"
  2206. "pmaddwd %%mm4, %%mm1 \n\t"
  2207. "pmaddwd %%mm2, %%mm3 \n\t"
  2208. "pmaddwd %%mm6, %%mm4 \n\t"
  2209. "pmaddwd %%mm6, %%mm2 \n\t"
  2210. #ifndef FAST_BGR2YV12
  2211. "psrad $8, %%mm4 \n\t"
  2212. "psrad $8, %%mm1 \n\t"
  2213. "psrad $8, %%mm2 \n\t"
  2214. "psrad $8, %%mm3 \n\t"
  2215. #endif
  2216. "packssdw %%mm2, %%mm4 \n\t"
  2217. "packssdw %%mm3, %%mm1 \n\t"
  2218. "pmaddwd %%mm5, %%mm4 \n\t"
  2219. "pmaddwd %%mm5, %%mm1 \n\t"
  2220. "add $24, %%"REG_d" \n\t"
  2221. "packssdw %%mm1, %%mm4 \n\t" // V3 V2 U3 U2
  2222. "psraw $7, %%mm4 \n\t"
  2223. "movq %%mm0, %%mm1 \n\t"
  2224. "punpckldq %%mm4, %%mm0 \n\t"
  2225. "punpckhdq %%mm4, %%mm1 \n\t"
  2226. "packsswb %%mm1, %%mm0 \n\t"
  2227. "paddb "MANGLE(bgr2UVOffset)", %%mm0 \n\t"
  2228. "movd %%mm0, (%2, %%"REG_a") \n\t"
  2229. "punpckhdq %%mm0, %%mm0 \n\t"
  2230. "movd %%mm0, (%3, %%"REG_a") \n\t"
  2231. "add $4, %%"REG_a" \n\t"
  2232. " js 1b \n\t"
  2233. : : "r" (src+chromWidth*6), "r" (src+srcStride+chromWidth*6), "r" (udst+chromWidth), "r" (vdst+chromWidth), "g" (-chromWidth)
  2234. : "%"REG_a, "%"REG_d
  2235. );
  2236. udst += chromStride;
  2237. vdst += chromStride;
  2238. src += srcStride*2;
  2239. }
  2240. asm volatile( EMMS" \n\t"
  2241. SFENCE" \n\t"
  2242. :::"memory");
  2243. #else
  2244. y=0;
  2245. #endif
  2246. for(; y<height; y+=2)
  2247. {
  2248. long i;
  2249. for(i=0; i<chromWidth; i++)
  2250. {
  2251. unsigned int b= src[6*i+0];
  2252. unsigned int g= src[6*i+1];
  2253. unsigned int r= src[6*i+2];
  2254. unsigned int Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
  2255. unsigned int V = ((RV*r + GV*g + BV*b)>>RGB2YUV_SHIFT) + 128;
  2256. unsigned int U = ((RU*r + GU*g + BU*b)>>RGB2YUV_SHIFT) + 128;
  2257. udst[i] = U;
  2258. vdst[i] = V;
  2259. ydst[2*i] = Y;
  2260. b= src[6*i+3];
  2261. g= src[6*i+4];
  2262. r= src[6*i+5];
  2263. Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
  2264. ydst[2*i+1] = Y;
  2265. }
  2266. ydst += lumStride;
  2267. src += srcStride;
  2268. for(i=0; i<chromWidth; i++)
  2269. {
  2270. unsigned int b= src[6*i+0];
  2271. unsigned int g= src[6*i+1];
  2272. unsigned int r= src[6*i+2];
  2273. unsigned int Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
  2274. ydst[2*i] = Y;
  2275. b= src[6*i+3];
  2276. g= src[6*i+4];
  2277. r= src[6*i+5];
  2278. Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
  2279. ydst[2*i+1] = Y;
  2280. }
  2281. udst += chromStride;
  2282. vdst += chromStride;
  2283. ydst += lumStride;
  2284. src += srcStride;
  2285. }
  2286. }
  2287. void RENAME(interleaveBytes)(uint8_t *src1, uint8_t *src2, uint8_t *dest,
  2288. long width, long height, long src1Stride,
  2289. long src2Stride, long dstStride){
  2290. long h;
  2291. for(h=0; h < height; h++)
  2292. {
  2293. long w;
  2294. #ifdef HAVE_MMX
  2295. #ifdef HAVE_SSE2
  2296. asm(
  2297. "xor %%"REG_a", %%"REG_a" \n\t"
  2298. "1: \n\t"
  2299. PREFETCH" 64(%1, %%"REG_a") \n\t"
  2300. PREFETCH" 64(%2, %%"REG_a") \n\t"
  2301. "movdqa (%1, %%"REG_a"), %%xmm0 \n\t"
  2302. "movdqa (%1, %%"REG_a"), %%xmm1 \n\t"
  2303. "movdqa (%2, %%"REG_a"), %%xmm2 \n\t"
  2304. "punpcklbw %%xmm2, %%xmm0 \n\t"
  2305. "punpckhbw %%xmm2, %%xmm1 \n\t"
  2306. "movntdq %%xmm0, (%0, %%"REG_a", 2)\n\t"
  2307. "movntdq %%xmm1, 16(%0, %%"REG_a", 2)\n\t"
  2308. "add $16, %%"REG_a" \n\t"
  2309. "cmp %3, %%"REG_a" \n\t"
  2310. " jb 1b \n\t"
  2311. ::"r"(dest), "r"(src1), "r"(src2), "r" (width-15)
  2312. : "memory", "%"REG_a""
  2313. );
  2314. #else
  2315. asm(
  2316. "xor %%"REG_a", %%"REG_a" \n\t"
  2317. "1: \n\t"
  2318. PREFETCH" 64(%1, %%"REG_a") \n\t"
  2319. PREFETCH" 64(%2, %%"REG_a") \n\t"
  2320. "movq (%1, %%"REG_a"), %%mm0 \n\t"
  2321. "movq 8(%1, %%"REG_a"), %%mm2 \n\t"
  2322. "movq %%mm0, %%mm1 \n\t"
  2323. "movq %%mm2, %%mm3 \n\t"
  2324. "movq (%2, %%"REG_a"), %%mm4 \n\t"
  2325. "movq 8(%2, %%"REG_a"), %%mm5 \n\t"
  2326. "punpcklbw %%mm4, %%mm0 \n\t"
  2327. "punpckhbw %%mm4, %%mm1 \n\t"
  2328. "punpcklbw %%mm5, %%mm2 \n\t"
  2329. "punpckhbw %%mm5, %%mm3 \n\t"
  2330. MOVNTQ" %%mm0, (%0, %%"REG_a", 2)\n\t"
  2331. MOVNTQ" %%mm1, 8(%0, %%"REG_a", 2)\n\t"
  2332. MOVNTQ" %%mm2, 16(%0, %%"REG_a", 2)\n\t"
  2333. MOVNTQ" %%mm3, 24(%0, %%"REG_a", 2)\n\t"
  2334. "add $16, %%"REG_a" \n\t"
  2335. "cmp %3, %%"REG_a" \n\t"
  2336. " jb 1b \n\t"
  2337. ::"r"(dest), "r"(src1), "r"(src2), "r" (width-15)
  2338. : "memory", "%"REG_a
  2339. );
  2340. #endif
  2341. for(w= (width&(~15)); w < width; w++)
  2342. {
  2343. dest[2*w+0] = src1[w];
  2344. dest[2*w+1] = src2[w];
  2345. }
  2346. #else
  2347. for(w=0; w < width; w++)
  2348. {
  2349. dest[2*w+0] = src1[w];
  2350. dest[2*w+1] = src2[w];
  2351. }
  2352. #endif
  2353. dest += dstStride;
  2354. src1 += src1Stride;
  2355. src2 += src2Stride;
  2356. }
  2357. #ifdef HAVE_MMX
  2358. asm(
  2359. EMMS" \n\t"
  2360. SFENCE" \n\t"
  2361. ::: "memory"
  2362. );
  2363. #endif
  2364. }
  2365. static inline void RENAME(vu9_to_vu12)(const uint8_t *src1, const uint8_t *src2,
  2366. uint8_t *dst1, uint8_t *dst2,
  2367. long width, long height,
  2368. long srcStride1, long srcStride2,
  2369. long dstStride1, long dstStride2)
  2370. {
  2371. long y,x,w,h;
  2372. w=width/2; h=height/2;
  2373. #ifdef HAVE_MMX
  2374. asm volatile(
  2375. PREFETCH" %0\n\t"
  2376. PREFETCH" %1\n\t"
  2377. ::"m"(*(src1+srcStride1)),"m"(*(src2+srcStride2)):"memory");
  2378. #endif
  2379. for(y=0;y<h;y++){
  2380. const uint8_t* s1=src1+srcStride1*(y>>1);
  2381. uint8_t* d=dst1+dstStride1*y;
  2382. x=0;
  2383. #ifdef HAVE_MMX
  2384. for(;x<w-31;x+=32)
  2385. {
  2386. asm volatile(
  2387. PREFETCH" 32%1\n\t"
  2388. "movq %1, %%mm0\n\t"
  2389. "movq 8%1, %%mm2\n\t"
  2390. "movq 16%1, %%mm4\n\t"
  2391. "movq 24%1, %%mm6\n\t"
  2392. "movq %%mm0, %%mm1\n\t"
  2393. "movq %%mm2, %%mm3\n\t"
  2394. "movq %%mm4, %%mm5\n\t"
  2395. "movq %%mm6, %%mm7\n\t"
  2396. "punpcklbw %%mm0, %%mm0\n\t"
  2397. "punpckhbw %%mm1, %%mm1\n\t"
  2398. "punpcklbw %%mm2, %%mm2\n\t"
  2399. "punpckhbw %%mm3, %%mm3\n\t"
  2400. "punpcklbw %%mm4, %%mm4\n\t"
  2401. "punpckhbw %%mm5, %%mm5\n\t"
  2402. "punpcklbw %%mm6, %%mm6\n\t"
  2403. "punpckhbw %%mm7, %%mm7\n\t"
  2404. MOVNTQ" %%mm0, %0\n\t"
  2405. MOVNTQ" %%mm1, 8%0\n\t"
  2406. MOVNTQ" %%mm2, 16%0\n\t"
  2407. MOVNTQ" %%mm3, 24%0\n\t"
  2408. MOVNTQ" %%mm4, 32%0\n\t"
  2409. MOVNTQ" %%mm5, 40%0\n\t"
  2410. MOVNTQ" %%mm6, 48%0\n\t"
  2411. MOVNTQ" %%mm7, 56%0"
  2412. :"=m"(d[2*x])
  2413. :"m"(s1[x])
  2414. :"memory");
  2415. }
  2416. #endif
  2417. for(;x<w;x++) d[2*x]=d[2*x+1]=s1[x];
  2418. }
  2419. for(y=0;y<h;y++){
  2420. const uint8_t* s2=src2+srcStride2*(y>>1);
  2421. uint8_t* d=dst2+dstStride2*y;
  2422. x=0;
  2423. #ifdef HAVE_MMX
  2424. for(;x<w-31;x+=32)
  2425. {
  2426. asm volatile(
  2427. PREFETCH" 32%1\n\t"
  2428. "movq %1, %%mm0\n\t"
  2429. "movq 8%1, %%mm2\n\t"
  2430. "movq 16%1, %%mm4\n\t"
  2431. "movq 24%1, %%mm6\n\t"
  2432. "movq %%mm0, %%mm1\n\t"
  2433. "movq %%mm2, %%mm3\n\t"
  2434. "movq %%mm4, %%mm5\n\t"
  2435. "movq %%mm6, %%mm7\n\t"
  2436. "punpcklbw %%mm0, %%mm0\n\t"
  2437. "punpckhbw %%mm1, %%mm1\n\t"
  2438. "punpcklbw %%mm2, %%mm2\n\t"
  2439. "punpckhbw %%mm3, %%mm3\n\t"
  2440. "punpcklbw %%mm4, %%mm4\n\t"
  2441. "punpckhbw %%mm5, %%mm5\n\t"
  2442. "punpcklbw %%mm6, %%mm6\n\t"
  2443. "punpckhbw %%mm7, %%mm7\n\t"
  2444. MOVNTQ" %%mm0, %0\n\t"
  2445. MOVNTQ" %%mm1, 8%0\n\t"
  2446. MOVNTQ" %%mm2, 16%0\n\t"
  2447. MOVNTQ" %%mm3, 24%0\n\t"
  2448. MOVNTQ" %%mm4, 32%0\n\t"
  2449. MOVNTQ" %%mm5, 40%0\n\t"
  2450. MOVNTQ" %%mm6, 48%0\n\t"
  2451. MOVNTQ" %%mm7, 56%0"
  2452. :"=m"(d[2*x])
  2453. :"m"(s2[x])
  2454. :"memory");
  2455. }
  2456. #endif
  2457. for(;x<w;x++) d[2*x]=d[2*x+1]=s2[x];
  2458. }
  2459. #ifdef HAVE_MMX
  2460. asm(
  2461. EMMS" \n\t"
  2462. SFENCE" \n\t"
  2463. ::: "memory"
  2464. );
  2465. #endif
  2466. }
  2467. static inline void RENAME(yvu9_to_yuy2)(const uint8_t *src1, const uint8_t *src2, const uint8_t *src3,
  2468. uint8_t *dst,
  2469. long width, long height,
  2470. long srcStride1, long srcStride2,
  2471. long srcStride3, long dstStride)
  2472. {
  2473. long y,x,w,h;
  2474. w=width/2; h=height;
  2475. for(y=0;y<h;y++){
  2476. const uint8_t* yp=src1+srcStride1*y;
  2477. const uint8_t* up=src2+srcStride2*(y>>2);
  2478. const uint8_t* vp=src3+srcStride3*(y>>2);
  2479. uint8_t* d=dst+dstStride*y;
  2480. x=0;
  2481. #ifdef HAVE_MMX
  2482. for(;x<w-7;x+=8)
  2483. {
  2484. asm volatile(
  2485. PREFETCH" 32(%1, %0)\n\t"
  2486. PREFETCH" 32(%2, %0)\n\t"
  2487. PREFETCH" 32(%3, %0)\n\t"
  2488. "movq (%1, %0, 4), %%mm0\n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
  2489. "movq (%2, %0), %%mm1\n\t" /* U0U1U2U3U4U5U6U7 */
  2490. "movq (%3, %0), %%mm2\n\t" /* V0V1V2V3V4V5V6V7 */
  2491. "movq %%mm0, %%mm3\n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
  2492. "movq %%mm1, %%mm4\n\t" /* U0U1U2U3U4U5U6U7 */
  2493. "movq %%mm2, %%mm5\n\t" /* V0V1V2V3V4V5V6V7 */
  2494. "punpcklbw %%mm1, %%mm1\n\t" /* U0U0 U1U1 U2U2 U3U3 */
  2495. "punpcklbw %%mm2, %%mm2\n\t" /* V0V0 V1V1 V2V2 V3V3 */
  2496. "punpckhbw %%mm4, %%mm4\n\t" /* U4U4 U5U5 U6U6 U7U7 */
  2497. "punpckhbw %%mm5, %%mm5\n\t" /* V4V4 V5V5 V6V6 V7V7 */
  2498. "movq %%mm1, %%mm6\n\t"
  2499. "punpcklbw %%mm2, %%mm1\n\t" /* U0V0 U0V0 U1V1 U1V1*/
  2500. "punpcklbw %%mm1, %%mm0\n\t" /* Y0U0 Y1V0 Y2U0 Y3V0*/
  2501. "punpckhbw %%mm1, %%mm3\n\t" /* Y4U1 Y5V1 Y6U1 Y7V1*/
  2502. MOVNTQ" %%mm0, (%4, %0, 8)\n\t"
  2503. MOVNTQ" %%mm3, 8(%4, %0, 8)\n\t"
  2504. "punpckhbw %%mm2, %%mm6\n\t" /* U2V2 U2V2 U3V3 U3V3*/
  2505. "movq 8(%1, %0, 4), %%mm0\n\t"
  2506. "movq %%mm0, %%mm3\n\t"
  2507. "punpcklbw %%mm6, %%mm0\n\t" /* Y U2 Y V2 Y U2 Y V2*/
  2508. "punpckhbw %%mm6, %%mm3\n\t" /* Y U3 Y V3 Y U3 Y V3*/
  2509. MOVNTQ" %%mm0, 16(%4, %0, 8)\n\t"
  2510. MOVNTQ" %%mm3, 24(%4, %0, 8)\n\t"
  2511. "movq %%mm4, %%mm6\n\t"
  2512. "movq 16(%1, %0, 4), %%mm0\n\t"
  2513. "movq %%mm0, %%mm3\n\t"
  2514. "punpcklbw %%mm5, %%mm4\n\t"
  2515. "punpcklbw %%mm4, %%mm0\n\t" /* Y U4 Y V4 Y U4 Y V4*/
  2516. "punpckhbw %%mm4, %%mm3\n\t" /* Y U5 Y V5 Y U5 Y V5*/
  2517. MOVNTQ" %%mm0, 32(%4, %0, 8)\n\t"
  2518. MOVNTQ" %%mm3, 40(%4, %0, 8)\n\t"
  2519. "punpckhbw %%mm5, %%mm6\n\t"
  2520. "movq 24(%1, %0, 4), %%mm0\n\t"
  2521. "movq %%mm0, %%mm3\n\t"
  2522. "punpcklbw %%mm6, %%mm0\n\t" /* Y U6 Y V6 Y U6 Y V6*/
  2523. "punpckhbw %%mm6, %%mm3\n\t" /* Y U7 Y V7 Y U7 Y V7*/
  2524. MOVNTQ" %%mm0, 48(%4, %0, 8)\n\t"
  2525. MOVNTQ" %%mm3, 56(%4, %0, 8)\n\t"
  2526. : "+r" (x)
  2527. : "r"(yp), "r" (up), "r"(vp), "r"(d)
  2528. :"memory");
  2529. }
  2530. #endif
  2531. for(; x<w; x++)
  2532. {
  2533. const long x2= x<<2;
  2534. d[8*x+0]=yp[x2];
  2535. d[8*x+1]=up[x];
  2536. d[8*x+2]=yp[x2+1];
  2537. d[8*x+3]=vp[x];
  2538. d[8*x+4]=yp[x2+2];
  2539. d[8*x+5]=up[x];
  2540. d[8*x+6]=yp[x2+3];
  2541. d[8*x+7]=vp[x];
  2542. }
  2543. }
  2544. #ifdef HAVE_MMX
  2545. asm(
  2546. EMMS" \n\t"
  2547. SFENCE" \n\t"
  2548. ::: "memory"
  2549. );
  2550. #endif
  2551. }