You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

3078 lines
110KB

  1. /*
  2. * software RGB to RGB converter
  3. * pluralize by software PAL8 to RGB converter
  4. * software YUV to YUV converter
  5. * software YUV to RGB converter
  6. * Written by Nick Kurshev.
  7. * palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at)
  8. * lot of big-endian byte order fixes by Alex Beregszaszi
  9. *
  10. * This file is part of FFmpeg.
  11. *
  12. * FFmpeg is free software; you can redistribute it and/or modify
  13. * it under the terms of the GNU General Public License as published by
  14. * the Free Software Foundation; either version 2 of the License, or
  15. * (at your option) any later version.
  16. *
  17. * FFmpeg is distributed in the hope that it will be useful,
  18. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  19. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  20. * GNU General Public License for more details.
  21. *
  22. * You should have received a copy of the GNU General Public License
  23. * along with FFmpeg; if not, write to the Free Software
  24. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  25. *
  26. * The C code (not assembly, MMX, ...) of this file can be used
  27. * under the LGPL license.
  28. */
  29. #include <stddef.h>
  30. #undef PREFETCH
  31. #undef MOVNTQ
  32. #undef EMMS
  33. #undef SFENCE
  34. #undef MMREG_SIZE
  35. #undef PREFETCHW
  36. #undef PAVGB
  37. #if HAVE_SSE2
  38. #define MMREG_SIZE 16
  39. #else
  40. #define MMREG_SIZE 8
  41. #endif
  42. #if HAVE_AMD3DNOW
  43. #define PREFETCH "prefetch"
  44. #define PREFETCHW "prefetchw"
  45. #define PAVGB "pavgusb"
  46. #elif HAVE_MMX2
  47. #define PREFETCH "prefetchnta"
  48. #define PREFETCHW "prefetcht0"
  49. #define PAVGB "pavgb"
  50. #else
  51. #define PREFETCH " # nop"
  52. #define PREFETCHW " # nop"
  53. #endif
  54. #if HAVE_AMD3DNOW
  55. /* On K6 femms is faster than emms. On K7 femms is directly mapped to emms. */
  56. #define EMMS "femms"
  57. #else
  58. #define EMMS "emms"
  59. #endif
  60. #if HAVE_MMX2
  61. #define MOVNTQ "movntq"
  62. #define SFENCE "sfence"
  63. #else
  64. #define MOVNTQ "movq"
  65. #define SFENCE " # nop"
  66. #endif
  67. static inline void RENAME(rgb24tobgr32)(const uint8_t *src, uint8_t *dst, long src_size)
  68. {
  69. uint8_t *dest = dst;
  70. const uint8_t *s = src;
  71. const uint8_t *end;
  72. #if HAVE_MMX
  73. const uint8_t *mm_end;
  74. #endif
  75. end = s + src_size;
  76. #if HAVE_MMX
  77. __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
  78. mm_end = end - 23;
  79. __asm__ volatile("movq %0, %%mm7"::"m"(mask32a):"memory");
  80. while (s < mm_end)
  81. {
  82. __asm__ volatile(
  83. PREFETCH" 32%1 \n\t"
  84. "movd %1, %%mm0 \n\t"
  85. "punpckldq 3%1, %%mm0 \n\t"
  86. "movd 6%1, %%mm1 \n\t"
  87. "punpckldq 9%1, %%mm1 \n\t"
  88. "movd 12%1, %%mm2 \n\t"
  89. "punpckldq 15%1, %%mm2 \n\t"
  90. "movd 18%1, %%mm3 \n\t"
  91. "punpckldq 21%1, %%mm3 \n\t"
  92. "por %%mm7, %%mm0 \n\t"
  93. "por %%mm7, %%mm1 \n\t"
  94. "por %%mm7, %%mm2 \n\t"
  95. "por %%mm7, %%mm3 \n\t"
  96. MOVNTQ" %%mm0, %0 \n\t"
  97. MOVNTQ" %%mm1, 8%0 \n\t"
  98. MOVNTQ" %%mm2, 16%0 \n\t"
  99. MOVNTQ" %%mm3, 24%0"
  100. :"=m"(*dest)
  101. :"m"(*s)
  102. :"memory");
  103. dest += 32;
  104. s += 24;
  105. }
  106. __asm__ volatile(SFENCE:::"memory");
  107. __asm__ volatile(EMMS:::"memory");
  108. #endif
  109. while (s < end)
  110. {
  111. #ifdef WORDS_BIGENDIAN
  112. /* RGB24 (= R,G,B) -> RGB32 (= A,B,G,R) */
  113. *dest++ = 255;
  114. *dest++ = s[2];
  115. *dest++ = s[1];
  116. *dest++ = s[0];
  117. s+=3;
  118. #else
  119. *dest++ = *s++;
  120. *dest++ = *s++;
  121. *dest++ = *s++;
  122. *dest++ = 255;
  123. #endif
  124. }
  125. }
  126. static inline void RENAME(rgb32tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
  127. {
  128. uint8_t *dest = dst;
  129. const uint8_t *s = src;
  130. const uint8_t *end;
  131. #if HAVE_MMX
  132. const uint8_t *mm_end;
  133. #endif
  134. end = s + src_size;
  135. #if HAVE_MMX
  136. __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
  137. mm_end = end - 31;
  138. while (s < mm_end)
  139. {
  140. __asm__ volatile(
  141. PREFETCH" 32%1 \n\t"
  142. "movq %1, %%mm0 \n\t"
  143. "movq 8%1, %%mm1 \n\t"
  144. "movq 16%1, %%mm4 \n\t"
  145. "movq 24%1, %%mm5 \n\t"
  146. "movq %%mm0, %%mm2 \n\t"
  147. "movq %%mm1, %%mm3 \n\t"
  148. "movq %%mm4, %%mm6 \n\t"
  149. "movq %%mm5, %%mm7 \n\t"
  150. "psrlq $8, %%mm2 \n\t"
  151. "psrlq $8, %%mm3 \n\t"
  152. "psrlq $8, %%mm6 \n\t"
  153. "psrlq $8, %%mm7 \n\t"
  154. "pand %2, %%mm0 \n\t"
  155. "pand %2, %%mm1 \n\t"
  156. "pand %2, %%mm4 \n\t"
  157. "pand %2, %%mm5 \n\t"
  158. "pand %3, %%mm2 \n\t"
  159. "pand %3, %%mm3 \n\t"
  160. "pand %3, %%mm6 \n\t"
  161. "pand %3, %%mm7 \n\t"
  162. "por %%mm2, %%mm0 \n\t"
  163. "por %%mm3, %%mm1 \n\t"
  164. "por %%mm6, %%mm4 \n\t"
  165. "por %%mm7, %%mm5 \n\t"
  166. "movq %%mm1, %%mm2 \n\t"
  167. "movq %%mm4, %%mm3 \n\t"
  168. "psllq $48, %%mm2 \n\t"
  169. "psllq $32, %%mm3 \n\t"
  170. "pand %4, %%mm2 \n\t"
  171. "pand %5, %%mm3 \n\t"
  172. "por %%mm2, %%mm0 \n\t"
  173. "psrlq $16, %%mm1 \n\t"
  174. "psrlq $32, %%mm4 \n\t"
  175. "psllq $16, %%mm5 \n\t"
  176. "por %%mm3, %%mm1 \n\t"
  177. "pand %6, %%mm5 \n\t"
  178. "por %%mm5, %%mm4 \n\t"
  179. MOVNTQ" %%mm0, %0 \n\t"
  180. MOVNTQ" %%mm1, 8%0 \n\t"
  181. MOVNTQ" %%mm4, 16%0"
  182. :"=m"(*dest)
  183. :"m"(*s),"m"(mask24l),
  184. "m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
  185. :"memory");
  186. dest += 24;
  187. s += 32;
  188. }
  189. __asm__ volatile(SFENCE:::"memory");
  190. __asm__ volatile(EMMS:::"memory");
  191. #endif
  192. while (s < end)
  193. {
  194. #ifdef WORDS_BIGENDIAN
  195. /* RGB32 (= A,B,G,R) -> RGB24 (= R,G,B) */
  196. s++;
  197. dest[2] = *s++;
  198. dest[1] = *s++;
  199. dest[0] = *s++;
  200. dest += 3;
  201. #else
  202. *dest++ = *s++;
  203. *dest++ = *s++;
  204. *dest++ = *s++;
  205. s++;
  206. #endif
  207. }
  208. }
  209. /*
  210. original by Strepto/Astral
  211. ported to gcc & bugfixed: A'rpi
  212. MMX2, 3DNOW optimization by Nick Kurshev
  213. 32-bit C version, and and&add trick by Michael Niedermayer
  214. */
  215. static inline void RENAME(rgb15to16)(const uint8_t *src, uint8_t *dst, long src_size)
  216. {
  217. register const uint8_t* s=src;
  218. register uint8_t* d=dst;
  219. register const uint8_t *end;
  220. const uint8_t *mm_end;
  221. end = s + src_size;
  222. #if HAVE_MMX
  223. __asm__ volatile(PREFETCH" %0"::"m"(*s));
  224. __asm__ volatile("movq %0, %%mm4"::"m"(mask15s));
  225. mm_end = end - 15;
  226. while (s<mm_end)
  227. {
  228. __asm__ volatile(
  229. PREFETCH" 32%1 \n\t"
  230. "movq %1, %%mm0 \n\t"
  231. "movq 8%1, %%mm2 \n\t"
  232. "movq %%mm0, %%mm1 \n\t"
  233. "movq %%mm2, %%mm3 \n\t"
  234. "pand %%mm4, %%mm0 \n\t"
  235. "pand %%mm4, %%mm2 \n\t"
  236. "paddw %%mm1, %%mm0 \n\t"
  237. "paddw %%mm3, %%mm2 \n\t"
  238. MOVNTQ" %%mm0, %0 \n\t"
  239. MOVNTQ" %%mm2, 8%0"
  240. :"=m"(*d)
  241. :"m"(*s)
  242. );
  243. d+=16;
  244. s+=16;
  245. }
  246. __asm__ volatile(SFENCE:::"memory");
  247. __asm__ volatile(EMMS:::"memory");
  248. #endif
  249. mm_end = end - 3;
  250. while (s < mm_end)
  251. {
  252. register unsigned x= *((const uint32_t *)s);
  253. *((uint32_t *)d) = (x&0x7FFF7FFF) + (x&0x7FE07FE0);
  254. d+=4;
  255. s+=4;
  256. }
  257. if (s < end)
  258. {
  259. register unsigned short x= *((const uint16_t *)s);
  260. *((uint16_t *)d) = (x&0x7FFF) + (x&0x7FE0);
  261. }
  262. }
  263. static inline void RENAME(rgb16to15)(const uint8_t *src, uint8_t *dst, long src_size)
  264. {
  265. register const uint8_t* s=src;
  266. register uint8_t* d=dst;
  267. register const uint8_t *end;
  268. const uint8_t *mm_end;
  269. end = s + src_size;
  270. #if HAVE_MMX
  271. __asm__ volatile(PREFETCH" %0"::"m"(*s));
  272. __asm__ volatile("movq %0, %%mm7"::"m"(mask15rg));
  273. __asm__ volatile("movq %0, %%mm6"::"m"(mask15b));
  274. mm_end = end - 15;
  275. while (s<mm_end)
  276. {
  277. __asm__ volatile(
  278. PREFETCH" 32%1 \n\t"
  279. "movq %1, %%mm0 \n\t"
  280. "movq 8%1, %%mm2 \n\t"
  281. "movq %%mm0, %%mm1 \n\t"
  282. "movq %%mm2, %%mm3 \n\t"
  283. "psrlq $1, %%mm0 \n\t"
  284. "psrlq $1, %%mm2 \n\t"
  285. "pand %%mm7, %%mm0 \n\t"
  286. "pand %%mm7, %%mm2 \n\t"
  287. "pand %%mm6, %%mm1 \n\t"
  288. "pand %%mm6, %%mm3 \n\t"
  289. "por %%mm1, %%mm0 \n\t"
  290. "por %%mm3, %%mm2 \n\t"
  291. MOVNTQ" %%mm0, %0 \n\t"
  292. MOVNTQ" %%mm2, 8%0"
  293. :"=m"(*d)
  294. :"m"(*s)
  295. );
  296. d+=16;
  297. s+=16;
  298. }
  299. __asm__ volatile(SFENCE:::"memory");
  300. __asm__ volatile(EMMS:::"memory");
  301. #endif
  302. mm_end = end - 3;
  303. while (s < mm_end)
  304. {
  305. register uint32_t x= *((const uint32_t*)s);
  306. *((uint32_t *)d) = ((x>>1)&0x7FE07FE0) | (x&0x001F001F);
  307. s+=4;
  308. d+=4;
  309. }
  310. if (s < end)
  311. {
  312. register uint16_t x= *((const uint16_t*)s);
  313. *((uint16_t *)d) = ((x>>1)&0x7FE0) | (x&0x001F);
  314. s+=2;
  315. d+=2;
  316. }
  317. }
  318. static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, long src_size)
  319. {
  320. const uint8_t *s = src;
  321. const uint8_t *end;
  322. #if HAVE_MMX
  323. const uint8_t *mm_end;
  324. #endif
  325. uint16_t *d = (uint16_t *)dst;
  326. end = s + src_size;
  327. #if HAVE_MMX
  328. mm_end = end - 15;
  329. #if 1 //is faster only if multiplies are reasonably fast (FIXME figure out on which CPUs this is faster, on Athlon it is slightly faster)
  330. __asm__ volatile(
  331. "movq %3, %%mm5 \n\t"
  332. "movq %4, %%mm6 \n\t"
  333. "movq %5, %%mm7 \n\t"
  334. "jmp 2f \n\t"
  335. ASMALIGN(4)
  336. "1: \n\t"
  337. PREFETCH" 32(%1) \n\t"
  338. "movd (%1), %%mm0 \n\t"
  339. "movd 4(%1), %%mm3 \n\t"
  340. "punpckldq 8(%1), %%mm0 \n\t"
  341. "punpckldq 12(%1), %%mm3 \n\t"
  342. "movq %%mm0, %%mm1 \n\t"
  343. "movq %%mm3, %%mm4 \n\t"
  344. "pand %%mm6, %%mm0 \n\t"
  345. "pand %%mm6, %%mm3 \n\t"
  346. "pmaddwd %%mm7, %%mm0 \n\t"
  347. "pmaddwd %%mm7, %%mm3 \n\t"
  348. "pand %%mm5, %%mm1 \n\t"
  349. "pand %%mm5, %%mm4 \n\t"
  350. "por %%mm1, %%mm0 \n\t"
  351. "por %%mm4, %%mm3 \n\t"
  352. "psrld $5, %%mm0 \n\t"
  353. "pslld $11, %%mm3 \n\t"
  354. "por %%mm3, %%mm0 \n\t"
  355. MOVNTQ" %%mm0, (%0) \n\t"
  356. "add $16, %1 \n\t"
  357. "add $8, %0 \n\t"
  358. "2: \n\t"
  359. "cmp %2, %1 \n\t"
  360. " jb 1b \n\t"
  361. : "+r" (d), "+r"(s)
  362. : "r" (mm_end), "m" (mask3216g), "m" (mask3216br), "m" (mul3216)
  363. );
  364. #else
  365. __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
  366. __asm__ volatile(
  367. "movq %0, %%mm7 \n\t"
  368. "movq %1, %%mm6 \n\t"
  369. ::"m"(red_16mask),"m"(green_16mask));
  370. while (s < mm_end)
  371. {
  372. __asm__ volatile(
  373. PREFETCH" 32%1 \n\t"
  374. "movd %1, %%mm0 \n\t"
  375. "movd 4%1, %%mm3 \n\t"
  376. "punpckldq 8%1, %%mm0 \n\t"
  377. "punpckldq 12%1, %%mm3 \n\t"
  378. "movq %%mm0, %%mm1 \n\t"
  379. "movq %%mm0, %%mm2 \n\t"
  380. "movq %%mm3, %%mm4 \n\t"
  381. "movq %%mm3, %%mm5 \n\t"
  382. "psrlq $3, %%mm0 \n\t"
  383. "psrlq $3, %%mm3 \n\t"
  384. "pand %2, %%mm0 \n\t"
  385. "pand %2, %%mm3 \n\t"
  386. "psrlq $5, %%mm1 \n\t"
  387. "psrlq $5, %%mm4 \n\t"
  388. "pand %%mm6, %%mm1 \n\t"
  389. "pand %%mm6, %%mm4 \n\t"
  390. "psrlq $8, %%mm2 \n\t"
  391. "psrlq $8, %%mm5 \n\t"
  392. "pand %%mm7, %%mm2 \n\t"
  393. "pand %%mm7, %%mm5 \n\t"
  394. "por %%mm1, %%mm0 \n\t"
  395. "por %%mm4, %%mm3 \n\t"
  396. "por %%mm2, %%mm0 \n\t"
  397. "por %%mm5, %%mm3 \n\t"
  398. "psllq $16, %%mm3 \n\t"
  399. "por %%mm3, %%mm0 \n\t"
  400. MOVNTQ" %%mm0, %0 \n\t"
  401. :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
  402. d += 4;
  403. s += 16;
  404. }
  405. #endif
  406. __asm__ volatile(SFENCE:::"memory");
  407. __asm__ volatile(EMMS:::"memory");
  408. #endif
  409. while (s < end)
  410. {
  411. register int rgb = *(const uint32_t*)s; s += 4;
  412. *d++ = ((rgb&0xFF)>>3) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>8);
  413. }
  414. }
  415. static inline void RENAME(rgb32tobgr16)(const uint8_t *src, uint8_t *dst, long src_size)
  416. {
  417. const uint8_t *s = src;
  418. const uint8_t *end;
  419. #if HAVE_MMX
  420. const uint8_t *mm_end;
  421. #endif
  422. uint16_t *d = (uint16_t *)dst;
  423. end = s + src_size;
  424. #if HAVE_MMX
  425. __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
  426. __asm__ volatile(
  427. "movq %0, %%mm7 \n\t"
  428. "movq %1, %%mm6 \n\t"
  429. ::"m"(red_16mask),"m"(green_16mask));
  430. mm_end = end - 15;
  431. while (s < mm_end)
  432. {
  433. __asm__ volatile(
  434. PREFETCH" 32%1 \n\t"
  435. "movd %1, %%mm0 \n\t"
  436. "movd 4%1, %%mm3 \n\t"
  437. "punpckldq 8%1, %%mm0 \n\t"
  438. "punpckldq 12%1, %%mm3 \n\t"
  439. "movq %%mm0, %%mm1 \n\t"
  440. "movq %%mm0, %%mm2 \n\t"
  441. "movq %%mm3, %%mm4 \n\t"
  442. "movq %%mm3, %%mm5 \n\t"
  443. "psllq $8, %%mm0 \n\t"
  444. "psllq $8, %%mm3 \n\t"
  445. "pand %%mm7, %%mm0 \n\t"
  446. "pand %%mm7, %%mm3 \n\t"
  447. "psrlq $5, %%mm1 \n\t"
  448. "psrlq $5, %%mm4 \n\t"
  449. "pand %%mm6, %%mm1 \n\t"
  450. "pand %%mm6, %%mm4 \n\t"
  451. "psrlq $19, %%mm2 \n\t"
  452. "psrlq $19, %%mm5 \n\t"
  453. "pand %2, %%mm2 \n\t"
  454. "pand %2, %%mm5 \n\t"
  455. "por %%mm1, %%mm0 \n\t"
  456. "por %%mm4, %%mm3 \n\t"
  457. "por %%mm2, %%mm0 \n\t"
  458. "por %%mm5, %%mm3 \n\t"
  459. "psllq $16, %%mm3 \n\t"
  460. "por %%mm3, %%mm0 \n\t"
  461. MOVNTQ" %%mm0, %0 \n\t"
  462. :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
  463. d += 4;
  464. s += 16;
  465. }
  466. __asm__ volatile(SFENCE:::"memory");
  467. __asm__ volatile(EMMS:::"memory");
  468. #endif
  469. while (s < end)
  470. {
  471. register int rgb = *(const uint32_t*)s; s += 4;
  472. *d++ = ((rgb&0xF8)<<8) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>19);
  473. }
  474. }
  475. static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, long src_size)
  476. {
  477. const uint8_t *s = src;
  478. const uint8_t *end;
  479. #if HAVE_MMX
  480. const uint8_t *mm_end;
  481. #endif
  482. uint16_t *d = (uint16_t *)dst;
  483. end = s + src_size;
  484. #if HAVE_MMX
  485. mm_end = end - 15;
  486. #if 1 //is faster only if multiplies are reasonably fast (FIXME figure out on which CPUs this is faster, on Athlon it is slightly faster)
  487. __asm__ volatile(
  488. "movq %3, %%mm5 \n\t"
  489. "movq %4, %%mm6 \n\t"
  490. "movq %5, %%mm7 \n\t"
  491. "jmp 2f \n\t"
  492. ASMALIGN(4)
  493. "1: \n\t"
  494. PREFETCH" 32(%1) \n\t"
  495. "movd (%1), %%mm0 \n\t"
  496. "movd 4(%1), %%mm3 \n\t"
  497. "punpckldq 8(%1), %%mm0 \n\t"
  498. "punpckldq 12(%1), %%mm3 \n\t"
  499. "movq %%mm0, %%mm1 \n\t"
  500. "movq %%mm3, %%mm4 \n\t"
  501. "pand %%mm6, %%mm0 \n\t"
  502. "pand %%mm6, %%mm3 \n\t"
  503. "pmaddwd %%mm7, %%mm0 \n\t"
  504. "pmaddwd %%mm7, %%mm3 \n\t"
  505. "pand %%mm5, %%mm1 \n\t"
  506. "pand %%mm5, %%mm4 \n\t"
  507. "por %%mm1, %%mm0 \n\t"
  508. "por %%mm4, %%mm3 \n\t"
  509. "psrld $6, %%mm0 \n\t"
  510. "pslld $10, %%mm3 \n\t"
  511. "por %%mm3, %%mm0 \n\t"
  512. MOVNTQ" %%mm0, (%0) \n\t"
  513. "add $16, %1 \n\t"
  514. "add $8, %0 \n\t"
  515. "2: \n\t"
  516. "cmp %2, %1 \n\t"
  517. " jb 1b \n\t"
  518. : "+r" (d), "+r"(s)
  519. : "r" (mm_end), "m" (mask3215g), "m" (mask3216br), "m" (mul3215)
  520. );
  521. #else
  522. __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
  523. __asm__ volatile(
  524. "movq %0, %%mm7 \n\t"
  525. "movq %1, %%mm6 \n\t"
  526. ::"m"(red_15mask),"m"(green_15mask));
  527. while (s < mm_end)
  528. {
  529. __asm__ volatile(
  530. PREFETCH" 32%1 \n\t"
  531. "movd %1, %%mm0 \n\t"
  532. "movd 4%1, %%mm3 \n\t"
  533. "punpckldq 8%1, %%mm0 \n\t"
  534. "punpckldq 12%1, %%mm3 \n\t"
  535. "movq %%mm0, %%mm1 \n\t"
  536. "movq %%mm0, %%mm2 \n\t"
  537. "movq %%mm3, %%mm4 \n\t"
  538. "movq %%mm3, %%mm5 \n\t"
  539. "psrlq $3, %%mm0 \n\t"
  540. "psrlq $3, %%mm3 \n\t"
  541. "pand %2, %%mm0 \n\t"
  542. "pand %2, %%mm3 \n\t"
  543. "psrlq $6, %%mm1 \n\t"
  544. "psrlq $6, %%mm4 \n\t"
  545. "pand %%mm6, %%mm1 \n\t"
  546. "pand %%mm6, %%mm4 \n\t"
  547. "psrlq $9, %%mm2 \n\t"
  548. "psrlq $9, %%mm5 \n\t"
  549. "pand %%mm7, %%mm2 \n\t"
  550. "pand %%mm7, %%mm5 \n\t"
  551. "por %%mm1, %%mm0 \n\t"
  552. "por %%mm4, %%mm3 \n\t"
  553. "por %%mm2, %%mm0 \n\t"
  554. "por %%mm5, %%mm3 \n\t"
  555. "psllq $16, %%mm3 \n\t"
  556. "por %%mm3, %%mm0 \n\t"
  557. MOVNTQ" %%mm0, %0 \n\t"
  558. :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
  559. d += 4;
  560. s += 16;
  561. }
  562. #endif
  563. __asm__ volatile(SFENCE:::"memory");
  564. __asm__ volatile(EMMS:::"memory");
  565. #endif
  566. while (s < end)
  567. {
  568. register int rgb = *(const uint32_t*)s; s += 4;
  569. *d++ = ((rgb&0xFF)>>3) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>9);
  570. }
  571. }
  572. static inline void RENAME(rgb32tobgr15)(const uint8_t *src, uint8_t *dst, long src_size)
  573. {
  574. const uint8_t *s = src;
  575. const uint8_t *end;
  576. #if HAVE_MMX
  577. const uint8_t *mm_end;
  578. #endif
  579. uint16_t *d = (uint16_t *)dst;
  580. end = s + src_size;
  581. #if HAVE_MMX
  582. __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
  583. __asm__ volatile(
  584. "movq %0, %%mm7 \n\t"
  585. "movq %1, %%mm6 \n\t"
  586. ::"m"(red_15mask),"m"(green_15mask));
  587. mm_end = end - 15;
  588. while (s < mm_end)
  589. {
  590. __asm__ volatile(
  591. PREFETCH" 32%1 \n\t"
  592. "movd %1, %%mm0 \n\t"
  593. "movd 4%1, %%mm3 \n\t"
  594. "punpckldq 8%1, %%mm0 \n\t"
  595. "punpckldq 12%1, %%mm3 \n\t"
  596. "movq %%mm0, %%mm1 \n\t"
  597. "movq %%mm0, %%mm2 \n\t"
  598. "movq %%mm3, %%mm4 \n\t"
  599. "movq %%mm3, %%mm5 \n\t"
  600. "psllq $7, %%mm0 \n\t"
  601. "psllq $7, %%mm3 \n\t"
  602. "pand %%mm7, %%mm0 \n\t"
  603. "pand %%mm7, %%mm3 \n\t"
  604. "psrlq $6, %%mm1 \n\t"
  605. "psrlq $6, %%mm4 \n\t"
  606. "pand %%mm6, %%mm1 \n\t"
  607. "pand %%mm6, %%mm4 \n\t"
  608. "psrlq $19, %%mm2 \n\t"
  609. "psrlq $19, %%mm5 \n\t"
  610. "pand %2, %%mm2 \n\t"
  611. "pand %2, %%mm5 \n\t"
  612. "por %%mm1, %%mm0 \n\t"
  613. "por %%mm4, %%mm3 \n\t"
  614. "por %%mm2, %%mm0 \n\t"
  615. "por %%mm5, %%mm3 \n\t"
  616. "psllq $16, %%mm3 \n\t"
  617. "por %%mm3, %%mm0 \n\t"
  618. MOVNTQ" %%mm0, %0 \n\t"
  619. :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
  620. d += 4;
  621. s += 16;
  622. }
  623. __asm__ volatile(SFENCE:::"memory");
  624. __asm__ volatile(EMMS:::"memory");
  625. #endif
  626. while (s < end)
  627. {
  628. register int rgb = *(const uint32_t*)s; s += 4;
  629. *d++ = ((rgb&0xF8)<<7) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>19);
  630. }
  631. }
  632. static inline void RENAME(rgb24tobgr16)(const uint8_t *src, uint8_t *dst, long src_size)
  633. {
  634. const uint8_t *s = src;
  635. const uint8_t *end;
  636. #if HAVE_MMX
  637. const uint8_t *mm_end;
  638. #endif
  639. uint16_t *d = (uint16_t *)dst;
  640. end = s + src_size;
  641. #if HAVE_MMX
  642. __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
  643. __asm__ volatile(
  644. "movq %0, %%mm7 \n\t"
  645. "movq %1, %%mm6 \n\t"
  646. ::"m"(red_16mask),"m"(green_16mask));
  647. mm_end = end - 11;
  648. while (s < mm_end)
  649. {
  650. __asm__ volatile(
  651. PREFETCH" 32%1 \n\t"
  652. "movd %1, %%mm0 \n\t"
  653. "movd 3%1, %%mm3 \n\t"
  654. "punpckldq 6%1, %%mm0 \n\t"
  655. "punpckldq 9%1, %%mm3 \n\t"
  656. "movq %%mm0, %%mm1 \n\t"
  657. "movq %%mm0, %%mm2 \n\t"
  658. "movq %%mm3, %%mm4 \n\t"
  659. "movq %%mm3, %%mm5 \n\t"
  660. "psrlq $3, %%mm0 \n\t"
  661. "psrlq $3, %%mm3 \n\t"
  662. "pand %2, %%mm0 \n\t"
  663. "pand %2, %%mm3 \n\t"
  664. "psrlq $5, %%mm1 \n\t"
  665. "psrlq $5, %%mm4 \n\t"
  666. "pand %%mm6, %%mm1 \n\t"
  667. "pand %%mm6, %%mm4 \n\t"
  668. "psrlq $8, %%mm2 \n\t"
  669. "psrlq $8, %%mm5 \n\t"
  670. "pand %%mm7, %%mm2 \n\t"
  671. "pand %%mm7, %%mm5 \n\t"
  672. "por %%mm1, %%mm0 \n\t"
  673. "por %%mm4, %%mm3 \n\t"
  674. "por %%mm2, %%mm0 \n\t"
  675. "por %%mm5, %%mm3 \n\t"
  676. "psllq $16, %%mm3 \n\t"
  677. "por %%mm3, %%mm0 \n\t"
  678. MOVNTQ" %%mm0, %0 \n\t"
  679. :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
  680. d += 4;
  681. s += 12;
  682. }
  683. __asm__ volatile(SFENCE:::"memory");
  684. __asm__ volatile(EMMS:::"memory");
  685. #endif
  686. while (s < end)
  687. {
  688. const int b = *s++;
  689. const int g = *s++;
  690. const int r = *s++;
  691. *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
  692. }
  693. }
  694. static inline void RENAME(rgb24to16)(const uint8_t *src, uint8_t *dst, long src_size)
  695. {
  696. const uint8_t *s = src;
  697. const uint8_t *end;
  698. #if HAVE_MMX
  699. const uint8_t *mm_end;
  700. #endif
  701. uint16_t *d = (uint16_t *)dst;
  702. end = s + src_size;
  703. #if HAVE_MMX
  704. __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
  705. __asm__ volatile(
  706. "movq %0, %%mm7 \n\t"
  707. "movq %1, %%mm6 \n\t"
  708. ::"m"(red_16mask),"m"(green_16mask));
  709. mm_end = end - 15;
  710. while (s < mm_end)
  711. {
  712. __asm__ volatile(
  713. PREFETCH" 32%1 \n\t"
  714. "movd %1, %%mm0 \n\t"
  715. "movd 3%1, %%mm3 \n\t"
  716. "punpckldq 6%1, %%mm0 \n\t"
  717. "punpckldq 9%1, %%mm3 \n\t"
  718. "movq %%mm0, %%mm1 \n\t"
  719. "movq %%mm0, %%mm2 \n\t"
  720. "movq %%mm3, %%mm4 \n\t"
  721. "movq %%mm3, %%mm5 \n\t"
  722. "psllq $8, %%mm0 \n\t"
  723. "psllq $8, %%mm3 \n\t"
  724. "pand %%mm7, %%mm0 \n\t"
  725. "pand %%mm7, %%mm3 \n\t"
  726. "psrlq $5, %%mm1 \n\t"
  727. "psrlq $5, %%mm4 \n\t"
  728. "pand %%mm6, %%mm1 \n\t"
  729. "pand %%mm6, %%mm4 \n\t"
  730. "psrlq $19, %%mm2 \n\t"
  731. "psrlq $19, %%mm5 \n\t"
  732. "pand %2, %%mm2 \n\t"
  733. "pand %2, %%mm5 \n\t"
  734. "por %%mm1, %%mm0 \n\t"
  735. "por %%mm4, %%mm3 \n\t"
  736. "por %%mm2, %%mm0 \n\t"
  737. "por %%mm5, %%mm3 \n\t"
  738. "psllq $16, %%mm3 \n\t"
  739. "por %%mm3, %%mm0 \n\t"
  740. MOVNTQ" %%mm0, %0 \n\t"
  741. :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
  742. d += 4;
  743. s += 12;
  744. }
  745. __asm__ volatile(SFENCE:::"memory");
  746. __asm__ volatile(EMMS:::"memory");
  747. #endif
  748. while (s < end)
  749. {
  750. const int r = *s++;
  751. const int g = *s++;
  752. const int b = *s++;
  753. *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
  754. }
  755. }
  756. static inline void RENAME(rgb24tobgr15)(const uint8_t *src, uint8_t *dst, long src_size)
  757. {
  758. const uint8_t *s = src;
  759. const uint8_t *end;
  760. #if HAVE_MMX
  761. const uint8_t *mm_end;
  762. #endif
  763. uint16_t *d = (uint16_t *)dst;
  764. end = s + src_size;
  765. #if HAVE_MMX
  766. __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
  767. __asm__ volatile(
  768. "movq %0, %%mm7 \n\t"
  769. "movq %1, %%mm6 \n\t"
  770. ::"m"(red_15mask),"m"(green_15mask));
  771. mm_end = end - 11;
  772. while (s < mm_end)
  773. {
  774. __asm__ volatile(
  775. PREFETCH" 32%1 \n\t"
  776. "movd %1, %%mm0 \n\t"
  777. "movd 3%1, %%mm3 \n\t"
  778. "punpckldq 6%1, %%mm0 \n\t"
  779. "punpckldq 9%1, %%mm3 \n\t"
  780. "movq %%mm0, %%mm1 \n\t"
  781. "movq %%mm0, %%mm2 \n\t"
  782. "movq %%mm3, %%mm4 \n\t"
  783. "movq %%mm3, %%mm5 \n\t"
  784. "psrlq $3, %%mm0 \n\t"
  785. "psrlq $3, %%mm3 \n\t"
  786. "pand %2, %%mm0 \n\t"
  787. "pand %2, %%mm3 \n\t"
  788. "psrlq $6, %%mm1 \n\t"
  789. "psrlq $6, %%mm4 \n\t"
  790. "pand %%mm6, %%mm1 \n\t"
  791. "pand %%mm6, %%mm4 \n\t"
  792. "psrlq $9, %%mm2 \n\t"
  793. "psrlq $9, %%mm5 \n\t"
  794. "pand %%mm7, %%mm2 \n\t"
  795. "pand %%mm7, %%mm5 \n\t"
  796. "por %%mm1, %%mm0 \n\t"
  797. "por %%mm4, %%mm3 \n\t"
  798. "por %%mm2, %%mm0 \n\t"
  799. "por %%mm5, %%mm3 \n\t"
  800. "psllq $16, %%mm3 \n\t"
  801. "por %%mm3, %%mm0 \n\t"
  802. MOVNTQ" %%mm0, %0 \n\t"
  803. :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
  804. d += 4;
  805. s += 12;
  806. }
  807. __asm__ volatile(SFENCE:::"memory");
  808. __asm__ volatile(EMMS:::"memory");
  809. #endif
  810. while (s < end)
  811. {
  812. const int b = *s++;
  813. const int g = *s++;
  814. const int r = *s++;
  815. *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
  816. }
  817. }
  818. static inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, long src_size)
  819. {
  820. const uint8_t *s = src;
  821. const uint8_t *end;
  822. #if HAVE_MMX
  823. const uint8_t *mm_end;
  824. #endif
  825. uint16_t *d = (uint16_t *)dst;
  826. end = s + src_size;
  827. #if HAVE_MMX
  828. __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
  829. __asm__ volatile(
  830. "movq %0, %%mm7 \n\t"
  831. "movq %1, %%mm6 \n\t"
  832. ::"m"(red_15mask),"m"(green_15mask));
  833. mm_end = end - 15;
  834. while (s < mm_end)
  835. {
  836. __asm__ volatile(
  837. PREFETCH" 32%1 \n\t"
  838. "movd %1, %%mm0 \n\t"
  839. "movd 3%1, %%mm3 \n\t"
  840. "punpckldq 6%1, %%mm0 \n\t"
  841. "punpckldq 9%1, %%mm3 \n\t"
  842. "movq %%mm0, %%mm1 \n\t"
  843. "movq %%mm0, %%mm2 \n\t"
  844. "movq %%mm3, %%mm4 \n\t"
  845. "movq %%mm3, %%mm5 \n\t"
  846. "psllq $7, %%mm0 \n\t"
  847. "psllq $7, %%mm3 \n\t"
  848. "pand %%mm7, %%mm0 \n\t"
  849. "pand %%mm7, %%mm3 \n\t"
  850. "psrlq $6, %%mm1 \n\t"
  851. "psrlq $6, %%mm4 \n\t"
  852. "pand %%mm6, %%mm1 \n\t"
  853. "pand %%mm6, %%mm4 \n\t"
  854. "psrlq $19, %%mm2 \n\t"
  855. "psrlq $19, %%mm5 \n\t"
  856. "pand %2, %%mm2 \n\t"
  857. "pand %2, %%mm5 \n\t"
  858. "por %%mm1, %%mm0 \n\t"
  859. "por %%mm4, %%mm3 \n\t"
  860. "por %%mm2, %%mm0 \n\t"
  861. "por %%mm5, %%mm3 \n\t"
  862. "psllq $16, %%mm3 \n\t"
  863. "por %%mm3, %%mm0 \n\t"
  864. MOVNTQ" %%mm0, %0 \n\t"
  865. :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
  866. d += 4;
  867. s += 12;
  868. }
  869. __asm__ volatile(SFENCE:::"memory");
  870. __asm__ volatile(EMMS:::"memory");
  871. #endif
  872. while (s < end)
  873. {
  874. const int r = *s++;
  875. const int g = *s++;
  876. const int b = *s++;
  877. *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
  878. }
  879. }
  880. /*
  881. I use less accurate approximation here by simply left-shifting the input
  882. value and filling the low order bits with zeroes. This method improves PNG
  883. compression but this scheme cannot reproduce white exactly, since it does
  884. not generate an all-ones maximum value; the net effect is to darken the
  885. image slightly.
  886. The better method should be "left bit replication":
  887. 4 3 2 1 0
  888. ---------
  889. 1 1 0 1 1
  890. 7 6 5 4 3 2 1 0
  891. ----------------
  892. 1 1 0 1 1 1 1 0
  893. |=======| |===|
  894. | leftmost bits repeated to fill open bits
  895. |
  896. original bits
  897. */
  898. static inline void RENAME(rgb15tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
  899. {
  900. const uint16_t *end;
  901. #if HAVE_MMX
  902. const uint16_t *mm_end;
  903. #endif
  904. uint8_t *d = dst;
  905. const uint16_t *s = (const uint16_t*)src;
  906. end = s + src_size/2;
  907. #if HAVE_MMX
  908. __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
  909. mm_end = end - 7;
  910. while (s < mm_end)
  911. {
  912. __asm__ volatile(
  913. PREFETCH" 32%1 \n\t"
  914. "movq %1, %%mm0 \n\t"
  915. "movq %1, %%mm1 \n\t"
  916. "movq %1, %%mm2 \n\t"
  917. "pand %2, %%mm0 \n\t"
  918. "pand %3, %%mm1 \n\t"
  919. "pand %4, %%mm2 \n\t"
  920. "psllq $3, %%mm0 \n\t"
  921. "psrlq $2, %%mm1 \n\t"
  922. "psrlq $7, %%mm2 \n\t"
  923. "movq %%mm0, %%mm3 \n\t"
  924. "movq %%mm1, %%mm4 \n\t"
  925. "movq %%mm2, %%mm5 \n\t"
  926. "punpcklwd %5, %%mm0 \n\t"
  927. "punpcklwd %5, %%mm1 \n\t"
  928. "punpcklwd %5, %%mm2 \n\t"
  929. "punpckhwd %5, %%mm3 \n\t"
  930. "punpckhwd %5, %%mm4 \n\t"
  931. "punpckhwd %5, %%mm5 \n\t"
  932. "psllq $8, %%mm1 \n\t"
  933. "psllq $16, %%mm2 \n\t"
  934. "por %%mm1, %%mm0 \n\t"
  935. "por %%mm2, %%mm0 \n\t"
  936. "psllq $8, %%mm4 \n\t"
  937. "psllq $16, %%mm5 \n\t"
  938. "por %%mm4, %%mm3 \n\t"
  939. "por %%mm5, %%mm3 \n\t"
  940. "movq %%mm0, %%mm6 \n\t"
  941. "movq %%mm3, %%mm7 \n\t"
  942. "movq 8%1, %%mm0 \n\t"
  943. "movq 8%1, %%mm1 \n\t"
  944. "movq 8%1, %%mm2 \n\t"
  945. "pand %2, %%mm0 \n\t"
  946. "pand %3, %%mm1 \n\t"
  947. "pand %4, %%mm2 \n\t"
  948. "psllq $3, %%mm0 \n\t"
  949. "psrlq $2, %%mm1 \n\t"
  950. "psrlq $7, %%mm2 \n\t"
  951. "movq %%mm0, %%mm3 \n\t"
  952. "movq %%mm1, %%mm4 \n\t"
  953. "movq %%mm2, %%mm5 \n\t"
  954. "punpcklwd %5, %%mm0 \n\t"
  955. "punpcklwd %5, %%mm1 \n\t"
  956. "punpcklwd %5, %%mm2 \n\t"
  957. "punpckhwd %5, %%mm3 \n\t"
  958. "punpckhwd %5, %%mm4 \n\t"
  959. "punpckhwd %5, %%mm5 \n\t"
  960. "psllq $8, %%mm1 \n\t"
  961. "psllq $16, %%mm2 \n\t"
  962. "por %%mm1, %%mm0 \n\t"
  963. "por %%mm2, %%mm0 \n\t"
  964. "psllq $8, %%mm4 \n\t"
  965. "psllq $16, %%mm5 \n\t"
  966. "por %%mm4, %%mm3 \n\t"
  967. "por %%mm5, %%mm3 \n\t"
  968. :"=m"(*d)
  969. :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r), "m"(mmx_null)
  970. :"memory");
  971. /* borrowed 32 to 24 */
  972. __asm__ volatile(
  973. "movq %%mm0, %%mm4 \n\t"
  974. "movq %%mm3, %%mm5 \n\t"
  975. "movq %%mm6, %%mm0 \n\t"
  976. "movq %%mm7, %%mm1 \n\t"
  977. "movq %%mm4, %%mm6 \n\t"
  978. "movq %%mm5, %%mm7 \n\t"
  979. "movq %%mm0, %%mm2 \n\t"
  980. "movq %%mm1, %%mm3 \n\t"
  981. "psrlq $8, %%mm2 \n\t"
  982. "psrlq $8, %%mm3 \n\t"
  983. "psrlq $8, %%mm6 \n\t"
  984. "psrlq $8, %%mm7 \n\t"
  985. "pand %2, %%mm0 \n\t"
  986. "pand %2, %%mm1 \n\t"
  987. "pand %2, %%mm4 \n\t"
  988. "pand %2, %%mm5 \n\t"
  989. "pand %3, %%mm2 \n\t"
  990. "pand %3, %%mm3 \n\t"
  991. "pand %3, %%mm6 \n\t"
  992. "pand %3, %%mm7 \n\t"
  993. "por %%mm2, %%mm0 \n\t"
  994. "por %%mm3, %%mm1 \n\t"
  995. "por %%mm6, %%mm4 \n\t"
  996. "por %%mm7, %%mm5 \n\t"
  997. "movq %%mm1, %%mm2 \n\t"
  998. "movq %%mm4, %%mm3 \n\t"
  999. "psllq $48, %%mm2 \n\t"
  1000. "psllq $32, %%mm3 \n\t"
  1001. "pand %4, %%mm2 \n\t"
  1002. "pand %5, %%mm3 \n\t"
  1003. "por %%mm2, %%mm0 \n\t"
  1004. "psrlq $16, %%mm1 \n\t"
  1005. "psrlq $32, %%mm4 \n\t"
  1006. "psllq $16, %%mm5 \n\t"
  1007. "por %%mm3, %%mm1 \n\t"
  1008. "pand %6, %%mm5 \n\t"
  1009. "por %%mm5, %%mm4 \n\t"
  1010. MOVNTQ" %%mm0, %0 \n\t"
  1011. MOVNTQ" %%mm1, 8%0 \n\t"
  1012. MOVNTQ" %%mm4, 16%0"
  1013. :"=m"(*d)
  1014. :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
  1015. :"memory");
  1016. d += 24;
  1017. s += 8;
  1018. }
  1019. __asm__ volatile(SFENCE:::"memory");
  1020. __asm__ volatile(EMMS:::"memory");
  1021. #endif
  1022. while (s < end)
  1023. {
  1024. register uint16_t bgr;
  1025. bgr = *s++;
  1026. *d++ = (bgr&0x1F)<<3;
  1027. *d++ = (bgr&0x3E0)>>2;
  1028. *d++ = (bgr&0x7C00)>>7;
  1029. }
  1030. }
  1031. static inline void RENAME(rgb16tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
  1032. {
  1033. const uint16_t *end;
  1034. #if HAVE_MMX
  1035. const uint16_t *mm_end;
  1036. #endif
  1037. uint8_t *d = (uint8_t *)dst;
  1038. const uint16_t *s = (const uint16_t *)src;
  1039. end = s + src_size/2;
  1040. #if HAVE_MMX
  1041. __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
  1042. mm_end = end - 7;
  1043. while (s < mm_end)
  1044. {
  1045. __asm__ volatile(
  1046. PREFETCH" 32%1 \n\t"
  1047. "movq %1, %%mm0 \n\t"
  1048. "movq %1, %%mm1 \n\t"
  1049. "movq %1, %%mm2 \n\t"
  1050. "pand %2, %%mm0 \n\t"
  1051. "pand %3, %%mm1 \n\t"
  1052. "pand %4, %%mm2 \n\t"
  1053. "psllq $3, %%mm0 \n\t"
  1054. "psrlq $3, %%mm1 \n\t"
  1055. "psrlq $8, %%mm2 \n\t"
  1056. "movq %%mm0, %%mm3 \n\t"
  1057. "movq %%mm1, %%mm4 \n\t"
  1058. "movq %%mm2, %%mm5 \n\t"
  1059. "punpcklwd %5, %%mm0 \n\t"
  1060. "punpcklwd %5, %%mm1 \n\t"
  1061. "punpcklwd %5, %%mm2 \n\t"
  1062. "punpckhwd %5, %%mm3 \n\t"
  1063. "punpckhwd %5, %%mm4 \n\t"
  1064. "punpckhwd %5, %%mm5 \n\t"
  1065. "psllq $8, %%mm1 \n\t"
  1066. "psllq $16, %%mm2 \n\t"
  1067. "por %%mm1, %%mm0 \n\t"
  1068. "por %%mm2, %%mm0 \n\t"
  1069. "psllq $8, %%mm4 \n\t"
  1070. "psllq $16, %%mm5 \n\t"
  1071. "por %%mm4, %%mm3 \n\t"
  1072. "por %%mm5, %%mm3 \n\t"
  1073. "movq %%mm0, %%mm6 \n\t"
  1074. "movq %%mm3, %%mm7 \n\t"
  1075. "movq 8%1, %%mm0 \n\t"
  1076. "movq 8%1, %%mm1 \n\t"
  1077. "movq 8%1, %%mm2 \n\t"
  1078. "pand %2, %%mm0 \n\t"
  1079. "pand %3, %%mm1 \n\t"
  1080. "pand %4, %%mm2 \n\t"
  1081. "psllq $3, %%mm0 \n\t"
  1082. "psrlq $3, %%mm1 \n\t"
  1083. "psrlq $8, %%mm2 \n\t"
  1084. "movq %%mm0, %%mm3 \n\t"
  1085. "movq %%mm1, %%mm4 \n\t"
  1086. "movq %%mm2, %%mm5 \n\t"
  1087. "punpcklwd %5, %%mm0 \n\t"
  1088. "punpcklwd %5, %%mm1 \n\t"
  1089. "punpcklwd %5, %%mm2 \n\t"
  1090. "punpckhwd %5, %%mm3 \n\t"
  1091. "punpckhwd %5, %%mm4 \n\t"
  1092. "punpckhwd %5, %%mm5 \n\t"
  1093. "psllq $8, %%mm1 \n\t"
  1094. "psllq $16, %%mm2 \n\t"
  1095. "por %%mm1, %%mm0 \n\t"
  1096. "por %%mm2, %%mm0 \n\t"
  1097. "psllq $8, %%mm4 \n\t"
  1098. "psllq $16, %%mm5 \n\t"
  1099. "por %%mm4, %%mm3 \n\t"
  1100. "por %%mm5, %%mm3 \n\t"
  1101. :"=m"(*d)
  1102. :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mmx_null)
  1103. :"memory");
  1104. /* borrowed 32 to 24 */
  1105. __asm__ volatile(
  1106. "movq %%mm0, %%mm4 \n\t"
  1107. "movq %%mm3, %%mm5 \n\t"
  1108. "movq %%mm6, %%mm0 \n\t"
  1109. "movq %%mm7, %%mm1 \n\t"
  1110. "movq %%mm4, %%mm6 \n\t"
  1111. "movq %%mm5, %%mm7 \n\t"
  1112. "movq %%mm0, %%mm2 \n\t"
  1113. "movq %%mm1, %%mm3 \n\t"
  1114. "psrlq $8, %%mm2 \n\t"
  1115. "psrlq $8, %%mm3 \n\t"
  1116. "psrlq $8, %%mm6 \n\t"
  1117. "psrlq $8, %%mm7 \n\t"
  1118. "pand %2, %%mm0 \n\t"
  1119. "pand %2, %%mm1 \n\t"
  1120. "pand %2, %%mm4 \n\t"
  1121. "pand %2, %%mm5 \n\t"
  1122. "pand %3, %%mm2 \n\t"
  1123. "pand %3, %%mm3 \n\t"
  1124. "pand %3, %%mm6 \n\t"
  1125. "pand %3, %%mm7 \n\t"
  1126. "por %%mm2, %%mm0 \n\t"
  1127. "por %%mm3, %%mm1 \n\t"
  1128. "por %%mm6, %%mm4 \n\t"
  1129. "por %%mm7, %%mm5 \n\t"
  1130. "movq %%mm1, %%mm2 \n\t"
  1131. "movq %%mm4, %%mm3 \n\t"
  1132. "psllq $48, %%mm2 \n\t"
  1133. "psllq $32, %%mm3 \n\t"
  1134. "pand %4, %%mm2 \n\t"
  1135. "pand %5, %%mm3 \n\t"
  1136. "por %%mm2, %%mm0 \n\t"
  1137. "psrlq $16, %%mm1 \n\t"
  1138. "psrlq $32, %%mm4 \n\t"
  1139. "psllq $16, %%mm5 \n\t"
  1140. "por %%mm3, %%mm1 \n\t"
  1141. "pand %6, %%mm5 \n\t"
  1142. "por %%mm5, %%mm4 \n\t"
  1143. MOVNTQ" %%mm0, %0 \n\t"
  1144. MOVNTQ" %%mm1, 8%0 \n\t"
  1145. MOVNTQ" %%mm4, 16%0"
  1146. :"=m"(*d)
  1147. :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
  1148. :"memory");
  1149. d += 24;
  1150. s += 8;
  1151. }
  1152. __asm__ volatile(SFENCE:::"memory");
  1153. __asm__ volatile(EMMS:::"memory");
  1154. #endif
  1155. while (s < end)
  1156. {
  1157. register uint16_t bgr;
  1158. bgr = *s++;
  1159. *d++ = (bgr&0x1F)<<3;
  1160. *d++ = (bgr&0x7E0)>>3;
  1161. *d++ = (bgr&0xF800)>>8;
  1162. }
  1163. }
  1164. /*
  1165. * mm0 = 00 B3 00 B2 00 B1 00 B0
  1166. * mm1 = 00 G3 00 G2 00 G1 00 G0
  1167. * mm2 = 00 R3 00 R2 00 R1 00 R0
  1168. * mm6 = FF FF FF FF FF FF FF FF
  1169. * mm7 = 00 00 00 00 00 00 00 00
  1170. */
  1171. #define PACK_RGB32 \
  1172. "packuswb %%mm7, %%mm0 \n\t" /* 00 00 00 00 B3 B2 B1 B0 */ \
  1173. "packuswb %%mm7, %%mm1 \n\t" /* 00 00 00 00 G3 G2 G1 G0 */ \
  1174. "packuswb %%mm7, %%mm2 \n\t" /* 00 00 00 00 R3 R2 R1 R0 */ \
  1175. "punpcklbw %%mm1, %%mm0 \n\t" /* G3 B3 G2 B2 G1 B1 G0 B0 */ \
  1176. "punpcklbw %%mm6, %%mm2 \n\t" /* FF R3 FF R2 FF R1 FF R0 */ \
  1177. "movq %%mm0, %%mm3 \n\t" \
  1178. "punpcklwd %%mm2, %%mm0 \n\t" /* FF R1 G1 B1 FF R0 G0 B0 */ \
  1179. "punpckhwd %%mm2, %%mm3 \n\t" /* FF R3 G3 B3 FF R2 G2 B2 */ \
  1180. MOVNTQ" %%mm0, %0 \n\t" \
  1181. MOVNTQ" %%mm3, 8%0 \n\t" \
  1182. static inline void RENAME(rgb15to32)(const uint8_t *src, uint8_t *dst, long src_size)
  1183. {
  1184. const uint16_t *end;
  1185. #if HAVE_MMX
  1186. const uint16_t *mm_end;
  1187. #endif
  1188. uint8_t *d = dst;
  1189. const uint16_t *s = (const uint16_t *)src;
  1190. end = s + src_size/2;
  1191. #if HAVE_MMX
  1192. __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
  1193. __asm__ volatile("pxor %%mm7,%%mm7 \n\t":::"memory");
  1194. __asm__ volatile("pcmpeqd %%mm6,%%mm6 \n\t":::"memory");
  1195. mm_end = end - 3;
  1196. while (s < mm_end)
  1197. {
  1198. __asm__ volatile(
  1199. PREFETCH" 32%1 \n\t"
  1200. "movq %1, %%mm0 \n\t"
  1201. "movq %1, %%mm1 \n\t"
  1202. "movq %1, %%mm2 \n\t"
  1203. "pand %2, %%mm0 \n\t"
  1204. "pand %3, %%mm1 \n\t"
  1205. "pand %4, %%mm2 \n\t"
  1206. "psllq $3, %%mm0 \n\t"
  1207. "psrlq $2, %%mm1 \n\t"
  1208. "psrlq $7, %%mm2 \n\t"
  1209. PACK_RGB32
  1210. :"=m"(*d)
  1211. :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r)
  1212. :"memory");
  1213. d += 16;
  1214. s += 4;
  1215. }
  1216. __asm__ volatile(SFENCE:::"memory");
  1217. __asm__ volatile(EMMS:::"memory");
  1218. #endif
  1219. while (s < end)
  1220. {
  1221. #if 0 //slightly slower on Athlon
  1222. int bgr= *s++;
  1223. *((uint32_t*)d)++ = ((bgr&0x1F)<<3) + ((bgr&0x3E0)<<6) + ((bgr&0x7C00)<<9);
  1224. #else
  1225. register uint16_t bgr;
  1226. bgr = *s++;
  1227. #ifdef WORDS_BIGENDIAN
  1228. *d++ = 255;
  1229. *d++ = (bgr&0x7C00)>>7;
  1230. *d++ = (bgr&0x3E0)>>2;
  1231. *d++ = (bgr&0x1F)<<3;
  1232. #else
  1233. *d++ = (bgr&0x1F)<<3;
  1234. *d++ = (bgr&0x3E0)>>2;
  1235. *d++ = (bgr&0x7C00)>>7;
  1236. *d++ = 255;
  1237. #endif
  1238. #endif
  1239. }
  1240. }
  1241. static inline void RENAME(rgb16to32)(const uint8_t *src, uint8_t *dst, long src_size)
  1242. {
  1243. const uint16_t *end;
  1244. #if HAVE_MMX
  1245. const uint16_t *mm_end;
  1246. #endif
  1247. uint8_t *d = dst;
  1248. const uint16_t *s = (const uint16_t*)src;
  1249. end = s + src_size/2;
  1250. #if HAVE_MMX
  1251. __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
  1252. __asm__ volatile("pxor %%mm7,%%mm7 \n\t":::"memory");
  1253. __asm__ volatile("pcmpeqd %%mm6,%%mm6 \n\t":::"memory");
  1254. mm_end = end - 3;
  1255. while (s < mm_end)
  1256. {
  1257. __asm__ volatile(
  1258. PREFETCH" 32%1 \n\t"
  1259. "movq %1, %%mm0 \n\t"
  1260. "movq %1, %%mm1 \n\t"
  1261. "movq %1, %%mm2 \n\t"
  1262. "pand %2, %%mm0 \n\t"
  1263. "pand %3, %%mm1 \n\t"
  1264. "pand %4, %%mm2 \n\t"
  1265. "psllq $3, %%mm0 \n\t"
  1266. "psrlq $3, %%mm1 \n\t"
  1267. "psrlq $8, %%mm2 \n\t"
  1268. PACK_RGB32
  1269. :"=m"(*d)
  1270. :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r)
  1271. :"memory");
  1272. d += 16;
  1273. s += 4;
  1274. }
  1275. __asm__ volatile(SFENCE:::"memory");
  1276. __asm__ volatile(EMMS:::"memory");
  1277. #endif
  1278. while (s < end)
  1279. {
  1280. register uint16_t bgr;
  1281. bgr = *s++;
  1282. #ifdef WORDS_BIGENDIAN
  1283. *d++ = 255;
  1284. *d++ = (bgr&0xF800)>>8;
  1285. *d++ = (bgr&0x7E0)>>3;
  1286. *d++ = (bgr&0x1F)<<3;
  1287. #else
  1288. *d++ = (bgr&0x1F)<<3;
  1289. *d++ = (bgr&0x7E0)>>3;
  1290. *d++ = (bgr&0xF800)>>8;
  1291. *d++ = 255;
  1292. #endif
  1293. }
  1294. }
  1295. static inline void RENAME(rgb32tobgr32)(const uint8_t *src, uint8_t *dst, long src_size)
  1296. {
  1297. x86_reg idx = 15 - src_size;
  1298. const uint8_t *s = src-idx;
  1299. uint8_t *d = dst-idx;
  1300. #if HAVE_MMX
  1301. __asm__ volatile(
  1302. "test %0, %0 \n\t"
  1303. "jns 2f \n\t"
  1304. PREFETCH" (%1, %0) \n\t"
  1305. "movq %3, %%mm7 \n\t"
  1306. "pxor %4, %%mm7 \n\t"
  1307. "movq %%mm7, %%mm6 \n\t"
  1308. "pxor %5, %%mm7 \n\t"
  1309. ASMALIGN(4)
  1310. "1: \n\t"
  1311. PREFETCH" 32(%1, %0) \n\t"
  1312. "movq (%1, %0), %%mm0 \n\t"
  1313. "movq 8(%1, %0), %%mm1 \n\t"
  1314. # if HAVE_MMX2
  1315. "pshufw $177, %%mm0, %%mm3 \n\t"
  1316. "pshufw $177, %%mm1, %%mm5 \n\t"
  1317. "pand %%mm7, %%mm0 \n\t"
  1318. "pand %%mm6, %%mm3 \n\t"
  1319. "pand %%mm7, %%mm1 \n\t"
  1320. "pand %%mm6, %%mm5 \n\t"
  1321. "por %%mm3, %%mm0 \n\t"
  1322. "por %%mm5, %%mm1 \n\t"
  1323. # else
  1324. "movq %%mm0, %%mm2 \n\t"
  1325. "movq %%mm1, %%mm4 \n\t"
  1326. "pand %%mm7, %%mm0 \n\t"
  1327. "pand %%mm6, %%mm2 \n\t"
  1328. "pand %%mm7, %%mm1 \n\t"
  1329. "pand %%mm6, %%mm4 \n\t"
  1330. "movq %%mm2, %%mm3 \n\t"
  1331. "movq %%mm4, %%mm5 \n\t"
  1332. "pslld $16, %%mm2 \n\t"
  1333. "psrld $16, %%mm3 \n\t"
  1334. "pslld $16, %%mm4 \n\t"
  1335. "psrld $16, %%mm5 \n\t"
  1336. "por %%mm2, %%mm0 \n\t"
  1337. "por %%mm4, %%mm1 \n\t"
  1338. "por %%mm3, %%mm0 \n\t"
  1339. "por %%mm5, %%mm1 \n\t"
  1340. # endif
  1341. MOVNTQ" %%mm0, (%2, %0) \n\t"
  1342. MOVNTQ" %%mm1, 8(%2, %0) \n\t"
  1343. "add $16, %0 \n\t"
  1344. "js 1b \n\t"
  1345. SFENCE" \n\t"
  1346. EMMS" \n\t"
  1347. "2: \n\t"
  1348. : "+&r"(idx)
  1349. : "r" (s), "r" (d), "m" (mask32b), "m" (mask32r), "m" (mmx_one)
  1350. : "memory");
  1351. #endif
  1352. for (; idx<15; idx+=4) {
  1353. register int v = *(const uint32_t *)&s[idx], g = v & 0xff00ff00;
  1354. v &= 0xff00ff;
  1355. *(uint32_t *)&d[idx] = (v>>16) + g + (v<<16);
  1356. }
  1357. }
  1358. static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
  1359. {
  1360. unsigned i;
  1361. #if HAVE_MMX
  1362. x86_reg mmx_size= 23 - src_size;
  1363. __asm__ volatile (
  1364. "test %%"REG_a", %%"REG_a" \n\t"
  1365. "jns 2f \n\t"
  1366. "movq "MANGLE(mask24r)", %%mm5 \n\t"
  1367. "movq "MANGLE(mask24g)", %%mm6 \n\t"
  1368. "movq "MANGLE(mask24b)", %%mm7 \n\t"
  1369. ASMALIGN(4)
  1370. "1: \n\t"
  1371. PREFETCH" 32(%1, %%"REG_a") \n\t"
  1372. "movq (%1, %%"REG_a"), %%mm0 \n\t" // BGR BGR BG
  1373. "movq (%1, %%"REG_a"), %%mm1 \n\t" // BGR BGR BG
  1374. "movq 2(%1, %%"REG_a"), %%mm2 \n\t" // R BGR BGR B
  1375. "psllq $16, %%mm0 \n\t" // 00 BGR BGR
  1376. "pand %%mm5, %%mm0 \n\t"
  1377. "pand %%mm6, %%mm1 \n\t"
  1378. "pand %%mm7, %%mm2 \n\t"
  1379. "por %%mm0, %%mm1 \n\t"
  1380. "por %%mm2, %%mm1 \n\t"
  1381. "movq 6(%1, %%"REG_a"), %%mm0 \n\t" // BGR BGR BG
  1382. MOVNTQ" %%mm1, (%2, %%"REG_a") \n\t" // RGB RGB RG
  1383. "movq 8(%1, %%"REG_a"), %%mm1 \n\t" // R BGR BGR B
  1384. "movq 10(%1, %%"REG_a"), %%mm2 \n\t" // GR BGR BGR
  1385. "pand %%mm7, %%mm0 \n\t"
  1386. "pand %%mm5, %%mm1 \n\t"
  1387. "pand %%mm6, %%mm2 \n\t"
  1388. "por %%mm0, %%mm1 \n\t"
  1389. "por %%mm2, %%mm1 \n\t"
  1390. "movq 14(%1, %%"REG_a"), %%mm0 \n\t" // R BGR BGR B
  1391. MOVNTQ" %%mm1, 8(%2, %%"REG_a") \n\t" // B RGB RGB R
  1392. "movq 16(%1, %%"REG_a"), %%mm1 \n\t" // GR BGR BGR
  1393. "movq 18(%1, %%"REG_a"), %%mm2 \n\t" // BGR BGR BG
  1394. "pand %%mm6, %%mm0 \n\t"
  1395. "pand %%mm7, %%mm1 \n\t"
  1396. "pand %%mm5, %%mm2 \n\t"
  1397. "por %%mm0, %%mm1 \n\t"
  1398. "por %%mm2, %%mm1 \n\t"
  1399. MOVNTQ" %%mm1, 16(%2, %%"REG_a") \n\t"
  1400. "add $24, %%"REG_a" \n\t"
  1401. " js 1b \n\t"
  1402. "2: \n\t"
  1403. : "+a" (mmx_size)
  1404. : "r" (src-mmx_size), "r"(dst-mmx_size)
  1405. );
  1406. __asm__ volatile(SFENCE:::"memory");
  1407. __asm__ volatile(EMMS:::"memory");
  1408. if (mmx_size==23) return; //finished, was multiple of 8
  1409. src+= src_size;
  1410. dst+= src_size;
  1411. src_size= 23-mmx_size;
  1412. src-= src_size;
  1413. dst-= src_size;
  1414. #endif
  1415. for (i=0; i<src_size; i+=3)
  1416. {
  1417. register uint8_t x;
  1418. x = src[i + 2];
  1419. dst[i + 1] = src[i + 1];
  1420. dst[i + 2] = src[i + 0];
  1421. dst[i + 0] = x;
  1422. }
  1423. }
  1424. static inline void RENAME(yuvPlanartoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
  1425. long width, long height,
  1426. long lumStride, long chromStride, long dstStride, long vertLumPerChroma)
  1427. {
  1428. long y;
  1429. const x86_reg chromWidth= width>>1;
  1430. for (y=0; y<height; y++)
  1431. {
  1432. #if HAVE_MMX
  1433. //FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway)
  1434. __asm__ volatile(
  1435. "xor %%"REG_a", %%"REG_a" \n\t"
  1436. ASMALIGN(4)
  1437. "1: \n\t"
  1438. PREFETCH" 32(%1, %%"REG_a", 2) \n\t"
  1439. PREFETCH" 32(%2, %%"REG_a") \n\t"
  1440. PREFETCH" 32(%3, %%"REG_a") \n\t"
  1441. "movq (%2, %%"REG_a"), %%mm0 \n\t" // U(0)
  1442. "movq %%mm0, %%mm2 \n\t" // U(0)
  1443. "movq (%3, %%"REG_a"), %%mm1 \n\t" // V(0)
  1444. "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
  1445. "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8)
  1446. "movq (%1, %%"REG_a",2), %%mm3 \n\t" // Y(0)
  1447. "movq 8(%1, %%"REG_a",2), %%mm5 \n\t" // Y(8)
  1448. "movq %%mm3, %%mm4 \n\t" // Y(0)
  1449. "movq %%mm5, %%mm6 \n\t" // Y(8)
  1450. "punpcklbw %%mm0, %%mm3 \n\t" // YUYV YUYV(0)
  1451. "punpckhbw %%mm0, %%mm4 \n\t" // YUYV YUYV(4)
  1452. "punpcklbw %%mm2, %%mm5 \n\t" // YUYV YUYV(8)
  1453. "punpckhbw %%mm2, %%mm6 \n\t" // YUYV YUYV(12)
  1454. MOVNTQ" %%mm3, (%0, %%"REG_a", 4) \n\t"
  1455. MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4) \n\t"
  1456. MOVNTQ" %%mm5, 16(%0, %%"REG_a", 4) \n\t"
  1457. MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4) \n\t"
  1458. "add $8, %%"REG_a" \n\t"
  1459. "cmp %4, %%"REG_a" \n\t"
  1460. " jb 1b \n\t"
  1461. ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
  1462. : "%"REG_a
  1463. );
  1464. #else
  1465. #if ARCH_ALPHA && HAVE_MVI
  1466. #define pl2yuy2(n) \
  1467. y1 = yc[n]; \
  1468. y2 = yc2[n]; \
  1469. u = uc[n]; \
  1470. v = vc[n]; \
  1471. __asm__("unpkbw %1, %0" : "=r"(y1) : "r"(y1)); \
  1472. __asm__("unpkbw %1, %0" : "=r"(y2) : "r"(y2)); \
  1473. __asm__("unpkbl %1, %0" : "=r"(u) : "r"(u)); \
  1474. __asm__("unpkbl %1, %0" : "=r"(v) : "r"(v)); \
  1475. yuv1 = (u << 8) + (v << 24); \
  1476. yuv2 = yuv1 + y2; \
  1477. yuv1 += y1; \
  1478. qdst[n] = yuv1; \
  1479. qdst2[n] = yuv2;
  1480. int i;
  1481. uint64_t *qdst = (uint64_t *) dst;
  1482. uint64_t *qdst2 = (uint64_t *) (dst + dstStride);
  1483. const uint32_t *yc = (uint32_t *) ysrc;
  1484. const uint32_t *yc2 = (uint32_t *) (ysrc + lumStride);
  1485. const uint16_t *uc = (uint16_t*) usrc, *vc = (uint16_t*) vsrc;
  1486. for (i = 0; i < chromWidth; i += 8){
  1487. uint64_t y1, y2, yuv1, yuv2;
  1488. uint64_t u, v;
  1489. /* Prefetch */
  1490. __asm__("ldq $31,64(%0)" :: "r"(yc));
  1491. __asm__("ldq $31,64(%0)" :: "r"(yc2));
  1492. __asm__("ldq $31,64(%0)" :: "r"(uc));
  1493. __asm__("ldq $31,64(%0)" :: "r"(vc));
  1494. pl2yuy2(0);
  1495. pl2yuy2(1);
  1496. pl2yuy2(2);
  1497. pl2yuy2(3);
  1498. yc += 4;
  1499. yc2 += 4;
  1500. uc += 4;
  1501. vc += 4;
  1502. qdst += 4;
  1503. qdst2 += 4;
  1504. }
  1505. y++;
  1506. ysrc += lumStride;
  1507. dst += dstStride;
  1508. #elif HAVE_FAST_64BIT
  1509. int i;
  1510. uint64_t *ldst = (uint64_t *) dst;
  1511. const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
  1512. for (i = 0; i < chromWidth; i += 2){
  1513. uint64_t k, l;
  1514. k = yc[0] + (uc[0] << 8) +
  1515. (yc[1] << 16) + (vc[0] << 24);
  1516. l = yc[2] + (uc[1] << 8) +
  1517. (yc[3] << 16) + (vc[1] << 24);
  1518. *ldst++ = k + (l << 32);
  1519. yc += 4;
  1520. uc += 2;
  1521. vc += 2;
  1522. }
  1523. #else
  1524. int i, *idst = (int32_t *) dst;
  1525. const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
  1526. for (i = 0; i < chromWidth; i++){
  1527. #ifdef WORDS_BIGENDIAN
  1528. *idst++ = (yc[0] << 24)+ (uc[0] << 16) +
  1529. (yc[1] << 8) + (vc[0] << 0);
  1530. #else
  1531. *idst++ = yc[0] + (uc[0] << 8) +
  1532. (yc[1] << 16) + (vc[0] << 24);
  1533. #endif
  1534. yc += 2;
  1535. uc++;
  1536. vc++;
  1537. }
  1538. #endif
  1539. #endif
  1540. if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1)
  1541. {
  1542. usrc += chromStride;
  1543. vsrc += chromStride;
  1544. }
  1545. ysrc += lumStride;
  1546. dst += dstStride;
  1547. }
  1548. #if HAVE_MMX
  1549. __asm__( EMMS" \n\t"
  1550. SFENCE" \n\t"
  1551. :::"memory");
  1552. #endif
  1553. }
  1554. /**
  1555. * Height should be a multiple of 2 and width should be a multiple of 16.
  1556. * (If this is a problem for anyone then tell me, and I will fix it.)
  1557. */
  1558. static inline void RENAME(yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
  1559. long width, long height,
  1560. long lumStride, long chromStride, long dstStride)
  1561. {
  1562. //FIXME interpolate chroma
  1563. RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
  1564. }
  1565. static inline void RENAME(yuvPlanartouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
  1566. long width, long height,
  1567. long lumStride, long chromStride, long dstStride, long vertLumPerChroma)
  1568. {
  1569. long y;
  1570. const x86_reg chromWidth= width>>1;
  1571. for (y=0; y<height; y++)
  1572. {
  1573. #if HAVE_MMX
  1574. //FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway)
  1575. __asm__ volatile(
  1576. "xor %%"REG_a", %%"REG_a" \n\t"
  1577. ASMALIGN(4)
  1578. "1: \n\t"
  1579. PREFETCH" 32(%1, %%"REG_a", 2) \n\t"
  1580. PREFETCH" 32(%2, %%"REG_a") \n\t"
  1581. PREFETCH" 32(%3, %%"REG_a") \n\t"
  1582. "movq (%2, %%"REG_a"), %%mm0 \n\t" // U(0)
  1583. "movq %%mm0, %%mm2 \n\t" // U(0)
  1584. "movq (%3, %%"REG_a"), %%mm1 \n\t" // V(0)
  1585. "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
  1586. "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8)
  1587. "movq (%1, %%"REG_a",2), %%mm3 \n\t" // Y(0)
  1588. "movq 8(%1, %%"REG_a",2), %%mm5 \n\t" // Y(8)
  1589. "movq %%mm0, %%mm4 \n\t" // Y(0)
  1590. "movq %%mm2, %%mm6 \n\t" // Y(8)
  1591. "punpcklbw %%mm3, %%mm0 \n\t" // YUYV YUYV(0)
  1592. "punpckhbw %%mm3, %%mm4 \n\t" // YUYV YUYV(4)
  1593. "punpcklbw %%mm5, %%mm2 \n\t" // YUYV YUYV(8)
  1594. "punpckhbw %%mm5, %%mm6 \n\t" // YUYV YUYV(12)
  1595. MOVNTQ" %%mm0, (%0, %%"REG_a", 4) \n\t"
  1596. MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4) \n\t"
  1597. MOVNTQ" %%mm2, 16(%0, %%"REG_a", 4) \n\t"
  1598. MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4) \n\t"
  1599. "add $8, %%"REG_a" \n\t"
  1600. "cmp %4, %%"REG_a" \n\t"
  1601. " jb 1b \n\t"
  1602. ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
  1603. : "%"REG_a
  1604. );
  1605. #else
  1606. //FIXME adapt the Alpha ASM code from yv12->yuy2
  1607. #if HAVE_FAST_64BIT
  1608. int i;
  1609. uint64_t *ldst = (uint64_t *) dst;
  1610. const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
  1611. for (i = 0; i < chromWidth; i += 2){
  1612. uint64_t k, l;
  1613. k = uc[0] + (yc[0] << 8) +
  1614. (vc[0] << 16) + (yc[1] << 24);
  1615. l = uc[1] + (yc[2] << 8) +
  1616. (vc[1] << 16) + (yc[3] << 24);
  1617. *ldst++ = k + (l << 32);
  1618. yc += 4;
  1619. uc += 2;
  1620. vc += 2;
  1621. }
  1622. #else
  1623. int i, *idst = (int32_t *) dst;
  1624. const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
  1625. for (i = 0; i < chromWidth; i++){
  1626. #ifdef WORDS_BIGENDIAN
  1627. *idst++ = (uc[0] << 24)+ (yc[0] << 16) +
  1628. (vc[0] << 8) + (yc[1] << 0);
  1629. #else
  1630. *idst++ = uc[0] + (yc[0] << 8) +
  1631. (vc[0] << 16) + (yc[1] << 24);
  1632. #endif
  1633. yc += 2;
  1634. uc++;
  1635. vc++;
  1636. }
  1637. #endif
  1638. #endif
  1639. if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1)
  1640. {
  1641. usrc += chromStride;
  1642. vsrc += chromStride;
  1643. }
  1644. ysrc += lumStride;
  1645. dst += dstStride;
  1646. }
  1647. #if HAVE_MMX
  1648. __asm__( EMMS" \n\t"
  1649. SFENCE" \n\t"
  1650. :::"memory");
  1651. #endif
  1652. }
  1653. /**
  1654. * Height should be a multiple of 2 and width should be a multiple of 16
  1655. * (If this is a problem for anyone then tell me, and I will fix it.)
  1656. */
  1657. static inline void RENAME(yv12touyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
  1658. long width, long height,
  1659. long lumStride, long chromStride, long dstStride)
  1660. {
  1661. //FIXME interpolate chroma
  1662. RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
  1663. }
  1664. /**
  1665. * Width should be a multiple of 16.
  1666. */
  1667. static inline void RENAME(yuv422ptouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
  1668. long width, long height,
  1669. long lumStride, long chromStride, long dstStride)
  1670. {
  1671. RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
  1672. }
  1673. /**
  1674. * Width should be a multiple of 16.
  1675. */
  1676. static inline void RENAME(yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
  1677. long width, long height,
  1678. long lumStride, long chromStride, long dstStride)
  1679. {
  1680. RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
  1681. }
  1682. /**
  1683. * Height should be a multiple of 2 and width should be a multiple of 16.
  1684. * (If this is a problem for anyone then tell me, and I will fix it.)
  1685. */
  1686. static inline void RENAME(yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
  1687. long width, long height,
  1688. long lumStride, long chromStride, long srcStride)
  1689. {
  1690. long y;
  1691. const x86_reg chromWidth= width>>1;
  1692. for (y=0; y<height; y+=2)
  1693. {
  1694. #if HAVE_MMX
  1695. __asm__ volatile(
  1696. "xor %%"REG_a", %%"REG_a" \n\t"
  1697. "pcmpeqw %%mm7, %%mm7 \n\t"
  1698. "psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
  1699. ASMALIGN(4)
  1700. "1: \n\t"
  1701. PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
  1702. "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
  1703. "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(4)
  1704. "movq %%mm0, %%mm2 \n\t" // YUYV YUYV(0)
  1705. "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(4)
  1706. "psrlw $8, %%mm0 \n\t" // U0V0 U0V0(0)
  1707. "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(4)
  1708. "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
  1709. "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
  1710. "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
  1711. "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
  1712. MOVNTQ" %%mm2, (%1, %%"REG_a", 2) \n\t"
  1713. "movq 16(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(8)
  1714. "movq 24(%0, %%"REG_a", 4), %%mm2 \n\t" // YUYV YUYV(12)
  1715. "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(8)
  1716. "movq %%mm2, %%mm4 \n\t" // YUYV YUYV(12)
  1717. "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(8)
  1718. "psrlw $8, %%mm2 \n\t" // U0V0 U0V0(12)
  1719. "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
  1720. "pand %%mm7, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
  1721. "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
  1722. "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
  1723. MOVNTQ" %%mm3, 8(%1, %%"REG_a", 2) \n\t"
  1724. "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
  1725. "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
  1726. "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
  1727. "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
  1728. "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
  1729. "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
  1730. "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
  1731. "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
  1732. MOVNTQ" %%mm0, (%3, %%"REG_a") \n\t"
  1733. MOVNTQ" %%mm2, (%2, %%"REG_a") \n\t"
  1734. "add $8, %%"REG_a" \n\t"
  1735. "cmp %4, %%"REG_a" \n\t"
  1736. " jb 1b \n\t"
  1737. ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
  1738. : "memory", "%"REG_a
  1739. );
  1740. ydst += lumStride;
  1741. src += srcStride;
  1742. __asm__ volatile(
  1743. "xor %%"REG_a", %%"REG_a" \n\t"
  1744. ASMALIGN(4)
  1745. "1: \n\t"
  1746. PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
  1747. "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
  1748. "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(4)
  1749. "movq 16(%0, %%"REG_a", 4), %%mm2 \n\t" // YUYV YUYV(8)
  1750. "movq 24(%0, %%"REG_a", 4), %%mm3 \n\t" // YUYV YUYV(12)
  1751. "pand %%mm7, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
  1752. "pand %%mm7, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
  1753. "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
  1754. "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
  1755. "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
  1756. "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
  1757. MOVNTQ" %%mm0, (%1, %%"REG_a", 2) \n\t"
  1758. MOVNTQ" %%mm2, 8(%1, %%"REG_a", 2) \n\t"
  1759. "add $8, %%"REG_a" \n\t"
  1760. "cmp %4, %%"REG_a" \n\t"
  1761. " jb 1b \n\t"
  1762. ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
  1763. : "memory", "%"REG_a
  1764. );
  1765. #else
  1766. long i;
  1767. for (i=0; i<chromWidth; i++)
  1768. {
  1769. ydst[2*i+0] = src[4*i+0];
  1770. udst[i] = src[4*i+1];
  1771. ydst[2*i+1] = src[4*i+2];
  1772. vdst[i] = src[4*i+3];
  1773. }
  1774. ydst += lumStride;
  1775. src += srcStride;
  1776. for (i=0; i<chromWidth; i++)
  1777. {
  1778. ydst[2*i+0] = src[4*i+0];
  1779. ydst[2*i+1] = src[4*i+2];
  1780. }
  1781. #endif
  1782. udst += chromStride;
  1783. vdst += chromStride;
  1784. ydst += lumStride;
  1785. src += srcStride;
  1786. }
  1787. #if HAVE_MMX
  1788. __asm__ volatile( EMMS" \n\t"
  1789. SFENCE" \n\t"
  1790. :::"memory");
  1791. #endif
  1792. }
  1793. static inline void RENAME(yvu9toyv12)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc,
  1794. uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
  1795. long width, long height, long lumStride, long chromStride)
  1796. {
  1797. /* Y Plane */
  1798. memcpy(ydst, ysrc, width*height);
  1799. /* XXX: implement upscaling for U,V */
  1800. }
  1801. static inline void RENAME(planar2x)(const uint8_t *src, uint8_t *dst, long srcWidth, long srcHeight, long srcStride, long dstStride)
  1802. {
  1803. long x,y;
  1804. dst[0]= src[0];
  1805. // first line
  1806. for (x=0; x<srcWidth-1; x++){
  1807. dst[2*x+1]= (3*src[x] + src[x+1])>>2;
  1808. dst[2*x+2]= ( src[x] + 3*src[x+1])>>2;
  1809. }
  1810. dst[2*srcWidth-1]= src[srcWidth-1];
  1811. dst+= dstStride;
  1812. for (y=1; y<srcHeight; y++){
  1813. #if HAVE_MMX2 || HAVE_AMD3DNOW
  1814. const x86_reg mmxSize= srcWidth&~15;
  1815. __asm__ volatile(
  1816. "mov %4, %%"REG_a" \n\t"
  1817. "1: \n\t"
  1818. "movq (%0, %%"REG_a"), %%mm0 \n\t"
  1819. "movq (%1, %%"REG_a"), %%mm1 \n\t"
  1820. "movq 1(%0, %%"REG_a"), %%mm2 \n\t"
  1821. "movq 1(%1, %%"REG_a"), %%mm3 \n\t"
  1822. "movq -1(%0, %%"REG_a"), %%mm4 \n\t"
  1823. "movq -1(%1, %%"REG_a"), %%mm5 \n\t"
  1824. PAVGB" %%mm0, %%mm5 \n\t"
  1825. PAVGB" %%mm0, %%mm3 \n\t"
  1826. PAVGB" %%mm0, %%mm5 \n\t"
  1827. PAVGB" %%mm0, %%mm3 \n\t"
  1828. PAVGB" %%mm1, %%mm4 \n\t"
  1829. PAVGB" %%mm1, %%mm2 \n\t"
  1830. PAVGB" %%mm1, %%mm4 \n\t"
  1831. PAVGB" %%mm1, %%mm2 \n\t"
  1832. "movq %%mm5, %%mm7 \n\t"
  1833. "movq %%mm4, %%mm6 \n\t"
  1834. "punpcklbw %%mm3, %%mm5 \n\t"
  1835. "punpckhbw %%mm3, %%mm7 \n\t"
  1836. "punpcklbw %%mm2, %%mm4 \n\t"
  1837. "punpckhbw %%mm2, %%mm6 \n\t"
  1838. #if 1
  1839. MOVNTQ" %%mm5, (%2, %%"REG_a", 2) \n\t"
  1840. MOVNTQ" %%mm7, 8(%2, %%"REG_a", 2) \n\t"
  1841. MOVNTQ" %%mm4, (%3, %%"REG_a", 2) \n\t"
  1842. MOVNTQ" %%mm6, 8(%3, %%"REG_a", 2) \n\t"
  1843. #else
  1844. "movq %%mm5, (%2, %%"REG_a", 2) \n\t"
  1845. "movq %%mm7, 8(%2, %%"REG_a", 2) \n\t"
  1846. "movq %%mm4, (%3, %%"REG_a", 2) \n\t"
  1847. "movq %%mm6, 8(%3, %%"REG_a", 2) \n\t"
  1848. #endif
  1849. "add $8, %%"REG_a" \n\t"
  1850. " js 1b \n\t"
  1851. :: "r" (src + mmxSize ), "r" (src + srcStride + mmxSize ),
  1852. "r" (dst + mmxSize*2), "r" (dst + dstStride + mmxSize*2),
  1853. "g" (-mmxSize)
  1854. : "%"REG_a
  1855. );
  1856. #else
  1857. const x86_reg mmxSize=1;
  1858. #endif
  1859. dst[0 ]= (3*src[0] + src[srcStride])>>2;
  1860. dst[dstStride]= ( src[0] + 3*src[srcStride])>>2;
  1861. for (x=mmxSize-1; x<srcWidth-1; x++){
  1862. dst[2*x +1]= (3*src[x+0] + src[x+srcStride+1])>>2;
  1863. dst[2*x+dstStride+2]= ( src[x+0] + 3*src[x+srcStride+1])>>2;
  1864. dst[2*x+dstStride+1]= ( src[x+1] + 3*src[x+srcStride ])>>2;
  1865. dst[2*x +2]= (3*src[x+1] + src[x+srcStride ])>>2;
  1866. }
  1867. dst[srcWidth*2 -1 ]= (3*src[srcWidth-1] + src[srcWidth-1 + srcStride])>>2;
  1868. dst[srcWidth*2 -1 + dstStride]= ( src[srcWidth-1] + 3*src[srcWidth-1 + srcStride])>>2;
  1869. dst+=dstStride*2;
  1870. src+=srcStride;
  1871. }
  1872. // last line
  1873. #if 1
  1874. dst[0]= src[0];
  1875. for (x=0; x<srcWidth-1; x++){
  1876. dst[2*x+1]= (3*src[x] + src[x+1])>>2;
  1877. dst[2*x+2]= ( src[x] + 3*src[x+1])>>2;
  1878. }
  1879. dst[2*srcWidth-1]= src[srcWidth-1];
  1880. #else
  1881. for (x=0; x<srcWidth; x++){
  1882. dst[2*x+0]=
  1883. dst[2*x+1]= src[x];
  1884. }
  1885. #endif
  1886. #if HAVE_MMX
  1887. __asm__ volatile( EMMS" \n\t"
  1888. SFENCE" \n\t"
  1889. :::"memory");
  1890. #endif
  1891. }
  1892. /**
  1893. * Height should be a multiple of 2 and width should be a multiple of 16.
  1894. * (If this is a problem for anyone then tell me, and I will fix it.)
  1895. * Chrominance data is only taken from every second line, others are ignored.
  1896. * FIXME: Write HQ version.
  1897. */
  1898. static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
  1899. long width, long height,
  1900. long lumStride, long chromStride, long srcStride)
  1901. {
  1902. long y;
  1903. const x86_reg chromWidth= width>>1;
  1904. for (y=0; y<height; y+=2)
  1905. {
  1906. #if HAVE_MMX
  1907. __asm__ volatile(
  1908. "xor %%"REG_a", %%"REG_a" \n\t"
  1909. "pcmpeqw %%mm7, %%mm7 \n\t"
  1910. "psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
  1911. ASMALIGN(4)
  1912. "1: \n\t"
  1913. PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
  1914. "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // UYVY UYVY(0)
  1915. "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // UYVY UYVY(4)
  1916. "movq %%mm0, %%mm2 \n\t" // UYVY UYVY(0)
  1917. "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(4)
  1918. "pand %%mm7, %%mm0 \n\t" // U0V0 U0V0(0)
  1919. "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(4)
  1920. "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
  1921. "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
  1922. "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
  1923. "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
  1924. MOVNTQ" %%mm2, (%1, %%"REG_a", 2) \n\t"
  1925. "movq 16(%0, %%"REG_a", 4), %%mm1 \n\t" // UYVY UYVY(8)
  1926. "movq 24(%0, %%"REG_a", 4), %%mm2 \n\t" // UYVY UYVY(12)
  1927. "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(8)
  1928. "movq %%mm2, %%mm4 \n\t" // UYVY UYVY(12)
  1929. "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(8)
  1930. "pand %%mm7, %%mm2 \n\t" // U0V0 U0V0(12)
  1931. "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
  1932. "psrlw $8, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
  1933. "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
  1934. "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
  1935. MOVNTQ" %%mm3, 8(%1, %%"REG_a", 2) \n\t"
  1936. "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
  1937. "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
  1938. "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
  1939. "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
  1940. "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
  1941. "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
  1942. "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
  1943. "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
  1944. MOVNTQ" %%mm0, (%3, %%"REG_a") \n\t"
  1945. MOVNTQ" %%mm2, (%2, %%"REG_a") \n\t"
  1946. "add $8, %%"REG_a" \n\t"
  1947. "cmp %4, %%"REG_a" \n\t"
  1948. " jb 1b \n\t"
  1949. ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
  1950. : "memory", "%"REG_a
  1951. );
  1952. ydst += lumStride;
  1953. src += srcStride;
  1954. __asm__ volatile(
  1955. "xor %%"REG_a", %%"REG_a" \n\t"
  1956. ASMALIGN(4)
  1957. "1: \n\t"
  1958. PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
  1959. "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
  1960. "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(4)
  1961. "movq 16(%0, %%"REG_a", 4), %%mm2 \n\t" // YUYV YUYV(8)
  1962. "movq 24(%0, %%"REG_a", 4), %%mm3 \n\t" // YUYV YUYV(12)
  1963. "psrlw $8, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
  1964. "psrlw $8, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
  1965. "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
  1966. "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
  1967. "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
  1968. "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
  1969. MOVNTQ" %%mm0, (%1, %%"REG_a", 2) \n\t"
  1970. MOVNTQ" %%mm2, 8(%1, %%"REG_a", 2) \n\t"
  1971. "add $8, %%"REG_a" \n\t"
  1972. "cmp %4, %%"REG_a" \n\t"
  1973. " jb 1b \n\t"
  1974. ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
  1975. : "memory", "%"REG_a
  1976. );
  1977. #else
  1978. long i;
  1979. for (i=0; i<chromWidth; i++)
  1980. {
  1981. udst[i] = src[4*i+0];
  1982. ydst[2*i+0] = src[4*i+1];
  1983. vdst[i] = src[4*i+2];
  1984. ydst[2*i+1] = src[4*i+3];
  1985. }
  1986. ydst += lumStride;
  1987. src += srcStride;
  1988. for (i=0; i<chromWidth; i++)
  1989. {
  1990. ydst[2*i+0] = src[4*i+1];
  1991. ydst[2*i+1] = src[4*i+3];
  1992. }
  1993. #endif
  1994. udst += chromStride;
  1995. vdst += chromStride;
  1996. ydst += lumStride;
  1997. src += srcStride;
  1998. }
  1999. #if HAVE_MMX
  2000. __asm__ volatile( EMMS" \n\t"
  2001. SFENCE" \n\t"
  2002. :::"memory");
  2003. #endif
  2004. }
  2005. /**
  2006. * Height should be a multiple of 2 and width should be a multiple of 2.
  2007. * (If this is a problem for anyone then tell me, and I will fix it.)
  2008. * Chrominance data is only taken from every second line,
  2009. * others are ignored in the C version.
  2010. * FIXME: Write HQ version.
  2011. */
  2012. static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
  2013. long width, long height,
  2014. long lumStride, long chromStride, long srcStride)
  2015. {
  2016. long y;
  2017. const x86_reg chromWidth= width>>1;
  2018. #if HAVE_MMX
  2019. for (y=0; y<height-2; y+=2)
  2020. {
  2021. long i;
  2022. for (i=0; i<2; i++)
  2023. {
  2024. __asm__ volatile(
  2025. "mov %2, %%"REG_a" \n\t"
  2026. "movq "MANGLE(ff_bgr2YCoeff)", %%mm6 \n\t"
  2027. "movq "MANGLE(ff_w1111)", %%mm5 \n\t"
  2028. "pxor %%mm7, %%mm7 \n\t"
  2029. "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d" \n\t"
  2030. ASMALIGN(4)
  2031. "1: \n\t"
  2032. PREFETCH" 64(%0, %%"REG_d") \n\t"
  2033. "movd (%0, %%"REG_d"), %%mm0 \n\t"
  2034. "movd 3(%0, %%"REG_d"), %%mm1 \n\t"
  2035. "punpcklbw %%mm7, %%mm0 \n\t"
  2036. "punpcklbw %%mm7, %%mm1 \n\t"
  2037. "movd 6(%0, %%"REG_d"), %%mm2 \n\t"
  2038. "movd 9(%0, %%"REG_d"), %%mm3 \n\t"
  2039. "punpcklbw %%mm7, %%mm2 \n\t"
  2040. "punpcklbw %%mm7, %%mm3 \n\t"
  2041. "pmaddwd %%mm6, %%mm0 \n\t"
  2042. "pmaddwd %%mm6, %%mm1 \n\t"
  2043. "pmaddwd %%mm6, %%mm2 \n\t"
  2044. "pmaddwd %%mm6, %%mm3 \n\t"
  2045. #ifndef FAST_BGR2YV12
  2046. "psrad $8, %%mm0 \n\t"
  2047. "psrad $8, %%mm1 \n\t"
  2048. "psrad $8, %%mm2 \n\t"
  2049. "psrad $8, %%mm3 \n\t"
  2050. #endif
  2051. "packssdw %%mm1, %%mm0 \n\t"
  2052. "packssdw %%mm3, %%mm2 \n\t"
  2053. "pmaddwd %%mm5, %%mm0 \n\t"
  2054. "pmaddwd %%mm5, %%mm2 \n\t"
  2055. "packssdw %%mm2, %%mm0 \n\t"
  2056. "psraw $7, %%mm0 \n\t"
  2057. "movd 12(%0, %%"REG_d"), %%mm4 \n\t"
  2058. "movd 15(%0, %%"REG_d"), %%mm1 \n\t"
  2059. "punpcklbw %%mm7, %%mm4 \n\t"
  2060. "punpcklbw %%mm7, %%mm1 \n\t"
  2061. "movd 18(%0, %%"REG_d"), %%mm2 \n\t"
  2062. "movd 21(%0, %%"REG_d"), %%mm3 \n\t"
  2063. "punpcklbw %%mm7, %%mm2 \n\t"
  2064. "punpcklbw %%mm7, %%mm3 \n\t"
  2065. "pmaddwd %%mm6, %%mm4 \n\t"
  2066. "pmaddwd %%mm6, %%mm1 \n\t"
  2067. "pmaddwd %%mm6, %%mm2 \n\t"
  2068. "pmaddwd %%mm6, %%mm3 \n\t"
  2069. #ifndef FAST_BGR2YV12
  2070. "psrad $8, %%mm4 \n\t"
  2071. "psrad $8, %%mm1 \n\t"
  2072. "psrad $8, %%mm2 \n\t"
  2073. "psrad $8, %%mm3 \n\t"
  2074. #endif
  2075. "packssdw %%mm1, %%mm4 \n\t"
  2076. "packssdw %%mm3, %%mm2 \n\t"
  2077. "pmaddwd %%mm5, %%mm4 \n\t"
  2078. "pmaddwd %%mm5, %%mm2 \n\t"
  2079. "add $24, %%"REG_d" \n\t"
  2080. "packssdw %%mm2, %%mm4 \n\t"
  2081. "psraw $7, %%mm4 \n\t"
  2082. "packuswb %%mm4, %%mm0 \n\t"
  2083. "paddusb "MANGLE(ff_bgr2YOffset)", %%mm0 \n\t"
  2084. MOVNTQ" %%mm0, (%1, %%"REG_a") \n\t"
  2085. "add $8, %%"REG_a" \n\t"
  2086. " js 1b \n\t"
  2087. : : "r" (src+width*3), "r" (ydst+width), "g" ((x86_reg)-width)
  2088. : "%"REG_a, "%"REG_d
  2089. );
  2090. ydst += lumStride;
  2091. src += srcStride;
  2092. }
  2093. src -= srcStride*2;
  2094. __asm__ volatile(
  2095. "mov %4, %%"REG_a" \n\t"
  2096. "movq "MANGLE(ff_w1111)", %%mm5 \n\t"
  2097. "movq "MANGLE(ff_bgr2UCoeff)", %%mm6 \n\t"
  2098. "pxor %%mm7, %%mm7 \n\t"
  2099. "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d" \n\t"
  2100. "add %%"REG_d", %%"REG_d" \n\t"
  2101. ASMALIGN(4)
  2102. "1: \n\t"
  2103. PREFETCH" 64(%0, %%"REG_d") \n\t"
  2104. PREFETCH" 64(%1, %%"REG_d") \n\t"
  2105. #if HAVE_MMX2 || HAVE_AMD3DNOW
  2106. "movq (%0, %%"REG_d"), %%mm0 \n\t"
  2107. "movq (%1, %%"REG_d"), %%mm1 \n\t"
  2108. "movq 6(%0, %%"REG_d"), %%mm2 \n\t"
  2109. "movq 6(%1, %%"REG_d"), %%mm3 \n\t"
  2110. PAVGB" %%mm1, %%mm0 \n\t"
  2111. PAVGB" %%mm3, %%mm2 \n\t"
  2112. "movq %%mm0, %%mm1 \n\t"
  2113. "movq %%mm2, %%mm3 \n\t"
  2114. "psrlq $24, %%mm0 \n\t"
  2115. "psrlq $24, %%mm2 \n\t"
  2116. PAVGB" %%mm1, %%mm0 \n\t"
  2117. PAVGB" %%mm3, %%mm2 \n\t"
  2118. "punpcklbw %%mm7, %%mm0 \n\t"
  2119. "punpcklbw %%mm7, %%mm2 \n\t"
  2120. #else
  2121. "movd (%0, %%"REG_d"), %%mm0 \n\t"
  2122. "movd (%1, %%"REG_d"), %%mm1 \n\t"
  2123. "movd 3(%0, %%"REG_d"), %%mm2 \n\t"
  2124. "movd 3(%1, %%"REG_d"), %%mm3 \n\t"
  2125. "punpcklbw %%mm7, %%mm0 \n\t"
  2126. "punpcklbw %%mm7, %%mm1 \n\t"
  2127. "punpcklbw %%mm7, %%mm2 \n\t"
  2128. "punpcklbw %%mm7, %%mm3 \n\t"
  2129. "paddw %%mm1, %%mm0 \n\t"
  2130. "paddw %%mm3, %%mm2 \n\t"
  2131. "paddw %%mm2, %%mm0 \n\t"
  2132. "movd 6(%0, %%"REG_d"), %%mm4 \n\t"
  2133. "movd 6(%1, %%"REG_d"), %%mm1 \n\t"
  2134. "movd 9(%0, %%"REG_d"), %%mm2 \n\t"
  2135. "movd 9(%1, %%"REG_d"), %%mm3 \n\t"
  2136. "punpcklbw %%mm7, %%mm4 \n\t"
  2137. "punpcklbw %%mm7, %%mm1 \n\t"
  2138. "punpcklbw %%mm7, %%mm2 \n\t"
  2139. "punpcklbw %%mm7, %%mm3 \n\t"
  2140. "paddw %%mm1, %%mm4 \n\t"
  2141. "paddw %%mm3, %%mm2 \n\t"
  2142. "paddw %%mm4, %%mm2 \n\t"
  2143. "psrlw $2, %%mm0 \n\t"
  2144. "psrlw $2, %%mm2 \n\t"
  2145. #endif
  2146. "movq "MANGLE(ff_bgr2VCoeff)", %%mm1 \n\t"
  2147. "movq "MANGLE(ff_bgr2VCoeff)", %%mm3 \n\t"
  2148. "pmaddwd %%mm0, %%mm1 \n\t"
  2149. "pmaddwd %%mm2, %%mm3 \n\t"
  2150. "pmaddwd %%mm6, %%mm0 \n\t"
  2151. "pmaddwd %%mm6, %%mm2 \n\t"
  2152. #ifndef FAST_BGR2YV12
  2153. "psrad $8, %%mm0 \n\t"
  2154. "psrad $8, %%mm1 \n\t"
  2155. "psrad $8, %%mm2 \n\t"
  2156. "psrad $8, %%mm3 \n\t"
  2157. #endif
  2158. "packssdw %%mm2, %%mm0 \n\t"
  2159. "packssdw %%mm3, %%mm1 \n\t"
  2160. "pmaddwd %%mm5, %%mm0 \n\t"
  2161. "pmaddwd %%mm5, %%mm1 \n\t"
  2162. "packssdw %%mm1, %%mm0 \n\t" // V1 V0 U1 U0
  2163. "psraw $7, %%mm0 \n\t"
  2164. #if HAVE_MMX2 || HAVE_AMD3DNOW
  2165. "movq 12(%0, %%"REG_d"), %%mm4 \n\t"
  2166. "movq 12(%1, %%"REG_d"), %%mm1 \n\t"
  2167. "movq 18(%0, %%"REG_d"), %%mm2 \n\t"
  2168. "movq 18(%1, %%"REG_d"), %%mm3 \n\t"
  2169. PAVGB" %%mm1, %%mm4 \n\t"
  2170. PAVGB" %%mm3, %%mm2 \n\t"
  2171. "movq %%mm4, %%mm1 \n\t"
  2172. "movq %%mm2, %%mm3 \n\t"
  2173. "psrlq $24, %%mm4 \n\t"
  2174. "psrlq $24, %%mm2 \n\t"
  2175. PAVGB" %%mm1, %%mm4 \n\t"
  2176. PAVGB" %%mm3, %%mm2 \n\t"
  2177. "punpcklbw %%mm7, %%mm4 \n\t"
  2178. "punpcklbw %%mm7, %%mm2 \n\t"
  2179. #else
  2180. "movd 12(%0, %%"REG_d"), %%mm4 \n\t"
  2181. "movd 12(%1, %%"REG_d"), %%mm1 \n\t"
  2182. "movd 15(%0, %%"REG_d"), %%mm2 \n\t"
  2183. "movd 15(%1, %%"REG_d"), %%mm3 \n\t"
  2184. "punpcklbw %%mm7, %%mm4 \n\t"
  2185. "punpcklbw %%mm7, %%mm1 \n\t"
  2186. "punpcklbw %%mm7, %%mm2 \n\t"
  2187. "punpcklbw %%mm7, %%mm3 \n\t"
  2188. "paddw %%mm1, %%mm4 \n\t"
  2189. "paddw %%mm3, %%mm2 \n\t"
  2190. "paddw %%mm2, %%mm4 \n\t"
  2191. "movd 18(%0, %%"REG_d"), %%mm5 \n\t"
  2192. "movd 18(%1, %%"REG_d"), %%mm1 \n\t"
  2193. "movd 21(%0, %%"REG_d"), %%mm2 \n\t"
  2194. "movd 21(%1, %%"REG_d"), %%mm3 \n\t"
  2195. "punpcklbw %%mm7, %%mm5 \n\t"
  2196. "punpcklbw %%mm7, %%mm1 \n\t"
  2197. "punpcklbw %%mm7, %%mm2 \n\t"
  2198. "punpcklbw %%mm7, %%mm3 \n\t"
  2199. "paddw %%mm1, %%mm5 \n\t"
  2200. "paddw %%mm3, %%mm2 \n\t"
  2201. "paddw %%mm5, %%mm2 \n\t"
  2202. "movq "MANGLE(ff_w1111)", %%mm5 \n\t"
  2203. "psrlw $2, %%mm4 \n\t"
  2204. "psrlw $2, %%mm2 \n\t"
  2205. #endif
  2206. "movq "MANGLE(ff_bgr2VCoeff)", %%mm1 \n\t"
  2207. "movq "MANGLE(ff_bgr2VCoeff)", %%mm3 \n\t"
  2208. "pmaddwd %%mm4, %%mm1 \n\t"
  2209. "pmaddwd %%mm2, %%mm3 \n\t"
  2210. "pmaddwd %%mm6, %%mm4 \n\t"
  2211. "pmaddwd %%mm6, %%mm2 \n\t"
  2212. #ifndef FAST_BGR2YV12
  2213. "psrad $8, %%mm4 \n\t"
  2214. "psrad $8, %%mm1 \n\t"
  2215. "psrad $8, %%mm2 \n\t"
  2216. "psrad $8, %%mm3 \n\t"
  2217. #endif
  2218. "packssdw %%mm2, %%mm4 \n\t"
  2219. "packssdw %%mm3, %%mm1 \n\t"
  2220. "pmaddwd %%mm5, %%mm4 \n\t"
  2221. "pmaddwd %%mm5, %%mm1 \n\t"
  2222. "add $24, %%"REG_d" \n\t"
  2223. "packssdw %%mm1, %%mm4 \n\t" // V3 V2 U3 U2
  2224. "psraw $7, %%mm4 \n\t"
  2225. "movq %%mm0, %%mm1 \n\t"
  2226. "punpckldq %%mm4, %%mm0 \n\t"
  2227. "punpckhdq %%mm4, %%mm1 \n\t"
  2228. "packsswb %%mm1, %%mm0 \n\t"
  2229. "paddb "MANGLE(ff_bgr2UVOffset)", %%mm0 \n\t"
  2230. "movd %%mm0, (%2, %%"REG_a") \n\t"
  2231. "punpckhdq %%mm0, %%mm0 \n\t"
  2232. "movd %%mm0, (%3, %%"REG_a") \n\t"
  2233. "add $4, %%"REG_a" \n\t"
  2234. " js 1b \n\t"
  2235. : : "r" (src+chromWidth*6), "r" (src+srcStride+chromWidth*6), "r" (udst+chromWidth), "r" (vdst+chromWidth), "g" (-chromWidth)
  2236. : "%"REG_a, "%"REG_d
  2237. );
  2238. udst += chromStride;
  2239. vdst += chromStride;
  2240. src += srcStride*2;
  2241. }
  2242. __asm__ volatile( EMMS" \n\t"
  2243. SFENCE" \n\t"
  2244. :::"memory");
  2245. #else
  2246. y=0;
  2247. #endif
  2248. for (; y<height; y+=2)
  2249. {
  2250. long i;
  2251. for (i=0; i<chromWidth; i++)
  2252. {
  2253. unsigned int b = src[6*i+0];
  2254. unsigned int g = src[6*i+1];
  2255. unsigned int r = src[6*i+2];
  2256. unsigned int Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
  2257. unsigned int V = ((RV*r + GV*g + BV*b)>>RGB2YUV_SHIFT) + 128;
  2258. unsigned int U = ((RU*r + GU*g + BU*b)>>RGB2YUV_SHIFT) + 128;
  2259. udst[i] = U;
  2260. vdst[i] = V;
  2261. ydst[2*i] = Y;
  2262. b = src[6*i+3];
  2263. g = src[6*i+4];
  2264. r = src[6*i+5];
  2265. Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
  2266. ydst[2*i+1] = Y;
  2267. }
  2268. ydst += lumStride;
  2269. src += srcStride;
  2270. for (i=0; i<chromWidth; i++)
  2271. {
  2272. unsigned int b = src[6*i+0];
  2273. unsigned int g = src[6*i+1];
  2274. unsigned int r = src[6*i+2];
  2275. unsigned int Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
  2276. ydst[2*i] = Y;
  2277. b = src[6*i+3];
  2278. g = src[6*i+4];
  2279. r = src[6*i+5];
  2280. Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
  2281. ydst[2*i+1] = Y;
  2282. }
  2283. udst += chromStride;
  2284. vdst += chromStride;
  2285. ydst += lumStride;
  2286. src += srcStride;
  2287. }
  2288. }
  2289. static void RENAME(interleaveBytes)(uint8_t *src1, uint8_t *src2, uint8_t *dest,
  2290. long width, long height, long src1Stride,
  2291. long src2Stride, long dstStride){
  2292. long h;
  2293. for (h=0; h < height; h++)
  2294. {
  2295. long w;
  2296. #if HAVE_MMX
  2297. #if HAVE_SSE2
  2298. __asm__(
  2299. "xor %%"REG_a", %%"REG_a" \n\t"
  2300. "1: \n\t"
  2301. PREFETCH" 64(%1, %%"REG_a") \n\t"
  2302. PREFETCH" 64(%2, %%"REG_a") \n\t"
  2303. "movdqa (%1, %%"REG_a"), %%xmm0 \n\t"
  2304. "movdqa (%1, %%"REG_a"), %%xmm1 \n\t"
  2305. "movdqa (%2, %%"REG_a"), %%xmm2 \n\t"
  2306. "punpcklbw %%xmm2, %%xmm0 \n\t"
  2307. "punpckhbw %%xmm2, %%xmm1 \n\t"
  2308. "movntdq %%xmm0, (%0, %%"REG_a", 2) \n\t"
  2309. "movntdq %%xmm1, 16(%0, %%"REG_a", 2) \n\t"
  2310. "add $16, %%"REG_a" \n\t"
  2311. "cmp %3, %%"REG_a" \n\t"
  2312. " jb 1b \n\t"
  2313. ::"r"(dest), "r"(src1), "r"(src2), "r" ((x86_reg)width-15)
  2314. : "memory", "%"REG_a""
  2315. );
  2316. #else
  2317. __asm__(
  2318. "xor %%"REG_a", %%"REG_a" \n\t"
  2319. "1: \n\t"
  2320. PREFETCH" 64(%1, %%"REG_a") \n\t"
  2321. PREFETCH" 64(%2, %%"REG_a") \n\t"
  2322. "movq (%1, %%"REG_a"), %%mm0 \n\t"
  2323. "movq 8(%1, %%"REG_a"), %%mm2 \n\t"
  2324. "movq %%mm0, %%mm1 \n\t"
  2325. "movq %%mm2, %%mm3 \n\t"
  2326. "movq (%2, %%"REG_a"), %%mm4 \n\t"
  2327. "movq 8(%2, %%"REG_a"), %%mm5 \n\t"
  2328. "punpcklbw %%mm4, %%mm0 \n\t"
  2329. "punpckhbw %%mm4, %%mm1 \n\t"
  2330. "punpcklbw %%mm5, %%mm2 \n\t"
  2331. "punpckhbw %%mm5, %%mm3 \n\t"
  2332. MOVNTQ" %%mm0, (%0, %%"REG_a", 2) \n\t"
  2333. MOVNTQ" %%mm1, 8(%0, %%"REG_a", 2) \n\t"
  2334. MOVNTQ" %%mm2, 16(%0, %%"REG_a", 2) \n\t"
  2335. MOVNTQ" %%mm3, 24(%0, %%"REG_a", 2) \n\t"
  2336. "add $16, %%"REG_a" \n\t"
  2337. "cmp %3, %%"REG_a" \n\t"
  2338. " jb 1b \n\t"
  2339. ::"r"(dest), "r"(src1), "r"(src2), "r" ((x86_reg)width-15)
  2340. : "memory", "%"REG_a
  2341. );
  2342. #endif
  2343. for (w= (width&(~15)); w < width; w++)
  2344. {
  2345. dest[2*w+0] = src1[w];
  2346. dest[2*w+1] = src2[w];
  2347. }
  2348. #else
  2349. for (w=0; w < width; w++)
  2350. {
  2351. dest[2*w+0] = src1[w];
  2352. dest[2*w+1] = src2[w];
  2353. }
  2354. #endif
  2355. dest += dstStride;
  2356. src1 += src1Stride;
  2357. src2 += src2Stride;
  2358. }
  2359. #if HAVE_MMX
  2360. __asm__(
  2361. EMMS" \n\t"
  2362. SFENCE" \n\t"
  2363. ::: "memory"
  2364. );
  2365. #endif
  2366. }
  2367. static inline void RENAME(vu9_to_vu12)(const uint8_t *src1, const uint8_t *src2,
  2368. uint8_t *dst1, uint8_t *dst2,
  2369. long width, long height,
  2370. long srcStride1, long srcStride2,
  2371. long dstStride1, long dstStride2)
  2372. {
  2373. x86_reg y;
  2374. long x,w,h;
  2375. w=width/2; h=height/2;
  2376. #if HAVE_MMX
  2377. __asm__ volatile(
  2378. PREFETCH" %0 \n\t"
  2379. PREFETCH" %1 \n\t"
  2380. ::"m"(*(src1+srcStride1)),"m"(*(src2+srcStride2)):"memory");
  2381. #endif
  2382. for (y=0;y<h;y++){
  2383. const uint8_t* s1=src1+srcStride1*(y>>1);
  2384. uint8_t* d=dst1+dstStride1*y;
  2385. x=0;
  2386. #if HAVE_MMX
  2387. for (;x<w-31;x+=32)
  2388. {
  2389. __asm__ volatile(
  2390. PREFETCH" 32%1 \n\t"
  2391. "movq %1, %%mm0 \n\t"
  2392. "movq 8%1, %%mm2 \n\t"
  2393. "movq 16%1, %%mm4 \n\t"
  2394. "movq 24%1, %%mm6 \n\t"
  2395. "movq %%mm0, %%mm1 \n\t"
  2396. "movq %%mm2, %%mm3 \n\t"
  2397. "movq %%mm4, %%mm5 \n\t"
  2398. "movq %%mm6, %%mm7 \n\t"
  2399. "punpcklbw %%mm0, %%mm0 \n\t"
  2400. "punpckhbw %%mm1, %%mm1 \n\t"
  2401. "punpcklbw %%mm2, %%mm2 \n\t"
  2402. "punpckhbw %%mm3, %%mm3 \n\t"
  2403. "punpcklbw %%mm4, %%mm4 \n\t"
  2404. "punpckhbw %%mm5, %%mm5 \n\t"
  2405. "punpcklbw %%mm6, %%mm6 \n\t"
  2406. "punpckhbw %%mm7, %%mm7 \n\t"
  2407. MOVNTQ" %%mm0, %0 \n\t"
  2408. MOVNTQ" %%mm1, 8%0 \n\t"
  2409. MOVNTQ" %%mm2, 16%0 \n\t"
  2410. MOVNTQ" %%mm3, 24%0 \n\t"
  2411. MOVNTQ" %%mm4, 32%0 \n\t"
  2412. MOVNTQ" %%mm5, 40%0 \n\t"
  2413. MOVNTQ" %%mm6, 48%0 \n\t"
  2414. MOVNTQ" %%mm7, 56%0"
  2415. :"=m"(d[2*x])
  2416. :"m"(s1[x])
  2417. :"memory");
  2418. }
  2419. #endif
  2420. for (;x<w;x++) d[2*x]=d[2*x+1]=s1[x];
  2421. }
  2422. for (y=0;y<h;y++){
  2423. const uint8_t* s2=src2+srcStride2*(y>>1);
  2424. uint8_t* d=dst2+dstStride2*y;
  2425. x=0;
  2426. #if HAVE_MMX
  2427. for (;x<w-31;x+=32)
  2428. {
  2429. __asm__ volatile(
  2430. PREFETCH" 32%1 \n\t"
  2431. "movq %1, %%mm0 \n\t"
  2432. "movq 8%1, %%mm2 \n\t"
  2433. "movq 16%1, %%mm4 \n\t"
  2434. "movq 24%1, %%mm6 \n\t"
  2435. "movq %%mm0, %%mm1 \n\t"
  2436. "movq %%mm2, %%mm3 \n\t"
  2437. "movq %%mm4, %%mm5 \n\t"
  2438. "movq %%mm6, %%mm7 \n\t"
  2439. "punpcklbw %%mm0, %%mm0 \n\t"
  2440. "punpckhbw %%mm1, %%mm1 \n\t"
  2441. "punpcklbw %%mm2, %%mm2 \n\t"
  2442. "punpckhbw %%mm3, %%mm3 \n\t"
  2443. "punpcklbw %%mm4, %%mm4 \n\t"
  2444. "punpckhbw %%mm5, %%mm5 \n\t"
  2445. "punpcklbw %%mm6, %%mm6 \n\t"
  2446. "punpckhbw %%mm7, %%mm7 \n\t"
  2447. MOVNTQ" %%mm0, %0 \n\t"
  2448. MOVNTQ" %%mm1, 8%0 \n\t"
  2449. MOVNTQ" %%mm2, 16%0 \n\t"
  2450. MOVNTQ" %%mm3, 24%0 \n\t"
  2451. MOVNTQ" %%mm4, 32%0 \n\t"
  2452. MOVNTQ" %%mm5, 40%0 \n\t"
  2453. MOVNTQ" %%mm6, 48%0 \n\t"
  2454. MOVNTQ" %%mm7, 56%0"
  2455. :"=m"(d[2*x])
  2456. :"m"(s2[x])
  2457. :"memory");
  2458. }
  2459. #endif
  2460. for (;x<w;x++) d[2*x]=d[2*x+1]=s2[x];
  2461. }
  2462. #if HAVE_MMX
  2463. __asm__(
  2464. EMMS" \n\t"
  2465. SFENCE" \n\t"
  2466. ::: "memory"
  2467. );
  2468. #endif
  2469. }
  2470. static inline void RENAME(yvu9_to_yuy2)(const uint8_t *src1, const uint8_t *src2, const uint8_t *src3,
  2471. uint8_t *dst,
  2472. long width, long height,
  2473. long srcStride1, long srcStride2,
  2474. long srcStride3, long dstStride)
  2475. {
  2476. x86_reg x;
  2477. long y,w,h;
  2478. w=width/2; h=height;
  2479. for (y=0;y<h;y++){
  2480. const uint8_t* yp=src1+srcStride1*y;
  2481. const uint8_t* up=src2+srcStride2*(y>>2);
  2482. const uint8_t* vp=src3+srcStride3*(y>>2);
  2483. uint8_t* d=dst+dstStride*y;
  2484. x=0;
  2485. #if HAVE_MMX
  2486. for (;x<w-7;x+=8)
  2487. {
  2488. __asm__ volatile(
  2489. PREFETCH" 32(%1, %0) \n\t"
  2490. PREFETCH" 32(%2, %0) \n\t"
  2491. PREFETCH" 32(%3, %0) \n\t"
  2492. "movq (%1, %0, 4), %%mm0 \n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
  2493. "movq (%2, %0), %%mm1 \n\t" /* U0U1U2U3U4U5U6U7 */
  2494. "movq (%3, %0), %%mm2 \n\t" /* V0V1V2V3V4V5V6V7 */
  2495. "movq %%mm0, %%mm3 \n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
  2496. "movq %%mm1, %%mm4 \n\t" /* U0U1U2U3U4U5U6U7 */
  2497. "movq %%mm2, %%mm5 \n\t" /* V0V1V2V3V4V5V6V7 */
  2498. "punpcklbw %%mm1, %%mm1 \n\t" /* U0U0 U1U1 U2U2 U3U3 */
  2499. "punpcklbw %%mm2, %%mm2 \n\t" /* V0V0 V1V1 V2V2 V3V3 */
  2500. "punpckhbw %%mm4, %%mm4 \n\t" /* U4U4 U5U5 U6U6 U7U7 */
  2501. "punpckhbw %%mm5, %%mm5 \n\t" /* V4V4 V5V5 V6V6 V7V7 */
  2502. "movq %%mm1, %%mm6 \n\t"
  2503. "punpcklbw %%mm2, %%mm1 \n\t" /* U0V0 U0V0 U1V1 U1V1*/
  2504. "punpcklbw %%mm1, %%mm0 \n\t" /* Y0U0 Y1V0 Y2U0 Y3V0*/
  2505. "punpckhbw %%mm1, %%mm3 \n\t" /* Y4U1 Y5V1 Y6U1 Y7V1*/
  2506. MOVNTQ" %%mm0, (%4, %0, 8) \n\t"
  2507. MOVNTQ" %%mm3, 8(%4, %0, 8) \n\t"
  2508. "punpckhbw %%mm2, %%mm6 \n\t" /* U2V2 U2V2 U3V3 U3V3*/
  2509. "movq 8(%1, %0, 4), %%mm0 \n\t"
  2510. "movq %%mm0, %%mm3 \n\t"
  2511. "punpcklbw %%mm6, %%mm0 \n\t" /* Y U2 Y V2 Y U2 Y V2*/
  2512. "punpckhbw %%mm6, %%mm3 \n\t" /* Y U3 Y V3 Y U3 Y V3*/
  2513. MOVNTQ" %%mm0, 16(%4, %0, 8) \n\t"
  2514. MOVNTQ" %%mm3, 24(%4, %0, 8) \n\t"
  2515. "movq %%mm4, %%mm6 \n\t"
  2516. "movq 16(%1, %0, 4), %%mm0 \n\t"
  2517. "movq %%mm0, %%mm3 \n\t"
  2518. "punpcklbw %%mm5, %%mm4 \n\t"
  2519. "punpcklbw %%mm4, %%mm0 \n\t" /* Y U4 Y V4 Y U4 Y V4*/
  2520. "punpckhbw %%mm4, %%mm3 \n\t" /* Y U5 Y V5 Y U5 Y V5*/
  2521. MOVNTQ" %%mm0, 32(%4, %0, 8) \n\t"
  2522. MOVNTQ" %%mm3, 40(%4, %0, 8) \n\t"
  2523. "punpckhbw %%mm5, %%mm6 \n\t"
  2524. "movq 24(%1, %0, 4), %%mm0 \n\t"
  2525. "movq %%mm0, %%mm3 \n\t"
  2526. "punpcklbw %%mm6, %%mm0 \n\t" /* Y U6 Y V6 Y U6 Y V6*/
  2527. "punpckhbw %%mm6, %%mm3 \n\t" /* Y U7 Y V7 Y U7 Y V7*/
  2528. MOVNTQ" %%mm0, 48(%4, %0, 8) \n\t"
  2529. MOVNTQ" %%mm3, 56(%4, %0, 8) \n\t"
  2530. : "+r" (x)
  2531. : "r"(yp), "r" (up), "r"(vp), "r"(d)
  2532. :"memory");
  2533. }
  2534. #endif
  2535. for (; x<w; x++)
  2536. {
  2537. const long x2 = x<<2;
  2538. d[8*x+0] = yp[x2];
  2539. d[8*x+1] = up[x];
  2540. d[8*x+2] = yp[x2+1];
  2541. d[8*x+3] = vp[x];
  2542. d[8*x+4] = yp[x2+2];
  2543. d[8*x+5] = up[x];
  2544. d[8*x+6] = yp[x2+3];
  2545. d[8*x+7] = vp[x];
  2546. }
  2547. }
  2548. #if HAVE_MMX
  2549. __asm__(
  2550. EMMS" \n\t"
  2551. SFENCE" \n\t"
  2552. ::: "memory"
  2553. );
  2554. #endif
  2555. }
  2556. static void RENAME(extract_even)(const uint8_t *src, uint8_t *dst, x86_reg count)
  2557. {
  2558. dst += count;
  2559. src += 2*count;
  2560. count= - count;
  2561. #if HAVE_MMX
  2562. if(count <= -16){
  2563. count += 15;
  2564. __asm__ volatile(
  2565. "pcmpeqw %%mm7, %%mm7 \n\t"
  2566. "psrlw $8, %%mm7 \n\t"
  2567. "1: \n\t"
  2568. "movq -30(%1, %0, 2), %%mm0 \n\t"
  2569. "movq -22(%1, %0, 2), %%mm1 \n\t"
  2570. "movq -14(%1, %0, 2), %%mm2 \n\t"
  2571. "movq -6(%1, %0, 2), %%mm3 \n\t"
  2572. "pand %%mm7, %%mm0 \n\t"
  2573. "pand %%mm7, %%mm1 \n\t"
  2574. "pand %%mm7, %%mm2 \n\t"
  2575. "pand %%mm7, %%mm3 \n\t"
  2576. "packuswb %%mm1, %%mm0 \n\t"
  2577. "packuswb %%mm3, %%mm2 \n\t"
  2578. MOVNTQ" %%mm0,-15(%2, %0) \n\t"
  2579. MOVNTQ" %%mm2,- 7(%2, %0) \n\t"
  2580. "add $16, %0 \n\t"
  2581. " js 1b \n\t"
  2582. : "+r"(count)
  2583. : "r"(src), "r"(dst)
  2584. );
  2585. count -= 15;
  2586. }
  2587. #endif
  2588. while(count<0){
  2589. dst[count]= src[2*count];
  2590. count++;
  2591. }
  2592. }
  2593. static void RENAME(extract_even2)(const uint8_t *src, uint8_t *dst0, uint8_t *dst1, x86_reg count)
  2594. {
  2595. dst0+= count;
  2596. dst1+= count;
  2597. src += 4*count;
  2598. count= - count;
  2599. #if HAVE_MMX
  2600. if(count <= -8){
  2601. count += 7;
  2602. __asm__ volatile(
  2603. "pcmpeqw %%mm7, %%mm7 \n\t"
  2604. "psrlw $8, %%mm7 \n\t"
  2605. "1: \n\t"
  2606. "movq -28(%1, %0, 4), %%mm0 \n\t"
  2607. "movq -20(%1, %0, 4), %%mm1 \n\t"
  2608. "movq -12(%1, %0, 4), %%mm2 \n\t"
  2609. "movq -4(%1, %0, 4), %%mm3 \n\t"
  2610. "pand %%mm7, %%mm0 \n\t"
  2611. "pand %%mm7, %%mm1 \n\t"
  2612. "pand %%mm7, %%mm2 \n\t"
  2613. "pand %%mm7, %%mm3 \n\t"
  2614. "packuswb %%mm1, %%mm0 \n\t"
  2615. "packuswb %%mm3, %%mm2 \n\t"
  2616. "movq %%mm0, %%mm1 \n\t"
  2617. "movq %%mm2, %%mm3 \n\t"
  2618. "psrlw $8, %%mm0 \n\t"
  2619. "psrlw $8, %%mm2 \n\t"
  2620. "pand %%mm7, %%mm1 \n\t"
  2621. "pand %%mm7, %%mm3 \n\t"
  2622. "packuswb %%mm2, %%mm0 \n\t"
  2623. "packuswb %%mm3, %%mm1 \n\t"
  2624. MOVNTQ" %%mm0,- 7(%3, %0) \n\t"
  2625. MOVNTQ" %%mm1,- 7(%2, %0) \n\t"
  2626. "add $8, %0 \n\t"
  2627. " js 1b \n\t"
  2628. : "+r"(count)
  2629. : "r"(src), "r"(dst0), "r"(dst1)
  2630. );
  2631. count -= 7;
  2632. }
  2633. #endif
  2634. while(count<0){
  2635. dst0[count]= src[4*count+0];
  2636. dst1[count]= src[4*count+2];
  2637. count++;
  2638. }
  2639. }
  2640. static void RENAME(extract_even2avg)(const uint8_t *src0, const uint8_t *src1, uint8_t *dst0, uint8_t *dst1, x86_reg count)
  2641. {
  2642. dst0 += count;
  2643. dst1 += count;
  2644. src0 += 4*count;
  2645. src1 += 4*count;
  2646. count= - count;
  2647. #ifdef PAVGB
  2648. if(count <= -8){
  2649. count += 7;
  2650. __asm__ volatile(
  2651. "pcmpeqw %%mm7, %%mm7 \n\t"
  2652. "psrlw $8, %%mm7 \n\t"
  2653. "1: \n\t"
  2654. "movq -28(%1, %0, 4), %%mm0 \n\t"
  2655. "movq -20(%1, %0, 4), %%mm1 \n\t"
  2656. "movq -12(%1, %0, 4), %%mm2 \n\t"
  2657. "movq -4(%1, %0, 4), %%mm3 \n\t"
  2658. PAVGB" -28(%2, %0, 4), %%mm0 \n\t"
  2659. PAVGB" -20(%2, %0, 4), %%mm1 \n\t"
  2660. PAVGB" -12(%2, %0, 4), %%mm2 \n\t"
  2661. PAVGB" - 4(%2, %0, 4), %%mm3 \n\t"
  2662. "pand %%mm7, %%mm0 \n\t"
  2663. "pand %%mm7, %%mm1 \n\t"
  2664. "pand %%mm7, %%mm2 \n\t"
  2665. "pand %%mm7, %%mm3 \n\t"
  2666. "packuswb %%mm1, %%mm0 \n\t"
  2667. "packuswb %%mm3, %%mm2 \n\t"
  2668. "movq %%mm0, %%mm1 \n\t"
  2669. "movq %%mm2, %%mm3 \n\t"
  2670. "psrlw $8, %%mm0 \n\t"
  2671. "psrlw $8, %%mm2 \n\t"
  2672. "pand %%mm7, %%mm1 \n\t"
  2673. "pand %%mm7, %%mm3 \n\t"
  2674. "packuswb %%mm2, %%mm0 \n\t"
  2675. "packuswb %%mm3, %%mm1 \n\t"
  2676. MOVNTQ" %%mm0,- 7(%4, %0) \n\t"
  2677. MOVNTQ" %%mm1,- 7(%3, %0) \n\t"
  2678. "add $8, %0 \n\t"
  2679. " js 1b \n\t"
  2680. : "+r"(count)
  2681. : "r"(src0), "r"(src1), "r"(dst0), "r"(dst1)
  2682. );
  2683. count -= 7;
  2684. }
  2685. #endif
  2686. while(count<0){
  2687. dst0[count]= (src0[4*count+0]+src1[4*count+0])>>1;
  2688. dst1[count]= (src0[4*count+2]+src1[4*count+2])>>1;
  2689. count++;
  2690. }
  2691. }
  2692. static void RENAME(extract_odd2)(const uint8_t *src, uint8_t *dst0, uint8_t *dst1, x86_reg count)
  2693. {
  2694. dst0+= count;
  2695. dst1+= count;
  2696. src += 4*count;
  2697. count= - count;
  2698. #if HAVE_MMX
  2699. if(count <= -8){
  2700. count += 7;
  2701. __asm__ volatile(
  2702. "pcmpeqw %%mm7, %%mm7 \n\t"
  2703. "psrlw $8, %%mm7 \n\t"
  2704. "1: \n\t"
  2705. "movq -28(%1, %0, 4), %%mm0 \n\t"
  2706. "movq -20(%1, %0, 4), %%mm1 \n\t"
  2707. "movq -12(%1, %0, 4), %%mm2 \n\t"
  2708. "movq -4(%1, %0, 4), %%mm3 \n\t"
  2709. "psrlw $8, %%mm0 \n\t"
  2710. "psrlw $8, %%mm1 \n\t"
  2711. "psrlw $8, %%mm2 \n\t"
  2712. "psrlw $8, %%mm3 \n\t"
  2713. "packuswb %%mm1, %%mm0 \n\t"
  2714. "packuswb %%mm3, %%mm2 \n\t"
  2715. "movq %%mm0, %%mm1 \n\t"
  2716. "movq %%mm2, %%mm3 \n\t"
  2717. "psrlw $8, %%mm0 \n\t"
  2718. "psrlw $8, %%mm2 \n\t"
  2719. "pand %%mm7, %%mm1 \n\t"
  2720. "pand %%mm7, %%mm3 \n\t"
  2721. "packuswb %%mm2, %%mm0 \n\t"
  2722. "packuswb %%mm3, %%mm1 \n\t"
  2723. MOVNTQ" %%mm0,- 7(%3, %0) \n\t"
  2724. MOVNTQ" %%mm1,- 7(%2, %0) \n\t"
  2725. "add $8, %0 \n\t"
  2726. " js 1b \n\t"
  2727. : "+r"(count)
  2728. : "r"(src), "r"(dst0), "r"(dst1)
  2729. );
  2730. count -= 7;
  2731. }
  2732. #endif
  2733. src++;
  2734. while(count<0){
  2735. dst0[count]= src[4*count+0];
  2736. dst1[count]= src[4*count+2];
  2737. count++;
  2738. }
  2739. }
  2740. static void RENAME(extract_odd2avg)(const uint8_t *src0, const uint8_t *src1, uint8_t *dst0, uint8_t *dst1, x86_reg count)
  2741. {
  2742. dst0 += count;
  2743. dst1 += count;
  2744. src0 += 4*count;
  2745. src1 += 4*count;
  2746. count= - count;
  2747. #ifdef PAVGB
  2748. if(count <= -8){
  2749. count += 7;
  2750. __asm__ volatile(
  2751. "pcmpeqw %%mm7, %%mm7 \n\t"
  2752. "psrlw $8, %%mm7 \n\t"
  2753. "1: \n\t"
  2754. "movq -28(%1, %0, 4), %%mm0 \n\t"
  2755. "movq -20(%1, %0, 4), %%mm1 \n\t"
  2756. "movq -12(%1, %0, 4), %%mm2 \n\t"
  2757. "movq -4(%1, %0, 4), %%mm3 \n\t"
  2758. PAVGB" -28(%2, %0, 4), %%mm0 \n\t"
  2759. PAVGB" -20(%2, %0, 4), %%mm1 \n\t"
  2760. PAVGB" -12(%2, %0, 4), %%mm2 \n\t"
  2761. PAVGB" - 4(%2, %0, 4), %%mm3 \n\t"
  2762. "psrlw $8, %%mm0 \n\t"
  2763. "psrlw $8, %%mm1 \n\t"
  2764. "psrlw $8, %%mm2 \n\t"
  2765. "psrlw $8, %%mm3 \n\t"
  2766. "packuswb %%mm1, %%mm0 \n\t"
  2767. "packuswb %%mm3, %%mm2 \n\t"
  2768. "movq %%mm0, %%mm1 \n\t"
  2769. "movq %%mm2, %%mm3 \n\t"
  2770. "psrlw $8, %%mm0 \n\t"
  2771. "psrlw $8, %%mm2 \n\t"
  2772. "pand %%mm7, %%mm1 \n\t"
  2773. "pand %%mm7, %%mm3 \n\t"
  2774. "packuswb %%mm2, %%mm0 \n\t"
  2775. "packuswb %%mm3, %%mm1 \n\t"
  2776. MOVNTQ" %%mm0,- 7(%4, %0) \n\t"
  2777. MOVNTQ" %%mm1,- 7(%3, %0) \n\t"
  2778. "add $8, %0 \n\t"
  2779. " js 1b \n\t"
  2780. : "+r"(count)
  2781. : "r"(src0), "r"(src1), "r"(dst0), "r"(dst1)
  2782. );
  2783. count -= 7;
  2784. }
  2785. #endif
  2786. src0++;
  2787. src1++;
  2788. while(count<0){
  2789. dst0[count]= (src0[4*count+0]+src1[4*count+0])>>1;
  2790. dst1[count]= (src0[4*count+2]+src1[4*count+2])>>1;
  2791. count++;
  2792. }
  2793. }
  2794. static void RENAME(yuyvtoyuv420)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
  2795. long width, long height,
  2796. long lumStride, long chromStride, long srcStride)
  2797. {
  2798. long y;
  2799. const long chromWidth= -((-width)>>1);
  2800. for (y=0; y<height; y++){
  2801. RENAME(extract_even)(src, ydst, width);
  2802. if(y&1){
  2803. RENAME(extract_odd2avg)(src-srcStride, src, udst, vdst, chromWidth);
  2804. udst+= chromStride;
  2805. vdst+= chromStride;
  2806. }
  2807. src += srcStride;
  2808. ydst+= lumStride;
  2809. }
  2810. #if HAVE_MMX
  2811. __asm__(
  2812. EMMS" \n\t"
  2813. SFENCE" \n\t"
  2814. ::: "memory"
  2815. );
  2816. #endif
  2817. }
  2818. static void RENAME(yuyvtoyuv422)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
  2819. long width, long height,
  2820. long lumStride, long chromStride, long srcStride)
  2821. {
  2822. long y;
  2823. const long chromWidth= -((-width)>>1);
  2824. for (y=0; y<height; y++){
  2825. RENAME(extract_even)(src, ydst, width);
  2826. RENAME(extract_odd2)(src, udst, vdst, chromWidth);
  2827. src += srcStride;
  2828. ydst+= lumStride;
  2829. udst+= chromStride;
  2830. vdst+= chromStride;
  2831. }
  2832. #if HAVE_MMX
  2833. __asm__(
  2834. EMMS" \n\t"
  2835. SFENCE" \n\t"
  2836. ::: "memory"
  2837. );
  2838. #endif
  2839. }
  2840. static void RENAME(uyvytoyuv420)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
  2841. long width, long height,
  2842. long lumStride, long chromStride, long srcStride)
  2843. {
  2844. long y;
  2845. const long chromWidth= -((-width)>>1);
  2846. for (y=0; y<height; y++){
  2847. RENAME(extract_even)(src+1, ydst, width);
  2848. if(y&1){
  2849. RENAME(extract_even2avg)(src-srcStride, src, udst, vdst, chromWidth);
  2850. udst+= chromStride;
  2851. vdst+= chromStride;
  2852. }
  2853. src += srcStride;
  2854. ydst+= lumStride;
  2855. }
  2856. #if HAVE_MMX
  2857. __asm__(
  2858. EMMS" \n\t"
  2859. SFENCE" \n\t"
  2860. ::: "memory"
  2861. );
  2862. #endif
  2863. }
  2864. static void RENAME(uyvytoyuv422)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
  2865. long width, long height,
  2866. long lumStride, long chromStride, long srcStride)
  2867. {
  2868. long y;
  2869. const long chromWidth= -((-width)>>1);
  2870. for (y=0; y<height; y++){
  2871. RENAME(extract_even)(src+1, ydst, width);
  2872. RENAME(extract_even2)(src, udst, vdst, chromWidth);
  2873. src += srcStride;
  2874. ydst+= lumStride;
  2875. udst+= chromStride;
  2876. vdst+= chromStride;
  2877. }
  2878. #if HAVE_MMX
  2879. __asm__(
  2880. EMMS" \n\t"
  2881. SFENCE" \n\t"
  2882. ::: "memory"
  2883. );
  2884. #endif
  2885. }
  2886. static inline void RENAME(rgb2rgb_init)(void){
  2887. rgb15to16 = RENAME(rgb15to16);
  2888. rgb15tobgr24 = RENAME(rgb15tobgr24);
  2889. rgb15to32 = RENAME(rgb15to32);
  2890. rgb16tobgr24 = RENAME(rgb16tobgr24);
  2891. rgb16to32 = RENAME(rgb16to32);
  2892. rgb16to15 = RENAME(rgb16to15);
  2893. rgb24tobgr16 = RENAME(rgb24tobgr16);
  2894. rgb24tobgr15 = RENAME(rgb24tobgr15);
  2895. rgb24tobgr32 = RENAME(rgb24tobgr32);
  2896. rgb32to16 = RENAME(rgb32to16);
  2897. rgb32to15 = RENAME(rgb32to15);
  2898. rgb32tobgr24 = RENAME(rgb32tobgr24);
  2899. rgb24to15 = RENAME(rgb24to15);
  2900. rgb24to16 = RENAME(rgb24to16);
  2901. rgb24tobgr24 = RENAME(rgb24tobgr24);
  2902. rgb32tobgr32 = RENAME(rgb32tobgr32);
  2903. rgb32tobgr16 = RENAME(rgb32tobgr16);
  2904. rgb32tobgr15 = RENAME(rgb32tobgr15);
  2905. yv12toyuy2 = RENAME(yv12toyuy2);
  2906. yv12touyvy = RENAME(yv12touyvy);
  2907. yuv422ptoyuy2 = RENAME(yuv422ptoyuy2);
  2908. yuv422ptouyvy = RENAME(yuv422ptouyvy);
  2909. yuy2toyv12 = RENAME(yuy2toyv12);
  2910. // yvu9toyv12 = RENAME(yvu9toyv12);
  2911. planar2x = RENAME(planar2x);
  2912. rgb24toyv12 = RENAME(rgb24toyv12);
  2913. interleaveBytes = RENAME(interleaveBytes);
  2914. vu9_to_vu12 = RENAME(vu9_to_vu12);
  2915. yvu9_to_yuy2 = RENAME(yvu9_to_yuy2);
  2916. uyvytoyuv420 = RENAME(uyvytoyuv420);
  2917. uyvytoyuv422 = RENAME(uyvytoyuv422);
  2918. yuyvtoyuv420 = RENAME(yuyvtoyuv420);
  2919. yuyvtoyuv422 = RENAME(yuyvtoyuv422);
  2920. }