You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

3015 lines
115KB

  1. /*
  2. * software RGB to RGB converter
  3. * pluralize by software PAL8 to RGB converter
  4. * software YUV to YUV converter
  5. * software YUV to RGB converter
  6. * Written by Nick Kurshev.
  7. * palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at)
  8. * lot of big-endian byte order fixes by Alex Beregszaszi
  9. *
  10. * This file is part of FFmpeg.
  11. *
  12. * FFmpeg is free software; you can redistribute it and/or modify
  13. * it under the terms of the GNU General Public License as published by
  14. * the Free Software Foundation; either version 2 of the License, or
  15. * (at your option) any later version.
  16. *
  17. * FFmpeg is distributed in the hope that it will be useful,
  18. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  19. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  20. * GNU General Public License for more details.
  21. *
  22. * You should have received a copy of the GNU General Public License
  23. * along with FFmpeg; if not, write to the Free Software
  24. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  25. *
  26. * The C code (not assembly, MMX, ...) of this file can be used
  27. * under the LGPL license.
  28. */
  29. #include <stddef.h>
  30. #undef PREFETCH
  31. #undef MOVNTQ
  32. #undef EMMS
  33. #undef SFENCE
  34. #undef MMREG_SIZE
  35. #undef PREFETCHW
  36. #undef PAVGB
  37. #if HAVE_SSE2
  38. #define MMREG_SIZE 16
  39. #else
  40. #define MMREG_SIZE 8
  41. #endif
  42. #if HAVE_AMD3DNOW
  43. #define PREFETCH "prefetch"
  44. #define PREFETCHW "prefetchw"
  45. #define PAVGB "pavgusb"
  46. #elif HAVE_MMX2
  47. #define PREFETCH "prefetchnta"
  48. #define PREFETCHW "prefetcht0"
  49. #define PAVGB "pavgb"
  50. #else
  51. #define PREFETCH " # nop"
  52. #define PREFETCHW " # nop"
  53. #endif
  54. #if HAVE_AMD3DNOW
  55. /* On K6 femms is faster than emms. On K7 femms is directly mapped to emms. */
  56. #define EMMS "femms"
  57. #else
  58. #define EMMS "emms"
  59. #endif
  60. #if HAVE_MMX2
  61. #define MOVNTQ "movntq"
  62. #define SFENCE "sfence"
  63. #else
  64. #define MOVNTQ "movq"
  65. #define SFENCE " # nop"
  66. #endif
  67. static inline void RENAME(rgb24tobgr32)(const uint8_t *src, uint8_t *dst, long src_size)
  68. {
  69. uint8_t *dest = dst;
  70. const uint8_t *s = src;
  71. const uint8_t *end;
  72. #if HAVE_MMX
  73. const uint8_t *mm_end;
  74. #endif
  75. end = s + src_size;
  76. #if HAVE_MMX
  77. __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
  78. mm_end = end - 23;
  79. __asm__ volatile("movq %0, %%mm7"::"m"(mask32a):"memory");
  80. while (s < mm_end) {
  81. __asm__ volatile(
  82. PREFETCH" 32%1 \n\t"
  83. "movd %1, %%mm0 \n\t"
  84. "punpckldq 3%1, %%mm0 \n\t"
  85. "movd 6%1, %%mm1 \n\t"
  86. "punpckldq 9%1, %%mm1 \n\t"
  87. "movd 12%1, %%mm2 \n\t"
  88. "punpckldq 15%1, %%mm2 \n\t"
  89. "movd 18%1, %%mm3 \n\t"
  90. "punpckldq 21%1, %%mm3 \n\t"
  91. "por %%mm7, %%mm0 \n\t"
  92. "por %%mm7, %%mm1 \n\t"
  93. "por %%mm7, %%mm2 \n\t"
  94. "por %%mm7, %%mm3 \n\t"
  95. MOVNTQ" %%mm0, %0 \n\t"
  96. MOVNTQ" %%mm1, 8%0 \n\t"
  97. MOVNTQ" %%mm2, 16%0 \n\t"
  98. MOVNTQ" %%mm3, 24%0"
  99. :"=m"(*dest)
  100. :"m"(*s)
  101. :"memory");
  102. dest += 32;
  103. s += 24;
  104. }
  105. __asm__ volatile(SFENCE:::"memory");
  106. __asm__ volatile(EMMS:::"memory");
  107. #endif
  108. while (s < end) {
  109. #if HAVE_BIGENDIAN
  110. /* RGB24 (= R,G,B) -> RGB32 (= A,B,G,R) */
  111. *dest++ = 255;
  112. *dest++ = s[2];
  113. *dest++ = s[1];
  114. *dest++ = s[0];
  115. s+=3;
  116. #else
  117. *dest++ = *s++;
  118. *dest++ = *s++;
  119. *dest++ = *s++;
  120. *dest++ = 255;
  121. #endif
  122. }
  123. }
  124. static inline void RENAME(rgb32tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
  125. {
  126. uint8_t *dest = dst;
  127. const uint8_t *s = src;
  128. const uint8_t *end;
  129. #if HAVE_MMX
  130. const uint8_t *mm_end;
  131. #endif
  132. end = s + src_size;
  133. #if HAVE_MMX
  134. __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
  135. mm_end = end - 31;
  136. while (s < mm_end) {
  137. __asm__ volatile(
  138. PREFETCH" 32%1 \n\t"
  139. "movq %1, %%mm0 \n\t"
  140. "movq 8%1, %%mm1 \n\t"
  141. "movq 16%1, %%mm4 \n\t"
  142. "movq 24%1, %%mm5 \n\t"
  143. "movq %%mm0, %%mm2 \n\t"
  144. "movq %%mm1, %%mm3 \n\t"
  145. "movq %%mm4, %%mm6 \n\t"
  146. "movq %%mm5, %%mm7 \n\t"
  147. "psrlq $8, %%mm2 \n\t"
  148. "psrlq $8, %%mm3 \n\t"
  149. "psrlq $8, %%mm6 \n\t"
  150. "psrlq $8, %%mm7 \n\t"
  151. "pand %2, %%mm0 \n\t"
  152. "pand %2, %%mm1 \n\t"
  153. "pand %2, %%mm4 \n\t"
  154. "pand %2, %%mm5 \n\t"
  155. "pand %3, %%mm2 \n\t"
  156. "pand %3, %%mm3 \n\t"
  157. "pand %3, %%mm6 \n\t"
  158. "pand %3, %%mm7 \n\t"
  159. "por %%mm2, %%mm0 \n\t"
  160. "por %%mm3, %%mm1 \n\t"
  161. "por %%mm6, %%mm4 \n\t"
  162. "por %%mm7, %%mm5 \n\t"
  163. "movq %%mm1, %%mm2 \n\t"
  164. "movq %%mm4, %%mm3 \n\t"
  165. "psllq $48, %%mm2 \n\t"
  166. "psllq $32, %%mm3 \n\t"
  167. "pand %4, %%mm2 \n\t"
  168. "pand %5, %%mm3 \n\t"
  169. "por %%mm2, %%mm0 \n\t"
  170. "psrlq $16, %%mm1 \n\t"
  171. "psrlq $32, %%mm4 \n\t"
  172. "psllq $16, %%mm5 \n\t"
  173. "por %%mm3, %%mm1 \n\t"
  174. "pand %6, %%mm5 \n\t"
  175. "por %%mm5, %%mm4 \n\t"
  176. MOVNTQ" %%mm0, %0 \n\t"
  177. MOVNTQ" %%mm1, 8%0 \n\t"
  178. MOVNTQ" %%mm4, 16%0"
  179. :"=m"(*dest)
  180. :"m"(*s),"m"(mask24l),
  181. "m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
  182. :"memory");
  183. dest += 24;
  184. s += 32;
  185. }
  186. __asm__ volatile(SFENCE:::"memory");
  187. __asm__ volatile(EMMS:::"memory");
  188. #endif
  189. while (s < end) {
  190. #if HAVE_BIGENDIAN
  191. /* RGB32 (= A,B,G,R) -> RGB24 (= R,G,B) */
  192. s++;
  193. dest[2] = *s++;
  194. dest[1] = *s++;
  195. dest[0] = *s++;
  196. dest += 3;
  197. #else
  198. *dest++ = *s++;
  199. *dest++ = *s++;
  200. *dest++ = *s++;
  201. s++;
  202. #endif
  203. }
  204. }
  205. /*
  206. original by Strepto/Astral
  207. ported to gcc & bugfixed: A'rpi
  208. MMX2, 3DNOW optimization by Nick Kurshev
  209. 32-bit C version, and and&add trick by Michael Niedermayer
  210. */
  211. static inline void RENAME(rgb15to16)(const uint8_t *src, uint8_t *dst, long src_size)
  212. {
  213. register const uint8_t* s=src;
  214. register uint8_t* d=dst;
  215. register const uint8_t *end;
  216. const uint8_t *mm_end;
  217. end = s + src_size;
  218. #if HAVE_MMX
  219. __asm__ volatile(PREFETCH" %0"::"m"(*s));
  220. __asm__ volatile("movq %0, %%mm4"::"m"(mask15s));
  221. mm_end = end - 15;
  222. while (s<mm_end) {
  223. __asm__ volatile(
  224. PREFETCH" 32%1 \n\t"
  225. "movq %1, %%mm0 \n\t"
  226. "movq 8%1, %%mm2 \n\t"
  227. "movq %%mm0, %%mm1 \n\t"
  228. "movq %%mm2, %%mm3 \n\t"
  229. "pand %%mm4, %%mm0 \n\t"
  230. "pand %%mm4, %%mm2 \n\t"
  231. "paddw %%mm1, %%mm0 \n\t"
  232. "paddw %%mm3, %%mm2 \n\t"
  233. MOVNTQ" %%mm0, %0 \n\t"
  234. MOVNTQ" %%mm2, 8%0"
  235. :"=m"(*d)
  236. :"m"(*s)
  237. );
  238. d+=16;
  239. s+=16;
  240. }
  241. __asm__ volatile(SFENCE:::"memory");
  242. __asm__ volatile(EMMS:::"memory");
  243. #endif
  244. mm_end = end - 3;
  245. while (s < mm_end) {
  246. register unsigned x= *((const uint32_t *)s);
  247. *((uint32_t *)d) = (x&0x7FFF7FFF) + (x&0x7FE07FE0);
  248. d+=4;
  249. s+=4;
  250. }
  251. if (s < end) {
  252. register unsigned short x= *((const uint16_t *)s);
  253. *((uint16_t *)d) = (x&0x7FFF) + (x&0x7FE0);
  254. }
  255. }
  256. static inline void RENAME(rgb16to15)(const uint8_t *src, uint8_t *dst, long src_size)
  257. {
  258. register const uint8_t* s=src;
  259. register uint8_t* d=dst;
  260. register const uint8_t *end;
  261. const uint8_t *mm_end;
  262. end = s + src_size;
  263. #if HAVE_MMX
  264. __asm__ volatile(PREFETCH" %0"::"m"(*s));
  265. __asm__ volatile("movq %0, %%mm7"::"m"(mask15rg));
  266. __asm__ volatile("movq %0, %%mm6"::"m"(mask15b));
  267. mm_end = end - 15;
  268. while (s<mm_end) {
  269. __asm__ volatile(
  270. PREFETCH" 32%1 \n\t"
  271. "movq %1, %%mm0 \n\t"
  272. "movq 8%1, %%mm2 \n\t"
  273. "movq %%mm0, %%mm1 \n\t"
  274. "movq %%mm2, %%mm3 \n\t"
  275. "psrlq $1, %%mm0 \n\t"
  276. "psrlq $1, %%mm2 \n\t"
  277. "pand %%mm7, %%mm0 \n\t"
  278. "pand %%mm7, %%mm2 \n\t"
  279. "pand %%mm6, %%mm1 \n\t"
  280. "pand %%mm6, %%mm3 \n\t"
  281. "por %%mm1, %%mm0 \n\t"
  282. "por %%mm3, %%mm2 \n\t"
  283. MOVNTQ" %%mm0, %0 \n\t"
  284. MOVNTQ" %%mm2, 8%0"
  285. :"=m"(*d)
  286. :"m"(*s)
  287. );
  288. d+=16;
  289. s+=16;
  290. }
  291. __asm__ volatile(SFENCE:::"memory");
  292. __asm__ volatile(EMMS:::"memory");
  293. #endif
  294. mm_end = end - 3;
  295. while (s < mm_end) {
  296. register uint32_t x= *((const uint32_t*)s);
  297. *((uint32_t *)d) = ((x>>1)&0x7FE07FE0) | (x&0x001F001F);
  298. s+=4;
  299. d+=4;
  300. }
  301. if (s < end) {
  302. register uint16_t x= *((const uint16_t*)s);
  303. *((uint16_t *)d) = ((x>>1)&0x7FE0) | (x&0x001F);
  304. }
  305. }
  306. static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, long src_size)
  307. {
  308. const uint8_t *s = src;
  309. const uint8_t *end;
  310. #if HAVE_MMX
  311. const uint8_t *mm_end;
  312. #endif
  313. uint16_t *d = (uint16_t *)dst;
  314. end = s + src_size;
  315. #if HAVE_MMX
  316. mm_end = end - 15;
  317. #if 1 //is faster only if multiplies are reasonably fast (FIXME figure out on which CPUs this is faster, on Athlon it is slightly faster)
  318. __asm__ volatile(
  319. "movq %3, %%mm5 \n\t"
  320. "movq %4, %%mm6 \n\t"
  321. "movq %5, %%mm7 \n\t"
  322. "jmp 2f \n\t"
  323. ASMALIGN(4)
  324. "1: \n\t"
  325. PREFETCH" 32(%1) \n\t"
  326. "movd (%1), %%mm0 \n\t"
  327. "movd 4(%1), %%mm3 \n\t"
  328. "punpckldq 8(%1), %%mm0 \n\t"
  329. "punpckldq 12(%1), %%mm3 \n\t"
  330. "movq %%mm0, %%mm1 \n\t"
  331. "movq %%mm3, %%mm4 \n\t"
  332. "pand %%mm6, %%mm0 \n\t"
  333. "pand %%mm6, %%mm3 \n\t"
  334. "pmaddwd %%mm7, %%mm0 \n\t"
  335. "pmaddwd %%mm7, %%mm3 \n\t"
  336. "pand %%mm5, %%mm1 \n\t"
  337. "pand %%mm5, %%mm4 \n\t"
  338. "por %%mm1, %%mm0 \n\t"
  339. "por %%mm4, %%mm3 \n\t"
  340. "psrld $5, %%mm0 \n\t"
  341. "pslld $11, %%mm3 \n\t"
  342. "por %%mm3, %%mm0 \n\t"
  343. MOVNTQ" %%mm0, (%0) \n\t"
  344. "add $16, %1 \n\t"
  345. "add $8, %0 \n\t"
  346. "2: \n\t"
  347. "cmp %2, %1 \n\t"
  348. " jb 1b \n\t"
  349. : "+r" (d), "+r"(s)
  350. : "r" (mm_end), "m" (mask3216g), "m" (mask3216br), "m" (mul3216)
  351. );
  352. #else
  353. __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
  354. __asm__ volatile(
  355. "movq %0, %%mm7 \n\t"
  356. "movq %1, %%mm6 \n\t"
  357. ::"m"(red_16mask),"m"(green_16mask));
  358. while (s < mm_end) {
  359. __asm__ volatile(
  360. PREFETCH" 32%1 \n\t"
  361. "movd %1, %%mm0 \n\t"
  362. "movd 4%1, %%mm3 \n\t"
  363. "punpckldq 8%1, %%mm0 \n\t"
  364. "punpckldq 12%1, %%mm3 \n\t"
  365. "movq %%mm0, %%mm1 \n\t"
  366. "movq %%mm0, %%mm2 \n\t"
  367. "movq %%mm3, %%mm4 \n\t"
  368. "movq %%mm3, %%mm5 \n\t"
  369. "psrlq $3, %%mm0 \n\t"
  370. "psrlq $3, %%mm3 \n\t"
  371. "pand %2, %%mm0 \n\t"
  372. "pand %2, %%mm3 \n\t"
  373. "psrlq $5, %%mm1 \n\t"
  374. "psrlq $5, %%mm4 \n\t"
  375. "pand %%mm6, %%mm1 \n\t"
  376. "pand %%mm6, %%mm4 \n\t"
  377. "psrlq $8, %%mm2 \n\t"
  378. "psrlq $8, %%mm5 \n\t"
  379. "pand %%mm7, %%mm2 \n\t"
  380. "pand %%mm7, %%mm5 \n\t"
  381. "por %%mm1, %%mm0 \n\t"
  382. "por %%mm4, %%mm3 \n\t"
  383. "por %%mm2, %%mm0 \n\t"
  384. "por %%mm5, %%mm3 \n\t"
  385. "psllq $16, %%mm3 \n\t"
  386. "por %%mm3, %%mm0 \n\t"
  387. MOVNTQ" %%mm0, %0 \n\t"
  388. :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
  389. d += 4;
  390. s += 16;
  391. }
  392. #endif
  393. __asm__ volatile(SFENCE:::"memory");
  394. __asm__ volatile(EMMS:::"memory");
  395. #endif
  396. while (s < end) {
  397. register int rgb = *(const uint32_t*)s; s += 4;
  398. *d++ = ((rgb&0xFF)>>3) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>8);
  399. }
  400. }
  401. static inline void RENAME(rgb32tobgr16)(const uint8_t *src, uint8_t *dst, long src_size)
  402. {
  403. const uint8_t *s = src;
  404. const uint8_t *end;
  405. #if HAVE_MMX
  406. const uint8_t *mm_end;
  407. #endif
  408. uint16_t *d = (uint16_t *)dst;
  409. end = s + src_size;
  410. #if HAVE_MMX
  411. __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
  412. __asm__ volatile(
  413. "movq %0, %%mm7 \n\t"
  414. "movq %1, %%mm6 \n\t"
  415. ::"m"(red_16mask),"m"(green_16mask));
  416. mm_end = end - 15;
  417. while (s < mm_end) {
  418. __asm__ volatile(
  419. PREFETCH" 32%1 \n\t"
  420. "movd %1, %%mm0 \n\t"
  421. "movd 4%1, %%mm3 \n\t"
  422. "punpckldq 8%1, %%mm0 \n\t"
  423. "punpckldq 12%1, %%mm3 \n\t"
  424. "movq %%mm0, %%mm1 \n\t"
  425. "movq %%mm0, %%mm2 \n\t"
  426. "movq %%mm3, %%mm4 \n\t"
  427. "movq %%mm3, %%mm5 \n\t"
  428. "psllq $8, %%mm0 \n\t"
  429. "psllq $8, %%mm3 \n\t"
  430. "pand %%mm7, %%mm0 \n\t"
  431. "pand %%mm7, %%mm3 \n\t"
  432. "psrlq $5, %%mm1 \n\t"
  433. "psrlq $5, %%mm4 \n\t"
  434. "pand %%mm6, %%mm1 \n\t"
  435. "pand %%mm6, %%mm4 \n\t"
  436. "psrlq $19, %%mm2 \n\t"
  437. "psrlq $19, %%mm5 \n\t"
  438. "pand %2, %%mm2 \n\t"
  439. "pand %2, %%mm5 \n\t"
  440. "por %%mm1, %%mm0 \n\t"
  441. "por %%mm4, %%mm3 \n\t"
  442. "por %%mm2, %%mm0 \n\t"
  443. "por %%mm5, %%mm3 \n\t"
  444. "psllq $16, %%mm3 \n\t"
  445. "por %%mm3, %%mm0 \n\t"
  446. MOVNTQ" %%mm0, %0 \n\t"
  447. :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
  448. d += 4;
  449. s += 16;
  450. }
  451. __asm__ volatile(SFENCE:::"memory");
  452. __asm__ volatile(EMMS:::"memory");
  453. #endif
  454. while (s < end) {
  455. register int rgb = *(const uint32_t*)s; s += 4;
  456. *d++ = ((rgb&0xF8)<<8) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>19);
  457. }
  458. }
  459. static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, long src_size)
  460. {
  461. const uint8_t *s = src;
  462. const uint8_t *end;
  463. #if HAVE_MMX
  464. const uint8_t *mm_end;
  465. #endif
  466. uint16_t *d = (uint16_t *)dst;
  467. end = s + src_size;
  468. #if HAVE_MMX
  469. mm_end = end - 15;
  470. #if 1 //is faster only if multiplies are reasonably fast (FIXME figure out on which CPUs this is faster, on Athlon it is slightly faster)
  471. __asm__ volatile(
  472. "movq %3, %%mm5 \n\t"
  473. "movq %4, %%mm6 \n\t"
  474. "movq %5, %%mm7 \n\t"
  475. "jmp 2f \n\t"
  476. ASMALIGN(4)
  477. "1: \n\t"
  478. PREFETCH" 32(%1) \n\t"
  479. "movd (%1), %%mm0 \n\t"
  480. "movd 4(%1), %%mm3 \n\t"
  481. "punpckldq 8(%1), %%mm0 \n\t"
  482. "punpckldq 12(%1), %%mm3 \n\t"
  483. "movq %%mm0, %%mm1 \n\t"
  484. "movq %%mm3, %%mm4 \n\t"
  485. "pand %%mm6, %%mm0 \n\t"
  486. "pand %%mm6, %%mm3 \n\t"
  487. "pmaddwd %%mm7, %%mm0 \n\t"
  488. "pmaddwd %%mm7, %%mm3 \n\t"
  489. "pand %%mm5, %%mm1 \n\t"
  490. "pand %%mm5, %%mm4 \n\t"
  491. "por %%mm1, %%mm0 \n\t"
  492. "por %%mm4, %%mm3 \n\t"
  493. "psrld $6, %%mm0 \n\t"
  494. "pslld $10, %%mm3 \n\t"
  495. "por %%mm3, %%mm0 \n\t"
  496. MOVNTQ" %%mm0, (%0) \n\t"
  497. "add $16, %1 \n\t"
  498. "add $8, %0 \n\t"
  499. "2: \n\t"
  500. "cmp %2, %1 \n\t"
  501. " jb 1b \n\t"
  502. : "+r" (d), "+r"(s)
  503. : "r" (mm_end), "m" (mask3215g), "m" (mask3216br), "m" (mul3215)
  504. );
  505. #else
  506. __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
  507. __asm__ volatile(
  508. "movq %0, %%mm7 \n\t"
  509. "movq %1, %%mm6 \n\t"
  510. ::"m"(red_15mask),"m"(green_15mask));
  511. while (s < mm_end) {
  512. __asm__ volatile(
  513. PREFETCH" 32%1 \n\t"
  514. "movd %1, %%mm0 \n\t"
  515. "movd 4%1, %%mm3 \n\t"
  516. "punpckldq 8%1, %%mm0 \n\t"
  517. "punpckldq 12%1, %%mm3 \n\t"
  518. "movq %%mm0, %%mm1 \n\t"
  519. "movq %%mm0, %%mm2 \n\t"
  520. "movq %%mm3, %%mm4 \n\t"
  521. "movq %%mm3, %%mm5 \n\t"
  522. "psrlq $3, %%mm0 \n\t"
  523. "psrlq $3, %%mm3 \n\t"
  524. "pand %2, %%mm0 \n\t"
  525. "pand %2, %%mm3 \n\t"
  526. "psrlq $6, %%mm1 \n\t"
  527. "psrlq $6, %%mm4 \n\t"
  528. "pand %%mm6, %%mm1 \n\t"
  529. "pand %%mm6, %%mm4 \n\t"
  530. "psrlq $9, %%mm2 \n\t"
  531. "psrlq $9, %%mm5 \n\t"
  532. "pand %%mm7, %%mm2 \n\t"
  533. "pand %%mm7, %%mm5 \n\t"
  534. "por %%mm1, %%mm0 \n\t"
  535. "por %%mm4, %%mm3 \n\t"
  536. "por %%mm2, %%mm0 \n\t"
  537. "por %%mm5, %%mm3 \n\t"
  538. "psllq $16, %%mm3 \n\t"
  539. "por %%mm3, %%mm0 \n\t"
  540. MOVNTQ" %%mm0, %0 \n\t"
  541. :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
  542. d += 4;
  543. s += 16;
  544. }
  545. #endif
  546. __asm__ volatile(SFENCE:::"memory");
  547. __asm__ volatile(EMMS:::"memory");
  548. #endif
  549. while (s < end) {
  550. register int rgb = *(const uint32_t*)s; s += 4;
  551. *d++ = ((rgb&0xFF)>>3) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>9);
  552. }
  553. }
  554. static inline void RENAME(rgb32tobgr15)(const uint8_t *src, uint8_t *dst, long src_size)
  555. {
  556. const uint8_t *s = src;
  557. const uint8_t *end;
  558. #if HAVE_MMX
  559. const uint8_t *mm_end;
  560. #endif
  561. uint16_t *d = (uint16_t *)dst;
  562. end = s + src_size;
  563. #if HAVE_MMX
  564. __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
  565. __asm__ volatile(
  566. "movq %0, %%mm7 \n\t"
  567. "movq %1, %%mm6 \n\t"
  568. ::"m"(red_15mask),"m"(green_15mask));
  569. mm_end = end - 15;
  570. while (s < mm_end) {
  571. __asm__ volatile(
  572. PREFETCH" 32%1 \n\t"
  573. "movd %1, %%mm0 \n\t"
  574. "movd 4%1, %%mm3 \n\t"
  575. "punpckldq 8%1, %%mm0 \n\t"
  576. "punpckldq 12%1, %%mm3 \n\t"
  577. "movq %%mm0, %%mm1 \n\t"
  578. "movq %%mm0, %%mm2 \n\t"
  579. "movq %%mm3, %%mm4 \n\t"
  580. "movq %%mm3, %%mm5 \n\t"
  581. "psllq $7, %%mm0 \n\t"
  582. "psllq $7, %%mm3 \n\t"
  583. "pand %%mm7, %%mm0 \n\t"
  584. "pand %%mm7, %%mm3 \n\t"
  585. "psrlq $6, %%mm1 \n\t"
  586. "psrlq $6, %%mm4 \n\t"
  587. "pand %%mm6, %%mm1 \n\t"
  588. "pand %%mm6, %%mm4 \n\t"
  589. "psrlq $19, %%mm2 \n\t"
  590. "psrlq $19, %%mm5 \n\t"
  591. "pand %2, %%mm2 \n\t"
  592. "pand %2, %%mm5 \n\t"
  593. "por %%mm1, %%mm0 \n\t"
  594. "por %%mm4, %%mm3 \n\t"
  595. "por %%mm2, %%mm0 \n\t"
  596. "por %%mm5, %%mm3 \n\t"
  597. "psllq $16, %%mm3 \n\t"
  598. "por %%mm3, %%mm0 \n\t"
  599. MOVNTQ" %%mm0, %0 \n\t"
  600. :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
  601. d += 4;
  602. s += 16;
  603. }
  604. __asm__ volatile(SFENCE:::"memory");
  605. __asm__ volatile(EMMS:::"memory");
  606. #endif
  607. while (s < end) {
  608. register int rgb = *(const uint32_t*)s; s += 4;
  609. *d++ = ((rgb&0xF8)<<7) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>19);
  610. }
  611. }
  612. static inline void RENAME(rgb24tobgr16)(const uint8_t *src, uint8_t *dst, long src_size)
  613. {
  614. const uint8_t *s = src;
  615. const uint8_t *end;
  616. #if HAVE_MMX
  617. const uint8_t *mm_end;
  618. #endif
  619. uint16_t *d = (uint16_t *)dst;
  620. end = s + src_size;
  621. #if HAVE_MMX
  622. __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
  623. __asm__ volatile(
  624. "movq %0, %%mm7 \n\t"
  625. "movq %1, %%mm6 \n\t"
  626. ::"m"(red_16mask),"m"(green_16mask));
  627. mm_end = end - 11;
  628. while (s < mm_end) {
  629. __asm__ volatile(
  630. PREFETCH" 32%1 \n\t"
  631. "movd %1, %%mm0 \n\t"
  632. "movd 3%1, %%mm3 \n\t"
  633. "punpckldq 6%1, %%mm0 \n\t"
  634. "punpckldq 9%1, %%mm3 \n\t"
  635. "movq %%mm0, %%mm1 \n\t"
  636. "movq %%mm0, %%mm2 \n\t"
  637. "movq %%mm3, %%mm4 \n\t"
  638. "movq %%mm3, %%mm5 \n\t"
  639. "psrlq $3, %%mm0 \n\t"
  640. "psrlq $3, %%mm3 \n\t"
  641. "pand %2, %%mm0 \n\t"
  642. "pand %2, %%mm3 \n\t"
  643. "psrlq $5, %%mm1 \n\t"
  644. "psrlq $5, %%mm4 \n\t"
  645. "pand %%mm6, %%mm1 \n\t"
  646. "pand %%mm6, %%mm4 \n\t"
  647. "psrlq $8, %%mm2 \n\t"
  648. "psrlq $8, %%mm5 \n\t"
  649. "pand %%mm7, %%mm2 \n\t"
  650. "pand %%mm7, %%mm5 \n\t"
  651. "por %%mm1, %%mm0 \n\t"
  652. "por %%mm4, %%mm3 \n\t"
  653. "por %%mm2, %%mm0 \n\t"
  654. "por %%mm5, %%mm3 \n\t"
  655. "psllq $16, %%mm3 \n\t"
  656. "por %%mm3, %%mm0 \n\t"
  657. MOVNTQ" %%mm0, %0 \n\t"
  658. :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
  659. d += 4;
  660. s += 12;
  661. }
  662. __asm__ volatile(SFENCE:::"memory");
  663. __asm__ volatile(EMMS:::"memory");
  664. #endif
  665. while (s < end) {
  666. const int b = *s++;
  667. const int g = *s++;
  668. const int r = *s++;
  669. *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
  670. }
  671. }
  672. static inline void RENAME(rgb24to16)(const uint8_t *src, uint8_t *dst, long src_size)
  673. {
  674. const uint8_t *s = src;
  675. const uint8_t *end;
  676. #if HAVE_MMX
  677. const uint8_t *mm_end;
  678. #endif
  679. uint16_t *d = (uint16_t *)dst;
  680. end = s + src_size;
  681. #if HAVE_MMX
  682. __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
  683. __asm__ volatile(
  684. "movq %0, %%mm7 \n\t"
  685. "movq %1, %%mm6 \n\t"
  686. ::"m"(red_16mask),"m"(green_16mask));
  687. mm_end = end - 15;
  688. while (s < mm_end) {
  689. __asm__ volatile(
  690. PREFETCH" 32%1 \n\t"
  691. "movd %1, %%mm0 \n\t"
  692. "movd 3%1, %%mm3 \n\t"
  693. "punpckldq 6%1, %%mm0 \n\t"
  694. "punpckldq 9%1, %%mm3 \n\t"
  695. "movq %%mm0, %%mm1 \n\t"
  696. "movq %%mm0, %%mm2 \n\t"
  697. "movq %%mm3, %%mm4 \n\t"
  698. "movq %%mm3, %%mm5 \n\t"
  699. "psllq $8, %%mm0 \n\t"
  700. "psllq $8, %%mm3 \n\t"
  701. "pand %%mm7, %%mm0 \n\t"
  702. "pand %%mm7, %%mm3 \n\t"
  703. "psrlq $5, %%mm1 \n\t"
  704. "psrlq $5, %%mm4 \n\t"
  705. "pand %%mm6, %%mm1 \n\t"
  706. "pand %%mm6, %%mm4 \n\t"
  707. "psrlq $19, %%mm2 \n\t"
  708. "psrlq $19, %%mm5 \n\t"
  709. "pand %2, %%mm2 \n\t"
  710. "pand %2, %%mm5 \n\t"
  711. "por %%mm1, %%mm0 \n\t"
  712. "por %%mm4, %%mm3 \n\t"
  713. "por %%mm2, %%mm0 \n\t"
  714. "por %%mm5, %%mm3 \n\t"
  715. "psllq $16, %%mm3 \n\t"
  716. "por %%mm3, %%mm0 \n\t"
  717. MOVNTQ" %%mm0, %0 \n\t"
  718. :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
  719. d += 4;
  720. s += 12;
  721. }
  722. __asm__ volatile(SFENCE:::"memory");
  723. __asm__ volatile(EMMS:::"memory");
  724. #endif
  725. while (s < end) {
  726. const int r = *s++;
  727. const int g = *s++;
  728. const int b = *s++;
  729. *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
  730. }
  731. }
  732. static inline void RENAME(rgb24tobgr15)(const uint8_t *src, uint8_t *dst, long src_size)
  733. {
  734. const uint8_t *s = src;
  735. const uint8_t *end;
  736. #if HAVE_MMX
  737. const uint8_t *mm_end;
  738. #endif
  739. uint16_t *d = (uint16_t *)dst;
  740. end = s + src_size;
  741. #if HAVE_MMX
  742. __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
  743. __asm__ volatile(
  744. "movq %0, %%mm7 \n\t"
  745. "movq %1, %%mm6 \n\t"
  746. ::"m"(red_15mask),"m"(green_15mask));
  747. mm_end = end - 11;
  748. while (s < mm_end) {
  749. __asm__ volatile(
  750. PREFETCH" 32%1 \n\t"
  751. "movd %1, %%mm0 \n\t"
  752. "movd 3%1, %%mm3 \n\t"
  753. "punpckldq 6%1, %%mm0 \n\t"
  754. "punpckldq 9%1, %%mm3 \n\t"
  755. "movq %%mm0, %%mm1 \n\t"
  756. "movq %%mm0, %%mm2 \n\t"
  757. "movq %%mm3, %%mm4 \n\t"
  758. "movq %%mm3, %%mm5 \n\t"
  759. "psrlq $3, %%mm0 \n\t"
  760. "psrlq $3, %%mm3 \n\t"
  761. "pand %2, %%mm0 \n\t"
  762. "pand %2, %%mm3 \n\t"
  763. "psrlq $6, %%mm1 \n\t"
  764. "psrlq $6, %%mm4 \n\t"
  765. "pand %%mm6, %%mm1 \n\t"
  766. "pand %%mm6, %%mm4 \n\t"
  767. "psrlq $9, %%mm2 \n\t"
  768. "psrlq $9, %%mm5 \n\t"
  769. "pand %%mm7, %%mm2 \n\t"
  770. "pand %%mm7, %%mm5 \n\t"
  771. "por %%mm1, %%mm0 \n\t"
  772. "por %%mm4, %%mm3 \n\t"
  773. "por %%mm2, %%mm0 \n\t"
  774. "por %%mm5, %%mm3 \n\t"
  775. "psllq $16, %%mm3 \n\t"
  776. "por %%mm3, %%mm0 \n\t"
  777. MOVNTQ" %%mm0, %0 \n\t"
  778. :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
  779. d += 4;
  780. s += 12;
  781. }
  782. __asm__ volatile(SFENCE:::"memory");
  783. __asm__ volatile(EMMS:::"memory");
  784. #endif
  785. while (s < end) {
  786. const int b = *s++;
  787. const int g = *s++;
  788. const int r = *s++;
  789. *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
  790. }
  791. }
  792. static inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, long src_size)
  793. {
  794. const uint8_t *s = src;
  795. const uint8_t *end;
  796. #if HAVE_MMX
  797. const uint8_t *mm_end;
  798. #endif
  799. uint16_t *d = (uint16_t *)dst;
  800. end = s + src_size;
  801. #if HAVE_MMX
  802. __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
  803. __asm__ volatile(
  804. "movq %0, %%mm7 \n\t"
  805. "movq %1, %%mm6 \n\t"
  806. ::"m"(red_15mask),"m"(green_15mask));
  807. mm_end = end - 15;
  808. while (s < mm_end) {
  809. __asm__ volatile(
  810. PREFETCH" 32%1 \n\t"
  811. "movd %1, %%mm0 \n\t"
  812. "movd 3%1, %%mm3 \n\t"
  813. "punpckldq 6%1, %%mm0 \n\t"
  814. "punpckldq 9%1, %%mm3 \n\t"
  815. "movq %%mm0, %%mm1 \n\t"
  816. "movq %%mm0, %%mm2 \n\t"
  817. "movq %%mm3, %%mm4 \n\t"
  818. "movq %%mm3, %%mm5 \n\t"
  819. "psllq $7, %%mm0 \n\t"
  820. "psllq $7, %%mm3 \n\t"
  821. "pand %%mm7, %%mm0 \n\t"
  822. "pand %%mm7, %%mm3 \n\t"
  823. "psrlq $6, %%mm1 \n\t"
  824. "psrlq $6, %%mm4 \n\t"
  825. "pand %%mm6, %%mm1 \n\t"
  826. "pand %%mm6, %%mm4 \n\t"
  827. "psrlq $19, %%mm2 \n\t"
  828. "psrlq $19, %%mm5 \n\t"
  829. "pand %2, %%mm2 \n\t"
  830. "pand %2, %%mm5 \n\t"
  831. "por %%mm1, %%mm0 \n\t"
  832. "por %%mm4, %%mm3 \n\t"
  833. "por %%mm2, %%mm0 \n\t"
  834. "por %%mm5, %%mm3 \n\t"
  835. "psllq $16, %%mm3 \n\t"
  836. "por %%mm3, %%mm0 \n\t"
  837. MOVNTQ" %%mm0, %0 \n\t"
  838. :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
  839. d += 4;
  840. s += 12;
  841. }
  842. __asm__ volatile(SFENCE:::"memory");
  843. __asm__ volatile(EMMS:::"memory");
  844. #endif
  845. while (s < end) {
  846. const int r = *s++;
  847. const int g = *s++;
  848. const int b = *s++;
  849. *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
  850. }
  851. }
  852. /*
  853. I use less accurate approximation here by simply left-shifting the input
  854. value and filling the low order bits with zeroes. This method improves PNG
  855. compression but this scheme cannot reproduce white exactly, since it does
  856. not generate an all-ones maximum value; the net effect is to darken the
  857. image slightly.
  858. The better method should be "left bit replication":
  859. 4 3 2 1 0
  860. ---------
  861. 1 1 0 1 1
  862. 7 6 5 4 3 2 1 0
  863. ----------------
  864. 1 1 0 1 1 1 1 0
  865. |=======| |===|
  866. | leftmost bits repeated to fill open bits
  867. |
  868. original bits
  869. */
  870. static inline void RENAME(rgb15tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
  871. {
  872. const uint16_t *end;
  873. #if HAVE_MMX
  874. const uint16_t *mm_end;
  875. #endif
  876. uint8_t *d = dst;
  877. const uint16_t *s = (const uint16_t*)src;
  878. end = s + src_size/2;
  879. #if HAVE_MMX
  880. __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
  881. mm_end = end - 7;
  882. while (s < mm_end) {
  883. __asm__ volatile(
  884. PREFETCH" 32%1 \n\t"
  885. "movq %1, %%mm0 \n\t"
  886. "movq %1, %%mm1 \n\t"
  887. "movq %1, %%mm2 \n\t"
  888. "pand %2, %%mm0 \n\t"
  889. "pand %3, %%mm1 \n\t"
  890. "pand %4, %%mm2 \n\t"
  891. "psllq $3, %%mm0 \n\t"
  892. "psrlq $2, %%mm1 \n\t"
  893. "psrlq $7, %%mm2 \n\t"
  894. "movq %%mm0, %%mm3 \n\t"
  895. "movq %%mm1, %%mm4 \n\t"
  896. "movq %%mm2, %%mm5 \n\t"
  897. "punpcklwd %5, %%mm0 \n\t"
  898. "punpcklwd %5, %%mm1 \n\t"
  899. "punpcklwd %5, %%mm2 \n\t"
  900. "punpckhwd %5, %%mm3 \n\t"
  901. "punpckhwd %5, %%mm4 \n\t"
  902. "punpckhwd %5, %%mm5 \n\t"
  903. "psllq $8, %%mm1 \n\t"
  904. "psllq $16, %%mm2 \n\t"
  905. "por %%mm1, %%mm0 \n\t"
  906. "por %%mm2, %%mm0 \n\t"
  907. "psllq $8, %%mm4 \n\t"
  908. "psllq $16, %%mm5 \n\t"
  909. "por %%mm4, %%mm3 \n\t"
  910. "por %%mm5, %%mm3 \n\t"
  911. "movq %%mm0, %%mm6 \n\t"
  912. "movq %%mm3, %%mm7 \n\t"
  913. "movq 8%1, %%mm0 \n\t"
  914. "movq 8%1, %%mm1 \n\t"
  915. "movq 8%1, %%mm2 \n\t"
  916. "pand %2, %%mm0 \n\t"
  917. "pand %3, %%mm1 \n\t"
  918. "pand %4, %%mm2 \n\t"
  919. "psllq $3, %%mm0 \n\t"
  920. "psrlq $2, %%mm1 \n\t"
  921. "psrlq $7, %%mm2 \n\t"
  922. "movq %%mm0, %%mm3 \n\t"
  923. "movq %%mm1, %%mm4 \n\t"
  924. "movq %%mm2, %%mm5 \n\t"
  925. "punpcklwd %5, %%mm0 \n\t"
  926. "punpcklwd %5, %%mm1 \n\t"
  927. "punpcklwd %5, %%mm2 \n\t"
  928. "punpckhwd %5, %%mm3 \n\t"
  929. "punpckhwd %5, %%mm4 \n\t"
  930. "punpckhwd %5, %%mm5 \n\t"
  931. "psllq $8, %%mm1 \n\t"
  932. "psllq $16, %%mm2 \n\t"
  933. "por %%mm1, %%mm0 \n\t"
  934. "por %%mm2, %%mm0 \n\t"
  935. "psllq $8, %%mm4 \n\t"
  936. "psllq $16, %%mm5 \n\t"
  937. "por %%mm4, %%mm3 \n\t"
  938. "por %%mm5, %%mm3 \n\t"
  939. :"=m"(*d)
  940. :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r), "m"(mmx_null)
  941. :"memory");
  942. /* borrowed 32 to 24 */
  943. __asm__ volatile(
  944. "movq %%mm0, %%mm4 \n\t"
  945. "movq %%mm3, %%mm5 \n\t"
  946. "movq %%mm6, %%mm0 \n\t"
  947. "movq %%mm7, %%mm1 \n\t"
  948. "movq %%mm4, %%mm6 \n\t"
  949. "movq %%mm5, %%mm7 \n\t"
  950. "movq %%mm0, %%mm2 \n\t"
  951. "movq %%mm1, %%mm3 \n\t"
  952. "psrlq $8, %%mm2 \n\t"
  953. "psrlq $8, %%mm3 \n\t"
  954. "psrlq $8, %%mm6 \n\t"
  955. "psrlq $8, %%mm7 \n\t"
  956. "pand %2, %%mm0 \n\t"
  957. "pand %2, %%mm1 \n\t"
  958. "pand %2, %%mm4 \n\t"
  959. "pand %2, %%mm5 \n\t"
  960. "pand %3, %%mm2 \n\t"
  961. "pand %3, %%mm3 \n\t"
  962. "pand %3, %%mm6 \n\t"
  963. "pand %3, %%mm7 \n\t"
  964. "por %%mm2, %%mm0 \n\t"
  965. "por %%mm3, %%mm1 \n\t"
  966. "por %%mm6, %%mm4 \n\t"
  967. "por %%mm7, %%mm5 \n\t"
  968. "movq %%mm1, %%mm2 \n\t"
  969. "movq %%mm4, %%mm3 \n\t"
  970. "psllq $48, %%mm2 \n\t"
  971. "psllq $32, %%mm3 \n\t"
  972. "pand %4, %%mm2 \n\t"
  973. "pand %5, %%mm3 \n\t"
  974. "por %%mm2, %%mm0 \n\t"
  975. "psrlq $16, %%mm1 \n\t"
  976. "psrlq $32, %%mm4 \n\t"
  977. "psllq $16, %%mm5 \n\t"
  978. "por %%mm3, %%mm1 \n\t"
  979. "pand %6, %%mm5 \n\t"
  980. "por %%mm5, %%mm4 \n\t"
  981. MOVNTQ" %%mm0, %0 \n\t"
  982. MOVNTQ" %%mm1, 8%0 \n\t"
  983. MOVNTQ" %%mm4, 16%0"
  984. :"=m"(*d)
  985. :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
  986. :"memory");
  987. d += 24;
  988. s += 8;
  989. }
  990. __asm__ volatile(SFENCE:::"memory");
  991. __asm__ volatile(EMMS:::"memory");
  992. #endif
  993. while (s < end) {
  994. register uint16_t bgr;
  995. bgr = *s++;
  996. *d++ = (bgr&0x1F)<<3;
  997. *d++ = (bgr&0x3E0)>>2;
  998. *d++ = (bgr&0x7C00)>>7;
  999. }
  1000. }
  1001. static inline void RENAME(rgb16tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
  1002. {
  1003. const uint16_t *end;
  1004. #if HAVE_MMX
  1005. const uint16_t *mm_end;
  1006. #endif
  1007. uint8_t *d = (uint8_t *)dst;
  1008. const uint16_t *s = (const uint16_t *)src;
  1009. end = s + src_size/2;
  1010. #if HAVE_MMX
  1011. __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
  1012. mm_end = end - 7;
  1013. while (s < mm_end) {
  1014. __asm__ volatile(
  1015. PREFETCH" 32%1 \n\t"
  1016. "movq %1, %%mm0 \n\t"
  1017. "movq %1, %%mm1 \n\t"
  1018. "movq %1, %%mm2 \n\t"
  1019. "pand %2, %%mm0 \n\t"
  1020. "pand %3, %%mm1 \n\t"
  1021. "pand %4, %%mm2 \n\t"
  1022. "psllq $3, %%mm0 \n\t"
  1023. "psrlq $3, %%mm1 \n\t"
  1024. "psrlq $8, %%mm2 \n\t"
  1025. "movq %%mm0, %%mm3 \n\t"
  1026. "movq %%mm1, %%mm4 \n\t"
  1027. "movq %%mm2, %%mm5 \n\t"
  1028. "punpcklwd %5, %%mm0 \n\t"
  1029. "punpcklwd %5, %%mm1 \n\t"
  1030. "punpcklwd %5, %%mm2 \n\t"
  1031. "punpckhwd %5, %%mm3 \n\t"
  1032. "punpckhwd %5, %%mm4 \n\t"
  1033. "punpckhwd %5, %%mm5 \n\t"
  1034. "psllq $8, %%mm1 \n\t"
  1035. "psllq $16, %%mm2 \n\t"
  1036. "por %%mm1, %%mm0 \n\t"
  1037. "por %%mm2, %%mm0 \n\t"
  1038. "psllq $8, %%mm4 \n\t"
  1039. "psllq $16, %%mm5 \n\t"
  1040. "por %%mm4, %%mm3 \n\t"
  1041. "por %%mm5, %%mm3 \n\t"
  1042. "movq %%mm0, %%mm6 \n\t"
  1043. "movq %%mm3, %%mm7 \n\t"
  1044. "movq 8%1, %%mm0 \n\t"
  1045. "movq 8%1, %%mm1 \n\t"
  1046. "movq 8%1, %%mm2 \n\t"
  1047. "pand %2, %%mm0 \n\t"
  1048. "pand %3, %%mm1 \n\t"
  1049. "pand %4, %%mm2 \n\t"
  1050. "psllq $3, %%mm0 \n\t"
  1051. "psrlq $3, %%mm1 \n\t"
  1052. "psrlq $8, %%mm2 \n\t"
  1053. "movq %%mm0, %%mm3 \n\t"
  1054. "movq %%mm1, %%mm4 \n\t"
  1055. "movq %%mm2, %%mm5 \n\t"
  1056. "punpcklwd %5, %%mm0 \n\t"
  1057. "punpcklwd %5, %%mm1 \n\t"
  1058. "punpcklwd %5, %%mm2 \n\t"
  1059. "punpckhwd %5, %%mm3 \n\t"
  1060. "punpckhwd %5, %%mm4 \n\t"
  1061. "punpckhwd %5, %%mm5 \n\t"
  1062. "psllq $8, %%mm1 \n\t"
  1063. "psllq $16, %%mm2 \n\t"
  1064. "por %%mm1, %%mm0 \n\t"
  1065. "por %%mm2, %%mm0 \n\t"
  1066. "psllq $8, %%mm4 \n\t"
  1067. "psllq $16, %%mm5 \n\t"
  1068. "por %%mm4, %%mm3 \n\t"
  1069. "por %%mm5, %%mm3 \n\t"
  1070. :"=m"(*d)
  1071. :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mmx_null)
  1072. :"memory");
  1073. /* borrowed 32 to 24 */
  1074. __asm__ volatile(
  1075. "movq %%mm0, %%mm4 \n\t"
  1076. "movq %%mm3, %%mm5 \n\t"
  1077. "movq %%mm6, %%mm0 \n\t"
  1078. "movq %%mm7, %%mm1 \n\t"
  1079. "movq %%mm4, %%mm6 \n\t"
  1080. "movq %%mm5, %%mm7 \n\t"
  1081. "movq %%mm0, %%mm2 \n\t"
  1082. "movq %%mm1, %%mm3 \n\t"
  1083. "psrlq $8, %%mm2 \n\t"
  1084. "psrlq $8, %%mm3 \n\t"
  1085. "psrlq $8, %%mm6 \n\t"
  1086. "psrlq $8, %%mm7 \n\t"
  1087. "pand %2, %%mm0 \n\t"
  1088. "pand %2, %%mm1 \n\t"
  1089. "pand %2, %%mm4 \n\t"
  1090. "pand %2, %%mm5 \n\t"
  1091. "pand %3, %%mm2 \n\t"
  1092. "pand %3, %%mm3 \n\t"
  1093. "pand %3, %%mm6 \n\t"
  1094. "pand %3, %%mm7 \n\t"
  1095. "por %%mm2, %%mm0 \n\t"
  1096. "por %%mm3, %%mm1 \n\t"
  1097. "por %%mm6, %%mm4 \n\t"
  1098. "por %%mm7, %%mm5 \n\t"
  1099. "movq %%mm1, %%mm2 \n\t"
  1100. "movq %%mm4, %%mm3 \n\t"
  1101. "psllq $48, %%mm2 \n\t"
  1102. "psllq $32, %%mm3 \n\t"
  1103. "pand %4, %%mm2 \n\t"
  1104. "pand %5, %%mm3 \n\t"
  1105. "por %%mm2, %%mm0 \n\t"
  1106. "psrlq $16, %%mm1 \n\t"
  1107. "psrlq $32, %%mm4 \n\t"
  1108. "psllq $16, %%mm5 \n\t"
  1109. "por %%mm3, %%mm1 \n\t"
  1110. "pand %6, %%mm5 \n\t"
  1111. "por %%mm5, %%mm4 \n\t"
  1112. MOVNTQ" %%mm0, %0 \n\t"
  1113. MOVNTQ" %%mm1, 8%0 \n\t"
  1114. MOVNTQ" %%mm4, 16%0"
  1115. :"=m"(*d)
  1116. :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
  1117. :"memory");
  1118. d += 24;
  1119. s += 8;
  1120. }
  1121. __asm__ volatile(SFENCE:::"memory");
  1122. __asm__ volatile(EMMS:::"memory");
  1123. #endif
  1124. while (s < end) {
  1125. register uint16_t bgr;
  1126. bgr = *s++;
  1127. *d++ = (bgr&0x1F)<<3;
  1128. *d++ = (bgr&0x7E0)>>3;
  1129. *d++ = (bgr&0xF800)>>8;
  1130. }
  1131. }
  1132. /*
  1133. * mm0 = 00 B3 00 B2 00 B1 00 B0
  1134. * mm1 = 00 G3 00 G2 00 G1 00 G0
  1135. * mm2 = 00 R3 00 R2 00 R1 00 R0
  1136. * mm6 = FF FF FF FF FF FF FF FF
  1137. * mm7 = 00 00 00 00 00 00 00 00
  1138. */
  1139. #define PACK_RGB32 \
  1140. "packuswb %%mm7, %%mm0 \n\t" /* 00 00 00 00 B3 B2 B1 B0 */ \
  1141. "packuswb %%mm7, %%mm1 \n\t" /* 00 00 00 00 G3 G2 G1 G0 */ \
  1142. "packuswb %%mm7, %%mm2 \n\t" /* 00 00 00 00 R3 R2 R1 R0 */ \
  1143. "punpcklbw %%mm1, %%mm0 \n\t" /* G3 B3 G2 B2 G1 B1 G0 B0 */ \
  1144. "punpcklbw %%mm6, %%mm2 \n\t" /* FF R3 FF R2 FF R1 FF R0 */ \
  1145. "movq %%mm0, %%mm3 \n\t" \
  1146. "punpcklwd %%mm2, %%mm0 \n\t" /* FF R1 G1 B1 FF R0 G0 B0 */ \
  1147. "punpckhwd %%mm2, %%mm3 \n\t" /* FF R3 G3 B3 FF R2 G2 B2 */ \
  1148. MOVNTQ" %%mm0, %0 \n\t" \
  1149. MOVNTQ" %%mm3, 8%0 \n\t" \
  1150. static inline void RENAME(rgb15to32)(const uint8_t *src, uint8_t *dst, long src_size)
  1151. {
  1152. const uint16_t *end;
  1153. #if HAVE_MMX
  1154. const uint16_t *mm_end;
  1155. #endif
  1156. uint8_t *d = dst;
  1157. const uint16_t *s = (const uint16_t *)src;
  1158. end = s + src_size/2;
  1159. #if HAVE_MMX
  1160. __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
  1161. __asm__ volatile("pxor %%mm7,%%mm7 \n\t":::"memory");
  1162. __asm__ volatile("pcmpeqd %%mm6,%%mm6 \n\t":::"memory");
  1163. mm_end = end - 3;
  1164. while (s < mm_end) {
  1165. __asm__ volatile(
  1166. PREFETCH" 32%1 \n\t"
  1167. "movq %1, %%mm0 \n\t"
  1168. "movq %1, %%mm1 \n\t"
  1169. "movq %1, %%mm2 \n\t"
  1170. "pand %2, %%mm0 \n\t"
  1171. "pand %3, %%mm1 \n\t"
  1172. "pand %4, %%mm2 \n\t"
  1173. "psllq $3, %%mm0 \n\t"
  1174. "psrlq $2, %%mm1 \n\t"
  1175. "psrlq $7, %%mm2 \n\t"
  1176. PACK_RGB32
  1177. :"=m"(*d)
  1178. :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r)
  1179. :"memory");
  1180. d += 16;
  1181. s += 4;
  1182. }
  1183. __asm__ volatile(SFENCE:::"memory");
  1184. __asm__ volatile(EMMS:::"memory");
  1185. #endif
  1186. while (s < end) {
  1187. register uint16_t bgr;
  1188. bgr = *s++;
  1189. #if HAVE_BIGENDIAN
  1190. *d++ = 255;
  1191. *d++ = (bgr&0x7C00)>>7;
  1192. *d++ = (bgr&0x3E0)>>2;
  1193. *d++ = (bgr&0x1F)<<3;
  1194. #else
  1195. *d++ = (bgr&0x1F)<<3;
  1196. *d++ = (bgr&0x3E0)>>2;
  1197. *d++ = (bgr&0x7C00)>>7;
  1198. *d++ = 255;
  1199. #endif
  1200. }
  1201. }
  1202. static inline void RENAME(rgb16to32)(const uint8_t *src, uint8_t *dst, long src_size)
  1203. {
  1204. const uint16_t *end;
  1205. #if HAVE_MMX
  1206. const uint16_t *mm_end;
  1207. #endif
  1208. uint8_t *d = dst;
  1209. const uint16_t *s = (const uint16_t*)src;
  1210. end = s + src_size/2;
  1211. #if HAVE_MMX
  1212. __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
  1213. __asm__ volatile("pxor %%mm7,%%mm7 \n\t":::"memory");
  1214. __asm__ volatile("pcmpeqd %%mm6,%%mm6 \n\t":::"memory");
  1215. mm_end = end - 3;
  1216. while (s < mm_end) {
  1217. __asm__ volatile(
  1218. PREFETCH" 32%1 \n\t"
  1219. "movq %1, %%mm0 \n\t"
  1220. "movq %1, %%mm1 \n\t"
  1221. "movq %1, %%mm2 \n\t"
  1222. "pand %2, %%mm0 \n\t"
  1223. "pand %3, %%mm1 \n\t"
  1224. "pand %4, %%mm2 \n\t"
  1225. "psllq $3, %%mm0 \n\t"
  1226. "psrlq $3, %%mm1 \n\t"
  1227. "psrlq $8, %%mm2 \n\t"
  1228. PACK_RGB32
  1229. :"=m"(*d)
  1230. :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r)
  1231. :"memory");
  1232. d += 16;
  1233. s += 4;
  1234. }
  1235. __asm__ volatile(SFENCE:::"memory");
  1236. __asm__ volatile(EMMS:::"memory");
  1237. #endif
  1238. while (s < end) {
  1239. register uint16_t bgr;
  1240. bgr = *s++;
  1241. #if HAVE_BIGENDIAN
  1242. *d++ = 255;
  1243. *d++ = (bgr&0xF800)>>8;
  1244. *d++ = (bgr&0x7E0)>>3;
  1245. *d++ = (bgr&0x1F)<<3;
  1246. #else
  1247. *d++ = (bgr&0x1F)<<3;
  1248. *d++ = (bgr&0x7E0)>>3;
  1249. *d++ = (bgr&0xF800)>>8;
  1250. *d++ = 255;
  1251. #endif
  1252. }
  1253. }
  1254. static inline void RENAME(rgb32tobgr32)(const uint8_t *src, uint8_t *dst, long src_size)
  1255. {
  1256. x86_reg idx = 15 - src_size;
  1257. const uint8_t *s = src-idx;
  1258. uint8_t *d = dst-idx;
  1259. #if HAVE_MMX
  1260. __asm__ volatile(
  1261. "test %0, %0 \n\t"
  1262. "jns 2f \n\t"
  1263. PREFETCH" (%1, %0) \n\t"
  1264. "movq %3, %%mm7 \n\t"
  1265. "pxor %4, %%mm7 \n\t"
  1266. "movq %%mm7, %%mm6 \n\t"
  1267. "pxor %5, %%mm7 \n\t"
  1268. ASMALIGN(4)
  1269. "1: \n\t"
  1270. PREFETCH" 32(%1, %0) \n\t"
  1271. "movq (%1, %0), %%mm0 \n\t"
  1272. "movq 8(%1, %0), %%mm1 \n\t"
  1273. # if HAVE_MMX2
  1274. "pshufw $177, %%mm0, %%mm3 \n\t"
  1275. "pshufw $177, %%mm1, %%mm5 \n\t"
  1276. "pand %%mm7, %%mm0 \n\t"
  1277. "pand %%mm6, %%mm3 \n\t"
  1278. "pand %%mm7, %%mm1 \n\t"
  1279. "pand %%mm6, %%mm5 \n\t"
  1280. "por %%mm3, %%mm0 \n\t"
  1281. "por %%mm5, %%mm1 \n\t"
  1282. # else
  1283. "movq %%mm0, %%mm2 \n\t"
  1284. "movq %%mm1, %%mm4 \n\t"
  1285. "pand %%mm7, %%mm0 \n\t"
  1286. "pand %%mm6, %%mm2 \n\t"
  1287. "pand %%mm7, %%mm1 \n\t"
  1288. "pand %%mm6, %%mm4 \n\t"
  1289. "movq %%mm2, %%mm3 \n\t"
  1290. "movq %%mm4, %%mm5 \n\t"
  1291. "pslld $16, %%mm2 \n\t"
  1292. "psrld $16, %%mm3 \n\t"
  1293. "pslld $16, %%mm4 \n\t"
  1294. "psrld $16, %%mm5 \n\t"
  1295. "por %%mm2, %%mm0 \n\t"
  1296. "por %%mm4, %%mm1 \n\t"
  1297. "por %%mm3, %%mm0 \n\t"
  1298. "por %%mm5, %%mm1 \n\t"
  1299. # endif
  1300. MOVNTQ" %%mm0, (%2, %0) \n\t"
  1301. MOVNTQ" %%mm1, 8(%2, %0) \n\t"
  1302. "add $16, %0 \n\t"
  1303. "js 1b \n\t"
  1304. SFENCE" \n\t"
  1305. EMMS" \n\t"
  1306. "2: \n\t"
  1307. : "+&r"(idx)
  1308. : "r" (s), "r" (d), "m" (mask32b), "m" (mask32r), "m" (mmx_one)
  1309. : "memory");
  1310. #endif
  1311. for (; idx<15; idx+=4) {
  1312. register int v = *(const uint32_t *)&s[idx], g = v & 0xff00ff00;
  1313. v &= 0xff00ff;
  1314. *(uint32_t *)&d[idx] = (v>>16) + g + (v<<16);
  1315. }
  1316. }
  1317. static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
  1318. {
  1319. unsigned i;
  1320. #if HAVE_MMX
  1321. x86_reg mmx_size= 23 - src_size;
  1322. __asm__ volatile (
  1323. "test %%"REG_a", %%"REG_a" \n\t"
  1324. "jns 2f \n\t"
  1325. "movq "MANGLE(mask24r)", %%mm5 \n\t"
  1326. "movq "MANGLE(mask24g)", %%mm6 \n\t"
  1327. "movq "MANGLE(mask24b)", %%mm7 \n\t"
  1328. ASMALIGN(4)
  1329. "1: \n\t"
  1330. PREFETCH" 32(%1, %%"REG_a") \n\t"
  1331. "movq (%1, %%"REG_a"), %%mm0 \n\t" // BGR BGR BG
  1332. "movq (%1, %%"REG_a"), %%mm1 \n\t" // BGR BGR BG
  1333. "movq 2(%1, %%"REG_a"), %%mm2 \n\t" // R BGR BGR B
  1334. "psllq $16, %%mm0 \n\t" // 00 BGR BGR
  1335. "pand %%mm5, %%mm0 \n\t"
  1336. "pand %%mm6, %%mm1 \n\t"
  1337. "pand %%mm7, %%mm2 \n\t"
  1338. "por %%mm0, %%mm1 \n\t"
  1339. "por %%mm2, %%mm1 \n\t"
  1340. "movq 6(%1, %%"REG_a"), %%mm0 \n\t" // BGR BGR BG
  1341. MOVNTQ" %%mm1, (%2, %%"REG_a") \n\t" // RGB RGB RG
  1342. "movq 8(%1, %%"REG_a"), %%mm1 \n\t" // R BGR BGR B
  1343. "movq 10(%1, %%"REG_a"), %%mm2 \n\t" // GR BGR BGR
  1344. "pand %%mm7, %%mm0 \n\t"
  1345. "pand %%mm5, %%mm1 \n\t"
  1346. "pand %%mm6, %%mm2 \n\t"
  1347. "por %%mm0, %%mm1 \n\t"
  1348. "por %%mm2, %%mm1 \n\t"
  1349. "movq 14(%1, %%"REG_a"), %%mm0 \n\t" // R BGR BGR B
  1350. MOVNTQ" %%mm1, 8(%2, %%"REG_a") \n\t" // B RGB RGB R
  1351. "movq 16(%1, %%"REG_a"), %%mm1 \n\t" // GR BGR BGR
  1352. "movq 18(%1, %%"REG_a"), %%mm2 \n\t" // BGR BGR BG
  1353. "pand %%mm6, %%mm0 \n\t"
  1354. "pand %%mm7, %%mm1 \n\t"
  1355. "pand %%mm5, %%mm2 \n\t"
  1356. "por %%mm0, %%mm1 \n\t"
  1357. "por %%mm2, %%mm1 \n\t"
  1358. MOVNTQ" %%mm1, 16(%2, %%"REG_a") \n\t"
  1359. "add $24, %%"REG_a" \n\t"
  1360. " js 1b \n\t"
  1361. "2: \n\t"
  1362. : "+a" (mmx_size)
  1363. : "r" (src-mmx_size), "r"(dst-mmx_size)
  1364. );
  1365. __asm__ volatile(SFENCE:::"memory");
  1366. __asm__ volatile(EMMS:::"memory");
  1367. if (mmx_size==23) return; //finished, was multiple of 8
  1368. src+= src_size;
  1369. dst+= src_size;
  1370. src_size= 23-mmx_size;
  1371. src-= src_size;
  1372. dst-= src_size;
  1373. #endif
  1374. for (i=0; i<src_size; i+=3) {
  1375. register uint8_t x;
  1376. x = src[i + 2];
  1377. dst[i + 1] = src[i + 1];
  1378. dst[i + 2] = src[i + 0];
  1379. dst[i + 0] = x;
  1380. }
  1381. }
  1382. static inline void RENAME(yuvPlanartoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
  1383. long width, long height,
  1384. long lumStride, long chromStride, long dstStride, long vertLumPerChroma)
  1385. {
  1386. long y;
  1387. const x86_reg chromWidth= width>>1;
  1388. for (y=0; y<height; y++) {
  1389. #if HAVE_MMX
  1390. //FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway)
  1391. __asm__ volatile(
  1392. "xor %%"REG_a", %%"REG_a" \n\t"
  1393. ASMALIGN(4)
  1394. "1: \n\t"
  1395. PREFETCH" 32(%1, %%"REG_a", 2) \n\t"
  1396. PREFETCH" 32(%2, %%"REG_a") \n\t"
  1397. PREFETCH" 32(%3, %%"REG_a") \n\t"
  1398. "movq (%2, %%"REG_a"), %%mm0 \n\t" // U(0)
  1399. "movq %%mm0, %%mm2 \n\t" // U(0)
  1400. "movq (%3, %%"REG_a"), %%mm1 \n\t" // V(0)
  1401. "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
  1402. "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8)
  1403. "movq (%1, %%"REG_a",2), %%mm3 \n\t" // Y(0)
  1404. "movq 8(%1, %%"REG_a",2), %%mm5 \n\t" // Y(8)
  1405. "movq %%mm3, %%mm4 \n\t" // Y(0)
  1406. "movq %%mm5, %%mm6 \n\t" // Y(8)
  1407. "punpcklbw %%mm0, %%mm3 \n\t" // YUYV YUYV(0)
  1408. "punpckhbw %%mm0, %%mm4 \n\t" // YUYV YUYV(4)
  1409. "punpcklbw %%mm2, %%mm5 \n\t" // YUYV YUYV(8)
  1410. "punpckhbw %%mm2, %%mm6 \n\t" // YUYV YUYV(12)
  1411. MOVNTQ" %%mm3, (%0, %%"REG_a", 4) \n\t"
  1412. MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4) \n\t"
  1413. MOVNTQ" %%mm5, 16(%0, %%"REG_a", 4) \n\t"
  1414. MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4) \n\t"
  1415. "add $8, %%"REG_a" \n\t"
  1416. "cmp %4, %%"REG_a" \n\t"
  1417. " jb 1b \n\t"
  1418. ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
  1419. : "%"REG_a
  1420. );
  1421. #else
  1422. #if ARCH_ALPHA && HAVE_MVI
  1423. #define pl2yuy2(n) \
  1424. y1 = yc[n]; \
  1425. y2 = yc2[n]; \
  1426. u = uc[n]; \
  1427. v = vc[n]; \
  1428. __asm__("unpkbw %1, %0" : "=r"(y1) : "r"(y1)); \
  1429. __asm__("unpkbw %1, %0" : "=r"(y2) : "r"(y2)); \
  1430. __asm__("unpkbl %1, %0" : "=r"(u) : "r"(u)); \
  1431. __asm__("unpkbl %1, %0" : "=r"(v) : "r"(v)); \
  1432. yuv1 = (u << 8) + (v << 24); \
  1433. yuv2 = yuv1 + y2; \
  1434. yuv1 += y1; \
  1435. qdst[n] = yuv1; \
  1436. qdst2[n] = yuv2;
  1437. int i;
  1438. uint64_t *qdst = (uint64_t *) dst;
  1439. uint64_t *qdst2 = (uint64_t *) (dst + dstStride);
  1440. const uint32_t *yc = (uint32_t *) ysrc;
  1441. const uint32_t *yc2 = (uint32_t *) (ysrc + lumStride);
  1442. const uint16_t *uc = (uint16_t*) usrc, *vc = (uint16_t*) vsrc;
  1443. for (i = 0; i < chromWidth; i += 8) {
  1444. uint64_t y1, y2, yuv1, yuv2;
  1445. uint64_t u, v;
  1446. /* Prefetch */
  1447. __asm__("ldq $31,64(%0)" :: "r"(yc));
  1448. __asm__("ldq $31,64(%0)" :: "r"(yc2));
  1449. __asm__("ldq $31,64(%0)" :: "r"(uc));
  1450. __asm__("ldq $31,64(%0)" :: "r"(vc));
  1451. pl2yuy2(0);
  1452. pl2yuy2(1);
  1453. pl2yuy2(2);
  1454. pl2yuy2(3);
  1455. yc += 4;
  1456. yc2 += 4;
  1457. uc += 4;
  1458. vc += 4;
  1459. qdst += 4;
  1460. qdst2 += 4;
  1461. }
  1462. y++;
  1463. ysrc += lumStride;
  1464. dst += dstStride;
  1465. #elif HAVE_FAST_64BIT
  1466. int i;
  1467. uint64_t *ldst = (uint64_t *) dst;
  1468. const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
  1469. for (i = 0; i < chromWidth; i += 2) {
  1470. uint64_t k, l;
  1471. k = yc[0] + (uc[0] << 8) +
  1472. (yc[1] << 16) + (vc[0] << 24);
  1473. l = yc[2] + (uc[1] << 8) +
  1474. (yc[3] << 16) + (vc[1] << 24);
  1475. *ldst++ = k + (l << 32);
  1476. yc += 4;
  1477. uc += 2;
  1478. vc += 2;
  1479. }
  1480. #else
  1481. int i, *idst = (int32_t *) dst;
  1482. const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
  1483. for (i = 0; i < chromWidth; i++) {
  1484. #if HAVE_BIGENDIAN
  1485. *idst++ = (yc[0] << 24)+ (uc[0] << 16) +
  1486. (yc[1] << 8) + (vc[0] << 0);
  1487. #else
  1488. *idst++ = yc[0] + (uc[0] << 8) +
  1489. (yc[1] << 16) + (vc[0] << 24);
  1490. #endif
  1491. yc += 2;
  1492. uc++;
  1493. vc++;
  1494. }
  1495. #endif
  1496. #endif
  1497. if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1) {
  1498. usrc += chromStride;
  1499. vsrc += chromStride;
  1500. }
  1501. ysrc += lumStride;
  1502. dst += dstStride;
  1503. }
  1504. #if HAVE_MMX
  1505. __asm__(EMMS" \n\t"
  1506. SFENCE" \n\t"
  1507. :::"memory");
  1508. #endif
  1509. }
  1510. /**
  1511. * Height should be a multiple of 2 and width should be a multiple of 16.
  1512. * (If this is a problem for anyone then tell me, and I will fix it.)
  1513. */
  1514. static inline void RENAME(yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
  1515. long width, long height,
  1516. long lumStride, long chromStride, long dstStride)
  1517. {
  1518. //FIXME interpolate chroma
  1519. RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
  1520. }
  1521. static inline void RENAME(yuvPlanartouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
  1522. long width, long height,
  1523. long lumStride, long chromStride, long dstStride, long vertLumPerChroma)
  1524. {
  1525. long y;
  1526. const x86_reg chromWidth= width>>1;
  1527. for (y=0; y<height; y++) {
  1528. #if HAVE_MMX
  1529. //FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway)
  1530. __asm__ volatile(
  1531. "xor %%"REG_a", %%"REG_a" \n\t"
  1532. ASMALIGN(4)
  1533. "1: \n\t"
  1534. PREFETCH" 32(%1, %%"REG_a", 2) \n\t"
  1535. PREFETCH" 32(%2, %%"REG_a") \n\t"
  1536. PREFETCH" 32(%3, %%"REG_a") \n\t"
  1537. "movq (%2, %%"REG_a"), %%mm0 \n\t" // U(0)
  1538. "movq %%mm0, %%mm2 \n\t" // U(0)
  1539. "movq (%3, %%"REG_a"), %%mm1 \n\t" // V(0)
  1540. "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
  1541. "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8)
  1542. "movq (%1, %%"REG_a",2), %%mm3 \n\t" // Y(0)
  1543. "movq 8(%1, %%"REG_a",2), %%mm5 \n\t" // Y(8)
  1544. "movq %%mm0, %%mm4 \n\t" // Y(0)
  1545. "movq %%mm2, %%mm6 \n\t" // Y(8)
  1546. "punpcklbw %%mm3, %%mm0 \n\t" // YUYV YUYV(0)
  1547. "punpckhbw %%mm3, %%mm4 \n\t" // YUYV YUYV(4)
  1548. "punpcklbw %%mm5, %%mm2 \n\t" // YUYV YUYV(8)
  1549. "punpckhbw %%mm5, %%mm6 \n\t" // YUYV YUYV(12)
  1550. MOVNTQ" %%mm0, (%0, %%"REG_a", 4) \n\t"
  1551. MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4) \n\t"
  1552. MOVNTQ" %%mm2, 16(%0, %%"REG_a", 4) \n\t"
  1553. MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4) \n\t"
  1554. "add $8, %%"REG_a" \n\t"
  1555. "cmp %4, %%"REG_a" \n\t"
  1556. " jb 1b \n\t"
  1557. ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
  1558. : "%"REG_a
  1559. );
  1560. #else
  1561. //FIXME adapt the Alpha ASM code from yv12->yuy2
  1562. #if HAVE_FAST_64BIT
  1563. int i;
  1564. uint64_t *ldst = (uint64_t *) dst;
  1565. const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
  1566. for (i = 0; i < chromWidth; i += 2) {
  1567. uint64_t k, l;
  1568. k = uc[0] + (yc[0] << 8) +
  1569. (vc[0] << 16) + (yc[1] << 24);
  1570. l = uc[1] + (yc[2] << 8) +
  1571. (vc[1] << 16) + (yc[3] << 24);
  1572. *ldst++ = k + (l << 32);
  1573. yc += 4;
  1574. uc += 2;
  1575. vc += 2;
  1576. }
  1577. #else
  1578. int i, *idst = (int32_t *) dst;
  1579. const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
  1580. for (i = 0; i < chromWidth; i++) {
  1581. #if HAVE_BIGENDIAN
  1582. *idst++ = (uc[0] << 24)+ (yc[0] << 16) +
  1583. (vc[0] << 8) + (yc[1] << 0);
  1584. #else
  1585. *idst++ = uc[0] + (yc[0] << 8) +
  1586. (vc[0] << 16) + (yc[1] << 24);
  1587. #endif
  1588. yc += 2;
  1589. uc++;
  1590. vc++;
  1591. }
  1592. #endif
  1593. #endif
  1594. if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1) {
  1595. usrc += chromStride;
  1596. vsrc += chromStride;
  1597. }
  1598. ysrc += lumStride;
  1599. dst += dstStride;
  1600. }
  1601. #if HAVE_MMX
  1602. __asm__(EMMS" \n\t"
  1603. SFENCE" \n\t"
  1604. :::"memory");
  1605. #endif
  1606. }
  1607. /**
  1608. * Height should be a multiple of 2 and width should be a multiple of 16
  1609. * (If this is a problem for anyone then tell me, and I will fix it.)
  1610. */
  1611. static inline void RENAME(yv12touyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
  1612. long width, long height,
  1613. long lumStride, long chromStride, long dstStride)
  1614. {
  1615. //FIXME interpolate chroma
  1616. RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
  1617. }
  1618. /**
  1619. * Width should be a multiple of 16.
  1620. */
  1621. static inline void RENAME(yuv422ptouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
  1622. long width, long height,
  1623. long lumStride, long chromStride, long dstStride)
  1624. {
  1625. RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
  1626. }
  1627. /**
  1628. * Width should be a multiple of 16.
  1629. */
  1630. static inline void RENAME(yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
  1631. long width, long height,
  1632. long lumStride, long chromStride, long dstStride)
  1633. {
  1634. RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
  1635. }
  1636. /**
  1637. * Height should be a multiple of 2 and width should be a multiple of 16.
  1638. * (If this is a problem for anyone then tell me, and I will fix it.)
  1639. */
  1640. static inline void RENAME(yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
  1641. long width, long height,
  1642. long lumStride, long chromStride, long srcStride)
  1643. {
  1644. long y;
  1645. const x86_reg chromWidth= width>>1;
  1646. for (y=0; y<height; y+=2) {
  1647. #if HAVE_MMX
  1648. __asm__ volatile(
  1649. "xor %%"REG_a", %%"REG_a" \n\t"
  1650. "pcmpeqw %%mm7, %%mm7 \n\t"
  1651. "psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
  1652. ASMALIGN(4)
  1653. "1: \n\t"
  1654. PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
  1655. "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
  1656. "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(4)
  1657. "movq %%mm0, %%mm2 \n\t" // YUYV YUYV(0)
  1658. "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(4)
  1659. "psrlw $8, %%mm0 \n\t" // U0V0 U0V0(0)
  1660. "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(4)
  1661. "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
  1662. "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
  1663. "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
  1664. "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
  1665. MOVNTQ" %%mm2, (%1, %%"REG_a", 2) \n\t"
  1666. "movq 16(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(8)
  1667. "movq 24(%0, %%"REG_a", 4), %%mm2 \n\t" // YUYV YUYV(12)
  1668. "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(8)
  1669. "movq %%mm2, %%mm4 \n\t" // YUYV YUYV(12)
  1670. "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(8)
  1671. "psrlw $8, %%mm2 \n\t" // U0V0 U0V0(12)
  1672. "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
  1673. "pand %%mm7, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
  1674. "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
  1675. "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
  1676. MOVNTQ" %%mm3, 8(%1, %%"REG_a", 2) \n\t"
  1677. "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
  1678. "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
  1679. "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
  1680. "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
  1681. "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
  1682. "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
  1683. "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
  1684. "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
  1685. MOVNTQ" %%mm0, (%3, %%"REG_a") \n\t"
  1686. MOVNTQ" %%mm2, (%2, %%"REG_a") \n\t"
  1687. "add $8, %%"REG_a" \n\t"
  1688. "cmp %4, %%"REG_a" \n\t"
  1689. " jb 1b \n\t"
  1690. ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
  1691. : "memory", "%"REG_a
  1692. );
  1693. ydst += lumStride;
  1694. src += srcStride;
  1695. __asm__ volatile(
  1696. "xor %%"REG_a", %%"REG_a" \n\t"
  1697. ASMALIGN(4)
  1698. "1: \n\t"
  1699. PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
  1700. "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
  1701. "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(4)
  1702. "movq 16(%0, %%"REG_a", 4), %%mm2 \n\t" // YUYV YUYV(8)
  1703. "movq 24(%0, %%"REG_a", 4), %%mm3 \n\t" // YUYV YUYV(12)
  1704. "pand %%mm7, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
  1705. "pand %%mm7, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
  1706. "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
  1707. "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
  1708. "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
  1709. "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
  1710. MOVNTQ" %%mm0, (%1, %%"REG_a", 2) \n\t"
  1711. MOVNTQ" %%mm2, 8(%1, %%"REG_a", 2) \n\t"
  1712. "add $8, %%"REG_a" \n\t"
  1713. "cmp %4, %%"REG_a" \n\t"
  1714. " jb 1b \n\t"
  1715. ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
  1716. : "memory", "%"REG_a
  1717. );
  1718. #else
  1719. long i;
  1720. for (i=0; i<chromWidth; i++) {
  1721. ydst[2*i+0] = src[4*i+0];
  1722. udst[i] = src[4*i+1];
  1723. ydst[2*i+1] = src[4*i+2];
  1724. vdst[i] = src[4*i+3];
  1725. }
  1726. ydst += lumStride;
  1727. src += srcStride;
  1728. for (i=0; i<chromWidth; i++) {
  1729. ydst[2*i+0] = src[4*i+0];
  1730. ydst[2*i+1] = src[4*i+2];
  1731. }
  1732. #endif
  1733. udst += chromStride;
  1734. vdst += chromStride;
  1735. ydst += lumStride;
  1736. src += srcStride;
  1737. }
  1738. #if HAVE_MMX
  1739. __asm__ volatile(EMMS" \n\t"
  1740. SFENCE" \n\t"
  1741. :::"memory");
  1742. #endif
  1743. }
  1744. static inline void RENAME(yvu9toyv12)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc,
  1745. uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
  1746. long width, long height, long lumStride, long chromStride)
  1747. {
  1748. /* Y Plane */
  1749. memcpy(ydst, ysrc, width*height);
  1750. /* XXX: implement upscaling for U,V */
  1751. }
  1752. static inline void RENAME(planar2x)(const uint8_t *src, uint8_t *dst, long srcWidth, long srcHeight, long srcStride, long dstStride)
  1753. {
  1754. long x,y;
  1755. dst[0]= src[0];
  1756. // first line
  1757. for (x=0; x<srcWidth-1; x++) {
  1758. dst[2*x+1]= (3*src[x] + src[x+1])>>2;
  1759. dst[2*x+2]= ( src[x] + 3*src[x+1])>>2;
  1760. }
  1761. dst[2*srcWidth-1]= src[srcWidth-1];
  1762. dst+= dstStride;
  1763. for (y=1; y<srcHeight; y++) {
  1764. #if HAVE_MMX2 || HAVE_AMD3DNOW
  1765. const x86_reg mmxSize= srcWidth&~15;
  1766. __asm__ volatile(
  1767. "mov %4, %%"REG_a" \n\t"
  1768. "1: \n\t"
  1769. "movq (%0, %%"REG_a"), %%mm0 \n\t"
  1770. "movq (%1, %%"REG_a"), %%mm1 \n\t"
  1771. "movq 1(%0, %%"REG_a"), %%mm2 \n\t"
  1772. "movq 1(%1, %%"REG_a"), %%mm3 \n\t"
  1773. "movq -1(%0, %%"REG_a"), %%mm4 \n\t"
  1774. "movq -1(%1, %%"REG_a"), %%mm5 \n\t"
  1775. PAVGB" %%mm0, %%mm5 \n\t"
  1776. PAVGB" %%mm0, %%mm3 \n\t"
  1777. PAVGB" %%mm0, %%mm5 \n\t"
  1778. PAVGB" %%mm0, %%mm3 \n\t"
  1779. PAVGB" %%mm1, %%mm4 \n\t"
  1780. PAVGB" %%mm1, %%mm2 \n\t"
  1781. PAVGB" %%mm1, %%mm4 \n\t"
  1782. PAVGB" %%mm1, %%mm2 \n\t"
  1783. "movq %%mm5, %%mm7 \n\t"
  1784. "movq %%mm4, %%mm6 \n\t"
  1785. "punpcklbw %%mm3, %%mm5 \n\t"
  1786. "punpckhbw %%mm3, %%mm7 \n\t"
  1787. "punpcklbw %%mm2, %%mm4 \n\t"
  1788. "punpckhbw %%mm2, %%mm6 \n\t"
  1789. #if 1
  1790. MOVNTQ" %%mm5, (%2, %%"REG_a", 2) \n\t"
  1791. MOVNTQ" %%mm7, 8(%2, %%"REG_a", 2) \n\t"
  1792. MOVNTQ" %%mm4, (%3, %%"REG_a", 2) \n\t"
  1793. MOVNTQ" %%mm6, 8(%3, %%"REG_a", 2) \n\t"
  1794. #else
  1795. "movq %%mm5, (%2, %%"REG_a", 2) \n\t"
  1796. "movq %%mm7, 8(%2, %%"REG_a", 2) \n\t"
  1797. "movq %%mm4, (%3, %%"REG_a", 2) \n\t"
  1798. "movq %%mm6, 8(%3, %%"REG_a", 2) \n\t"
  1799. #endif
  1800. "add $8, %%"REG_a" \n\t"
  1801. " js 1b \n\t"
  1802. :: "r" (src + mmxSize ), "r" (src + srcStride + mmxSize ),
  1803. "r" (dst + mmxSize*2), "r" (dst + dstStride + mmxSize*2),
  1804. "g" (-mmxSize)
  1805. : "%"REG_a
  1806. );
  1807. #else
  1808. const x86_reg mmxSize=1;
  1809. #endif
  1810. dst[0 ]= (3*src[0] + src[srcStride])>>2;
  1811. dst[dstStride]= ( src[0] + 3*src[srcStride])>>2;
  1812. for (x=mmxSize-1; x<srcWidth-1; x++) {
  1813. dst[2*x +1]= (3*src[x+0] + src[x+srcStride+1])>>2;
  1814. dst[2*x+dstStride+2]= ( src[x+0] + 3*src[x+srcStride+1])>>2;
  1815. dst[2*x+dstStride+1]= ( src[x+1] + 3*src[x+srcStride ])>>2;
  1816. dst[2*x +2]= (3*src[x+1] + src[x+srcStride ])>>2;
  1817. }
  1818. dst[srcWidth*2 -1 ]= (3*src[srcWidth-1] + src[srcWidth-1 + srcStride])>>2;
  1819. dst[srcWidth*2 -1 + dstStride]= ( src[srcWidth-1] + 3*src[srcWidth-1 + srcStride])>>2;
  1820. dst+=dstStride*2;
  1821. src+=srcStride;
  1822. }
  1823. // last line
  1824. #if 1
  1825. dst[0]= src[0];
  1826. for (x=0; x<srcWidth-1; x++) {
  1827. dst[2*x+1]= (3*src[x] + src[x+1])>>2;
  1828. dst[2*x+2]= ( src[x] + 3*src[x+1])>>2;
  1829. }
  1830. dst[2*srcWidth-1]= src[srcWidth-1];
  1831. #else
  1832. for (x=0; x<srcWidth; x++) {
  1833. dst[2*x+0]=
  1834. dst[2*x+1]= src[x];
  1835. }
  1836. #endif
  1837. #if HAVE_MMX
  1838. __asm__ volatile(EMMS" \n\t"
  1839. SFENCE" \n\t"
  1840. :::"memory");
  1841. #endif
  1842. }
  1843. /**
  1844. * Height should be a multiple of 2 and width should be a multiple of 16.
  1845. * (If this is a problem for anyone then tell me, and I will fix it.)
  1846. * Chrominance data is only taken from every second line, others are ignored.
  1847. * FIXME: Write HQ version.
  1848. */
  1849. static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
  1850. long width, long height,
  1851. long lumStride, long chromStride, long srcStride)
  1852. {
  1853. long y;
  1854. const x86_reg chromWidth= width>>1;
  1855. for (y=0; y<height; y+=2) {
  1856. #if HAVE_MMX
  1857. __asm__ volatile(
  1858. "xor %%"REG_a", %%"REG_a" \n\t"
  1859. "pcmpeqw %%mm7, %%mm7 \n\t"
  1860. "psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
  1861. ASMALIGN(4)
  1862. "1: \n\t"
  1863. PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
  1864. "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // UYVY UYVY(0)
  1865. "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // UYVY UYVY(4)
  1866. "movq %%mm0, %%mm2 \n\t" // UYVY UYVY(0)
  1867. "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(4)
  1868. "pand %%mm7, %%mm0 \n\t" // U0V0 U0V0(0)
  1869. "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(4)
  1870. "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
  1871. "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
  1872. "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
  1873. "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
  1874. MOVNTQ" %%mm2, (%1, %%"REG_a", 2) \n\t"
  1875. "movq 16(%0, %%"REG_a", 4), %%mm1 \n\t" // UYVY UYVY(8)
  1876. "movq 24(%0, %%"REG_a", 4), %%mm2 \n\t" // UYVY UYVY(12)
  1877. "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(8)
  1878. "movq %%mm2, %%mm4 \n\t" // UYVY UYVY(12)
  1879. "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(8)
  1880. "pand %%mm7, %%mm2 \n\t" // U0V0 U0V0(12)
  1881. "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
  1882. "psrlw $8, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
  1883. "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
  1884. "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
  1885. MOVNTQ" %%mm3, 8(%1, %%"REG_a", 2) \n\t"
  1886. "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
  1887. "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
  1888. "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
  1889. "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
  1890. "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
  1891. "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
  1892. "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
  1893. "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
  1894. MOVNTQ" %%mm0, (%3, %%"REG_a") \n\t"
  1895. MOVNTQ" %%mm2, (%2, %%"REG_a") \n\t"
  1896. "add $8, %%"REG_a" \n\t"
  1897. "cmp %4, %%"REG_a" \n\t"
  1898. " jb 1b \n\t"
  1899. ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
  1900. : "memory", "%"REG_a
  1901. );
  1902. ydst += lumStride;
  1903. src += srcStride;
  1904. __asm__ volatile(
  1905. "xor %%"REG_a", %%"REG_a" \n\t"
  1906. ASMALIGN(4)
  1907. "1: \n\t"
  1908. PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
  1909. "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
  1910. "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(4)
  1911. "movq 16(%0, %%"REG_a", 4), %%mm2 \n\t" // YUYV YUYV(8)
  1912. "movq 24(%0, %%"REG_a", 4), %%mm3 \n\t" // YUYV YUYV(12)
  1913. "psrlw $8, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
  1914. "psrlw $8, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
  1915. "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
  1916. "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
  1917. "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
  1918. "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
  1919. MOVNTQ" %%mm0, (%1, %%"REG_a", 2) \n\t"
  1920. MOVNTQ" %%mm2, 8(%1, %%"REG_a", 2) \n\t"
  1921. "add $8, %%"REG_a" \n\t"
  1922. "cmp %4, %%"REG_a" \n\t"
  1923. " jb 1b \n\t"
  1924. ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
  1925. : "memory", "%"REG_a
  1926. );
  1927. #else
  1928. long i;
  1929. for (i=0; i<chromWidth; i++) {
  1930. udst[i] = src[4*i+0];
  1931. ydst[2*i+0] = src[4*i+1];
  1932. vdst[i] = src[4*i+2];
  1933. ydst[2*i+1] = src[4*i+3];
  1934. }
  1935. ydst += lumStride;
  1936. src += srcStride;
  1937. for (i=0; i<chromWidth; i++) {
  1938. ydst[2*i+0] = src[4*i+1];
  1939. ydst[2*i+1] = src[4*i+3];
  1940. }
  1941. #endif
  1942. udst += chromStride;
  1943. vdst += chromStride;
  1944. ydst += lumStride;
  1945. src += srcStride;
  1946. }
  1947. #if HAVE_MMX
  1948. __asm__ volatile(EMMS" \n\t"
  1949. SFENCE" \n\t"
  1950. :::"memory");
  1951. #endif
  1952. }
  1953. /**
  1954. * Height should be a multiple of 2 and width should be a multiple of 2.
  1955. * (If this is a problem for anyone then tell me, and I will fix it.)
  1956. * Chrominance data is only taken from every second line,
  1957. * others are ignored in the C version.
  1958. * FIXME: Write HQ version.
  1959. */
  1960. static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
  1961. long width, long height,
  1962. long lumStride, long chromStride, long srcStride)
  1963. {
  1964. long y;
  1965. const x86_reg chromWidth= width>>1;
  1966. #if HAVE_MMX
  1967. for (y=0; y<height-2; y+=2) {
  1968. long i;
  1969. for (i=0; i<2; i++) {
  1970. __asm__ volatile(
  1971. "mov %2, %%"REG_a" \n\t"
  1972. "movq "MANGLE(ff_bgr2YCoeff)", %%mm6 \n\t"
  1973. "movq "MANGLE(ff_w1111)", %%mm5 \n\t"
  1974. "pxor %%mm7, %%mm7 \n\t"
  1975. "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d" \n\t"
  1976. ASMALIGN(4)
  1977. "1: \n\t"
  1978. PREFETCH" 64(%0, %%"REG_d") \n\t"
  1979. "movd (%0, %%"REG_d"), %%mm0 \n\t"
  1980. "movd 3(%0, %%"REG_d"), %%mm1 \n\t"
  1981. "punpcklbw %%mm7, %%mm0 \n\t"
  1982. "punpcklbw %%mm7, %%mm1 \n\t"
  1983. "movd 6(%0, %%"REG_d"), %%mm2 \n\t"
  1984. "movd 9(%0, %%"REG_d"), %%mm3 \n\t"
  1985. "punpcklbw %%mm7, %%mm2 \n\t"
  1986. "punpcklbw %%mm7, %%mm3 \n\t"
  1987. "pmaddwd %%mm6, %%mm0 \n\t"
  1988. "pmaddwd %%mm6, %%mm1 \n\t"
  1989. "pmaddwd %%mm6, %%mm2 \n\t"
  1990. "pmaddwd %%mm6, %%mm3 \n\t"
  1991. #ifndef FAST_BGR2YV12
  1992. "psrad $8, %%mm0 \n\t"
  1993. "psrad $8, %%mm1 \n\t"
  1994. "psrad $8, %%mm2 \n\t"
  1995. "psrad $8, %%mm3 \n\t"
  1996. #endif
  1997. "packssdw %%mm1, %%mm0 \n\t"
  1998. "packssdw %%mm3, %%mm2 \n\t"
  1999. "pmaddwd %%mm5, %%mm0 \n\t"
  2000. "pmaddwd %%mm5, %%mm2 \n\t"
  2001. "packssdw %%mm2, %%mm0 \n\t"
  2002. "psraw $7, %%mm0 \n\t"
  2003. "movd 12(%0, %%"REG_d"), %%mm4 \n\t"
  2004. "movd 15(%0, %%"REG_d"), %%mm1 \n\t"
  2005. "punpcklbw %%mm7, %%mm4 \n\t"
  2006. "punpcklbw %%mm7, %%mm1 \n\t"
  2007. "movd 18(%0, %%"REG_d"), %%mm2 \n\t"
  2008. "movd 21(%0, %%"REG_d"), %%mm3 \n\t"
  2009. "punpcklbw %%mm7, %%mm2 \n\t"
  2010. "punpcklbw %%mm7, %%mm3 \n\t"
  2011. "pmaddwd %%mm6, %%mm4 \n\t"
  2012. "pmaddwd %%mm6, %%mm1 \n\t"
  2013. "pmaddwd %%mm6, %%mm2 \n\t"
  2014. "pmaddwd %%mm6, %%mm3 \n\t"
  2015. #ifndef FAST_BGR2YV12
  2016. "psrad $8, %%mm4 \n\t"
  2017. "psrad $8, %%mm1 \n\t"
  2018. "psrad $8, %%mm2 \n\t"
  2019. "psrad $8, %%mm3 \n\t"
  2020. #endif
  2021. "packssdw %%mm1, %%mm4 \n\t"
  2022. "packssdw %%mm3, %%mm2 \n\t"
  2023. "pmaddwd %%mm5, %%mm4 \n\t"
  2024. "pmaddwd %%mm5, %%mm2 \n\t"
  2025. "add $24, %%"REG_d" \n\t"
  2026. "packssdw %%mm2, %%mm4 \n\t"
  2027. "psraw $7, %%mm4 \n\t"
  2028. "packuswb %%mm4, %%mm0 \n\t"
  2029. "paddusb "MANGLE(ff_bgr2YOffset)", %%mm0 \n\t"
  2030. MOVNTQ" %%mm0, (%1, %%"REG_a") \n\t"
  2031. "add $8, %%"REG_a" \n\t"
  2032. " js 1b \n\t"
  2033. : : "r" (src+width*3), "r" (ydst+width), "g" ((x86_reg)-width)
  2034. : "%"REG_a, "%"REG_d
  2035. );
  2036. ydst += lumStride;
  2037. src += srcStride;
  2038. }
  2039. src -= srcStride*2;
  2040. __asm__ volatile(
  2041. "mov %4, %%"REG_a" \n\t"
  2042. "movq "MANGLE(ff_w1111)", %%mm5 \n\t"
  2043. "movq "MANGLE(ff_bgr2UCoeff)", %%mm6 \n\t"
  2044. "pxor %%mm7, %%mm7 \n\t"
  2045. "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d" \n\t"
  2046. "add %%"REG_d", %%"REG_d" \n\t"
  2047. ASMALIGN(4)
  2048. "1: \n\t"
  2049. PREFETCH" 64(%0, %%"REG_d") \n\t"
  2050. PREFETCH" 64(%1, %%"REG_d") \n\t"
  2051. #if HAVE_MMX2 || HAVE_AMD3DNOW
  2052. "movq (%0, %%"REG_d"), %%mm0 \n\t"
  2053. "movq (%1, %%"REG_d"), %%mm1 \n\t"
  2054. "movq 6(%0, %%"REG_d"), %%mm2 \n\t"
  2055. "movq 6(%1, %%"REG_d"), %%mm3 \n\t"
  2056. PAVGB" %%mm1, %%mm0 \n\t"
  2057. PAVGB" %%mm3, %%mm2 \n\t"
  2058. "movq %%mm0, %%mm1 \n\t"
  2059. "movq %%mm2, %%mm3 \n\t"
  2060. "psrlq $24, %%mm0 \n\t"
  2061. "psrlq $24, %%mm2 \n\t"
  2062. PAVGB" %%mm1, %%mm0 \n\t"
  2063. PAVGB" %%mm3, %%mm2 \n\t"
  2064. "punpcklbw %%mm7, %%mm0 \n\t"
  2065. "punpcklbw %%mm7, %%mm2 \n\t"
  2066. #else
  2067. "movd (%0, %%"REG_d"), %%mm0 \n\t"
  2068. "movd (%1, %%"REG_d"), %%mm1 \n\t"
  2069. "movd 3(%0, %%"REG_d"), %%mm2 \n\t"
  2070. "movd 3(%1, %%"REG_d"), %%mm3 \n\t"
  2071. "punpcklbw %%mm7, %%mm0 \n\t"
  2072. "punpcklbw %%mm7, %%mm1 \n\t"
  2073. "punpcklbw %%mm7, %%mm2 \n\t"
  2074. "punpcklbw %%mm7, %%mm3 \n\t"
  2075. "paddw %%mm1, %%mm0 \n\t"
  2076. "paddw %%mm3, %%mm2 \n\t"
  2077. "paddw %%mm2, %%mm0 \n\t"
  2078. "movd 6(%0, %%"REG_d"), %%mm4 \n\t"
  2079. "movd 6(%1, %%"REG_d"), %%mm1 \n\t"
  2080. "movd 9(%0, %%"REG_d"), %%mm2 \n\t"
  2081. "movd 9(%1, %%"REG_d"), %%mm3 \n\t"
  2082. "punpcklbw %%mm7, %%mm4 \n\t"
  2083. "punpcklbw %%mm7, %%mm1 \n\t"
  2084. "punpcklbw %%mm7, %%mm2 \n\t"
  2085. "punpcklbw %%mm7, %%mm3 \n\t"
  2086. "paddw %%mm1, %%mm4 \n\t"
  2087. "paddw %%mm3, %%mm2 \n\t"
  2088. "paddw %%mm4, %%mm2 \n\t"
  2089. "psrlw $2, %%mm0 \n\t"
  2090. "psrlw $2, %%mm2 \n\t"
  2091. #endif
  2092. "movq "MANGLE(ff_bgr2VCoeff)", %%mm1 \n\t"
  2093. "movq "MANGLE(ff_bgr2VCoeff)", %%mm3 \n\t"
  2094. "pmaddwd %%mm0, %%mm1 \n\t"
  2095. "pmaddwd %%mm2, %%mm3 \n\t"
  2096. "pmaddwd %%mm6, %%mm0 \n\t"
  2097. "pmaddwd %%mm6, %%mm2 \n\t"
  2098. #ifndef FAST_BGR2YV12
  2099. "psrad $8, %%mm0 \n\t"
  2100. "psrad $8, %%mm1 \n\t"
  2101. "psrad $8, %%mm2 \n\t"
  2102. "psrad $8, %%mm3 \n\t"
  2103. #endif
  2104. "packssdw %%mm2, %%mm0 \n\t"
  2105. "packssdw %%mm3, %%mm1 \n\t"
  2106. "pmaddwd %%mm5, %%mm0 \n\t"
  2107. "pmaddwd %%mm5, %%mm1 \n\t"
  2108. "packssdw %%mm1, %%mm0 \n\t" // V1 V0 U1 U0
  2109. "psraw $7, %%mm0 \n\t"
  2110. #if HAVE_MMX2 || HAVE_AMD3DNOW
  2111. "movq 12(%0, %%"REG_d"), %%mm4 \n\t"
  2112. "movq 12(%1, %%"REG_d"), %%mm1 \n\t"
  2113. "movq 18(%0, %%"REG_d"), %%mm2 \n\t"
  2114. "movq 18(%1, %%"REG_d"), %%mm3 \n\t"
  2115. PAVGB" %%mm1, %%mm4 \n\t"
  2116. PAVGB" %%mm3, %%mm2 \n\t"
  2117. "movq %%mm4, %%mm1 \n\t"
  2118. "movq %%mm2, %%mm3 \n\t"
  2119. "psrlq $24, %%mm4 \n\t"
  2120. "psrlq $24, %%mm2 \n\t"
  2121. PAVGB" %%mm1, %%mm4 \n\t"
  2122. PAVGB" %%mm3, %%mm2 \n\t"
  2123. "punpcklbw %%mm7, %%mm4 \n\t"
  2124. "punpcklbw %%mm7, %%mm2 \n\t"
  2125. #else
  2126. "movd 12(%0, %%"REG_d"), %%mm4 \n\t"
  2127. "movd 12(%1, %%"REG_d"), %%mm1 \n\t"
  2128. "movd 15(%0, %%"REG_d"), %%mm2 \n\t"
  2129. "movd 15(%1, %%"REG_d"), %%mm3 \n\t"
  2130. "punpcklbw %%mm7, %%mm4 \n\t"
  2131. "punpcklbw %%mm7, %%mm1 \n\t"
  2132. "punpcklbw %%mm7, %%mm2 \n\t"
  2133. "punpcklbw %%mm7, %%mm3 \n\t"
  2134. "paddw %%mm1, %%mm4 \n\t"
  2135. "paddw %%mm3, %%mm2 \n\t"
  2136. "paddw %%mm2, %%mm4 \n\t"
  2137. "movd 18(%0, %%"REG_d"), %%mm5 \n\t"
  2138. "movd 18(%1, %%"REG_d"), %%mm1 \n\t"
  2139. "movd 21(%0, %%"REG_d"), %%mm2 \n\t"
  2140. "movd 21(%1, %%"REG_d"), %%mm3 \n\t"
  2141. "punpcklbw %%mm7, %%mm5 \n\t"
  2142. "punpcklbw %%mm7, %%mm1 \n\t"
  2143. "punpcklbw %%mm7, %%mm2 \n\t"
  2144. "punpcklbw %%mm7, %%mm3 \n\t"
  2145. "paddw %%mm1, %%mm5 \n\t"
  2146. "paddw %%mm3, %%mm2 \n\t"
  2147. "paddw %%mm5, %%mm2 \n\t"
  2148. "movq "MANGLE(ff_w1111)", %%mm5 \n\t"
  2149. "psrlw $2, %%mm4 \n\t"
  2150. "psrlw $2, %%mm2 \n\t"
  2151. #endif
  2152. "movq "MANGLE(ff_bgr2VCoeff)", %%mm1 \n\t"
  2153. "movq "MANGLE(ff_bgr2VCoeff)", %%mm3 \n\t"
  2154. "pmaddwd %%mm4, %%mm1 \n\t"
  2155. "pmaddwd %%mm2, %%mm3 \n\t"
  2156. "pmaddwd %%mm6, %%mm4 \n\t"
  2157. "pmaddwd %%mm6, %%mm2 \n\t"
  2158. #ifndef FAST_BGR2YV12
  2159. "psrad $8, %%mm4 \n\t"
  2160. "psrad $8, %%mm1 \n\t"
  2161. "psrad $8, %%mm2 \n\t"
  2162. "psrad $8, %%mm3 \n\t"
  2163. #endif
  2164. "packssdw %%mm2, %%mm4 \n\t"
  2165. "packssdw %%mm3, %%mm1 \n\t"
  2166. "pmaddwd %%mm5, %%mm4 \n\t"
  2167. "pmaddwd %%mm5, %%mm1 \n\t"
  2168. "add $24, %%"REG_d" \n\t"
  2169. "packssdw %%mm1, %%mm4 \n\t" // V3 V2 U3 U2
  2170. "psraw $7, %%mm4 \n\t"
  2171. "movq %%mm0, %%mm1 \n\t"
  2172. "punpckldq %%mm4, %%mm0 \n\t"
  2173. "punpckhdq %%mm4, %%mm1 \n\t"
  2174. "packsswb %%mm1, %%mm0 \n\t"
  2175. "paddb "MANGLE(ff_bgr2UVOffset)", %%mm0 \n\t"
  2176. "movd %%mm0, (%2, %%"REG_a") \n\t"
  2177. "punpckhdq %%mm0, %%mm0 \n\t"
  2178. "movd %%mm0, (%3, %%"REG_a") \n\t"
  2179. "add $4, %%"REG_a" \n\t"
  2180. " js 1b \n\t"
  2181. : : "r" (src+chromWidth*6), "r" (src+srcStride+chromWidth*6), "r" (udst+chromWidth), "r" (vdst+chromWidth), "g" (-chromWidth)
  2182. : "%"REG_a, "%"REG_d
  2183. );
  2184. udst += chromStride;
  2185. vdst += chromStride;
  2186. src += srcStride*2;
  2187. }
  2188. __asm__ volatile(EMMS" \n\t"
  2189. SFENCE" \n\t"
  2190. :::"memory");
  2191. #else
  2192. y=0;
  2193. #endif
  2194. for (; y<height; y+=2) {
  2195. long i;
  2196. for (i=0; i<chromWidth; i++) {
  2197. unsigned int b = src[6*i+0];
  2198. unsigned int g = src[6*i+1];
  2199. unsigned int r = src[6*i+2];
  2200. unsigned int Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
  2201. unsigned int V = ((RV*r + GV*g + BV*b)>>RGB2YUV_SHIFT) + 128;
  2202. unsigned int U = ((RU*r + GU*g + BU*b)>>RGB2YUV_SHIFT) + 128;
  2203. udst[i] = U;
  2204. vdst[i] = V;
  2205. ydst[2*i] = Y;
  2206. b = src[6*i+3];
  2207. g = src[6*i+4];
  2208. r = src[6*i+5];
  2209. Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
  2210. ydst[2*i+1] = Y;
  2211. }
  2212. ydst += lumStride;
  2213. src += srcStride;
  2214. for (i=0; i<chromWidth; i++) {
  2215. unsigned int b = src[6*i+0];
  2216. unsigned int g = src[6*i+1];
  2217. unsigned int r = src[6*i+2];
  2218. unsigned int Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
  2219. ydst[2*i] = Y;
  2220. b = src[6*i+3];
  2221. g = src[6*i+4];
  2222. r = src[6*i+5];
  2223. Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
  2224. ydst[2*i+1] = Y;
  2225. }
  2226. udst += chromStride;
  2227. vdst += chromStride;
  2228. ydst += lumStride;
  2229. src += srcStride;
  2230. }
  2231. }
  2232. static void RENAME(interleaveBytes)(uint8_t *src1, uint8_t *src2, uint8_t *dest,
  2233. long width, long height, long src1Stride,
  2234. long src2Stride, long dstStride)
  2235. {
  2236. long h;
  2237. for (h=0; h < height; h++) {
  2238. long w;
  2239. #if HAVE_MMX
  2240. #if HAVE_SSE2
  2241. __asm__(
  2242. "xor %%"REG_a", %%"REG_a" \n\t"
  2243. "1: \n\t"
  2244. PREFETCH" 64(%1, %%"REG_a") \n\t"
  2245. PREFETCH" 64(%2, %%"REG_a") \n\t"
  2246. "movdqa (%1, %%"REG_a"), %%xmm0 \n\t"
  2247. "movdqa (%1, %%"REG_a"), %%xmm1 \n\t"
  2248. "movdqa (%2, %%"REG_a"), %%xmm2 \n\t"
  2249. "punpcklbw %%xmm2, %%xmm0 \n\t"
  2250. "punpckhbw %%xmm2, %%xmm1 \n\t"
  2251. "movntdq %%xmm0, (%0, %%"REG_a", 2) \n\t"
  2252. "movntdq %%xmm1, 16(%0, %%"REG_a", 2) \n\t"
  2253. "add $16, %%"REG_a" \n\t"
  2254. "cmp %3, %%"REG_a" \n\t"
  2255. " jb 1b \n\t"
  2256. ::"r"(dest), "r"(src1), "r"(src2), "r" ((x86_reg)width-15)
  2257. : "memory", "%"REG_a""
  2258. );
  2259. #else
  2260. __asm__(
  2261. "xor %%"REG_a", %%"REG_a" \n\t"
  2262. "1: \n\t"
  2263. PREFETCH" 64(%1, %%"REG_a") \n\t"
  2264. PREFETCH" 64(%2, %%"REG_a") \n\t"
  2265. "movq (%1, %%"REG_a"), %%mm0 \n\t"
  2266. "movq 8(%1, %%"REG_a"), %%mm2 \n\t"
  2267. "movq %%mm0, %%mm1 \n\t"
  2268. "movq %%mm2, %%mm3 \n\t"
  2269. "movq (%2, %%"REG_a"), %%mm4 \n\t"
  2270. "movq 8(%2, %%"REG_a"), %%mm5 \n\t"
  2271. "punpcklbw %%mm4, %%mm0 \n\t"
  2272. "punpckhbw %%mm4, %%mm1 \n\t"
  2273. "punpcklbw %%mm5, %%mm2 \n\t"
  2274. "punpckhbw %%mm5, %%mm3 \n\t"
  2275. MOVNTQ" %%mm0, (%0, %%"REG_a", 2) \n\t"
  2276. MOVNTQ" %%mm1, 8(%0, %%"REG_a", 2) \n\t"
  2277. MOVNTQ" %%mm2, 16(%0, %%"REG_a", 2) \n\t"
  2278. MOVNTQ" %%mm3, 24(%0, %%"REG_a", 2) \n\t"
  2279. "add $16, %%"REG_a" \n\t"
  2280. "cmp %3, %%"REG_a" \n\t"
  2281. " jb 1b \n\t"
  2282. ::"r"(dest), "r"(src1), "r"(src2), "r" ((x86_reg)width-15)
  2283. : "memory", "%"REG_a
  2284. );
  2285. #endif
  2286. for (w= (width&(~15)); w < width; w++) {
  2287. dest[2*w+0] = src1[w];
  2288. dest[2*w+1] = src2[w];
  2289. }
  2290. #else
  2291. for (w=0; w < width; w++) {
  2292. dest[2*w+0] = src1[w];
  2293. dest[2*w+1] = src2[w];
  2294. }
  2295. #endif
  2296. dest += dstStride;
  2297. src1 += src1Stride;
  2298. src2 += src2Stride;
  2299. }
  2300. #if HAVE_MMX
  2301. __asm__(
  2302. EMMS" \n\t"
  2303. SFENCE" \n\t"
  2304. ::: "memory"
  2305. );
  2306. #endif
  2307. }
  2308. static inline void RENAME(vu9_to_vu12)(const uint8_t *src1, const uint8_t *src2,
  2309. uint8_t *dst1, uint8_t *dst2,
  2310. long width, long height,
  2311. long srcStride1, long srcStride2,
  2312. long dstStride1, long dstStride2)
  2313. {
  2314. x86_reg y;
  2315. long x,w,h;
  2316. w=width/2; h=height/2;
  2317. #if HAVE_MMX
  2318. __asm__ volatile(
  2319. PREFETCH" %0 \n\t"
  2320. PREFETCH" %1 \n\t"
  2321. ::"m"(*(src1+srcStride1)),"m"(*(src2+srcStride2)):"memory");
  2322. #endif
  2323. for (y=0;y<h;y++) {
  2324. const uint8_t* s1=src1+srcStride1*(y>>1);
  2325. uint8_t* d=dst1+dstStride1*y;
  2326. x=0;
  2327. #if HAVE_MMX
  2328. for (;x<w-31;x+=32) {
  2329. __asm__ volatile(
  2330. PREFETCH" 32%1 \n\t"
  2331. "movq %1, %%mm0 \n\t"
  2332. "movq 8%1, %%mm2 \n\t"
  2333. "movq 16%1, %%mm4 \n\t"
  2334. "movq 24%1, %%mm6 \n\t"
  2335. "movq %%mm0, %%mm1 \n\t"
  2336. "movq %%mm2, %%mm3 \n\t"
  2337. "movq %%mm4, %%mm5 \n\t"
  2338. "movq %%mm6, %%mm7 \n\t"
  2339. "punpcklbw %%mm0, %%mm0 \n\t"
  2340. "punpckhbw %%mm1, %%mm1 \n\t"
  2341. "punpcklbw %%mm2, %%mm2 \n\t"
  2342. "punpckhbw %%mm3, %%mm3 \n\t"
  2343. "punpcklbw %%mm4, %%mm4 \n\t"
  2344. "punpckhbw %%mm5, %%mm5 \n\t"
  2345. "punpcklbw %%mm6, %%mm6 \n\t"
  2346. "punpckhbw %%mm7, %%mm7 \n\t"
  2347. MOVNTQ" %%mm0, %0 \n\t"
  2348. MOVNTQ" %%mm1, 8%0 \n\t"
  2349. MOVNTQ" %%mm2, 16%0 \n\t"
  2350. MOVNTQ" %%mm3, 24%0 \n\t"
  2351. MOVNTQ" %%mm4, 32%0 \n\t"
  2352. MOVNTQ" %%mm5, 40%0 \n\t"
  2353. MOVNTQ" %%mm6, 48%0 \n\t"
  2354. MOVNTQ" %%mm7, 56%0"
  2355. :"=m"(d[2*x])
  2356. :"m"(s1[x])
  2357. :"memory");
  2358. }
  2359. #endif
  2360. for (;x<w;x++) d[2*x]=d[2*x+1]=s1[x];
  2361. }
  2362. for (y=0;y<h;y++) {
  2363. const uint8_t* s2=src2+srcStride2*(y>>1);
  2364. uint8_t* d=dst2+dstStride2*y;
  2365. x=0;
  2366. #if HAVE_MMX
  2367. for (;x<w-31;x+=32) {
  2368. __asm__ volatile(
  2369. PREFETCH" 32%1 \n\t"
  2370. "movq %1, %%mm0 \n\t"
  2371. "movq 8%1, %%mm2 \n\t"
  2372. "movq 16%1, %%mm4 \n\t"
  2373. "movq 24%1, %%mm6 \n\t"
  2374. "movq %%mm0, %%mm1 \n\t"
  2375. "movq %%mm2, %%mm3 \n\t"
  2376. "movq %%mm4, %%mm5 \n\t"
  2377. "movq %%mm6, %%mm7 \n\t"
  2378. "punpcklbw %%mm0, %%mm0 \n\t"
  2379. "punpckhbw %%mm1, %%mm1 \n\t"
  2380. "punpcklbw %%mm2, %%mm2 \n\t"
  2381. "punpckhbw %%mm3, %%mm3 \n\t"
  2382. "punpcklbw %%mm4, %%mm4 \n\t"
  2383. "punpckhbw %%mm5, %%mm5 \n\t"
  2384. "punpcklbw %%mm6, %%mm6 \n\t"
  2385. "punpckhbw %%mm7, %%mm7 \n\t"
  2386. MOVNTQ" %%mm0, %0 \n\t"
  2387. MOVNTQ" %%mm1, 8%0 \n\t"
  2388. MOVNTQ" %%mm2, 16%0 \n\t"
  2389. MOVNTQ" %%mm3, 24%0 \n\t"
  2390. MOVNTQ" %%mm4, 32%0 \n\t"
  2391. MOVNTQ" %%mm5, 40%0 \n\t"
  2392. MOVNTQ" %%mm6, 48%0 \n\t"
  2393. MOVNTQ" %%mm7, 56%0"
  2394. :"=m"(d[2*x])
  2395. :"m"(s2[x])
  2396. :"memory");
  2397. }
  2398. #endif
  2399. for (;x<w;x++) d[2*x]=d[2*x+1]=s2[x];
  2400. }
  2401. #if HAVE_MMX
  2402. __asm__(
  2403. EMMS" \n\t"
  2404. SFENCE" \n\t"
  2405. ::: "memory"
  2406. );
  2407. #endif
  2408. }
  2409. static inline void RENAME(yvu9_to_yuy2)(const uint8_t *src1, const uint8_t *src2, const uint8_t *src3,
  2410. uint8_t *dst,
  2411. long width, long height,
  2412. long srcStride1, long srcStride2,
  2413. long srcStride3, long dstStride)
  2414. {
  2415. x86_reg x;
  2416. long y,w,h;
  2417. w=width/2; h=height;
  2418. for (y=0;y<h;y++) {
  2419. const uint8_t* yp=src1+srcStride1*y;
  2420. const uint8_t* up=src2+srcStride2*(y>>2);
  2421. const uint8_t* vp=src3+srcStride3*(y>>2);
  2422. uint8_t* d=dst+dstStride*y;
  2423. x=0;
  2424. #if HAVE_MMX
  2425. for (;x<w-7;x+=8) {
  2426. __asm__ volatile(
  2427. PREFETCH" 32(%1, %0) \n\t"
  2428. PREFETCH" 32(%2, %0) \n\t"
  2429. PREFETCH" 32(%3, %0) \n\t"
  2430. "movq (%1, %0, 4), %%mm0 \n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
  2431. "movq (%2, %0), %%mm1 \n\t" /* U0U1U2U3U4U5U6U7 */
  2432. "movq (%3, %0), %%mm2 \n\t" /* V0V1V2V3V4V5V6V7 */
  2433. "movq %%mm0, %%mm3 \n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
  2434. "movq %%mm1, %%mm4 \n\t" /* U0U1U2U3U4U5U6U7 */
  2435. "movq %%mm2, %%mm5 \n\t" /* V0V1V2V3V4V5V6V7 */
  2436. "punpcklbw %%mm1, %%mm1 \n\t" /* U0U0 U1U1 U2U2 U3U3 */
  2437. "punpcklbw %%mm2, %%mm2 \n\t" /* V0V0 V1V1 V2V2 V3V3 */
  2438. "punpckhbw %%mm4, %%mm4 \n\t" /* U4U4 U5U5 U6U6 U7U7 */
  2439. "punpckhbw %%mm5, %%mm5 \n\t" /* V4V4 V5V5 V6V6 V7V7 */
  2440. "movq %%mm1, %%mm6 \n\t"
  2441. "punpcklbw %%mm2, %%mm1 \n\t" /* U0V0 U0V0 U1V1 U1V1*/
  2442. "punpcklbw %%mm1, %%mm0 \n\t" /* Y0U0 Y1V0 Y2U0 Y3V0*/
  2443. "punpckhbw %%mm1, %%mm3 \n\t" /* Y4U1 Y5V1 Y6U1 Y7V1*/
  2444. MOVNTQ" %%mm0, (%4, %0, 8) \n\t"
  2445. MOVNTQ" %%mm3, 8(%4, %0, 8) \n\t"
  2446. "punpckhbw %%mm2, %%mm6 \n\t" /* U2V2 U2V2 U3V3 U3V3*/
  2447. "movq 8(%1, %0, 4), %%mm0 \n\t"
  2448. "movq %%mm0, %%mm3 \n\t"
  2449. "punpcklbw %%mm6, %%mm0 \n\t" /* Y U2 Y V2 Y U2 Y V2*/
  2450. "punpckhbw %%mm6, %%mm3 \n\t" /* Y U3 Y V3 Y U3 Y V3*/
  2451. MOVNTQ" %%mm0, 16(%4, %0, 8) \n\t"
  2452. MOVNTQ" %%mm3, 24(%4, %0, 8) \n\t"
  2453. "movq %%mm4, %%mm6 \n\t"
  2454. "movq 16(%1, %0, 4), %%mm0 \n\t"
  2455. "movq %%mm0, %%mm3 \n\t"
  2456. "punpcklbw %%mm5, %%mm4 \n\t"
  2457. "punpcklbw %%mm4, %%mm0 \n\t" /* Y U4 Y V4 Y U4 Y V4*/
  2458. "punpckhbw %%mm4, %%mm3 \n\t" /* Y U5 Y V5 Y U5 Y V5*/
  2459. MOVNTQ" %%mm0, 32(%4, %0, 8) \n\t"
  2460. MOVNTQ" %%mm3, 40(%4, %0, 8) \n\t"
  2461. "punpckhbw %%mm5, %%mm6 \n\t"
  2462. "movq 24(%1, %0, 4), %%mm0 \n\t"
  2463. "movq %%mm0, %%mm3 \n\t"
  2464. "punpcklbw %%mm6, %%mm0 \n\t" /* Y U6 Y V6 Y U6 Y V6*/
  2465. "punpckhbw %%mm6, %%mm3 \n\t" /* Y U7 Y V7 Y U7 Y V7*/
  2466. MOVNTQ" %%mm0, 48(%4, %0, 8) \n\t"
  2467. MOVNTQ" %%mm3, 56(%4, %0, 8) \n\t"
  2468. : "+r" (x)
  2469. : "r"(yp), "r" (up), "r"(vp), "r"(d)
  2470. :"memory");
  2471. }
  2472. #endif
  2473. for (; x<w; x++) {
  2474. const long x2 = x<<2;
  2475. d[8*x+0] = yp[x2];
  2476. d[8*x+1] = up[x];
  2477. d[8*x+2] = yp[x2+1];
  2478. d[8*x+3] = vp[x];
  2479. d[8*x+4] = yp[x2+2];
  2480. d[8*x+5] = up[x];
  2481. d[8*x+6] = yp[x2+3];
  2482. d[8*x+7] = vp[x];
  2483. }
  2484. }
  2485. #if HAVE_MMX
  2486. __asm__(
  2487. EMMS" \n\t"
  2488. SFENCE" \n\t"
  2489. ::: "memory"
  2490. );
  2491. #endif
  2492. }
  2493. static void RENAME(extract_even)(const uint8_t *src, uint8_t *dst, x86_reg count)
  2494. {
  2495. dst += count;
  2496. src += 2*count;
  2497. count= - count;
  2498. #if HAVE_MMX
  2499. if(count <= -16) {
  2500. count += 15;
  2501. __asm__ volatile(
  2502. "pcmpeqw %%mm7, %%mm7 \n\t"
  2503. "psrlw $8, %%mm7 \n\t"
  2504. "1: \n\t"
  2505. "movq -30(%1, %0, 2), %%mm0 \n\t"
  2506. "movq -22(%1, %0, 2), %%mm1 \n\t"
  2507. "movq -14(%1, %0, 2), %%mm2 \n\t"
  2508. "movq -6(%1, %0, 2), %%mm3 \n\t"
  2509. "pand %%mm7, %%mm0 \n\t"
  2510. "pand %%mm7, %%mm1 \n\t"
  2511. "pand %%mm7, %%mm2 \n\t"
  2512. "pand %%mm7, %%mm3 \n\t"
  2513. "packuswb %%mm1, %%mm0 \n\t"
  2514. "packuswb %%mm3, %%mm2 \n\t"
  2515. MOVNTQ" %%mm0,-15(%2, %0) \n\t"
  2516. MOVNTQ" %%mm2,- 7(%2, %0) \n\t"
  2517. "add $16, %0 \n\t"
  2518. " js 1b \n\t"
  2519. : "+r"(count)
  2520. : "r"(src), "r"(dst)
  2521. );
  2522. count -= 15;
  2523. }
  2524. #endif
  2525. while(count<0) {
  2526. dst[count]= src[2*count];
  2527. count++;
  2528. }
  2529. }
  2530. static void RENAME(extract_even2)(const uint8_t *src, uint8_t *dst0, uint8_t *dst1, x86_reg count)
  2531. {
  2532. dst0+= count;
  2533. dst1+= count;
  2534. src += 4*count;
  2535. count= - count;
  2536. #if HAVE_MMX
  2537. if(count <= -8) {
  2538. count += 7;
  2539. __asm__ volatile(
  2540. "pcmpeqw %%mm7, %%mm7 \n\t"
  2541. "psrlw $8, %%mm7 \n\t"
  2542. "1: \n\t"
  2543. "movq -28(%1, %0, 4), %%mm0 \n\t"
  2544. "movq -20(%1, %0, 4), %%mm1 \n\t"
  2545. "movq -12(%1, %0, 4), %%mm2 \n\t"
  2546. "movq -4(%1, %0, 4), %%mm3 \n\t"
  2547. "pand %%mm7, %%mm0 \n\t"
  2548. "pand %%mm7, %%mm1 \n\t"
  2549. "pand %%mm7, %%mm2 \n\t"
  2550. "pand %%mm7, %%mm3 \n\t"
  2551. "packuswb %%mm1, %%mm0 \n\t"
  2552. "packuswb %%mm3, %%mm2 \n\t"
  2553. "movq %%mm0, %%mm1 \n\t"
  2554. "movq %%mm2, %%mm3 \n\t"
  2555. "psrlw $8, %%mm0 \n\t"
  2556. "psrlw $8, %%mm2 \n\t"
  2557. "pand %%mm7, %%mm1 \n\t"
  2558. "pand %%mm7, %%mm3 \n\t"
  2559. "packuswb %%mm2, %%mm0 \n\t"
  2560. "packuswb %%mm3, %%mm1 \n\t"
  2561. MOVNTQ" %%mm0,- 7(%3, %0) \n\t"
  2562. MOVNTQ" %%mm1,- 7(%2, %0) \n\t"
  2563. "add $8, %0 \n\t"
  2564. " js 1b \n\t"
  2565. : "+r"(count)
  2566. : "r"(src), "r"(dst0), "r"(dst1)
  2567. );
  2568. count -= 7;
  2569. }
  2570. #endif
  2571. while(count<0) {
  2572. dst0[count]= src[4*count+0];
  2573. dst1[count]= src[4*count+2];
  2574. count++;
  2575. }
  2576. }
  2577. static void RENAME(extract_even2avg)(const uint8_t *src0, const uint8_t *src1, uint8_t *dst0, uint8_t *dst1, x86_reg count)
  2578. {
  2579. dst0 += count;
  2580. dst1 += count;
  2581. src0 += 4*count;
  2582. src1 += 4*count;
  2583. count= - count;
  2584. #ifdef PAVGB
  2585. if(count <= -8) {
  2586. count += 7;
  2587. __asm__ volatile(
  2588. "pcmpeqw %%mm7, %%mm7 \n\t"
  2589. "psrlw $8, %%mm7 \n\t"
  2590. "1: \n\t"
  2591. "movq -28(%1, %0, 4), %%mm0 \n\t"
  2592. "movq -20(%1, %0, 4), %%mm1 \n\t"
  2593. "movq -12(%1, %0, 4), %%mm2 \n\t"
  2594. "movq -4(%1, %0, 4), %%mm3 \n\t"
  2595. PAVGB" -28(%2, %0, 4), %%mm0 \n\t"
  2596. PAVGB" -20(%2, %0, 4), %%mm1 \n\t"
  2597. PAVGB" -12(%2, %0, 4), %%mm2 \n\t"
  2598. PAVGB" - 4(%2, %0, 4), %%mm3 \n\t"
  2599. "pand %%mm7, %%mm0 \n\t"
  2600. "pand %%mm7, %%mm1 \n\t"
  2601. "pand %%mm7, %%mm2 \n\t"
  2602. "pand %%mm7, %%mm3 \n\t"
  2603. "packuswb %%mm1, %%mm0 \n\t"
  2604. "packuswb %%mm3, %%mm2 \n\t"
  2605. "movq %%mm0, %%mm1 \n\t"
  2606. "movq %%mm2, %%mm3 \n\t"
  2607. "psrlw $8, %%mm0 \n\t"
  2608. "psrlw $8, %%mm2 \n\t"
  2609. "pand %%mm7, %%mm1 \n\t"
  2610. "pand %%mm7, %%mm3 \n\t"
  2611. "packuswb %%mm2, %%mm0 \n\t"
  2612. "packuswb %%mm3, %%mm1 \n\t"
  2613. MOVNTQ" %%mm0,- 7(%4, %0) \n\t"
  2614. MOVNTQ" %%mm1,- 7(%3, %0) \n\t"
  2615. "add $8, %0 \n\t"
  2616. " js 1b \n\t"
  2617. : "+r"(count)
  2618. : "r"(src0), "r"(src1), "r"(dst0), "r"(dst1)
  2619. );
  2620. count -= 7;
  2621. }
  2622. #endif
  2623. while(count<0) {
  2624. dst0[count]= (src0[4*count+0]+src1[4*count+0])>>1;
  2625. dst1[count]= (src0[4*count+2]+src1[4*count+2])>>1;
  2626. count++;
  2627. }
  2628. }
  2629. static void RENAME(extract_odd2)(const uint8_t *src, uint8_t *dst0, uint8_t *dst1, x86_reg count)
  2630. {
  2631. dst0+= count;
  2632. dst1+= count;
  2633. src += 4*count;
  2634. count= - count;
  2635. #if HAVE_MMX
  2636. if(count <= -8) {
  2637. count += 7;
  2638. __asm__ volatile(
  2639. "pcmpeqw %%mm7, %%mm7 \n\t"
  2640. "psrlw $8, %%mm7 \n\t"
  2641. "1: \n\t"
  2642. "movq -28(%1, %0, 4), %%mm0 \n\t"
  2643. "movq -20(%1, %0, 4), %%mm1 \n\t"
  2644. "movq -12(%1, %0, 4), %%mm2 \n\t"
  2645. "movq -4(%1, %0, 4), %%mm3 \n\t"
  2646. "psrlw $8, %%mm0 \n\t"
  2647. "psrlw $8, %%mm1 \n\t"
  2648. "psrlw $8, %%mm2 \n\t"
  2649. "psrlw $8, %%mm3 \n\t"
  2650. "packuswb %%mm1, %%mm0 \n\t"
  2651. "packuswb %%mm3, %%mm2 \n\t"
  2652. "movq %%mm0, %%mm1 \n\t"
  2653. "movq %%mm2, %%mm3 \n\t"
  2654. "psrlw $8, %%mm0 \n\t"
  2655. "psrlw $8, %%mm2 \n\t"
  2656. "pand %%mm7, %%mm1 \n\t"
  2657. "pand %%mm7, %%mm3 \n\t"
  2658. "packuswb %%mm2, %%mm0 \n\t"
  2659. "packuswb %%mm3, %%mm1 \n\t"
  2660. MOVNTQ" %%mm0,- 7(%3, %0) \n\t"
  2661. MOVNTQ" %%mm1,- 7(%2, %0) \n\t"
  2662. "add $8, %0 \n\t"
  2663. " js 1b \n\t"
  2664. : "+r"(count)
  2665. : "r"(src), "r"(dst0), "r"(dst1)
  2666. );
  2667. count -= 7;
  2668. }
  2669. #endif
  2670. src++;
  2671. while(count<0) {
  2672. dst0[count]= src[4*count+0];
  2673. dst1[count]= src[4*count+2];
  2674. count++;
  2675. }
  2676. }
  2677. static void RENAME(extract_odd2avg)(const uint8_t *src0, const uint8_t *src1, uint8_t *dst0, uint8_t *dst1, x86_reg count)
  2678. {
  2679. dst0 += count;
  2680. dst1 += count;
  2681. src0 += 4*count;
  2682. src1 += 4*count;
  2683. count= - count;
  2684. #ifdef PAVGB
  2685. if(count <= -8) {
  2686. count += 7;
  2687. __asm__ volatile(
  2688. "pcmpeqw %%mm7, %%mm7 \n\t"
  2689. "psrlw $8, %%mm7 \n\t"
  2690. "1: \n\t"
  2691. "movq -28(%1, %0, 4), %%mm0 \n\t"
  2692. "movq -20(%1, %0, 4), %%mm1 \n\t"
  2693. "movq -12(%1, %0, 4), %%mm2 \n\t"
  2694. "movq -4(%1, %0, 4), %%mm3 \n\t"
  2695. PAVGB" -28(%2, %0, 4), %%mm0 \n\t"
  2696. PAVGB" -20(%2, %0, 4), %%mm1 \n\t"
  2697. PAVGB" -12(%2, %0, 4), %%mm2 \n\t"
  2698. PAVGB" - 4(%2, %0, 4), %%mm3 \n\t"
  2699. "psrlw $8, %%mm0 \n\t"
  2700. "psrlw $8, %%mm1 \n\t"
  2701. "psrlw $8, %%mm2 \n\t"
  2702. "psrlw $8, %%mm3 \n\t"
  2703. "packuswb %%mm1, %%mm0 \n\t"
  2704. "packuswb %%mm3, %%mm2 \n\t"
  2705. "movq %%mm0, %%mm1 \n\t"
  2706. "movq %%mm2, %%mm3 \n\t"
  2707. "psrlw $8, %%mm0 \n\t"
  2708. "psrlw $8, %%mm2 \n\t"
  2709. "pand %%mm7, %%mm1 \n\t"
  2710. "pand %%mm7, %%mm3 \n\t"
  2711. "packuswb %%mm2, %%mm0 \n\t"
  2712. "packuswb %%mm3, %%mm1 \n\t"
  2713. MOVNTQ" %%mm0,- 7(%4, %0) \n\t"
  2714. MOVNTQ" %%mm1,- 7(%3, %0) \n\t"
  2715. "add $8, %0 \n\t"
  2716. " js 1b \n\t"
  2717. : "+r"(count)
  2718. : "r"(src0), "r"(src1), "r"(dst0), "r"(dst1)
  2719. );
  2720. count -= 7;
  2721. }
  2722. #endif
  2723. src0++;
  2724. src1++;
  2725. while(count<0) {
  2726. dst0[count]= (src0[4*count+0]+src1[4*count+0])>>1;
  2727. dst1[count]= (src0[4*count+2]+src1[4*count+2])>>1;
  2728. count++;
  2729. }
  2730. }
  2731. static void RENAME(yuyvtoyuv420)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
  2732. long width, long height,
  2733. long lumStride, long chromStride, long srcStride)
  2734. {
  2735. long y;
  2736. const long chromWidth= -((-width)>>1);
  2737. for (y=0; y<height; y++) {
  2738. RENAME(extract_even)(src, ydst, width);
  2739. if(y&1) {
  2740. RENAME(extract_odd2avg)(src-srcStride, src, udst, vdst, chromWidth);
  2741. udst+= chromStride;
  2742. vdst+= chromStride;
  2743. }
  2744. src += srcStride;
  2745. ydst+= lumStride;
  2746. }
  2747. #if HAVE_MMX
  2748. __asm__(
  2749. EMMS" \n\t"
  2750. SFENCE" \n\t"
  2751. ::: "memory"
  2752. );
  2753. #endif
  2754. }
  2755. static void RENAME(yuyvtoyuv422)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
  2756. long width, long height,
  2757. long lumStride, long chromStride, long srcStride)
  2758. {
  2759. long y;
  2760. const long chromWidth= -((-width)>>1);
  2761. for (y=0; y<height; y++) {
  2762. RENAME(extract_even)(src, ydst, width);
  2763. RENAME(extract_odd2)(src, udst, vdst, chromWidth);
  2764. src += srcStride;
  2765. ydst+= lumStride;
  2766. udst+= chromStride;
  2767. vdst+= chromStride;
  2768. }
  2769. #if HAVE_MMX
  2770. __asm__(
  2771. EMMS" \n\t"
  2772. SFENCE" \n\t"
  2773. ::: "memory"
  2774. );
  2775. #endif
  2776. }
  2777. static void RENAME(uyvytoyuv420)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
  2778. long width, long height,
  2779. long lumStride, long chromStride, long srcStride)
  2780. {
  2781. long y;
  2782. const long chromWidth= -((-width)>>1);
  2783. for (y=0; y<height; y++) {
  2784. RENAME(extract_even)(src+1, ydst, width);
  2785. if(y&1) {
  2786. RENAME(extract_even2avg)(src-srcStride, src, udst, vdst, chromWidth);
  2787. udst+= chromStride;
  2788. vdst+= chromStride;
  2789. }
  2790. src += srcStride;
  2791. ydst+= lumStride;
  2792. }
  2793. #if HAVE_MMX
  2794. __asm__(
  2795. EMMS" \n\t"
  2796. SFENCE" \n\t"
  2797. ::: "memory"
  2798. );
  2799. #endif
  2800. }
  2801. static void RENAME(uyvytoyuv422)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
  2802. long width, long height,
  2803. long lumStride, long chromStride, long srcStride)
  2804. {
  2805. long y;
  2806. const long chromWidth= -((-width)>>1);
  2807. for (y=0; y<height; y++) {
  2808. RENAME(extract_even)(src+1, ydst, width);
  2809. RENAME(extract_even2)(src, udst, vdst, chromWidth);
  2810. src += srcStride;
  2811. ydst+= lumStride;
  2812. udst+= chromStride;
  2813. vdst+= chromStride;
  2814. }
  2815. #if HAVE_MMX
  2816. __asm__(
  2817. EMMS" \n\t"
  2818. SFENCE" \n\t"
  2819. ::: "memory"
  2820. );
  2821. #endif
  2822. }
  2823. static inline void RENAME(rgb2rgb_init)(void)
  2824. {
  2825. rgb15to16 = RENAME(rgb15to16);
  2826. rgb15tobgr24 = RENAME(rgb15tobgr24);
  2827. rgb15to32 = RENAME(rgb15to32);
  2828. rgb16tobgr24 = RENAME(rgb16tobgr24);
  2829. rgb16to32 = RENAME(rgb16to32);
  2830. rgb16to15 = RENAME(rgb16to15);
  2831. rgb24tobgr16 = RENAME(rgb24tobgr16);
  2832. rgb24tobgr15 = RENAME(rgb24tobgr15);
  2833. rgb24tobgr32 = RENAME(rgb24tobgr32);
  2834. rgb32to16 = RENAME(rgb32to16);
  2835. rgb32to15 = RENAME(rgb32to15);
  2836. rgb32tobgr24 = RENAME(rgb32tobgr24);
  2837. rgb24to15 = RENAME(rgb24to15);
  2838. rgb24to16 = RENAME(rgb24to16);
  2839. rgb24tobgr24 = RENAME(rgb24tobgr24);
  2840. rgb32tobgr32 = RENAME(rgb32tobgr32);
  2841. rgb32tobgr16 = RENAME(rgb32tobgr16);
  2842. rgb32tobgr15 = RENAME(rgb32tobgr15);
  2843. yv12toyuy2 = RENAME(yv12toyuy2);
  2844. yv12touyvy = RENAME(yv12touyvy);
  2845. yuv422ptoyuy2 = RENAME(yuv422ptoyuy2);
  2846. yuv422ptouyvy = RENAME(yuv422ptouyvy);
  2847. yuy2toyv12 = RENAME(yuy2toyv12);
  2848. // yvu9toyv12 = RENAME(yvu9toyv12);
  2849. planar2x = RENAME(planar2x);
  2850. rgb24toyv12 = RENAME(rgb24toyv12);
  2851. interleaveBytes = RENAME(interleaveBytes);
  2852. vu9_to_vu12 = RENAME(vu9_to_vu12);
  2853. yvu9_to_yuy2 = RENAME(yvu9_to_yuy2);
  2854. uyvytoyuv420 = RENAME(uyvytoyuv420);
  2855. uyvytoyuv422 = RENAME(uyvytoyuv422);
  2856. yuyvtoyuv420 = RENAME(yuyvtoyuv420);
  2857. yuyvtoyuv422 = RENAME(yuyvtoyuv422);
  2858. }