You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

2647 lines
107KB

  1. /*
  2. * software RGB to RGB converter
  3. * pluralize by software PAL8 to RGB converter
  4. * software YUV to YUV converter
  5. * software YUV to RGB converter
  6. * Written by Nick Kurshev.
  7. * palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at)
  8. * lot of big-endian byte order fixes by Alex Beregszaszi
  9. *
  10. * This file is part of Libav.
  11. *
  12. * Libav is free software; you can redistribute it and/or
  13. * modify it under the terms of the GNU Lesser General Public
  14. * License as published by the Free Software Foundation; either
  15. * version 2.1 of the License, or (at your option) any later version.
  16. *
  17. * Libav is distributed in the hope that it will be useful,
  18. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  19. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  20. * Lesser General Public License for more details.
  21. *
  22. * You should have received a copy of the GNU Lesser General Public
  23. * License along with Libav; if not, write to the Free Software
  24. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  25. */
  26. #include <stddef.h>
  27. #undef PREFETCH
  28. #undef MOVNTQ
  29. #undef EMMS
  30. #undef SFENCE
  31. #undef MMREG_SIZE
  32. #undef PAVGB
  33. #if COMPILE_TEMPLATE_SSE2
  34. #define MMREG_SIZE 16
  35. #else
  36. #define MMREG_SIZE 8
  37. #endif
  38. #if COMPILE_TEMPLATE_AMD3DNOW
  39. #define PREFETCH "prefetch"
  40. #define PAVGB "pavgusb"
  41. #elif COMPILE_TEMPLATE_MMX2
  42. #define PREFETCH "prefetchnta"
  43. #define PAVGB "pavgb"
  44. #else
  45. #define PREFETCH " # nop"
  46. #endif
  47. #if COMPILE_TEMPLATE_AMD3DNOW
  48. /* On K6 femms is faster than emms. On K7 femms is directly mapped to emms. */
  49. #define EMMS "femms"
  50. #else
  51. #define EMMS "emms"
  52. #endif
  53. #if COMPILE_TEMPLATE_MMX2
  54. #define MOVNTQ "movntq"
  55. #define SFENCE "sfence"
  56. #else
  57. #define MOVNTQ "movq"
  58. #define SFENCE " # nop"
  59. #endif
  60. static inline void RENAME(rgb24tobgr32)(const uint8_t *src, uint8_t *dst, long src_size)
  61. {
  62. uint8_t *dest = dst;
  63. const uint8_t *s = src;
  64. const uint8_t *end;
  65. const uint8_t *mm_end;
  66. end = s + src_size;
  67. __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
  68. mm_end = end - 23;
  69. __asm__ volatile("movq %0, %%mm7"::"m"(mask32a):"memory");
  70. while (s < mm_end) {
  71. __asm__ volatile(
  72. PREFETCH" 32%1 \n\t"
  73. "movd %1, %%mm0 \n\t"
  74. "punpckldq 3%1, %%mm0 \n\t"
  75. "movd 6%1, %%mm1 \n\t"
  76. "punpckldq 9%1, %%mm1 \n\t"
  77. "movd 12%1, %%mm2 \n\t"
  78. "punpckldq 15%1, %%mm2 \n\t"
  79. "movd 18%1, %%mm3 \n\t"
  80. "punpckldq 21%1, %%mm3 \n\t"
  81. "por %%mm7, %%mm0 \n\t"
  82. "por %%mm7, %%mm1 \n\t"
  83. "por %%mm7, %%mm2 \n\t"
  84. "por %%mm7, %%mm3 \n\t"
  85. MOVNTQ" %%mm0, %0 \n\t"
  86. MOVNTQ" %%mm1, 8%0 \n\t"
  87. MOVNTQ" %%mm2, 16%0 \n\t"
  88. MOVNTQ" %%mm3, 24%0"
  89. :"=m"(*dest)
  90. :"m"(*s)
  91. :"memory");
  92. dest += 32;
  93. s += 24;
  94. }
  95. __asm__ volatile(SFENCE:::"memory");
  96. __asm__ volatile(EMMS:::"memory");
  97. while (s < end) {
  98. *dest++ = *s++;
  99. *dest++ = *s++;
  100. *dest++ = *s++;
  101. *dest++ = 255;
  102. }
  103. }
  104. #define STORE_BGR24_MMX \
  105. "psrlq $8, %%mm2 \n\t" \
  106. "psrlq $8, %%mm3 \n\t" \
  107. "psrlq $8, %%mm6 \n\t" \
  108. "psrlq $8, %%mm7 \n\t" \
  109. "pand "MANGLE(mask24l)", %%mm0\n\t" \
  110. "pand "MANGLE(mask24l)", %%mm1\n\t" \
  111. "pand "MANGLE(mask24l)", %%mm4\n\t" \
  112. "pand "MANGLE(mask24l)", %%mm5\n\t" \
  113. "pand "MANGLE(mask24h)", %%mm2\n\t" \
  114. "pand "MANGLE(mask24h)", %%mm3\n\t" \
  115. "pand "MANGLE(mask24h)", %%mm6\n\t" \
  116. "pand "MANGLE(mask24h)", %%mm7\n\t" \
  117. "por %%mm2, %%mm0 \n\t" \
  118. "por %%mm3, %%mm1 \n\t" \
  119. "por %%mm6, %%mm4 \n\t" \
  120. "por %%mm7, %%mm5 \n\t" \
  121. \
  122. "movq %%mm1, %%mm2 \n\t" \
  123. "movq %%mm4, %%mm3 \n\t" \
  124. "psllq $48, %%mm2 \n\t" \
  125. "psllq $32, %%mm3 \n\t" \
  126. "pand "MANGLE(mask24hh)", %%mm2\n\t" \
  127. "pand "MANGLE(mask24hhh)", %%mm3\n\t" \
  128. "por %%mm2, %%mm0 \n\t" \
  129. "psrlq $16, %%mm1 \n\t" \
  130. "psrlq $32, %%mm4 \n\t" \
  131. "psllq $16, %%mm5 \n\t" \
  132. "por %%mm3, %%mm1 \n\t" \
  133. "pand "MANGLE(mask24hhhh)", %%mm5\n\t" \
  134. "por %%mm5, %%mm4 \n\t" \
  135. \
  136. MOVNTQ" %%mm0, %0 \n\t" \
  137. MOVNTQ" %%mm1, 8%0 \n\t" \
  138. MOVNTQ" %%mm4, 16%0"
  139. static inline void RENAME(rgb32tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
  140. {
  141. uint8_t *dest = dst;
  142. const uint8_t *s = src;
  143. const uint8_t *end;
  144. const uint8_t *mm_end;
  145. end = s + src_size;
  146. __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
  147. mm_end = end - 31;
  148. while (s < mm_end) {
  149. __asm__ volatile(
  150. PREFETCH" 32%1 \n\t"
  151. "movq %1, %%mm0 \n\t"
  152. "movq 8%1, %%mm1 \n\t"
  153. "movq 16%1, %%mm4 \n\t"
  154. "movq 24%1, %%mm5 \n\t"
  155. "movq %%mm0, %%mm2 \n\t"
  156. "movq %%mm1, %%mm3 \n\t"
  157. "movq %%mm4, %%mm6 \n\t"
  158. "movq %%mm5, %%mm7 \n\t"
  159. STORE_BGR24_MMX
  160. :"=m"(*dest)
  161. :"m"(*s)
  162. :"memory");
  163. dest += 24;
  164. s += 32;
  165. }
  166. __asm__ volatile(SFENCE:::"memory");
  167. __asm__ volatile(EMMS:::"memory");
  168. while (s < end) {
  169. *dest++ = *s++;
  170. *dest++ = *s++;
  171. *dest++ = *s++;
  172. s++;
  173. }
  174. }
  175. /*
  176. original by Strepto/Astral
  177. ported to gcc & bugfixed: A'rpi
  178. MMX2, 3DNOW optimization by Nick Kurshev
  179. 32-bit C version, and and&add trick by Michael Niedermayer
  180. */
  181. static inline void RENAME(rgb15to16)(const uint8_t *src, uint8_t *dst, long src_size)
  182. {
  183. register const uint8_t* s=src;
  184. register uint8_t* d=dst;
  185. register const uint8_t *end;
  186. const uint8_t *mm_end;
  187. end = s + src_size;
  188. __asm__ volatile(PREFETCH" %0"::"m"(*s));
  189. __asm__ volatile("movq %0, %%mm4"::"m"(mask15s));
  190. mm_end = end - 15;
  191. while (s<mm_end) {
  192. __asm__ volatile(
  193. PREFETCH" 32%1 \n\t"
  194. "movq %1, %%mm0 \n\t"
  195. "movq 8%1, %%mm2 \n\t"
  196. "movq %%mm0, %%mm1 \n\t"
  197. "movq %%mm2, %%mm3 \n\t"
  198. "pand %%mm4, %%mm0 \n\t"
  199. "pand %%mm4, %%mm2 \n\t"
  200. "paddw %%mm1, %%mm0 \n\t"
  201. "paddw %%mm3, %%mm2 \n\t"
  202. MOVNTQ" %%mm0, %0 \n\t"
  203. MOVNTQ" %%mm2, 8%0"
  204. :"=m"(*d)
  205. :"m"(*s)
  206. );
  207. d+=16;
  208. s+=16;
  209. }
  210. __asm__ volatile(SFENCE:::"memory");
  211. __asm__ volatile(EMMS:::"memory");
  212. mm_end = end - 3;
  213. while (s < mm_end) {
  214. register unsigned x= *((const uint32_t *)s);
  215. *((uint32_t *)d) = (x&0x7FFF7FFF) + (x&0x7FE07FE0);
  216. d+=4;
  217. s+=4;
  218. }
  219. if (s < end) {
  220. register unsigned short x= *((const uint16_t *)s);
  221. *((uint16_t *)d) = (x&0x7FFF) + (x&0x7FE0);
  222. }
  223. }
  224. static inline void RENAME(rgb16to15)(const uint8_t *src, uint8_t *dst, long src_size)
  225. {
  226. register const uint8_t* s=src;
  227. register uint8_t* d=dst;
  228. register const uint8_t *end;
  229. const uint8_t *mm_end;
  230. end = s + src_size;
  231. __asm__ volatile(PREFETCH" %0"::"m"(*s));
  232. __asm__ volatile("movq %0, %%mm7"::"m"(mask15rg));
  233. __asm__ volatile("movq %0, %%mm6"::"m"(mask15b));
  234. mm_end = end - 15;
  235. while (s<mm_end) {
  236. __asm__ volatile(
  237. PREFETCH" 32%1 \n\t"
  238. "movq %1, %%mm0 \n\t"
  239. "movq 8%1, %%mm2 \n\t"
  240. "movq %%mm0, %%mm1 \n\t"
  241. "movq %%mm2, %%mm3 \n\t"
  242. "psrlq $1, %%mm0 \n\t"
  243. "psrlq $1, %%mm2 \n\t"
  244. "pand %%mm7, %%mm0 \n\t"
  245. "pand %%mm7, %%mm2 \n\t"
  246. "pand %%mm6, %%mm1 \n\t"
  247. "pand %%mm6, %%mm3 \n\t"
  248. "por %%mm1, %%mm0 \n\t"
  249. "por %%mm3, %%mm2 \n\t"
  250. MOVNTQ" %%mm0, %0 \n\t"
  251. MOVNTQ" %%mm2, 8%0"
  252. :"=m"(*d)
  253. :"m"(*s)
  254. );
  255. d+=16;
  256. s+=16;
  257. }
  258. __asm__ volatile(SFENCE:::"memory");
  259. __asm__ volatile(EMMS:::"memory");
  260. mm_end = end - 3;
  261. while (s < mm_end) {
  262. register uint32_t x= *((const uint32_t*)s);
  263. *((uint32_t *)d) = ((x>>1)&0x7FE07FE0) | (x&0x001F001F);
  264. s+=4;
  265. d+=4;
  266. }
  267. if (s < end) {
  268. register uint16_t x= *((const uint16_t*)s);
  269. *((uint16_t *)d) = ((x>>1)&0x7FE0) | (x&0x001F);
  270. }
  271. }
  272. static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, long src_size)
  273. {
  274. const uint8_t *s = src;
  275. const uint8_t *end;
  276. const uint8_t *mm_end;
  277. uint16_t *d = (uint16_t *)dst;
  278. end = s + src_size;
  279. mm_end = end - 15;
  280. #if 1 //is faster only if multiplies are reasonably fast (FIXME figure out on which CPUs this is faster, on Athlon it is slightly faster)
  281. __asm__ volatile(
  282. "movq %3, %%mm5 \n\t"
  283. "movq %4, %%mm6 \n\t"
  284. "movq %5, %%mm7 \n\t"
  285. "jmp 2f \n\t"
  286. ".p2align 4 \n\t"
  287. "1: \n\t"
  288. PREFETCH" 32(%1) \n\t"
  289. "movd (%1), %%mm0 \n\t"
  290. "movd 4(%1), %%mm3 \n\t"
  291. "punpckldq 8(%1), %%mm0 \n\t"
  292. "punpckldq 12(%1), %%mm3 \n\t"
  293. "movq %%mm0, %%mm1 \n\t"
  294. "movq %%mm3, %%mm4 \n\t"
  295. "pand %%mm6, %%mm0 \n\t"
  296. "pand %%mm6, %%mm3 \n\t"
  297. "pmaddwd %%mm7, %%mm0 \n\t"
  298. "pmaddwd %%mm7, %%mm3 \n\t"
  299. "pand %%mm5, %%mm1 \n\t"
  300. "pand %%mm5, %%mm4 \n\t"
  301. "por %%mm1, %%mm0 \n\t"
  302. "por %%mm4, %%mm3 \n\t"
  303. "psrld $5, %%mm0 \n\t"
  304. "pslld $11, %%mm3 \n\t"
  305. "por %%mm3, %%mm0 \n\t"
  306. MOVNTQ" %%mm0, (%0) \n\t"
  307. "add $16, %1 \n\t"
  308. "add $8, %0 \n\t"
  309. "2: \n\t"
  310. "cmp %2, %1 \n\t"
  311. " jb 1b \n\t"
  312. : "+r" (d), "+r"(s)
  313. : "r" (mm_end), "m" (mask3216g), "m" (mask3216br), "m" (mul3216)
  314. );
  315. #else
  316. __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
  317. __asm__ volatile(
  318. "movq %0, %%mm7 \n\t"
  319. "movq %1, %%mm6 \n\t"
  320. ::"m"(red_16mask),"m"(green_16mask));
  321. while (s < mm_end) {
  322. __asm__ volatile(
  323. PREFETCH" 32%1 \n\t"
  324. "movd %1, %%mm0 \n\t"
  325. "movd 4%1, %%mm3 \n\t"
  326. "punpckldq 8%1, %%mm0 \n\t"
  327. "punpckldq 12%1, %%mm3 \n\t"
  328. "movq %%mm0, %%mm1 \n\t"
  329. "movq %%mm0, %%mm2 \n\t"
  330. "movq %%mm3, %%mm4 \n\t"
  331. "movq %%mm3, %%mm5 \n\t"
  332. "psrlq $3, %%mm0 \n\t"
  333. "psrlq $3, %%mm3 \n\t"
  334. "pand %2, %%mm0 \n\t"
  335. "pand %2, %%mm3 \n\t"
  336. "psrlq $5, %%mm1 \n\t"
  337. "psrlq $5, %%mm4 \n\t"
  338. "pand %%mm6, %%mm1 \n\t"
  339. "pand %%mm6, %%mm4 \n\t"
  340. "psrlq $8, %%mm2 \n\t"
  341. "psrlq $8, %%mm5 \n\t"
  342. "pand %%mm7, %%mm2 \n\t"
  343. "pand %%mm7, %%mm5 \n\t"
  344. "por %%mm1, %%mm0 \n\t"
  345. "por %%mm4, %%mm3 \n\t"
  346. "por %%mm2, %%mm0 \n\t"
  347. "por %%mm5, %%mm3 \n\t"
  348. "psllq $16, %%mm3 \n\t"
  349. "por %%mm3, %%mm0 \n\t"
  350. MOVNTQ" %%mm0, %0 \n\t"
  351. :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
  352. d += 4;
  353. s += 16;
  354. }
  355. #endif
  356. __asm__ volatile(SFENCE:::"memory");
  357. __asm__ volatile(EMMS:::"memory");
  358. while (s < end) {
  359. register int rgb = *(const uint32_t*)s; s += 4;
  360. *d++ = ((rgb&0xFF)>>3) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>8);
  361. }
  362. }
  363. static inline void RENAME(rgb32tobgr16)(const uint8_t *src, uint8_t *dst, long src_size)
  364. {
  365. const uint8_t *s = src;
  366. const uint8_t *end;
  367. const uint8_t *mm_end;
  368. uint16_t *d = (uint16_t *)dst;
  369. end = s + src_size;
  370. __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
  371. __asm__ volatile(
  372. "movq %0, %%mm7 \n\t"
  373. "movq %1, %%mm6 \n\t"
  374. ::"m"(red_16mask),"m"(green_16mask));
  375. mm_end = end - 15;
  376. while (s < mm_end) {
  377. __asm__ volatile(
  378. PREFETCH" 32%1 \n\t"
  379. "movd %1, %%mm0 \n\t"
  380. "movd 4%1, %%mm3 \n\t"
  381. "punpckldq 8%1, %%mm0 \n\t"
  382. "punpckldq 12%1, %%mm3 \n\t"
  383. "movq %%mm0, %%mm1 \n\t"
  384. "movq %%mm0, %%mm2 \n\t"
  385. "movq %%mm3, %%mm4 \n\t"
  386. "movq %%mm3, %%mm5 \n\t"
  387. "psllq $8, %%mm0 \n\t"
  388. "psllq $8, %%mm3 \n\t"
  389. "pand %%mm7, %%mm0 \n\t"
  390. "pand %%mm7, %%mm3 \n\t"
  391. "psrlq $5, %%mm1 \n\t"
  392. "psrlq $5, %%mm4 \n\t"
  393. "pand %%mm6, %%mm1 \n\t"
  394. "pand %%mm6, %%mm4 \n\t"
  395. "psrlq $19, %%mm2 \n\t"
  396. "psrlq $19, %%mm5 \n\t"
  397. "pand %2, %%mm2 \n\t"
  398. "pand %2, %%mm5 \n\t"
  399. "por %%mm1, %%mm0 \n\t"
  400. "por %%mm4, %%mm3 \n\t"
  401. "por %%mm2, %%mm0 \n\t"
  402. "por %%mm5, %%mm3 \n\t"
  403. "psllq $16, %%mm3 \n\t"
  404. "por %%mm3, %%mm0 \n\t"
  405. MOVNTQ" %%mm0, %0 \n\t"
  406. :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
  407. d += 4;
  408. s += 16;
  409. }
  410. __asm__ volatile(SFENCE:::"memory");
  411. __asm__ volatile(EMMS:::"memory");
  412. while (s < end) {
  413. register int rgb = *(const uint32_t*)s; s += 4;
  414. *d++ = ((rgb&0xF8)<<8) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>19);
  415. }
  416. }
  417. static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, long src_size)
  418. {
  419. const uint8_t *s = src;
  420. const uint8_t *end;
  421. const uint8_t *mm_end;
  422. uint16_t *d = (uint16_t *)dst;
  423. end = s + src_size;
  424. mm_end = end - 15;
  425. #if 1 //is faster only if multiplies are reasonably fast (FIXME figure out on which CPUs this is faster, on Athlon it is slightly faster)
  426. __asm__ volatile(
  427. "movq %3, %%mm5 \n\t"
  428. "movq %4, %%mm6 \n\t"
  429. "movq %5, %%mm7 \n\t"
  430. "jmp 2f \n\t"
  431. ".p2align 4 \n\t"
  432. "1: \n\t"
  433. PREFETCH" 32(%1) \n\t"
  434. "movd (%1), %%mm0 \n\t"
  435. "movd 4(%1), %%mm3 \n\t"
  436. "punpckldq 8(%1), %%mm0 \n\t"
  437. "punpckldq 12(%1), %%mm3 \n\t"
  438. "movq %%mm0, %%mm1 \n\t"
  439. "movq %%mm3, %%mm4 \n\t"
  440. "pand %%mm6, %%mm0 \n\t"
  441. "pand %%mm6, %%mm3 \n\t"
  442. "pmaddwd %%mm7, %%mm0 \n\t"
  443. "pmaddwd %%mm7, %%mm3 \n\t"
  444. "pand %%mm5, %%mm1 \n\t"
  445. "pand %%mm5, %%mm4 \n\t"
  446. "por %%mm1, %%mm0 \n\t"
  447. "por %%mm4, %%mm3 \n\t"
  448. "psrld $6, %%mm0 \n\t"
  449. "pslld $10, %%mm3 \n\t"
  450. "por %%mm3, %%mm0 \n\t"
  451. MOVNTQ" %%mm0, (%0) \n\t"
  452. "add $16, %1 \n\t"
  453. "add $8, %0 \n\t"
  454. "2: \n\t"
  455. "cmp %2, %1 \n\t"
  456. " jb 1b \n\t"
  457. : "+r" (d), "+r"(s)
  458. : "r" (mm_end), "m" (mask3215g), "m" (mask3216br), "m" (mul3215)
  459. );
  460. #else
  461. __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
  462. __asm__ volatile(
  463. "movq %0, %%mm7 \n\t"
  464. "movq %1, %%mm6 \n\t"
  465. ::"m"(red_15mask),"m"(green_15mask));
  466. while (s < mm_end) {
  467. __asm__ volatile(
  468. PREFETCH" 32%1 \n\t"
  469. "movd %1, %%mm0 \n\t"
  470. "movd 4%1, %%mm3 \n\t"
  471. "punpckldq 8%1, %%mm0 \n\t"
  472. "punpckldq 12%1, %%mm3 \n\t"
  473. "movq %%mm0, %%mm1 \n\t"
  474. "movq %%mm0, %%mm2 \n\t"
  475. "movq %%mm3, %%mm4 \n\t"
  476. "movq %%mm3, %%mm5 \n\t"
  477. "psrlq $3, %%mm0 \n\t"
  478. "psrlq $3, %%mm3 \n\t"
  479. "pand %2, %%mm0 \n\t"
  480. "pand %2, %%mm3 \n\t"
  481. "psrlq $6, %%mm1 \n\t"
  482. "psrlq $6, %%mm4 \n\t"
  483. "pand %%mm6, %%mm1 \n\t"
  484. "pand %%mm6, %%mm4 \n\t"
  485. "psrlq $9, %%mm2 \n\t"
  486. "psrlq $9, %%mm5 \n\t"
  487. "pand %%mm7, %%mm2 \n\t"
  488. "pand %%mm7, %%mm5 \n\t"
  489. "por %%mm1, %%mm0 \n\t"
  490. "por %%mm4, %%mm3 \n\t"
  491. "por %%mm2, %%mm0 \n\t"
  492. "por %%mm5, %%mm3 \n\t"
  493. "psllq $16, %%mm3 \n\t"
  494. "por %%mm3, %%mm0 \n\t"
  495. MOVNTQ" %%mm0, %0 \n\t"
  496. :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
  497. d += 4;
  498. s += 16;
  499. }
  500. #endif
  501. __asm__ volatile(SFENCE:::"memory");
  502. __asm__ volatile(EMMS:::"memory");
  503. while (s < end) {
  504. register int rgb = *(const uint32_t*)s; s += 4;
  505. *d++ = ((rgb&0xFF)>>3) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>9);
  506. }
  507. }
  508. static inline void RENAME(rgb32tobgr15)(const uint8_t *src, uint8_t *dst, long src_size)
  509. {
  510. const uint8_t *s = src;
  511. const uint8_t *end;
  512. const uint8_t *mm_end;
  513. uint16_t *d = (uint16_t *)dst;
  514. end = s + src_size;
  515. __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
  516. __asm__ volatile(
  517. "movq %0, %%mm7 \n\t"
  518. "movq %1, %%mm6 \n\t"
  519. ::"m"(red_15mask),"m"(green_15mask));
  520. mm_end = end - 15;
  521. while (s < mm_end) {
  522. __asm__ volatile(
  523. PREFETCH" 32%1 \n\t"
  524. "movd %1, %%mm0 \n\t"
  525. "movd 4%1, %%mm3 \n\t"
  526. "punpckldq 8%1, %%mm0 \n\t"
  527. "punpckldq 12%1, %%mm3 \n\t"
  528. "movq %%mm0, %%mm1 \n\t"
  529. "movq %%mm0, %%mm2 \n\t"
  530. "movq %%mm3, %%mm4 \n\t"
  531. "movq %%mm3, %%mm5 \n\t"
  532. "psllq $7, %%mm0 \n\t"
  533. "psllq $7, %%mm3 \n\t"
  534. "pand %%mm7, %%mm0 \n\t"
  535. "pand %%mm7, %%mm3 \n\t"
  536. "psrlq $6, %%mm1 \n\t"
  537. "psrlq $6, %%mm4 \n\t"
  538. "pand %%mm6, %%mm1 \n\t"
  539. "pand %%mm6, %%mm4 \n\t"
  540. "psrlq $19, %%mm2 \n\t"
  541. "psrlq $19, %%mm5 \n\t"
  542. "pand %2, %%mm2 \n\t"
  543. "pand %2, %%mm5 \n\t"
  544. "por %%mm1, %%mm0 \n\t"
  545. "por %%mm4, %%mm3 \n\t"
  546. "por %%mm2, %%mm0 \n\t"
  547. "por %%mm5, %%mm3 \n\t"
  548. "psllq $16, %%mm3 \n\t"
  549. "por %%mm3, %%mm0 \n\t"
  550. MOVNTQ" %%mm0, %0 \n\t"
  551. :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
  552. d += 4;
  553. s += 16;
  554. }
  555. __asm__ volatile(SFENCE:::"memory");
  556. __asm__ volatile(EMMS:::"memory");
  557. while (s < end) {
  558. register int rgb = *(const uint32_t*)s; s += 4;
  559. *d++ = ((rgb&0xF8)<<7) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>19);
  560. }
  561. }
  562. static inline void RENAME(rgb24tobgr16)(const uint8_t *src, uint8_t *dst, long src_size)
  563. {
  564. const uint8_t *s = src;
  565. const uint8_t *end;
  566. const uint8_t *mm_end;
  567. uint16_t *d = (uint16_t *)dst;
  568. end = s + src_size;
  569. __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
  570. __asm__ volatile(
  571. "movq %0, %%mm7 \n\t"
  572. "movq %1, %%mm6 \n\t"
  573. ::"m"(red_16mask),"m"(green_16mask));
  574. mm_end = end - 11;
  575. while (s < mm_end) {
  576. __asm__ volatile(
  577. PREFETCH" 32%1 \n\t"
  578. "movd %1, %%mm0 \n\t"
  579. "movd 3%1, %%mm3 \n\t"
  580. "punpckldq 6%1, %%mm0 \n\t"
  581. "punpckldq 9%1, %%mm3 \n\t"
  582. "movq %%mm0, %%mm1 \n\t"
  583. "movq %%mm0, %%mm2 \n\t"
  584. "movq %%mm3, %%mm4 \n\t"
  585. "movq %%mm3, %%mm5 \n\t"
  586. "psrlq $3, %%mm0 \n\t"
  587. "psrlq $3, %%mm3 \n\t"
  588. "pand %2, %%mm0 \n\t"
  589. "pand %2, %%mm3 \n\t"
  590. "psrlq $5, %%mm1 \n\t"
  591. "psrlq $5, %%mm4 \n\t"
  592. "pand %%mm6, %%mm1 \n\t"
  593. "pand %%mm6, %%mm4 \n\t"
  594. "psrlq $8, %%mm2 \n\t"
  595. "psrlq $8, %%mm5 \n\t"
  596. "pand %%mm7, %%mm2 \n\t"
  597. "pand %%mm7, %%mm5 \n\t"
  598. "por %%mm1, %%mm0 \n\t"
  599. "por %%mm4, %%mm3 \n\t"
  600. "por %%mm2, %%mm0 \n\t"
  601. "por %%mm5, %%mm3 \n\t"
  602. "psllq $16, %%mm3 \n\t"
  603. "por %%mm3, %%mm0 \n\t"
  604. MOVNTQ" %%mm0, %0 \n\t"
  605. :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
  606. d += 4;
  607. s += 12;
  608. }
  609. __asm__ volatile(SFENCE:::"memory");
  610. __asm__ volatile(EMMS:::"memory");
  611. while (s < end) {
  612. const int b = *s++;
  613. const int g = *s++;
  614. const int r = *s++;
  615. *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
  616. }
  617. }
  618. static inline void RENAME(rgb24to16)(const uint8_t *src, uint8_t *dst, long src_size)
  619. {
  620. const uint8_t *s = src;
  621. const uint8_t *end;
  622. const uint8_t *mm_end;
  623. uint16_t *d = (uint16_t *)dst;
  624. end = s + src_size;
  625. __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
  626. __asm__ volatile(
  627. "movq %0, %%mm7 \n\t"
  628. "movq %1, %%mm6 \n\t"
  629. ::"m"(red_16mask),"m"(green_16mask));
  630. mm_end = end - 15;
  631. while (s < mm_end) {
  632. __asm__ volatile(
  633. PREFETCH" 32%1 \n\t"
  634. "movd %1, %%mm0 \n\t"
  635. "movd 3%1, %%mm3 \n\t"
  636. "punpckldq 6%1, %%mm0 \n\t"
  637. "punpckldq 9%1, %%mm3 \n\t"
  638. "movq %%mm0, %%mm1 \n\t"
  639. "movq %%mm0, %%mm2 \n\t"
  640. "movq %%mm3, %%mm4 \n\t"
  641. "movq %%mm3, %%mm5 \n\t"
  642. "psllq $8, %%mm0 \n\t"
  643. "psllq $8, %%mm3 \n\t"
  644. "pand %%mm7, %%mm0 \n\t"
  645. "pand %%mm7, %%mm3 \n\t"
  646. "psrlq $5, %%mm1 \n\t"
  647. "psrlq $5, %%mm4 \n\t"
  648. "pand %%mm6, %%mm1 \n\t"
  649. "pand %%mm6, %%mm4 \n\t"
  650. "psrlq $19, %%mm2 \n\t"
  651. "psrlq $19, %%mm5 \n\t"
  652. "pand %2, %%mm2 \n\t"
  653. "pand %2, %%mm5 \n\t"
  654. "por %%mm1, %%mm0 \n\t"
  655. "por %%mm4, %%mm3 \n\t"
  656. "por %%mm2, %%mm0 \n\t"
  657. "por %%mm5, %%mm3 \n\t"
  658. "psllq $16, %%mm3 \n\t"
  659. "por %%mm3, %%mm0 \n\t"
  660. MOVNTQ" %%mm0, %0 \n\t"
  661. :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
  662. d += 4;
  663. s += 12;
  664. }
  665. __asm__ volatile(SFENCE:::"memory");
  666. __asm__ volatile(EMMS:::"memory");
  667. while (s < end) {
  668. const int r = *s++;
  669. const int g = *s++;
  670. const int b = *s++;
  671. *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
  672. }
  673. }
  674. static inline void RENAME(rgb24tobgr15)(const uint8_t *src, uint8_t *dst, long src_size)
  675. {
  676. const uint8_t *s = src;
  677. const uint8_t *end;
  678. const uint8_t *mm_end;
  679. uint16_t *d = (uint16_t *)dst;
  680. end = s + src_size;
  681. __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
  682. __asm__ volatile(
  683. "movq %0, %%mm7 \n\t"
  684. "movq %1, %%mm6 \n\t"
  685. ::"m"(red_15mask),"m"(green_15mask));
  686. mm_end = end - 11;
  687. while (s < mm_end) {
  688. __asm__ volatile(
  689. PREFETCH" 32%1 \n\t"
  690. "movd %1, %%mm0 \n\t"
  691. "movd 3%1, %%mm3 \n\t"
  692. "punpckldq 6%1, %%mm0 \n\t"
  693. "punpckldq 9%1, %%mm3 \n\t"
  694. "movq %%mm0, %%mm1 \n\t"
  695. "movq %%mm0, %%mm2 \n\t"
  696. "movq %%mm3, %%mm4 \n\t"
  697. "movq %%mm3, %%mm5 \n\t"
  698. "psrlq $3, %%mm0 \n\t"
  699. "psrlq $3, %%mm3 \n\t"
  700. "pand %2, %%mm0 \n\t"
  701. "pand %2, %%mm3 \n\t"
  702. "psrlq $6, %%mm1 \n\t"
  703. "psrlq $6, %%mm4 \n\t"
  704. "pand %%mm6, %%mm1 \n\t"
  705. "pand %%mm6, %%mm4 \n\t"
  706. "psrlq $9, %%mm2 \n\t"
  707. "psrlq $9, %%mm5 \n\t"
  708. "pand %%mm7, %%mm2 \n\t"
  709. "pand %%mm7, %%mm5 \n\t"
  710. "por %%mm1, %%mm0 \n\t"
  711. "por %%mm4, %%mm3 \n\t"
  712. "por %%mm2, %%mm0 \n\t"
  713. "por %%mm5, %%mm3 \n\t"
  714. "psllq $16, %%mm3 \n\t"
  715. "por %%mm3, %%mm0 \n\t"
  716. MOVNTQ" %%mm0, %0 \n\t"
  717. :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
  718. d += 4;
  719. s += 12;
  720. }
  721. __asm__ volatile(SFENCE:::"memory");
  722. __asm__ volatile(EMMS:::"memory");
  723. while (s < end) {
  724. const int b = *s++;
  725. const int g = *s++;
  726. const int r = *s++;
  727. *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
  728. }
  729. }
  730. static inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, long src_size)
  731. {
  732. const uint8_t *s = src;
  733. const uint8_t *end;
  734. const uint8_t *mm_end;
  735. uint16_t *d = (uint16_t *)dst;
  736. end = s + src_size;
  737. __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
  738. __asm__ volatile(
  739. "movq %0, %%mm7 \n\t"
  740. "movq %1, %%mm6 \n\t"
  741. ::"m"(red_15mask),"m"(green_15mask));
  742. mm_end = end - 15;
  743. while (s < mm_end) {
  744. __asm__ volatile(
  745. PREFETCH" 32%1 \n\t"
  746. "movd %1, %%mm0 \n\t"
  747. "movd 3%1, %%mm3 \n\t"
  748. "punpckldq 6%1, %%mm0 \n\t"
  749. "punpckldq 9%1, %%mm3 \n\t"
  750. "movq %%mm0, %%mm1 \n\t"
  751. "movq %%mm0, %%mm2 \n\t"
  752. "movq %%mm3, %%mm4 \n\t"
  753. "movq %%mm3, %%mm5 \n\t"
  754. "psllq $7, %%mm0 \n\t"
  755. "psllq $7, %%mm3 \n\t"
  756. "pand %%mm7, %%mm0 \n\t"
  757. "pand %%mm7, %%mm3 \n\t"
  758. "psrlq $6, %%mm1 \n\t"
  759. "psrlq $6, %%mm4 \n\t"
  760. "pand %%mm6, %%mm1 \n\t"
  761. "pand %%mm6, %%mm4 \n\t"
  762. "psrlq $19, %%mm2 \n\t"
  763. "psrlq $19, %%mm5 \n\t"
  764. "pand %2, %%mm2 \n\t"
  765. "pand %2, %%mm5 \n\t"
  766. "por %%mm1, %%mm0 \n\t"
  767. "por %%mm4, %%mm3 \n\t"
  768. "por %%mm2, %%mm0 \n\t"
  769. "por %%mm5, %%mm3 \n\t"
  770. "psllq $16, %%mm3 \n\t"
  771. "por %%mm3, %%mm0 \n\t"
  772. MOVNTQ" %%mm0, %0 \n\t"
  773. :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
  774. d += 4;
  775. s += 12;
  776. }
  777. __asm__ volatile(SFENCE:::"memory");
  778. __asm__ volatile(EMMS:::"memory");
  779. while (s < end) {
  780. const int r = *s++;
  781. const int g = *s++;
  782. const int b = *s++;
  783. *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
  784. }
  785. }
  786. /*
  787. I use less accurate approximation here by simply left-shifting the input
  788. value and filling the low order bits with zeroes. This method improves PNG
  789. compression but this scheme cannot reproduce white exactly, since it does
  790. not generate an all-ones maximum value; the net effect is to darken the
  791. image slightly.
  792. The better method should be "left bit replication":
  793. 4 3 2 1 0
  794. ---------
  795. 1 1 0 1 1
  796. 7 6 5 4 3 2 1 0
  797. ----------------
  798. 1 1 0 1 1 1 1 0
  799. |=======| |===|
  800. | leftmost bits repeated to fill open bits
  801. |
  802. original bits
  803. */
  804. static inline void RENAME(rgb15tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
  805. {
  806. const uint16_t *end;
  807. const uint16_t *mm_end;
  808. uint8_t *d = dst;
  809. const uint16_t *s = (const uint16_t*)src;
  810. end = s + src_size/2;
  811. __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
  812. mm_end = end - 7;
  813. while (s < mm_end) {
  814. __asm__ volatile(
  815. PREFETCH" 32%1 \n\t"
  816. "movq %1, %%mm0 \n\t"
  817. "movq %1, %%mm1 \n\t"
  818. "movq %1, %%mm2 \n\t"
  819. "pand %2, %%mm0 \n\t"
  820. "pand %3, %%mm1 \n\t"
  821. "pand %4, %%mm2 \n\t"
  822. "psllq $3, %%mm0 \n\t"
  823. "psrlq $2, %%mm1 \n\t"
  824. "psrlq $7, %%mm2 \n\t"
  825. "movq %%mm0, %%mm3 \n\t"
  826. "movq %%mm1, %%mm4 \n\t"
  827. "movq %%mm2, %%mm5 \n\t"
  828. "punpcklwd %5, %%mm0 \n\t"
  829. "punpcklwd %5, %%mm1 \n\t"
  830. "punpcklwd %5, %%mm2 \n\t"
  831. "punpckhwd %5, %%mm3 \n\t"
  832. "punpckhwd %5, %%mm4 \n\t"
  833. "punpckhwd %5, %%mm5 \n\t"
  834. "psllq $8, %%mm1 \n\t"
  835. "psllq $16, %%mm2 \n\t"
  836. "por %%mm1, %%mm0 \n\t"
  837. "por %%mm2, %%mm0 \n\t"
  838. "psllq $8, %%mm4 \n\t"
  839. "psllq $16, %%mm5 \n\t"
  840. "por %%mm4, %%mm3 \n\t"
  841. "por %%mm5, %%mm3 \n\t"
  842. "movq %%mm0, %%mm6 \n\t"
  843. "movq %%mm3, %%mm7 \n\t"
  844. "movq 8%1, %%mm0 \n\t"
  845. "movq 8%1, %%mm1 \n\t"
  846. "movq 8%1, %%mm2 \n\t"
  847. "pand %2, %%mm0 \n\t"
  848. "pand %3, %%mm1 \n\t"
  849. "pand %4, %%mm2 \n\t"
  850. "psllq $3, %%mm0 \n\t"
  851. "psrlq $2, %%mm1 \n\t"
  852. "psrlq $7, %%mm2 \n\t"
  853. "movq %%mm0, %%mm3 \n\t"
  854. "movq %%mm1, %%mm4 \n\t"
  855. "movq %%mm2, %%mm5 \n\t"
  856. "punpcklwd %5, %%mm0 \n\t"
  857. "punpcklwd %5, %%mm1 \n\t"
  858. "punpcklwd %5, %%mm2 \n\t"
  859. "punpckhwd %5, %%mm3 \n\t"
  860. "punpckhwd %5, %%mm4 \n\t"
  861. "punpckhwd %5, %%mm5 \n\t"
  862. "psllq $8, %%mm1 \n\t"
  863. "psllq $16, %%mm2 \n\t"
  864. "por %%mm1, %%mm0 \n\t"
  865. "por %%mm2, %%mm0 \n\t"
  866. "psllq $8, %%mm4 \n\t"
  867. "psllq $16, %%mm5 \n\t"
  868. "por %%mm4, %%mm3 \n\t"
  869. "por %%mm5, %%mm3 \n\t"
  870. :"=m"(*d)
  871. :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r), "m"(mmx_null)
  872. :"memory");
  873. /* borrowed 32 to 24 */
  874. __asm__ volatile(
  875. "movq %%mm0, %%mm4 \n\t"
  876. "movq %%mm3, %%mm5 \n\t"
  877. "movq %%mm6, %%mm0 \n\t"
  878. "movq %%mm7, %%mm1 \n\t"
  879. "movq %%mm4, %%mm6 \n\t"
  880. "movq %%mm5, %%mm7 \n\t"
  881. "movq %%mm0, %%mm2 \n\t"
  882. "movq %%mm1, %%mm3 \n\t"
  883. STORE_BGR24_MMX
  884. :"=m"(*d)
  885. :"m"(*s)
  886. :"memory");
  887. d += 24;
  888. s += 8;
  889. }
  890. __asm__ volatile(SFENCE:::"memory");
  891. __asm__ volatile(EMMS:::"memory");
  892. while (s < end) {
  893. register uint16_t bgr;
  894. bgr = *s++;
  895. *d++ = (bgr&0x1F)<<3;
  896. *d++ = (bgr&0x3E0)>>2;
  897. *d++ = (bgr&0x7C00)>>7;
  898. }
  899. }
  900. static inline void RENAME(rgb16tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
  901. {
  902. const uint16_t *end;
  903. const uint16_t *mm_end;
  904. uint8_t *d = (uint8_t *)dst;
  905. const uint16_t *s = (const uint16_t *)src;
  906. end = s + src_size/2;
  907. __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
  908. mm_end = end - 7;
  909. while (s < mm_end) {
  910. __asm__ volatile(
  911. PREFETCH" 32%1 \n\t"
  912. "movq %1, %%mm0 \n\t"
  913. "movq %1, %%mm1 \n\t"
  914. "movq %1, %%mm2 \n\t"
  915. "pand %2, %%mm0 \n\t"
  916. "pand %3, %%mm1 \n\t"
  917. "pand %4, %%mm2 \n\t"
  918. "psllq $3, %%mm0 \n\t"
  919. "psrlq $3, %%mm1 \n\t"
  920. "psrlq $8, %%mm2 \n\t"
  921. "movq %%mm0, %%mm3 \n\t"
  922. "movq %%mm1, %%mm4 \n\t"
  923. "movq %%mm2, %%mm5 \n\t"
  924. "punpcklwd %5, %%mm0 \n\t"
  925. "punpcklwd %5, %%mm1 \n\t"
  926. "punpcklwd %5, %%mm2 \n\t"
  927. "punpckhwd %5, %%mm3 \n\t"
  928. "punpckhwd %5, %%mm4 \n\t"
  929. "punpckhwd %5, %%mm5 \n\t"
  930. "psllq $8, %%mm1 \n\t"
  931. "psllq $16, %%mm2 \n\t"
  932. "por %%mm1, %%mm0 \n\t"
  933. "por %%mm2, %%mm0 \n\t"
  934. "psllq $8, %%mm4 \n\t"
  935. "psllq $16, %%mm5 \n\t"
  936. "por %%mm4, %%mm3 \n\t"
  937. "por %%mm5, %%mm3 \n\t"
  938. "movq %%mm0, %%mm6 \n\t"
  939. "movq %%mm3, %%mm7 \n\t"
  940. "movq 8%1, %%mm0 \n\t"
  941. "movq 8%1, %%mm1 \n\t"
  942. "movq 8%1, %%mm2 \n\t"
  943. "pand %2, %%mm0 \n\t"
  944. "pand %3, %%mm1 \n\t"
  945. "pand %4, %%mm2 \n\t"
  946. "psllq $3, %%mm0 \n\t"
  947. "psrlq $3, %%mm1 \n\t"
  948. "psrlq $8, %%mm2 \n\t"
  949. "movq %%mm0, %%mm3 \n\t"
  950. "movq %%mm1, %%mm4 \n\t"
  951. "movq %%mm2, %%mm5 \n\t"
  952. "punpcklwd %5, %%mm0 \n\t"
  953. "punpcklwd %5, %%mm1 \n\t"
  954. "punpcklwd %5, %%mm2 \n\t"
  955. "punpckhwd %5, %%mm3 \n\t"
  956. "punpckhwd %5, %%mm4 \n\t"
  957. "punpckhwd %5, %%mm5 \n\t"
  958. "psllq $8, %%mm1 \n\t"
  959. "psllq $16, %%mm2 \n\t"
  960. "por %%mm1, %%mm0 \n\t"
  961. "por %%mm2, %%mm0 \n\t"
  962. "psllq $8, %%mm4 \n\t"
  963. "psllq $16, %%mm5 \n\t"
  964. "por %%mm4, %%mm3 \n\t"
  965. "por %%mm5, %%mm3 \n\t"
  966. :"=m"(*d)
  967. :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mmx_null)
  968. :"memory");
  969. /* borrowed 32 to 24 */
  970. __asm__ volatile(
  971. "movq %%mm0, %%mm4 \n\t"
  972. "movq %%mm3, %%mm5 \n\t"
  973. "movq %%mm6, %%mm0 \n\t"
  974. "movq %%mm7, %%mm1 \n\t"
  975. "movq %%mm4, %%mm6 \n\t"
  976. "movq %%mm5, %%mm7 \n\t"
  977. "movq %%mm0, %%mm2 \n\t"
  978. "movq %%mm1, %%mm3 \n\t"
  979. STORE_BGR24_MMX
  980. :"=m"(*d)
  981. :"m"(*s)
  982. :"memory");
  983. d += 24;
  984. s += 8;
  985. }
  986. __asm__ volatile(SFENCE:::"memory");
  987. __asm__ volatile(EMMS:::"memory");
  988. while (s < end) {
  989. register uint16_t bgr;
  990. bgr = *s++;
  991. *d++ = (bgr&0x1F)<<3;
  992. *d++ = (bgr&0x7E0)>>3;
  993. *d++ = (bgr&0xF800)>>8;
  994. }
  995. }
  996. /*
  997. * mm0 = 00 B3 00 B2 00 B1 00 B0
  998. * mm1 = 00 G3 00 G2 00 G1 00 G0
  999. * mm2 = 00 R3 00 R2 00 R1 00 R0
  1000. * mm6 = FF FF FF FF FF FF FF FF
  1001. * mm7 = 00 00 00 00 00 00 00 00
  1002. */
  1003. #define PACK_RGB32 \
  1004. "packuswb %%mm7, %%mm0 \n\t" /* 00 00 00 00 B3 B2 B1 B0 */ \
  1005. "packuswb %%mm7, %%mm1 \n\t" /* 00 00 00 00 G3 G2 G1 G0 */ \
  1006. "packuswb %%mm7, %%mm2 \n\t" /* 00 00 00 00 R3 R2 R1 R0 */ \
  1007. "punpcklbw %%mm1, %%mm0 \n\t" /* G3 B3 G2 B2 G1 B1 G0 B0 */ \
  1008. "punpcklbw %%mm6, %%mm2 \n\t" /* FF R3 FF R2 FF R1 FF R0 */ \
  1009. "movq %%mm0, %%mm3 \n\t" \
  1010. "punpcklwd %%mm2, %%mm0 \n\t" /* FF R1 G1 B1 FF R0 G0 B0 */ \
  1011. "punpckhwd %%mm2, %%mm3 \n\t" /* FF R3 G3 B3 FF R2 G2 B2 */ \
  1012. MOVNTQ" %%mm0, %0 \n\t" \
  1013. MOVNTQ" %%mm3, 8%0 \n\t" \
  1014. static inline void RENAME(rgb15to32)(const uint8_t *src, uint8_t *dst, long src_size)
  1015. {
  1016. const uint16_t *end;
  1017. const uint16_t *mm_end;
  1018. uint8_t *d = dst;
  1019. const uint16_t *s = (const uint16_t *)src;
  1020. end = s + src_size/2;
  1021. __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
  1022. __asm__ volatile("pxor %%mm7,%%mm7 \n\t":::"memory");
  1023. __asm__ volatile("pcmpeqd %%mm6,%%mm6 \n\t":::"memory");
  1024. mm_end = end - 3;
  1025. while (s < mm_end) {
  1026. __asm__ volatile(
  1027. PREFETCH" 32%1 \n\t"
  1028. "movq %1, %%mm0 \n\t"
  1029. "movq %1, %%mm1 \n\t"
  1030. "movq %1, %%mm2 \n\t"
  1031. "pand %2, %%mm0 \n\t"
  1032. "pand %3, %%mm1 \n\t"
  1033. "pand %4, %%mm2 \n\t"
  1034. "psllq $3, %%mm0 \n\t"
  1035. "psrlq $2, %%mm1 \n\t"
  1036. "psrlq $7, %%mm2 \n\t"
  1037. PACK_RGB32
  1038. :"=m"(*d)
  1039. :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r)
  1040. :"memory");
  1041. d += 16;
  1042. s += 4;
  1043. }
  1044. __asm__ volatile(SFENCE:::"memory");
  1045. __asm__ volatile(EMMS:::"memory");
  1046. while (s < end) {
  1047. register uint16_t bgr;
  1048. bgr = *s++;
  1049. *d++ = (bgr&0x1F)<<3;
  1050. *d++ = (bgr&0x3E0)>>2;
  1051. *d++ = (bgr&0x7C00)>>7;
  1052. *d++ = 255;
  1053. }
  1054. }
  1055. static inline void RENAME(rgb16to32)(const uint8_t *src, uint8_t *dst, long src_size)
  1056. {
  1057. const uint16_t *end;
  1058. const uint16_t *mm_end;
  1059. uint8_t *d = dst;
  1060. const uint16_t *s = (const uint16_t*)src;
  1061. end = s + src_size/2;
  1062. __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
  1063. __asm__ volatile("pxor %%mm7,%%mm7 \n\t":::"memory");
  1064. __asm__ volatile("pcmpeqd %%mm6,%%mm6 \n\t":::"memory");
  1065. mm_end = end - 3;
  1066. while (s < mm_end) {
  1067. __asm__ volatile(
  1068. PREFETCH" 32%1 \n\t"
  1069. "movq %1, %%mm0 \n\t"
  1070. "movq %1, %%mm1 \n\t"
  1071. "movq %1, %%mm2 \n\t"
  1072. "pand %2, %%mm0 \n\t"
  1073. "pand %3, %%mm1 \n\t"
  1074. "pand %4, %%mm2 \n\t"
  1075. "psllq $3, %%mm0 \n\t"
  1076. "psrlq $3, %%mm1 \n\t"
  1077. "psrlq $8, %%mm2 \n\t"
  1078. PACK_RGB32
  1079. :"=m"(*d)
  1080. :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r)
  1081. :"memory");
  1082. d += 16;
  1083. s += 4;
  1084. }
  1085. __asm__ volatile(SFENCE:::"memory");
  1086. __asm__ volatile(EMMS:::"memory");
  1087. while (s < end) {
  1088. register uint16_t bgr;
  1089. bgr = *s++;
  1090. *d++ = (bgr&0x1F)<<3;
  1091. *d++ = (bgr&0x7E0)>>3;
  1092. *d++ = (bgr&0xF800)>>8;
  1093. *d++ = 255;
  1094. }
  1095. }
  1096. static inline void RENAME(shuffle_bytes_2103)(const uint8_t *src, uint8_t *dst, long src_size)
  1097. {
  1098. x86_reg idx = 15 - src_size;
  1099. const uint8_t *s = src-idx;
  1100. uint8_t *d = dst-idx;
  1101. __asm__ volatile(
  1102. "test %0, %0 \n\t"
  1103. "jns 2f \n\t"
  1104. PREFETCH" (%1, %0) \n\t"
  1105. "movq %3, %%mm7 \n\t"
  1106. "pxor %4, %%mm7 \n\t"
  1107. "movq %%mm7, %%mm6 \n\t"
  1108. "pxor %5, %%mm7 \n\t"
  1109. ".p2align 4 \n\t"
  1110. "1: \n\t"
  1111. PREFETCH" 32(%1, %0) \n\t"
  1112. "movq (%1, %0), %%mm0 \n\t"
  1113. "movq 8(%1, %0), %%mm1 \n\t"
  1114. # if COMPILE_TEMPLATE_MMX2
  1115. "pshufw $177, %%mm0, %%mm3 \n\t"
  1116. "pshufw $177, %%mm1, %%mm5 \n\t"
  1117. "pand %%mm7, %%mm0 \n\t"
  1118. "pand %%mm6, %%mm3 \n\t"
  1119. "pand %%mm7, %%mm1 \n\t"
  1120. "pand %%mm6, %%mm5 \n\t"
  1121. "por %%mm3, %%mm0 \n\t"
  1122. "por %%mm5, %%mm1 \n\t"
  1123. # else
  1124. "movq %%mm0, %%mm2 \n\t"
  1125. "movq %%mm1, %%mm4 \n\t"
  1126. "pand %%mm7, %%mm0 \n\t"
  1127. "pand %%mm6, %%mm2 \n\t"
  1128. "pand %%mm7, %%mm1 \n\t"
  1129. "pand %%mm6, %%mm4 \n\t"
  1130. "movq %%mm2, %%mm3 \n\t"
  1131. "movq %%mm4, %%mm5 \n\t"
  1132. "pslld $16, %%mm2 \n\t"
  1133. "psrld $16, %%mm3 \n\t"
  1134. "pslld $16, %%mm4 \n\t"
  1135. "psrld $16, %%mm5 \n\t"
  1136. "por %%mm2, %%mm0 \n\t"
  1137. "por %%mm4, %%mm1 \n\t"
  1138. "por %%mm3, %%mm0 \n\t"
  1139. "por %%mm5, %%mm1 \n\t"
  1140. # endif
  1141. MOVNTQ" %%mm0, (%2, %0) \n\t"
  1142. MOVNTQ" %%mm1, 8(%2, %0) \n\t"
  1143. "add $16, %0 \n\t"
  1144. "js 1b \n\t"
  1145. SFENCE" \n\t"
  1146. EMMS" \n\t"
  1147. "2: \n\t"
  1148. : "+&r"(idx)
  1149. : "r" (s), "r" (d), "m" (mask32b), "m" (mask32r), "m" (mmx_one)
  1150. : "memory");
  1151. for (; idx<15; idx+=4) {
  1152. register int v = *(const uint32_t *)&s[idx], g = v & 0xff00ff00;
  1153. v &= 0xff00ff;
  1154. *(uint32_t *)&d[idx] = (v>>16) + g + (v<<16);
  1155. }
  1156. }
  1157. static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
  1158. {
  1159. unsigned i;
  1160. x86_reg mmx_size= 23 - src_size;
  1161. __asm__ volatile (
  1162. "test %%"REG_a", %%"REG_a" \n\t"
  1163. "jns 2f \n\t"
  1164. "movq "MANGLE(mask24r)", %%mm5 \n\t"
  1165. "movq "MANGLE(mask24g)", %%mm6 \n\t"
  1166. "movq "MANGLE(mask24b)", %%mm7 \n\t"
  1167. ".p2align 4 \n\t"
  1168. "1: \n\t"
  1169. PREFETCH" 32(%1, %%"REG_a") \n\t"
  1170. "movq (%1, %%"REG_a"), %%mm0 \n\t" // BGR BGR BG
  1171. "movq (%1, %%"REG_a"), %%mm1 \n\t" // BGR BGR BG
  1172. "movq 2(%1, %%"REG_a"), %%mm2 \n\t" // R BGR BGR B
  1173. "psllq $16, %%mm0 \n\t" // 00 BGR BGR
  1174. "pand %%mm5, %%mm0 \n\t"
  1175. "pand %%mm6, %%mm1 \n\t"
  1176. "pand %%mm7, %%mm2 \n\t"
  1177. "por %%mm0, %%mm1 \n\t"
  1178. "por %%mm2, %%mm1 \n\t"
  1179. "movq 6(%1, %%"REG_a"), %%mm0 \n\t" // BGR BGR BG
  1180. MOVNTQ" %%mm1, (%2, %%"REG_a") \n\t" // RGB RGB RG
  1181. "movq 8(%1, %%"REG_a"), %%mm1 \n\t" // R BGR BGR B
  1182. "movq 10(%1, %%"REG_a"), %%mm2 \n\t" // GR BGR BGR
  1183. "pand %%mm7, %%mm0 \n\t"
  1184. "pand %%mm5, %%mm1 \n\t"
  1185. "pand %%mm6, %%mm2 \n\t"
  1186. "por %%mm0, %%mm1 \n\t"
  1187. "por %%mm2, %%mm1 \n\t"
  1188. "movq 14(%1, %%"REG_a"), %%mm0 \n\t" // R BGR BGR B
  1189. MOVNTQ" %%mm1, 8(%2, %%"REG_a") \n\t" // B RGB RGB R
  1190. "movq 16(%1, %%"REG_a"), %%mm1 \n\t" // GR BGR BGR
  1191. "movq 18(%1, %%"REG_a"), %%mm2 \n\t" // BGR BGR BG
  1192. "pand %%mm6, %%mm0 \n\t"
  1193. "pand %%mm7, %%mm1 \n\t"
  1194. "pand %%mm5, %%mm2 \n\t"
  1195. "por %%mm0, %%mm1 \n\t"
  1196. "por %%mm2, %%mm1 \n\t"
  1197. MOVNTQ" %%mm1, 16(%2, %%"REG_a") \n\t"
  1198. "add $24, %%"REG_a" \n\t"
  1199. " js 1b \n\t"
  1200. "2: \n\t"
  1201. : "+a" (mmx_size)
  1202. : "r" (src-mmx_size), "r"(dst-mmx_size)
  1203. );
  1204. __asm__ volatile(SFENCE:::"memory");
  1205. __asm__ volatile(EMMS:::"memory");
  1206. if (mmx_size==23) return; //finished, was multiple of 8
  1207. src+= src_size;
  1208. dst+= src_size;
  1209. src_size= 23-mmx_size;
  1210. src-= src_size;
  1211. dst-= src_size;
  1212. for (i=0; i<src_size; i+=3) {
  1213. register uint8_t x;
  1214. x = src[i + 2];
  1215. dst[i + 1] = src[i + 1];
  1216. dst[i + 2] = src[i + 0];
  1217. dst[i + 0] = x;
  1218. }
  1219. }
  1220. static inline void RENAME(yuvPlanartoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
  1221. long width, long height,
  1222. long lumStride, long chromStride, long dstStride, long vertLumPerChroma)
  1223. {
  1224. long y;
  1225. const x86_reg chromWidth= width>>1;
  1226. for (y=0; y<height; y++) {
  1227. //FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway)
  1228. __asm__ volatile(
  1229. "xor %%"REG_a", %%"REG_a" \n\t"
  1230. ".p2align 4 \n\t"
  1231. "1: \n\t"
  1232. PREFETCH" 32(%1, %%"REG_a", 2) \n\t"
  1233. PREFETCH" 32(%2, %%"REG_a") \n\t"
  1234. PREFETCH" 32(%3, %%"REG_a") \n\t"
  1235. "movq (%2, %%"REG_a"), %%mm0 \n\t" // U(0)
  1236. "movq %%mm0, %%mm2 \n\t" // U(0)
  1237. "movq (%3, %%"REG_a"), %%mm1 \n\t" // V(0)
  1238. "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
  1239. "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8)
  1240. "movq (%1, %%"REG_a",2), %%mm3 \n\t" // Y(0)
  1241. "movq 8(%1, %%"REG_a",2), %%mm5 \n\t" // Y(8)
  1242. "movq %%mm3, %%mm4 \n\t" // Y(0)
  1243. "movq %%mm5, %%mm6 \n\t" // Y(8)
  1244. "punpcklbw %%mm0, %%mm3 \n\t" // YUYV YUYV(0)
  1245. "punpckhbw %%mm0, %%mm4 \n\t" // YUYV YUYV(4)
  1246. "punpcklbw %%mm2, %%mm5 \n\t" // YUYV YUYV(8)
  1247. "punpckhbw %%mm2, %%mm6 \n\t" // YUYV YUYV(12)
  1248. MOVNTQ" %%mm3, (%0, %%"REG_a", 4) \n\t"
  1249. MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4) \n\t"
  1250. MOVNTQ" %%mm5, 16(%0, %%"REG_a", 4) \n\t"
  1251. MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4) \n\t"
  1252. "add $8, %%"REG_a" \n\t"
  1253. "cmp %4, %%"REG_a" \n\t"
  1254. " jb 1b \n\t"
  1255. ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
  1256. : "%"REG_a
  1257. );
  1258. if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1) {
  1259. usrc += chromStride;
  1260. vsrc += chromStride;
  1261. }
  1262. ysrc += lumStride;
  1263. dst += dstStride;
  1264. }
  1265. __asm__(EMMS" \n\t"
  1266. SFENCE" \n\t"
  1267. :::"memory");
  1268. }
  1269. /**
  1270. * Height should be a multiple of 2 and width should be a multiple of 16.
  1271. * (If this is a problem for anyone then tell me, and I will fix it.)
  1272. */
  1273. static inline void RENAME(yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
  1274. long width, long height,
  1275. long lumStride, long chromStride, long dstStride)
  1276. {
  1277. //FIXME interpolate chroma
  1278. RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
  1279. }
  1280. static inline void RENAME(yuvPlanartouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
  1281. long width, long height,
  1282. long lumStride, long chromStride, long dstStride, long vertLumPerChroma)
  1283. {
  1284. long y;
  1285. const x86_reg chromWidth= width>>1;
  1286. for (y=0; y<height; y++) {
  1287. //FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway)
  1288. __asm__ volatile(
  1289. "xor %%"REG_a", %%"REG_a" \n\t"
  1290. ".p2align 4 \n\t"
  1291. "1: \n\t"
  1292. PREFETCH" 32(%1, %%"REG_a", 2) \n\t"
  1293. PREFETCH" 32(%2, %%"REG_a") \n\t"
  1294. PREFETCH" 32(%3, %%"REG_a") \n\t"
  1295. "movq (%2, %%"REG_a"), %%mm0 \n\t" // U(0)
  1296. "movq %%mm0, %%mm2 \n\t" // U(0)
  1297. "movq (%3, %%"REG_a"), %%mm1 \n\t" // V(0)
  1298. "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
  1299. "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8)
  1300. "movq (%1, %%"REG_a",2), %%mm3 \n\t" // Y(0)
  1301. "movq 8(%1, %%"REG_a",2), %%mm5 \n\t" // Y(8)
  1302. "movq %%mm0, %%mm4 \n\t" // Y(0)
  1303. "movq %%mm2, %%mm6 \n\t" // Y(8)
  1304. "punpcklbw %%mm3, %%mm0 \n\t" // YUYV YUYV(0)
  1305. "punpckhbw %%mm3, %%mm4 \n\t" // YUYV YUYV(4)
  1306. "punpcklbw %%mm5, %%mm2 \n\t" // YUYV YUYV(8)
  1307. "punpckhbw %%mm5, %%mm6 \n\t" // YUYV YUYV(12)
  1308. MOVNTQ" %%mm0, (%0, %%"REG_a", 4) \n\t"
  1309. MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4) \n\t"
  1310. MOVNTQ" %%mm2, 16(%0, %%"REG_a", 4) \n\t"
  1311. MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4) \n\t"
  1312. "add $8, %%"REG_a" \n\t"
  1313. "cmp %4, %%"REG_a" \n\t"
  1314. " jb 1b \n\t"
  1315. ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
  1316. : "%"REG_a
  1317. );
  1318. if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1) {
  1319. usrc += chromStride;
  1320. vsrc += chromStride;
  1321. }
  1322. ysrc += lumStride;
  1323. dst += dstStride;
  1324. }
  1325. __asm__(EMMS" \n\t"
  1326. SFENCE" \n\t"
  1327. :::"memory");
  1328. }
  1329. /**
  1330. * Height should be a multiple of 2 and width should be a multiple of 16
  1331. * (If this is a problem for anyone then tell me, and I will fix it.)
  1332. */
  1333. static inline void RENAME(yv12touyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
  1334. long width, long height,
  1335. long lumStride, long chromStride, long dstStride)
  1336. {
  1337. //FIXME interpolate chroma
  1338. RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
  1339. }
  1340. /**
  1341. * Width should be a multiple of 16.
  1342. */
  1343. static inline void RENAME(yuv422ptouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
  1344. long width, long height,
  1345. long lumStride, long chromStride, long dstStride)
  1346. {
  1347. RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
  1348. }
  1349. /**
  1350. * Width should be a multiple of 16.
  1351. */
  1352. static inline void RENAME(yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
  1353. long width, long height,
  1354. long lumStride, long chromStride, long dstStride)
  1355. {
  1356. RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
  1357. }
  1358. /**
  1359. * Height should be a multiple of 2 and width should be a multiple of 16.
  1360. * (If this is a problem for anyone then tell me, and I will fix it.)
  1361. */
  1362. static inline void RENAME(yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
  1363. long width, long height,
  1364. long lumStride, long chromStride, long srcStride)
  1365. {
  1366. long y;
  1367. const x86_reg chromWidth= width>>1;
  1368. for (y=0; y<height; y+=2) {
  1369. __asm__ volatile(
  1370. "xor %%"REG_a", %%"REG_a" \n\t"
  1371. "pcmpeqw %%mm7, %%mm7 \n\t"
  1372. "psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
  1373. ".p2align 4 \n\t"
  1374. "1: \n\t"
  1375. PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
  1376. "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
  1377. "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(4)
  1378. "movq %%mm0, %%mm2 \n\t" // YUYV YUYV(0)
  1379. "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(4)
  1380. "psrlw $8, %%mm0 \n\t" // U0V0 U0V0(0)
  1381. "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(4)
  1382. "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
  1383. "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
  1384. "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
  1385. "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
  1386. MOVNTQ" %%mm2, (%1, %%"REG_a", 2) \n\t"
  1387. "movq 16(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(8)
  1388. "movq 24(%0, %%"REG_a", 4), %%mm2 \n\t" // YUYV YUYV(12)
  1389. "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(8)
  1390. "movq %%mm2, %%mm4 \n\t" // YUYV YUYV(12)
  1391. "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(8)
  1392. "psrlw $8, %%mm2 \n\t" // U0V0 U0V0(12)
  1393. "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
  1394. "pand %%mm7, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
  1395. "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
  1396. "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
  1397. MOVNTQ" %%mm3, 8(%1, %%"REG_a", 2) \n\t"
  1398. "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
  1399. "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
  1400. "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
  1401. "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
  1402. "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
  1403. "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
  1404. "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
  1405. "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
  1406. MOVNTQ" %%mm0, (%3, %%"REG_a") \n\t"
  1407. MOVNTQ" %%mm2, (%2, %%"REG_a") \n\t"
  1408. "add $8, %%"REG_a" \n\t"
  1409. "cmp %4, %%"REG_a" \n\t"
  1410. " jb 1b \n\t"
  1411. ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
  1412. : "memory", "%"REG_a
  1413. );
  1414. ydst += lumStride;
  1415. src += srcStride;
  1416. __asm__ volatile(
  1417. "xor %%"REG_a", %%"REG_a" \n\t"
  1418. ".p2align 4 \n\t"
  1419. "1: \n\t"
  1420. PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
  1421. "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
  1422. "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(4)
  1423. "movq 16(%0, %%"REG_a", 4), %%mm2 \n\t" // YUYV YUYV(8)
  1424. "movq 24(%0, %%"REG_a", 4), %%mm3 \n\t" // YUYV YUYV(12)
  1425. "pand %%mm7, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
  1426. "pand %%mm7, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
  1427. "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
  1428. "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
  1429. "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
  1430. "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
  1431. MOVNTQ" %%mm0, (%1, %%"REG_a", 2) \n\t"
  1432. MOVNTQ" %%mm2, 8(%1, %%"REG_a", 2) \n\t"
  1433. "add $8, %%"REG_a" \n\t"
  1434. "cmp %4, %%"REG_a" \n\t"
  1435. " jb 1b \n\t"
  1436. ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
  1437. : "memory", "%"REG_a
  1438. );
  1439. udst += chromStride;
  1440. vdst += chromStride;
  1441. ydst += lumStride;
  1442. src += srcStride;
  1443. }
  1444. __asm__ volatile(EMMS" \n\t"
  1445. SFENCE" \n\t"
  1446. :::"memory");
  1447. }
  1448. static inline void RENAME(planar2x)(const uint8_t *src, uint8_t *dst, long srcWidth, long srcHeight, long srcStride, long dstStride)
  1449. {
  1450. long x,y;
  1451. dst[0]= src[0];
  1452. // first line
  1453. for (x=0; x<srcWidth-1; x++) {
  1454. dst[2*x+1]= (3*src[x] + src[x+1])>>2;
  1455. dst[2*x+2]= ( src[x] + 3*src[x+1])>>2;
  1456. }
  1457. dst[2*srcWidth-1]= src[srcWidth-1];
  1458. dst+= dstStride;
  1459. for (y=1; y<srcHeight; y++) {
  1460. #if COMPILE_TEMPLATE_MMX2 || COMPILE_TEMPLATE_AMD3DNOW
  1461. const x86_reg mmxSize= srcWidth&~15;
  1462. __asm__ volatile(
  1463. "mov %4, %%"REG_a" \n\t"
  1464. "movq "MANGLE(mmx_ff)", %%mm0 \n\t"
  1465. "movq (%0, %%"REG_a"), %%mm4 \n\t"
  1466. "movq %%mm4, %%mm2 \n\t"
  1467. "psllq $8, %%mm4 \n\t"
  1468. "pand %%mm0, %%mm2 \n\t"
  1469. "por %%mm2, %%mm4 \n\t"
  1470. "movq (%1, %%"REG_a"), %%mm5 \n\t"
  1471. "movq %%mm5, %%mm3 \n\t"
  1472. "psllq $8, %%mm5 \n\t"
  1473. "pand %%mm0, %%mm3 \n\t"
  1474. "por %%mm3, %%mm5 \n\t"
  1475. "1: \n\t"
  1476. "movq (%0, %%"REG_a"), %%mm0 \n\t"
  1477. "movq (%1, %%"REG_a"), %%mm1 \n\t"
  1478. "movq 1(%0, %%"REG_a"), %%mm2 \n\t"
  1479. "movq 1(%1, %%"REG_a"), %%mm3 \n\t"
  1480. PAVGB" %%mm0, %%mm5 \n\t"
  1481. PAVGB" %%mm0, %%mm3 \n\t"
  1482. PAVGB" %%mm0, %%mm5 \n\t"
  1483. PAVGB" %%mm0, %%mm3 \n\t"
  1484. PAVGB" %%mm1, %%mm4 \n\t"
  1485. PAVGB" %%mm1, %%mm2 \n\t"
  1486. PAVGB" %%mm1, %%mm4 \n\t"
  1487. PAVGB" %%mm1, %%mm2 \n\t"
  1488. "movq %%mm5, %%mm7 \n\t"
  1489. "movq %%mm4, %%mm6 \n\t"
  1490. "punpcklbw %%mm3, %%mm5 \n\t"
  1491. "punpckhbw %%mm3, %%mm7 \n\t"
  1492. "punpcklbw %%mm2, %%mm4 \n\t"
  1493. "punpckhbw %%mm2, %%mm6 \n\t"
  1494. #if 1
  1495. MOVNTQ" %%mm5, (%2, %%"REG_a", 2) \n\t"
  1496. MOVNTQ" %%mm7, 8(%2, %%"REG_a", 2) \n\t"
  1497. MOVNTQ" %%mm4, (%3, %%"REG_a", 2) \n\t"
  1498. MOVNTQ" %%mm6, 8(%3, %%"REG_a", 2) \n\t"
  1499. #else
  1500. "movq %%mm5, (%2, %%"REG_a", 2) \n\t"
  1501. "movq %%mm7, 8(%2, %%"REG_a", 2) \n\t"
  1502. "movq %%mm4, (%3, %%"REG_a", 2) \n\t"
  1503. "movq %%mm6, 8(%3, %%"REG_a", 2) \n\t"
  1504. #endif
  1505. "add $8, %%"REG_a" \n\t"
  1506. "movq -1(%0, %%"REG_a"), %%mm4 \n\t"
  1507. "movq -1(%1, %%"REG_a"), %%mm5 \n\t"
  1508. " js 1b \n\t"
  1509. :: "r" (src + mmxSize ), "r" (src + srcStride + mmxSize ),
  1510. "r" (dst + mmxSize*2), "r" (dst + dstStride + mmxSize*2),
  1511. "g" (-mmxSize)
  1512. : "%"REG_a
  1513. );
  1514. #else
  1515. const x86_reg mmxSize=1;
  1516. dst[0 ]= (3*src[0] + src[srcStride])>>2;
  1517. dst[dstStride]= ( src[0] + 3*src[srcStride])>>2;
  1518. #endif
  1519. for (x=mmxSize-1; x<srcWidth-1; x++) {
  1520. dst[2*x +1]= (3*src[x+0] + src[x+srcStride+1])>>2;
  1521. dst[2*x+dstStride+2]= ( src[x+0] + 3*src[x+srcStride+1])>>2;
  1522. dst[2*x+dstStride+1]= ( src[x+1] + 3*src[x+srcStride ])>>2;
  1523. dst[2*x +2]= (3*src[x+1] + src[x+srcStride ])>>2;
  1524. }
  1525. dst[srcWidth*2 -1 ]= (3*src[srcWidth-1] + src[srcWidth-1 + srcStride])>>2;
  1526. dst[srcWidth*2 -1 + dstStride]= ( src[srcWidth-1] + 3*src[srcWidth-1 + srcStride])>>2;
  1527. dst+=dstStride*2;
  1528. src+=srcStride;
  1529. }
  1530. // last line
  1531. #if 1
  1532. dst[0]= src[0];
  1533. for (x=0; x<srcWidth-1; x++) {
  1534. dst[2*x+1]= (3*src[x] + src[x+1])>>2;
  1535. dst[2*x+2]= ( src[x] + 3*src[x+1])>>2;
  1536. }
  1537. dst[2*srcWidth-1]= src[srcWidth-1];
  1538. #else
  1539. for (x=0; x<srcWidth; x++) {
  1540. dst[2*x+0]=
  1541. dst[2*x+1]= src[x];
  1542. }
  1543. #endif
  1544. __asm__ volatile(EMMS" \n\t"
  1545. SFENCE" \n\t"
  1546. :::"memory");
  1547. }
  1548. /**
  1549. * Height should be a multiple of 2 and width should be a multiple of 16.
  1550. * (If this is a problem for anyone then tell me, and I will fix it.)
  1551. * Chrominance data is only taken from every second line, others are ignored.
  1552. * FIXME: Write HQ version.
  1553. */
  1554. static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
  1555. long width, long height,
  1556. long lumStride, long chromStride, long srcStride)
  1557. {
  1558. long y;
  1559. const x86_reg chromWidth= width>>1;
  1560. for (y=0; y<height; y+=2) {
  1561. __asm__ volatile(
  1562. "xor %%"REG_a", %%"REG_a" \n\t"
  1563. "pcmpeqw %%mm7, %%mm7 \n\t"
  1564. "psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
  1565. ".p2align 4 \n\t"
  1566. "1: \n\t"
  1567. PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
  1568. "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // UYVY UYVY(0)
  1569. "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // UYVY UYVY(4)
  1570. "movq %%mm0, %%mm2 \n\t" // UYVY UYVY(0)
  1571. "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(4)
  1572. "pand %%mm7, %%mm0 \n\t" // U0V0 U0V0(0)
  1573. "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(4)
  1574. "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
  1575. "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
  1576. "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
  1577. "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
  1578. MOVNTQ" %%mm2, (%1, %%"REG_a", 2) \n\t"
  1579. "movq 16(%0, %%"REG_a", 4), %%mm1 \n\t" // UYVY UYVY(8)
  1580. "movq 24(%0, %%"REG_a", 4), %%mm2 \n\t" // UYVY UYVY(12)
  1581. "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(8)
  1582. "movq %%mm2, %%mm4 \n\t" // UYVY UYVY(12)
  1583. "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(8)
  1584. "pand %%mm7, %%mm2 \n\t" // U0V0 U0V0(12)
  1585. "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
  1586. "psrlw $8, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
  1587. "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
  1588. "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
  1589. MOVNTQ" %%mm3, 8(%1, %%"REG_a", 2) \n\t"
  1590. "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
  1591. "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
  1592. "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
  1593. "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
  1594. "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
  1595. "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
  1596. "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
  1597. "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
  1598. MOVNTQ" %%mm0, (%3, %%"REG_a") \n\t"
  1599. MOVNTQ" %%mm2, (%2, %%"REG_a") \n\t"
  1600. "add $8, %%"REG_a" \n\t"
  1601. "cmp %4, %%"REG_a" \n\t"
  1602. " jb 1b \n\t"
  1603. ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
  1604. : "memory", "%"REG_a
  1605. );
  1606. ydst += lumStride;
  1607. src += srcStride;
  1608. __asm__ volatile(
  1609. "xor %%"REG_a", %%"REG_a" \n\t"
  1610. ".p2align 4 \n\t"
  1611. "1: \n\t"
  1612. PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
  1613. "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
  1614. "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(4)
  1615. "movq 16(%0, %%"REG_a", 4), %%mm2 \n\t" // YUYV YUYV(8)
  1616. "movq 24(%0, %%"REG_a", 4), %%mm3 \n\t" // YUYV YUYV(12)
  1617. "psrlw $8, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
  1618. "psrlw $8, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
  1619. "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
  1620. "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
  1621. "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
  1622. "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
  1623. MOVNTQ" %%mm0, (%1, %%"REG_a", 2) \n\t"
  1624. MOVNTQ" %%mm2, 8(%1, %%"REG_a", 2) \n\t"
  1625. "add $8, %%"REG_a" \n\t"
  1626. "cmp %4, %%"REG_a" \n\t"
  1627. " jb 1b \n\t"
  1628. ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
  1629. : "memory", "%"REG_a
  1630. );
  1631. udst += chromStride;
  1632. vdst += chromStride;
  1633. ydst += lumStride;
  1634. src += srcStride;
  1635. }
  1636. __asm__ volatile(EMMS" \n\t"
  1637. SFENCE" \n\t"
  1638. :::"memory");
  1639. }
  1640. /**
  1641. * Height should be a multiple of 2 and width should be a multiple of 2.
  1642. * (If this is a problem for anyone then tell me, and I will fix it.)
  1643. * Chrominance data is only taken from every second line,
  1644. * others are ignored in the C version.
  1645. * FIXME: Write HQ version.
  1646. */
  1647. static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
  1648. long width, long height,
  1649. long lumStride, long chromStride, long srcStride)
  1650. {
  1651. long y;
  1652. const x86_reg chromWidth= width>>1;
  1653. for (y=0; y<height-2; y+=2) {
  1654. long i;
  1655. for (i=0; i<2; i++) {
  1656. __asm__ volatile(
  1657. "mov %2, %%"REG_a" \n\t"
  1658. "movq "MANGLE(ff_bgr2YCoeff)", %%mm6 \n\t"
  1659. "movq "MANGLE(ff_w1111)", %%mm5 \n\t"
  1660. "pxor %%mm7, %%mm7 \n\t"
  1661. "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d" \n\t"
  1662. ".p2align 4 \n\t"
  1663. "1: \n\t"
  1664. PREFETCH" 64(%0, %%"REG_d") \n\t"
  1665. "movd (%0, %%"REG_d"), %%mm0 \n\t"
  1666. "movd 3(%0, %%"REG_d"), %%mm1 \n\t"
  1667. "punpcklbw %%mm7, %%mm0 \n\t"
  1668. "punpcklbw %%mm7, %%mm1 \n\t"
  1669. "movd 6(%0, %%"REG_d"), %%mm2 \n\t"
  1670. "movd 9(%0, %%"REG_d"), %%mm3 \n\t"
  1671. "punpcklbw %%mm7, %%mm2 \n\t"
  1672. "punpcklbw %%mm7, %%mm3 \n\t"
  1673. "pmaddwd %%mm6, %%mm0 \n\t"
  1674. "pmaddwd %%mm6, %%mm1 \n\t"
  1675. "pmaddwd %%mm6, %%mm2 \n\t"
  1676. "pmaddwd %%mm6, %%mm3 \n\t"
  1677. #ifndef FAST_BGR2YV12
  1678. "psrad $8, %%mm0 \n\t"
  1679. "psrad $8, %%mm1 \n\t"
  1680. "psrad $8, %%mm2 \n\t"
  1681. "psrad $8, %%mm3 \n\t"
  1682. #endif
  1683. "packssdw %%mm1, %%mm0 \n\t"
  1684. "packssdw %%mm3, %%mm2 \n\t"
  1685. "pmaddwd %%mm5, %%mm0 \n\t"
  1686. "pmaddwd %%mm5, %%mm2 \n\t"
  1687. "packssdw %%mm2, %%mm0 \n\t"
  1688. "psraw $7, %%mm0 \n\t"
  1689. "movd 12(%0, %%"REG_d"), %%mm4 \n\t"
  1690. "movd 15(%0, %%"REG_d"), %%mm1 \n\t"
  1691. "punpcklbw %%mm7, %%mm4 \n\t"
  1692. "punpcklbw %%mm7, %%mm1 \n\t"
  1693. "movd 18(%0, %%"REG_d"), %%mm2 \n\t"
  1694. "movd 21(%0, %%"REG_d"), %%mm3 \n\t"
  1695. "punpcklbw %%mm7, %%mm2 \n\t"
  1696. "punpcklbw %%mm7, %%mm3 \n\t"
  1697. "pmaddwd %%mm6, %%mm4 \n\t"
  1698. "pmaddwd %%mm6, %%mm1 \n\t"
  1699. "pmaddwd %%mm6, %%mm2 \n\t"
  1700. "pmaddwd %%mm6, %%mm3 \n\t"
  1701. #ifndef FAST_BGR2YV12
  1702. "psrad $8, %%mm4 \n\t"
  1703. "psrad $8, %%mm1 \n\t"
  1704. "psrad $8, %%mm2 \n\t"
  1705. "psrad $8, %%mm3 \n\t"
  1706. #endif
  1707. "packssdw %%mm1, %%mm4 \n\t"
  1708. "packssdw %%mm3, %%mm2 \n\t"
  1709. "pmaddwd %%mm5, %%mm4 \n\t"
  1710. "pmaddwd %%mm5, %%mm2 \n\t"
  1711. "add $24, %%"REG_d" \n\t"
  1712. "packssdw %%mm2, %%mm4 \n\t"
  1713. "psraw $7, %%mm4 \n\t"
  1714. "packuswb %%mm4, %%mm0 \n\t"
  1715. "paddusb "MANGLE(ff_bgr2YOffset)", %%mm0 \n\t"
  1716. MOVNTQ" %%mm0, (%1, %%"REG_a") \n\t"
  1717. "add $8, %%"REG_a" \n\t"
  1718. " js 1b \n\t"
  1719. : : "r" (src+width*3), "r" (ydst+width), "g" ((x86_reg)-width)
  1720. : "%"REG_a, "%"REG_d
  1721. );
  1722. ydst += lumStride;
  1723. src += srcStride;
  1724. }
  1725. src -= srcStride*2;
  1726. __asm__ volatile(
  1727. "mov %4, %%"REG_a" \n\t"
  1728. "movq "MANGLE(ff_w1111)", %%mm5 \n\t"
  1729. "movq "MANGLE(ff_bgr2UCoeff)", %%mm6 \n\t"
  1730. "pxor %%mm7, %%mm7 \n\t"
  1731. "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d" \n\t"
  1732. "add %%"REG_d", %%"REG_d" \n\t"
  1733. ".p2align 4 \n\t"
  1734. "1: \n\t"
  1735. PREFETCH" 64(%0, %%"REG_d") \n\t"
  1736. PREFETCH" 64(%1, %%"REG_d") \n\t"
  1737. #if COMPILE_TEMPLATE_MMX2 || COMPILE_TEMPLATE_AMD3DNOW
  1738. "movq (%0, %%"REG_d"), %%mm0 \n\t"
  1739. "movq (%1, %%"REG_d"), %%mm1 \n\t"
  1740. "movq 6(%0, %%"REG_d"), %%mm2 \n\t"
  1741. "movq 6(%1, %%"REG_d"), %%mm3 \n\t"
  1742. PAVGB" %%mm1, %%mm0 \n\t"
  1743. PAVGB" %%mm3, %%mm2 \n\t"
  1744. "movq %%mm0, %%mm1 \n\t"
  1745. "movq %%mm2, %%mm3 \n\t"
  1746. "psrlq $24, %%mm0 \n\t"
  1747. "psrlq $24, %%mm2 \n\t"
  1748. PAVGB" %%mm1, %%mm0 \n\t"
  1749. PAVGB" %%mm3, %%mm2 \n\t"
  1750. "punpcklbw %%mm7, %%mm0 \n\t"
  1751. "punpcklbw %%mm7, %%mm2 \n\t"
  1752. #else
  1753. "movd (%0, %%"REG_d"), %%mm0 \n\t"
  1754. "movd (%1, %%"REG_d"), %%mm1 \n\t"
  1755. "movd 3(%0, %%"REG_d"), %%mm2 \n\t"
  1756. "movd 3(%1, %%"REG_d"), %%mm3 \n\t"
  1757. "punpcklbw %%mm7, %%mm0 \n\t"
  1758. "punpcklbw %%mm7, %%mm1 \n\t"
  1759. "punpcklbw %%mm7, %%mm2 \n\t"
  1760. "punpcklbw %%mm7, %%mm3 \n\t"
  1761. "paddw %%mm1, %%mm0 \n\t"
  1762. "paddw %%mm3, %%mm2 \n\t"
  1763. "paddw %%mm2, %%mm0 \n\t"
  1764. "movd 6(%0, %%"REG_d"), %%mm4 \n\t"
  1765. "movd 6(%1, %%"REG_d"), %%mm1 \n\t"
  1766. "movd 9(%0, %%"REG_d"), %%mm2 \n\t"
  1767. "movd 9(%1, %%"REG_d"), %%mm3 \n\t"
  1768. "punpcklbw %%mm7, %%mm4 \n\t"
  1769. "punpcklbw %%mm7, %%mm1 \n\t"
  1770. "punpcklbw %%mm7, %%mm2 \n\t"
  1771. "punpcklbw %%mm7, %%mm3 \n\t"
  1772. "paddw %%mm1, %%mm4 \n\t"
  1773. "paddw %%mm3, %%mm2 \n\t"
  1774. "paddw %%mm4, %%mm2 \n\t"
  1775. "psrlw $2, %%mm0 \n\t"
  1776. "psrlw $2, %%mm2 \n\t"
  1777. #endif
  1778. "movq "MANGLE(ff_bgr2VCoeff)", %%mm1 \n\t"
  1779. "movq "MANGLE(ff_bgr2VCoeff)", %%mm3 \n\t"
  1780. "pmaddwd %%mm0, %%mm1 \n\t"
  1781. "pmaddwd %%mm2, %%mm3 \n\t"
  1782. "pmaddwd %%mm6, %%mm0 \n\t"
  1783. "pmaddwd %%mm6, %%mm2 \n\t"
  1784. #ifndef FAST_BGR2YV12
  1785. "psrad $8, %%mm0 \n\t"
  1786. "psrad $8, %%mm1 \n\t"
  1787. "psrad $8, %%mm2 \n\t"
  1788. "psrad $8, %%mm3 \n\t"
  1789. #endif
  1790. "packssdw %%mm2, %%mm0 \n\t"
  1791. "packssdw %%mm3, %%mm1 \n\t"
  1792. "pmaddwd %%mm5, %%mm0 \n\t"
  1793. "pmaddwd %%mm5, %%mm1 \n\t"
  1794. "packssdw %%mm1, %%mm0 \n\t" // V1 V0 U1 U0
  1795. "psraw $7, %%mm0 \n\t"
  1796. #if COMPILE_TEMPLATE_MMX2 || COMPILE_TEMPLATE_AMD3DNOW
  1797. "movq 12(%0, %%"REG_d"), %%mm4 \n\t"
  1798. "movq 12(%1, %%"REG_d"), %%mm1 \n\t"
  1799. "movq 18(%0, %%"REG_d"), %%mm2 \n\t"
  1800. "movq 18(%1, %%"REG_d"), %%mm3 \n\t"
  1801. PAVGB" %%mm1, %%mm4 \n\t"
  1802. PAVGB" %%mm3, %%mm2 \n\t"
  1803. "movq %%mm4, %%mm1 \n\t"
  1804. "movq %%mm2, %%mm3 \n\t"
  1805. "psrlq $24, %%mm4 \n\t"
  1806. "psrlq $24, %%mm2 \n\t"
  1807. PAVGB" %%mm1, %%mm4 \n\t"
  1808. PAVGB" %%mm3, %%mm2 \n\t"
  1809. "punpcklbw %%mm7, %%mm4 \n\t"
  1810. "punpcklbw %%mm7, %%mm2 \n\t"
  1811. #else
  1812. "movd 12(%0, %%"REG_d"), %%mm4 \n\t"
  1813. "movd 12(%1, %%"REG_d"), %%mm1 \n\t"
  1814. "movd 15(%0, %%"REG_d"), %%mm2 \n\t"
  1815. "movd 15(%1, %%"REG_d"), %%mm3 \n\t"
  1816. "punpcklbw %%mm7, %%mm4 \n\t"
  1817. "punpcklbw %%mm7, %%mm1 \n\t"
  1818. "punpcklbw %%mm7, %%mm2 \n\t"
  1819. "punpcklbw %%mm7, %%mm3 \n\t"
  1820. "paddw %%mm1, %%mm4 \n\t"
  1821. "paddw %%mm3, %%mm2 \n\t"
  1822. "paddw %%mm2, %%mm4 \n\t"
  1823. "movd 18(%0, %%"REG_d"), %%mm5 \n\t"
  1824. "movd 18(%1, %%"REG_d"), %%mm1 \n\t"
  1825. "movd 21(%0, %%"REG_d"), %%mm2 \n\t"
  1826. "movd 21(%1, %%"REG_d"), %%mm3 \n\t"
  1827. "punpcklbw %%mm7, %%mm5 \n\t"
  1828. "punpcklbw %%mm7, %%mm1 \n\t"
  1829. "punpcklbw %%mm7, %%mm2 \n\t"
  1830. "punpcklbw %%mm7, %%mm3 \n\t"
  1831. "paddw %%mm1, %%mm5 \n\t"
  1832. "paddw %%mm3, %%mm2 \n\t"
  1833. "paddw %%mm5, %%mm2 \n\t"
  1834. "movq "MANGLE(ff_w1111)", %%mm5 \n\t"
  1835. "psrlw $2, %%mm4 \n\t"
  1836. "psrlw $2, %%mm2 \n\t"
  1837. #endif
  1838. "movq "MANGLE(ff_bgr2VCoeff)", %%mm1 \n\t"
  1839. "movq "MANGLE(ff_bgr2VCoeff)", %%mm3 \n\t"
  1840. "pmaddwd %%mm4, %%mm1 \n\t"
  1841. "pmaddwd %%mm2, %%mm3 \n\t"
  1842. "pmaddwd %%mm6, %%mm4 \n\t"
  1843. "pmaddwd %%mm6, %%mm2 \n\t"
  1844. #ifndef FAST_BGR2YV12
  1845. "psrad $8, %%mm4 \n\t"
  1846. "psrad $8, %%mm1 \n\t"
  1847. "psrad $8, %%mm2 \n\t"
  1848. "psrad $8, %%mm3 \n\t"
  1849. #endif
  1850. "packssdw %%mm2, %%mm4 \n\t"
  1851. "packssdw %%mm3, %%mm1 \n\t"
  1852. "pmaddwd %%mm5, %%mm4 \n\t"
  1853. "pmaddwd %%mm5, %%mm1 \n\t"
  1854. "add $24, %%"REG_d" \n\t"
  1855. "packssdw %%mm1, %%mm4 \n\t" // V3 V2 U3 U2
  1856. "psraw $7, %%mm4 \n\t"
  1857. "movq %%mm0, %%mm1 \n\t"
  1858. "punpckldq %%mm4, %%mm0 \n\t"
  1859. "punpckhdq %%mm4, %%mm1 \n\t"
  1860. "packsswb %%mm1, %%mm0 \n\t"
  1861. "paddb "MANGLE(ff_bgr2UVOffset)", %%mm0 \n\t"
  1862. "movd %%mm0, (%2, %%"REG_a") \n\t"
  1863. "punpckhdq %%mm0, %%mm0 \n\t"
  1864. "movd %%mm0, (%3, %%"REG_a") \n\t"
  1865. "add $4, %%"REG_a" \n\t"
  1866. " js 1b \n\t"
  1867. : : "r" (src+chromWidth*6), "r" (src+srcStride+chromWidth*6), "r" (udst+chromWidth), "r" (vdst+chromWidth), "g" (-chromWidth)
  1868. : "%"REG_a, "%"REG_d
  1869. );
  1870. udst += chromStride;
  1871. vdst += chromStride;
  1872. src += srcStride*2;
  1873. }
  1874. __asm__ volatile(EMMS" \n\t"
  1875. SFENCE" \n\t"
  1876. :::"memory");
  1877. for (; y<height; y+=2) {
  1878. long i;
  1879. for (i=0; i<chromWidth; i++) {
  1880. unsigned int b = src[6*i+0];
  1881. unsigned int g = src[6*i+1];
  1882. unsigned int r = src[6*i+2];
  1883. unsigned int Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
  1884. unsigned int V = ((RV*r + GV*g + BV*b)>>RGB2YUV_SHIFT) + 128;
  1885. unsigned int U = ((RU*r + GU*g + BU*b)>>RGB2YUV_SHIFT) + 128;
  1886. udst[i] = U;
  1887. vdst[i] = V;
  1888. ydst[2*i] = Y;
  1889. b = src[6*i+3];
  1890. g = src[6*i+4];
  1891. r = src[6*i+5];
  1892. Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
  1893. ydst[2*i+1] = Y;
  1894. }
  1895. ydst += lumStride;
  1896. src += srcStride;
  1897. for (i=0; i<chromWidth; i++) {
  1898. unsigned int b = src[6*i+0];
  1899. unsigned int g = src[6*i+1];
  1900. unsigned int r = src[6*i+2];
  1901. unsigned int Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
  1902. ydst[2*i] = Y;
  1903. b = src[6*i+3];
  1904. g = src[6*i+4];
  1905. r = src[6*i+5];
  1906. Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
  1907. ydst[2*i+1] = Y;
  1908. }
  1909. udst += chromStride;
  1910. vdst += chromStride;
  1911. ydst += lumStride;
  1912. src += srcStride;
  1913. }
  1914. }
  1915. static void RENAME(interleaveBytes)(const uint8_t *src1, const uint8_t *src2, uint8_t *dest,
  1916. long width, long height, long src1Stride,
  1917. long src2Stride, long dstStride)
  1918. {
  1919. long h;
  1920. for (h=0; h < height; h++) {
  1921. long w;
  1922. #if COMPILE_TEMPLATE_SSE2
  1923. __asm__(
  1924. "xor %%"REG_a", %%"REG_a" \n\t"
  1925. "1: \n\t"
  1926. PREFETCH" 64(%1, %%"REG_a") \n\t"
  1927. PREFETCH" 64(%2, %%"REG_a") \n\t"
  1928. "movdqa (%1, %%"REG_a"), %%xmm0 \n\t"
  1929. "movdqa (%1, %%"REG_a"), %%xmm1 \n\t"
  1930. "movdqa (%2, %%"REG_a"), %%xmm2 \n\t"
  1931. "punpcklbw %%xmm2, %%xmm0 \n\t"
  1932. "punpckhbw %%xmm2, %%xmm1 \n\t"
  1933. "movntdq %%xmm0, (%0, %%"REG_a", 2) \n\t"
  1934. "movntdq %%xmm1, 16(%0, %%"REG_a", 2) \n\t"
  1935. "add $16, %%"REG_a" \n\t"
  1936. "cmp %3, %%"REG_a" \n\t"
  1937. " jb 1b \n\t"
  1938. ::"r"(dest), "r"(src1), "r"(src2), "r" ((x86_reg)width-15)
  1939. : "memory", "%"REG_a""
  1940. );
  1941. #else
  1942. __asm__(
  1943. "xor %%"REG_a", %%"REG_a" \n\t"
  1944. "1: \n\t"
  1945. PREFETCH" 64(%1, %%"REG_a") \n\t"
  1946. PREFETCH" 64(%2, %%"REG_a") \n\t"
  1947. "movq (%1, %%"REG_a"), %%mm0 \n\t"
  1948. "movq 8(%1, %%"REG_a"), %%mm2 \n\t"
  1949. "movq %%mm0, %%mm1 \n\t"
  1950. "movq %%mm2, %%mm3 \n\t"
  1951. "movq (%2, %%"REG_a"), %%mm4 \n\t"
  1952. "movq 8(%2, %%"REG_a"), %%mm5 \n\t"
  1953. "punpcklbw %%mm4, %%mm0 \n\t"
  1954. "punpckhbw %%mm4, %%mm1 \n\t"
  1955. "punpcklbw %%mm5, %%mm2 \n\t"
  1956. "punpckhbw %%mm5, %%mm3 \n\t"
  1957. MOVNTQ" %%mm0, (%0, %%"REG_a", 2) \n\t"
  1958. MOVNTQ" %%mm1, 8(%0, %%"REG_a", 2) \n\t"
  1959. MOVNTQ" %%mm2, 16(%0, %%"REG_a", 2) \n\t"
  1960. MOVNTQ" %%mm3, 24(%0, %%"REG_a", 2) \n\t"
  1961. "add $16, %%"REG_a" \n\t"
  1962. "cmp %3, %%"REG_a" \n\t"
  1963. " jb 1b \n\t"
  1964. ::"r"(dest), "r"(src1), "r"(src2), "r" ((x86_reg)width-15)
  1965. : "memory", "%"REG_a
  1966. );
  1967. #endif
  1968. for (w= (width&(~15)); w < width; w++) {
  1969. dest[2*w+0] = src1[w];
  1970. dest[2*w+1] = src2[w];
  1971. }
  1972. dest += dstStride;
  1973. src1 += src1Stride;
  1974. src2 += src2Stride;
  1975. }
  1976. __asm__(
  1977. EMMS" \n\t"
  1978. SFENCE" \n\t"
  1979. ::: "memory"
  1980. );
  1981. }
  1982. static inline void RENAME(vu9_to_vu12)(const uint8_t *src1, const uint8_t *src2,
  1983. uint8_t *dst1, uint8_t *dst2,
  1984. long width, long height,
  1985. long srcStride1, long srcStride2,
  1986. long dstStride1, long dstStride2)
  1987. {
  1988. x86_reg y;
  1989. long x,w,h;
  1990. w=width/2; h=height/2;
  1991. __asm__ volatile(
  1992. PREFETCH" %0 \n\t"
  1993. PREFETCH" %1 \n\t"
  1994. ::"m"(*(src1+srcStride1)),"m"(*(src2+srcStride2)):"memory");
  1995. for (y=0;y<h;y++) {
  1996. const uint8_t* s1=src1+srcStride1*(y>>1);
  1997. uint8_t* d=dst1+dstStride1*y;
  1998. x=0;
  1999. for (;x<w-31;x+=32) {
  2000. __asm__ volatile(
  2001. PREFETCH" 32%1 \n\t"
  2002. "movq %1, %%mm0 \n\t"
  2003. "movq 8%1, %%mm2 \n\t"
  2004. "movq 16%1, %%mm4 \n\t"
  2005. "movq 24%1, %%mm6 \n\t"
  2006. "movq %%mm0, %%mm1 \n\t"
  2007. "movq %%mm2, %%mm3 \n\t"
  2008. "movq %%mm4, %%mm5 \n\t"
  2009. "movq %%mm6, %%mm7 \n\t"
  2010. "punpcklbw %%mm0, %%mm0 \n\t"
  2011. "punpckhbw %%mm1, %%mm1 \n\t"
  2012. "punpcklbw %%mm2, %%mm2 \n\t"
  2013. "punpckhbw %%mm3, %%mm3 \n\t"
  2014. "punpcklbw %%mm4, %%mm4 \n\t"
  2015. "punpckhbw %%mm5, %%mm5 \n\t"
  2016. "punpcklbw %%mm6, %%mm6 \n\t"
  2017. "punpckhbw %%mm7, %%mm7 \n\t"
  2018. MOVNTQ" %%mm0, %0 \n\t"
  2019. MOVNTQ" %%mm1, 8%0 \n\t"
  2020. MOVNTQ" %%mm2, 16%0 \n\t"
  2021. MOVNTQ" %%mm3, 24%0 \n\t"
  2022. MOVNTQ" %%mm4, 32%0 \n\t"
  2023. MOVNTQ" %%mm5, 40%0 \n\t"
  2024. MOVNTQ" %%mm6, 48%0 \n\t"
  2025. MOVNTQ" %%mm7, 56%0"
  2026. :"=m"(d[2*x])
  2027. :"m"(s1[x])
  2028. :"memory");
  2029. }
  2030. for (;x<w;x++) d[2*x]=d[2*x+1]=s1[x];
  2031. }
  2032. for (y=0;y<h;y++) {
  2033. const uint8_t* s2=src2+srcStride2*(y>>1);
  2034. uint8_t* d=dst2+dstStride2*y;
  2035. x=0;
  2036. for (;x<w-31;x+=32) {
  2037. __asm__ volatile(
  2038. PREFETCH" 32%1 \n\t"
  2039. "movq %1, %%mm0 \n\t"
  2040. "movq 8%1, %%mm2 \n\t"
  2041. "movq 16%1, %%mm4 \n\t"
  2042. "movq 24%1, %%mm6 \n\t"
  2043. "movq %%mm0, %%mm1 \n\t"
  2044. "movq %%mm2, %%mm3 \n\t"
  2045. "movq %%mm4, %%mm5 \n\t"
  2046. "movq %%mm6, %%mm7 \n\t"
  2047. "punpcklbw %%mm0, %%mm0 \n\t"
  2048. "punpckhbw %%mm1, %%mm1 \n\t"
  2049. "punpcklbw %%mm2, %%mm2 \n\t"
  2050. "punpckhbw %%mm3, %%mm3 \n\t"
  2051. "punpcklbw %%mm4, %%mm4 \n\t"
  2052. "punpckhbw %%mm5, %%mm5 \n\t"
  2053. "punpcklbw %%mm6, %%mm6 \n\t"
  2054. "punpckhbw %%mm7, %%mm7 \n\t"
  2055. MOVNTQ" %%mm0, %0 \n\t"
  2056. MOVNTQ" %%mm1, 8%0 \n\t"
  2057. MOVNTQ" %%mm2, 16%0 \n\t"
  2058. MOVNTQ" %%mm3, 24%0 \n\t"
  2059. MOVNTQ" %%mm4, 32%0 \n\t"
  2060. MOVNTQ" %%mm5, 40%0 \n\t"
  2061. MOVNTQ" %%mm6, 48%0 \n\t"
  2062. MOVNTQ" %%mm7, 56%0"
  2063. :"=m"(d[2*x])
  2064. :"m"(s2[x])
  2065. :"memory");
  2066. }
  2067. for (;x<w;x++) d[2*x]=d[2*x+1]=s2[x];
  2068. }
  2069. __asm__(
  2070. EMMS" \n\t"
  2071. SFENCE" \n\t"
  2072. ::: "memory"
  2073. );
  2074. }
  2075. static inline void RENAME(yvu9_to_yuy2)(const uint8_t *src1, const uint8_t *src2, const uint8_t *src3,
  2076. uint8_t *dst,
  2077. long width, long height,
  2078. long srcStride1, long srcStride2,
  2079. long srcStride3, long dstStride)
  2080. {
  2081. x86_reg x;
  2082. long y,w,h;
  2083. w=width/2; h=height;
  2084. for (y=0;y<h;y++) {
  2085. const uint8_t* yp=src1+srcStride1*y;
  2086. const uint8_t* up=src2+srcStride2*(y>>2);
  2087. const uint8_t* vp=src3+srcStride3*(y>>2);
  2088. uint8_t* d=dst+dstStride*y;
  2089. x=0;
  2090. for (;x<w-7;x+=8) {
  2091. __asm__ volatile(
  2092. PREFETCH" 32(%1, %0) \n\t"
  2093. PREFETCH" 32(%2, %0) \n\t"
  2094. PREFETCH" 32(%3, %0) \n\t"
  2095. "movq (%1, %0, 4), %%mm0 \n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
  2096. "movq (%2, %0), %%mm1 \n\t" /* U0U1U2U3U4U5U6U7 */
  2097. "movq (%3, %0), %%mm2 \n\t" /* V0V1V2V3V4V5V6V7 */
  2098. "movq %%mm0, %%mm3 \n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
  2099. "movq %%mm1, %%mm4 \n\t" /* U0U1U2U3U4U5U6U7 */
  2100. "movq %%mm2, %%mm5 \n\t" /* V0V1V2V3V4V5V6V7 */
  2101. "punpcklbw %%mm1, %%mm1 \n\t" /* U0U0 U1U1 U2U2 U3U3 */
  2102. "punpcklbw %%mm2, %%mm2 \n\t" /* V0V0 V1V1 V2V2 V3V3 */
  2103. "punpckhbw %%mm4, %%mm4 \n\t" /* U4U4 U5U5 U6U6 U7U7 */
  2104. "punpckhbw %%mm5, %%mm5 \n\t" /* V4V4 V5V5 V6V6 V7V7 */
  2105. "movq %%mm1, %%mm6 \n\t"
  2106. "punpcklbw %%mm2, %%mm1 \n\t" /* U0V0 U0V0 U1V1 U1V1*/
  2107. "punpcklbw %%mm1, %%mm0 \n\t" /* Y0U0 Y1V0 Y2U0 Y3V0*/
  2108. "punpckhbw %%mm1, %%mm3 \n\t" /* Y4U1 Y5V1 Y6U1 Y7V1*/
  2109. MOVNTQ" %%mm0, (%4, %0, 8) \n\t"
  2110. MOVNTQ" %%mm3, 8(%4, %0, 8) \n\t"
  2111. "punpckhbw %%mm2, %%mm6 \n\t" /* U2V2 U2V2 U3V3 U3V3*/
  2112. "movq 8(%1, %0, 4), %%mm0 \n\t"
  2113. "movq %%mm0, %%mm3 \n\t"
  2114. "punpcklbw %%mm6, %%mm0 \n\t" /* Y U2 Y V2 Y U2 Y V2*/
  2115. "punpckhbw %%mm6, %%mm3 \n\t" /* Y U3 Y V3 Y U3 Y V3*/
  2116. MOVNTQ" %%mm0, 16(%4, %0, 8) \n\t"
  2117. MOVNTQ" %%mm3, 24(%4, %0, 8) \n\t"
  2118. "movq %%mm4, %%mm6 \n\t"
  2119. "movq 16(%1, %0, 4), %%mm0 \n\t"
  2120. "movq %%mm0, %%mm3 \n\t"
  2121. "punpcklbw %%mm5, %%mm4 \n\t"
  2122. "punpcklbw %%mm4, %%mm0 \n\t" /* Y U4 Y V4 Y U4 Y V4*/
  2123. "punpckhbw %%mm4, %%mm3 \n\t" /* Y U5 Y V5 Y U5 Y V5*/
  2124. MOVNTQ" %%mm0, 32(%4, %0, 8) \n\t"
  2125. MOVNTQ" %%mm3, 40(%4, %0, 8) \n\t"
  2126. "punpckhbw %%mm5, %%mm6 \n\t"
  2127. "movq 24(%1, %0, 4), %%mm0 \n\t"
  2128. "movq %%mm0, %%mm3 \n\t"
  2129. "punpcklbw %%mm6, %%mm0 \n\t" /* Y U6 Y V6 Y U6 Y V6*/
  2130. "punpckhbw %%mm6, %%mm3 \n\t" /* Y U7 Y V7 Y U7 Y V7*/
  2131. MOVNTQ" %%mm0, 48(%4, %0, 8) \n\t"
  2132. MOVNTQ" %%mm3, 56(%4, %0, 8) \n\t"
  2133. : "+r" (x)
  2134. : "r"(yp), "r" (up), "r"(vp), "r"(d)
  2135. :"memory");
  2136. }
  2137. for (; x<w; x++) {
  2138. const long x2 = x<<2;
  2139. d[8*x+0] = yp[x2];
  2140. d[8*x+1] = up[x];
  2141. d[8*x+2] = yp[x2+1];
  2142. d[8*x+3] = vp[x];
  2143. d[8*x+4] = yp[x2+2];
  2144. d[8*x+5] = up[x];
  2145. d[8*x+6] = yp[x2+3];
  2146. d[8*x+7] = vp[x];
  2147. }
  2148. }
  2149. __asm__(
  2150. EMMS" \n\t"
  2151. SFENCE" \n\t"
  2152. ::: "memory"
  2153. );
  2154. }
  2155. static void RENAME(extract_even)(const uint8_t *src, uint8_t *dst, x86_reg count)
  2156. {
  2157. dst += count;
  2158. src += 2*count;
  2159. count= - count;
  2160. if(count <= -16) {
  2161. count += 15;
  2162. __asm__ volatile(
  2163. "pcmpeqw %%mm7, %%mm7 \n\t"
  2164. "psrlw $8, %%mm7 \n\t"
  2165. "1: \n\t"
  2166. "movq -30(%1, %0, 2), %%mm0 \n\t"
  2167. "movq -22(%1, %0, 2), %%mm1 \n\t"
  2168. "movq -14(%1, %0, 2), %%mm2 \n\t"
  2169. "movq -6(%1, %0, 2), %%mm3 \n\t"
  2170. "pand %%mm7, %%mm0 \n\t"
  2171. "pand %%mm7, %%mm1 \n\t"
  2172. "pand %%mm7, %%mm2 \n\t"
  2173. "pand %%mm7, %%mm3 \n\t"
  2174. "packuswb %%mm1, %%mm0 \n\t"
  2175. "packuswb %%mm3, %%mm2 \n\t"
  2176. MOVNTQ" %%mm0,-15(%2, %0) \n\t"
  2177. MOVNTQ" %%mm2,- 7(%2, %0) \n\t"
  2178. "add $16, %0 \n\t"
  2179. " js 1b \n\t"
  2180. : "+r"(count)
  2181. : "r"(src), "r"(dst)
  2182. );
  2183. count -= 15;
  2184. }
  2185. while(count<0) {
  2186. dst[count]= src[2*count];
  2187. count++;
  2188. }
  2189. }
  2190. static void RENAME(extract_even2)(const uint8_t *src, uint8_t *dst0, uint8_t *dst1, x86_reg count)
  2191. {
  2192. dst0+= count;
  2193. dst1+= count;
  2194. src += 4*count;
  2195. count= - count;
  2196. if(count <= -8) {
  2197. count += 7;
  2198. __asm__ volatile(
  2199. "pcmpeqw %%mm7, %%mm7 \n\t"
  2200. "psrlw $8, %%mm7 \n\t"
  2201. "1: \n\t"
  2202. "movq -28(%1, %0, 4), %%mm0 \n\t"
  2203. "movq -20(%1, %0, 4), %%mm1 \n\t"
  2204. "movq -12(%1, %0, 4), %%mm2 \n\t"
  2205. "movq -4(%1, %0, 4), %%mm3 \n\t"
  2206. "pand %%mm7, %%mm0 \n\t"
  2207. "pand %%mm7, %%mm1 \n\t"
  2208. "pand %%mm7, %%mm2 \n\t"
  2209. "pand %%mm7, %%mm3 \n\t"
  2210. "packuswb %%mm1, %%mm0 \n\t"
  2211. "packuswb %%mm3, %%mm2 \n\t"
  2212. "movq %%mm0, %%mm1 \n\t"
  2213. "movq %%mm2, %%mm3 \n\t"
  2214. "psrlw $8, %%mm0 \n\t"
  2215. "psrlw $8, %%mm2 \n\t"
  2216. "pand %%mm7, %%mm1 \n\t"
  2217. "pand %%mm7, %%mm3 \n\t"
  2218. "packuswb %%mm2, %%mm0 \n\t"
  2219. "packuswb %%mm3, %%mm1 \n\t"
  2220. MOVNTQ" %%mm0,- 7(%3, %0) \n\t"
  2221. MOVNTQ" %%mm1,- 7(%2, %0) \n\t"
  2222. "add $8, %0 \n\t"
  2223. " js 1b \n\t"
  2224. : "+r"(count)
  2225. : "r"(src), "r"(dst0), "r"(dst1)
  2226. );
  2227. count -= 7;
  2228. }
  2229. while(count<0) {
  2230. dst0[count]= src[4*count+0];
  2231. dst1[count]= src[4*count+2];
  2232. count++;
  2233. }
  2234. }
  2235. static void RENAME(extract_even2avg)(const uint8_t *src0, const uint8_t *src1, uint8_t *dst0, uint8_t *dst1, x86_reg count)
  2236. {
  2237. dst0 += count;
  2238. dst1 += count;
  2239. src0 += 4*count;
  2240. src1 += 4*count;
  2241. count= - count;
  2242. #ifdef PAVGB
  2243. if(count <= -8) {
  2244. count += 7;
  2245. __asm__ volatile(
  2246. "pcmpeqw %%mm7, %%mm7 \n\t"
  2247. "psrlw $8, %%mm7 \n\t"
  2248. "1: \n\t"
  2249. "movq -28(%1, %0, 4), %%mm0 \n\t"
  2250. "movq -20(%1, %0, 4), %%mm1 \n\t"
  2251. "movq -12(%1, %0, 4), %%mm2 \n\t"
  2252. "movq -4(%1, %0, 4), %%mm3 \n\t"
  2253. PAVGB" -28(%2, %0, 4), %%mm0 \n\t"
  2254. PAVGB" -20(%2, %0, 4), %%mm1 \n\t"
  2255. PAVGB" -12(%2, %0, 4), %%mm2 \n\t"
  2256. PAVGB" - 4(%2, %0, 4), %%mm3 \n\t"
  2257. "pand %%mm7, %%mm0 \n\t"
  2258. "pand %%mm7, %%mm1 \n\t"
  2259. "pand %%mm7, %%mm2 \n\t"
  2260. "pand %%mm7, %%mm3 \n\t"
  2261. "packuswb %%mm1, %%mm0 \n\t"
  2262. "packuswb %%mm3, %%mm2 \n\t"
  2263. "movq %%mm0, %%mm1 \n\t"
  2264. "movq %%mm2, %%mm3 \n\t"
  2265. "psrlw $8, %%mm0 \n\t"
  2266. "psrlw $8, %%mm2 \n\t"
  2267. "pand %%mm7, %%mm1 \n\t"
  2268. "pand %%mm7, %%mm3 \n\t"
  2269. "packuswb %%mm2, %%mm0 \n\t"
  2270. "packuswb %%mm3, %%mm1 \n\t"
  2271. MOVNTQ" %%mm0,- 7(%4, %0) \n\t"
  2272. MOVNTQ" %%mm1,- 7(%3, %0) \n\t"
  2273. "add $8, %0 \n\t"
  2274. " js 1b \n\t"
  2275. : "+r"(count)
  2276. : "r"(src0), "r"(src1), "r"(dst0), "r"(dst1)
  2277. );
  2278. count -= 7;
  2279. }
  2280. #endif
  2281. while(count<0) {
  2282. dst0[count]= (src0[4*count+0]+src1[4*count+0])>>1;
  2283. dst1[count]= (src0[4*count+2]+src1[4*count+2])>>1;
  2284. count++;
  2285. }
  2286. }
  2287. static void RENAME(extract_odd2)(const uint8_t *src, uint8_t *dst0, uint8_t *dst1, x86_reg count)
  2288. {
  2289. dst0+= count;
  2290. dst1+= count;
  2291. src += 4*count;
  2292. count= - count;
  2293. if(count <= -8) {
  2294. count += 7;
  2295. __asm__ volatile(
  2296. "pcmpeqw %%mm7, %%mm7 \n\t"
  2297. "psrlw $8, %%mm7 \n\t"
  2298. "1: \n\t"
  2299. "movq -28(%1, %0, 4), %%mm0 \n\t"
  2300. "movq -20(%1, %0, 4), %%mm1 \n\t"
  2301. "movq -12(%1, %0, 4), %%mm2 \n\t"
  2302. "movq -4(%1, %0, 4), %%mm3 \n\t"
  2303. "psrlw $8, %%mm0 \n\t"
  2304. "psrlw $8, %%mm1 \n\t"
  2305. "psrlw $8, %%mm2 \n\t"
  2306. "psrlw $8, %%mm3 \n\t"
  2307. "packuswb %%mm1, %%mm0 \n\t"
  2308. "packuswb %%mm3, %%mm2 \n\t"
  2309. "movq %%mm0, %%mm1 \n\t"
  2310. "movq %%mm2, %%mm3 \n\t"
  2311. "psrlw $8, %%mm0 \n\t"
  2312. "psrlw $8, %%mm2 \n\t"
  2313. "pand %%mm7, %%mm1 \n\t"
  2314. "pand %%mm7, %%mm3 \n\t"
  2315. "packuswb %%mm2, %%mm0 \n\t"
  2316. "packuswb %%mm3, %%mm1 \n\t"
  2317. MOVNTQ" %%mm0,- 7(%3, %0) \n\t"
  2318. MOVNTQ" %%mm1,- 7(%2, %0) \n\t"
  2319. "add $8, %0 \n\t"
  2320. " js 1b \n\t"
  2321. : "+r"(count)
  2322. : "r"(src), "r"(dst0), "r"(dst1)
  2323. );
  2324. count -= 7;
  2325. }
  2326. src++;
  2327. while(count<0) {
  2328. dst0[count]= src[4*count+0];
  2329. dst1[count]= src[4*count+2];
  2330. count++;
  2331. }
  2332. }
  2333. static void RENAME(extract_odd2avg)(const uint8_t *src0, const uint8_t *src1, uint8_t *dst0, uint8_t *dst1, x86_reg count)
  2334. {
  2335. dst0 += count;
  2336. dst1 += count;
  2337. src0 += 4*count;
  2338. src1 += 4*count;
  2339. count= - count;
  2340. #ifdef PAVGB
  2341. if(count <= -8) {
  2342. count += 7;
  2343. __asm__ volatile(
  2344. "pcmpeqw %%mm7, %%mm7 \n\t"
  2345. "psrlw $8, %%mm7 \n\t"
  2346. "1: \n\t"
  2347. "movq -28(%1, %0, 4), %%mm0 \n\t"
  2348. "movq -20(%1, %0, 4), %%mm1 \n\t"
  2349. "movq -12(%1, %0, 4), %%mm2 \n\t"
  2350. "movq -4(%1, %0, 4), %%mm3 \n\t"
  2351. PAVGB" -28(%2, %0, 4), %%mm0 \n\t"
  2352. PAVGB" -20(%2, %0, 4), %%mm1 \n\t"
  2353. PAVGB" -12(%2, %0, 4), %%mm2 \n\t"
  2354. PAVGB" - 4(%2, %0, 4), %%mm3 \n\t"
  2355. "psrlw $8, %%mm0 \n\t"
  2356. "psrlw $8, %%mm1 \n\t"
  2357. "psrlw $8, %%mm2 \n\t"
  2358. "psrlw $8, %%mm3 \n\t"
  2359. "packuswb %%mm1, %%mm0 \n\t"
  2360. "packuswb %%mm3, %%mm2 \n\t"
  2361. "movq %%mm0, %%mm1 \n\t"
  2362. "movq %%mm2, %%mm3 \n\t"
  2363. "psrlw $8, %%mm0 \n\t"
  2364. "psrlw $8, %%mm2 \n\t"
  2365. "pand %%mm7, %%mm1 \n\t"
  2366. "pand %%mm7, %%mm3 \n\t"
  2367. "packuswb %%mm2, %%mm0 \n\t"
  2368. "packuswb %%mm3, %%mm1 \n\t"
  2369. MOVNTQ" %%mm0,- 7(%4, %0) \n\t"
  2370. MOVNTQ" %%mm1,- 7(%3, %0) \n\t"
  2371. "add $8, %0 \n\t"
  2372. " js 1b \n\t"
  2373. : "+r"(count)
  2374. : "r"(src0), "r"(src1), "r"(dst0), "r"(dst1)
  2375. );
  2376. count -= 7;
  2377. }
  2378. #endif
  2379. src0++;
  2380. src1++;
  2381. while(count<0) {
  2382. dst0[count]= (src0[4*count+0]+src1[4*count+0])>>1;
  2383. dst1[count]= (src0[4*count+2]+src1[4*count+2])>>1;
  2384. count++;
  2385. }
  2386. }
  2387. static void RENAME(yuyvtoyuv420)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
  2388. long width, long height,
  2389. long lumStride, long chromStride, long srcStride)
  2390. {
  2391. long y;
  2392. const long chromWidth= -((-width)>>1);
  2393. for (y=0; y<height; y++) {
  2394. RENAME(extract_even)(src, ydst, width);
  2395. if(y&1) {
  2396. RENAME(extract_odd2avg)(src-srcStride, src, udst, vdst, chromWidth);
  2397. udst+= chromStride;
  2398. vdst+= chromStride;
  2399. }
  2400. src += srcStride;
  2401. ydst+= lumStride;
  2402. }
  2403. __asm__(
  2404. EMMS" \n\t"
  2405. SFENCE" \n\t"
  2406. ::: "memory"
  2407. );
  2408. }
  2409. static void RENAME(yuyvtoyuv422)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
  2410. long width, long height,
  2411. long lumStride, long chromStride, long srcStride)
  2412. {
  2413. long y;
  2414. const long chromWidth= -((-width)>>1);
  2415. for (y=0; y<height; y++) {
  2416. RENAME(extract_even)(src, ydst, width);
  2417. RENAME(extract_odd2)(src, udst, vdst, chromWidth);
  2418. src += srcStride;
  2419. ydst+= lumStride;
  2420. udst+= chromStride;
  2421. vdst+= chromStride;
  2422. }
  2423. __asm__(
  2424. EMMS" \n\t"
  2425. SFENCE" \n\t"
  2426. ::: "memory"
  2427. );
  2428. }
  2429. static void RENAME(uyvytoyuv420)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
  2430. long width, long height,
  2431. long lumStride, long chromStride, long srcStride)
  2432. {
  2433. long y;
  2434. const long chromWidth= -((-width)>>1);
  2435. for (y=0; y<height; y++) {
  2436. RENAME(extract_even)(src+1, ydst, width);
  2437. if(y&1) {
  2438. RENAME(extract_even2avg)(src-srcStride, src, udst, vdst, chromWidth);
  2439. udst+= chromStride;
  2440. vdst+= chromStride;
  2441. }
  2442. src += srcStride;
  2443. ydst+= lumStride;
  2444. }
  2445. __asm__(
  2446. EMMS" \n\t"
  2447. SFENCE" \n\t"
  2448. ::: "memory"
  2449. );
  2450. }
  2451. static void RENAME(uyvytoyuv422)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
  2452. long width, long height,
  2453. long lumStride, long chromStride, long srcStride)
  2454. {
  2455. long y;
  2456. const long chromWidth= -((-width)>>1);
  2457. for (y=0; y<height; y++) {
  2458. RENAME(extract_even)(src+1, ydst, width);
  2459. RENAME(extract_even2)(src, udst, vdst, chromWidth);
  2460. src += srcStride;
  2461. ydst+= lumStride;
  2462. udst+= chromStride;
  2463. vdst+= chromStride;
  2464. }
  2465. __asm__(
  2466. EMMS" \n\t"
  2467. SFENCE" \n\t"
  2468. ::: "memory"
  2469. );
  2470. }
  2471. static inline void RENAME(rgb2rgb_init)(void)
  2472. {
  2473. rgb15to16 = RENAME(rgb15to16);
  2474. rgb15tobgr24 = RENAME(rgb15tobgr24);
  2475. rgb15to32 = RENAME(rgb15to32);
  2476. rgb16tobgr24 = RENAME(rgb16tobgr24);
  2477. rgb16to32 = RENAME(rgb16to32);
  2478. rgb16to15 = RENAME(rgb16to15);
  2479. rgb24tobgr16 = RENAME(rgb24tobgr16);
  2480. rgb24tobgr15 = RENAME(rgb24tobgr15);
  2481. rgb24tobgr32 = RENAME(rgb24tobgr32);
  2482. rgb32to16 = RENAME(rgb32to16);
  2483. rgb32to15 = RENAME(rgb32to15);
  2484. rgb32tobgr24 = RENAME(rgb32tobgr24);
  2485. rgb24to15 = RENAME(rgb24to15);
  2486. rgb24to16 = RENAME(rgb24to16);
  2487. rgb24tobgr24 = RENAME(rgb24tobgr24);
  2488. shuffle_bytes_2103 = RENAME(shuffle_bytes_2103);
  2489. rgb32tobgr16 = RENAME(rgb32tobgr16);
  2490. rgb32tobgr15 = RENAME(rgb32tobgr15);
  2491. yv12toyuy2 = RENAME(yv12toyuy2);
  2492. yv12touyvy = RENAME(yv12touyvy);
  2493. yuv422ptoyuy2 = RENAME(yuv422ptoyuy2);
  2494. yuv422ptouyvy = RENAME(yuv422ptouyvy);
  2495. yuy2toyv12 = RENAME(yuy2toyv12);
  2496. planar2x = RENAME(planar2x);
  2497. rgb24toyv12 = RENAME(rgb24toyv12);
  2498. interleaveBytes = RENAME(interleaveBytes);
  2499. vu9_to_vu12 = RENAME(vu9_to_vu12);
  2500. yvu9_to_yuy2 = RENAME(yvu9_to_yuy2);
  2501. uyvytoyuv420 = RENAME(uyvytoyuv420);
  2502. uyvytoyuv422 = RENAME(uyvytoyuv422);
  2503. yuyvtoyuv420 = RENAME(yuyvtoyuv420);
  2504. yuyvtoyuv422 = RENAME(yuyvtoyuv422);
  2505. }