You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1896 lines
48KB

  1. /*
  2. *
  3. * rgb2rgb.c, Software RGB to RGB convertor
  4. * pluralize by Software PAL8 to RGB convertor
  5. * Software YUV to YUV convertor
  6. * Software YUV to RGB convertor
  7. * Written by Nick Kurshev.
  8. * palette & yuv & runtime cpu stuff by Michael (michaelni@gmx.at) (under GPL)
  9. */
  10. #include <stddef.h>
  11. #include <inttypes.h> /* for __WORDSIZE */
  12. #ifndef __WORDSIZE
  13. #warning You have misconfigured system and probably will lose performance!
  14. #endif
  15. #undef PREFETCH
  16. #undef MOVNTQ
  17. #undef EMMS
  18. #undef SFENCE
  19. #undef MMREG_SIZE
  20. #undef PREFETCHW
  21. #undef PAVGB
  22. #ifdef HAVE_SSE2
  23. #define MMREG_SIZE 16
  24. #else
  25. #define MMREG_SIZE 8
  26. #endif
  27. #ifdef HAVE_3DNOW
  28. #define PREFETCH "prefetch"
  29. #define PREFETCHW "prefetchw"
  30. #define PAVGB "pavgusb"
  31. #elif defined ( HAVE_MMX2 )
  32. #define PREFETCH "prefetchnta"
  33. #define PREFETCHW "prefetcht0"
  34. #define PAVGB "pavgb"
  35. #else
  36. #define PREFETCH "/nop"
  37. #define PREFETCHW "/nop"
  38. #endif
  39. #ifdef HAVE_3DNOW
  40. /* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
  41. #define EMMS "femms"
  42. #else
  43. #define EMMS "emms"
  44. #endif
  45. #ifdef HAVE_MMX2
  46. #define MOVNTQ "movntq"
  47. #define SFENCE "sfence"
  48. #else
  49. #define MOVNTQ "movq"
  50. #define SFENCE "/nop"
  51. #endif
  52. static inline void RENAME(rgb24to32)(const uint8_t *src,uint8_t *dst,unsigned src_size)
  53. {
  54. uint8_t *dest = dst;
  55. const uint8_t *s = src;
  56. const uint8_t *end;
  57. #ifdef HAVE_MMX
  58. uint8_t *mm_end;
  59. #endif
  60. end = s + src_size;
  61. #ifdef HAVE_MMX
  62. __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
  63. __asm __volatile("movq %0, %%mm7"::"m"(mask32):"memory");
  64. mm_end = (uint8_t*)((((unsigned long)end)/24)*24);
  65. while(s < mm_end)
  66. {
  67. __asm __volatile(
  68. PREFETCH" 32%1\n\t"
  69. "movd %1, %%mm0\n\t"
  70. "punpckldq 3%1, %%mm0\n\t"
  71. "movd 6%1, %%mm1\n\t"
  72. "punpckldq 9%1, %%mm1\n\t"
  73. "movd 12%1, %%mm2\n\t"
  74. "punpckldq 15%1, %%mm2\n\t"
  75. "movd 18%1, %%mm3\n\t"
  76. "punpckldq 21%1, %%mm3\n\t"
  77. "pand %%mm7, %%mm0\n\t"
  78. "pand %%mm7, %%mm1\n\t"
  79. "pand %%mm7, %%mm2\n\t"
  80. "pand %%mm7, %%mm3\n\t"
  81. MOVNTQ" %%mm0, %0\n\t"
  82. MOVNTQ" %%mm1, 8%0\n\t"
  83. MOVNTQ" %%mm2, 16%0\n\t"
  84. MOVNTQ" %%mm3, 24%0"
  85. :"=m"(*dest)
  86. :"m"(*s)
  87. :"memory");
  88. dest += 32;
  89. s += 24;
  90. }
  91. __asm __volatile(SFENCE:::"memory");
  92. __asm __volatile(EMMS:::"memory");
  93. #endif
  94. while(s < end)
  95. {
  96. *dest++ = *s++;
  97. *dest++ = *s++;
  98. *dest++ = *s++;
  99. *dest++ = 0;
  100. }
  101. }
  102. static inline void RENAME(rgb32to24)(const uint8_t *src,uint8_t *dst,unsigned src_size)
  103. {
  104. uint8_t *dest = dst;
  105. const uint8_t *s = src;
  106. const uint8_t *end;
  107. #ifdef HAVE_MMX
  108. uint8_t *mm_end;
  109. #endif
  110. end = s + src_size;
  111. #ifdef HAVE_MMX
  112. __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
  113. mm_end = (uint8_t*)((((unsigned long)end)/32)*32);
  114. while(s < mm_end)
  115. {
  116. __asm __volatile(
  117. PREFETCH" 32%1\n\t"
  118. "movq %1, %%mm0\n\t"
  119. "movq 8%1, %%mm1\n\t"
  120. "movq 16%1, %%mm4\n\t"
  121. "movq 24%1, %%mm5\n\t"
  122. "movq %%mm0, %%mm2\n\t"
  123. "movq %%mm1, %%mm3\n\t"
  124. "movq %%mm4, %%mm6\n\t"
  125. "movq %%mm5, %%mm7\n\t"
  126. "psrlq $8, %%mm2\n\t"
  127. "psrlq $8, %%mm3\n\t"
  128. "psrlq $8, %%mm6\n\t"
  129. "psrlq $8, %%mm7\n\t"
  130. "pand %2, %%mm0\n\t"
  131. "pand %2, %%mm1\n\t"
  132. "pand %2, %%mm4\n\t"
  133. "pand %2, %%mm5\n\t"
  134. "pand %3, %%mm2\n\t"
  135. "pand %3, %%mm3\n\t"
  136. "pand %3, %%mm6\n\t"
  137. "pand %3, %%mm7\n\t"
  138. "por %%mm2, %%mm0\n\t"
  139. "por %%mm3, %%mm1\n\t"
  140. "por %%mm6, %%mm4\n\t"
  141. "por %%mm7, %%mm5\n\t"
  142. "movq %%mm1, %%mm2\n\t"
  143. "movq %%mm4, %%mm3\n\t"
  144. "psllq $48, %%mm2\n\t"
  145. "psllq $32, %%mm3\n\t"
  146. "pand %4, %%mm2\n\t"
  147. "pand %5, %%mm3\n\t"
  148. "por %%mm2, %%mm0\n\t"
  149. "psrlq $16, %%mm1\n\t"
  150. "psrlq $32, %%mm4\n\t"
  151. "psllq $16, %%mm5\n\t"
  152. "por %%mm3, %%mm1\n\t"
  153. "pand %6, %%mm5\n\t"
  154. "por %%mm5, %%mm4\n\t"
  155. MOVNTQ" %%mm0, %0\n\t"
  156. MOVNTQ" %%mm1, 8%0\n\t"
  157. MOVNTQ" %%mm4, 16%0"
  158. :"=m"(*dest)
  159. :"m"(*s),"m"(mask24l),
  160. "m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
  161. :"memory");
  162. dest += 24;
  163. s += 32;
  164. }
  165. __asm __volatile(SFENCE:::"memory");
  166. __asm __volatile(EMMS:::"memory");
  167. #endif
  168. while(s < end)
  169. {
  170. *dest++ = *s++;
  171. *dest++ = *s++;
  172. *dest++ = *s++;
  173. s++;
  174. }
  175. }
  176. /*
  177. Original by Strepto/Astral
  178. ported to gcc & bugfixed : A'rpi
  179. MMX2, 3DNOW optimization by Nick Kurshev
  180. 32bit c version, and and&add trick by Michael Niedermayer
  181. */
  182. static inline void RENAME(rgb15to16)(const uint8_t *src,uint8_t *dst,unsigned src_size)
  183. {
  184. register const uint8_t* s=src;
  185. register uint8_t* d=dst;
  186. register const uint8_t *end;
  187. uint8_t *mm_end;
  188. end = s + src_size;
  189. #ifdef HAVE_MMX
  190. __asm __volatile(PREFETCH" %0"::"m"(*s));
  191. __asm __volatile("movq %0, %%mm4"::"m"(mask15s));
  192. mm_end = (uint8_t*)((((unsigned long)end)/16)*16);
  193. while(s<mm_end)
  194. {
  195. __asm __volatile(
  196. PREFETCH" 32%1\n\t"
  197. "movq %1, %%mm0\n\t"
  198. "movq 8%1, %%mm2\n\t"
  199. "movq %%mm0, %%mm1\n\t"
  200. "movq %%mm2, %%mm3\n\t"
  201. "pand %%mm4, %%mm0\n\t"
  202. "pand %%mm4, %%mm2\n\t"
  203. "paddw %%mm1, %%mm0\n\t"
  204. "paddw %%mm3, %%mm2\n\t"
  205. MOVNTQ" %%mm0, %0\n\t"
  206. MOVNTQ" %%mm2, 8%0"
  207. :"=m"(*d)
  208. :"m"(*s)
  209. );
  210. d+=16;
  211. s+=16;
  212. }
  213. __asm __volatile(SFENCE:::"memory");
  214. __asm __volatile(EMMS:::"memory");
  215. #endif
  216. mm_end = (uint8_t*)((((unsigned long)end)/4)*4);
  217. while(s < mm_end)
  218. {
  219. register unsigned x= *((uint32_t *)s);
  220. *((uint32_t *)d) = (x&0x7FFF7FFF) + (x&0x7FE07FE0);
  221. d+=4;
  222. s+=4;
  223. }
  224. if(s < end)
  225. {
  226. register unsigned short x= *((uint16_t *)s);
  227. *((uint16_t *)d) = (x&0x7FFF) + (x&0x7FE0);
  228. }
  229. }
  230. static inline void RENAME(bgr24torgb24)(const uint8_t *src, uint8_t *dst, unsigned src_size)
  231. {
  232. unsigned j,i,num_pixels=src_size/3;
  233. for(i=0,j=0; j<num_pixels; i+=3,j+=3)
  234. {
  235. dst[j+0] = src[i+2];
  236. dst[j+1] = src[i+1];
  237. dst[j+2] = src[i+0];
  238. }
  239. }
  240. static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, unsigned src_size)
  241. {
  242. const uint8_t *s = src;
  243. const uint8_t *end;
  244. #ifdef HAVE_MMX
  245. const uint8_t *mm_end;
  246. #endif
  247. uint16_t *d = (uint16_t *)dst;
  248. end = s + src_size;
  249. #ifdef HAVE_MMX
  250. __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
  251. __asm __volatile(
  252. "movq %0, %%mm7\n\t"
  253. "movq %1, %%mm6\n\t"
  254. ::"m"(red_16mask),"m"(green_16mask));
  255. mm_end = (uint8_t*)((((unsigned long)end)/16)*16);
  256. while(s < mm_end)
  257. {
  258. __asm __volatile(
  259. PREFETCH" 32%1\n\t"
  260. "movd %1, %%mm0\n\t"
  261. "movd 4%1, %%mm3\n\t"
  262. "punpckldq 8%1, %%mm0\n\t"
  263. "punpckldq 12%1, %%mm3\n\t"
  264. "movq %%mm0, %%mm1\n\t"
  265. "movq %%mm0, %%mm2\n\t"
  266. "movq %%mm3, %%mm4\n\t"
  267. "movq %%mm3, %%mm5\n\t"
  268. "psrlq $3, %%mm0\n\t"
  269. "psrlq $3, %%mm3\n\t"
  270. "pand %2, %%mm0\n\t"
  271. "pand %2, %%mm3\n\t"
  272. "psrlq $5, %%mm1\n\t"
  273. "psrlq $5, %%mm4\n\t"
  274. "pand %%mm6, %%mm1\n\t"
  275. "pand %%mm6, %%mm4\n\t"
  276. "psrlq $8, %%mm2\n\t"
  277. "psrlq $8, %%mm5\n\t"
  278. "pand %%mm7, %%mm2\n\t"
  279. "pand %%mm7, %%mm5\n\t"
  280. "por %%mm1, %%mm0\n\t"
  281. "por %%mm4, %%mm3\n\t"
  282. "por %%mm2, %%mm0\n\t"
  283. "por %%mm5, %%mm3\n\t"
  284. "psllq $16, %%mm3\n\t"
  285. "por %%mm3, %%mm0\n\t"
  286. MOVNTQ" %%mm0, %0\n\t"
  287. :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
  288. d += 4;
  289. s += 16;
  290. }
  291. __asm __volatile(SFENCE:::"memory");
  292. __asm __volatile(EMMS:::"memory");
  293. #endif
  294. while(s < end)
  295. {
  296. const int b= *s++;
  297. const int g= *s++;
  298. const int r= *s++;
  299. *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
  300. s++;
  301. }
  302. }
  303. static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, unsigned src_size)
  304. {
  305. const uint8_t *s = src;
  306. const uint8_t *end;
  307. #ifdef HAVE_MMX
  308. const uint8_t *mm_end;
  309. #endif
  310. uint16_t *d = (uint16_t *)dst;
  311. end = s + src_size;
  312. #ifdef HAVE_MMX
  313. __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
  314. __asm __volatile(
  315. "movq %0, %%mm7\n\t"
  316. "movq %1, %%mm6\n\t"
  317. ::"m"(red_15mask),"m"(green_15mask));
  318. mm_end = (uint8_t*)((((unsigned long)end)/16)*16);
  319. while(s < mm_end)
  320. {
  321. __asm __volatile(
  322. PREFETCH" 32%1\n\t"
  323. "movd %1, %%mm0\n\t"
  324. "movd 4%1, %%mm3\n\t"
  325. "punpckldq 8%1, %%mm0\n\t"
  326. "punpckldq 12%1, %%mm3\n\t"
  327. "movq %%mm0, %%mm1\n\t"
  328. "movq %%mm0, %%mm2\n\t"
  329. "movq %%mm3, %%mm4\n\t"
  330. "movq %%mm3, %%mm5\n\t"
  331. "psrlq $3, %%mm0\n\t"
  332. "psrlq $3, %%mm3\n\t"
  333. "pand %2, %%mm0\n\t"
  334. "pand %2, %%mm3\n\t"
  335. "psrlq $6, %%mm1\n\t"
  336. "psrlq $6, %%mm4\n\t"
  337. "pand %%mm6, %%mm1\n\t"
  338. "pand %%mm6, %%mm4\n\t"
  339. "psrlq $9, %%mm2\n\t"
  340. "psrlq $9, %%mm5\n\t"
  341. "pand %%mm7, %%mm2\n\t"
  342. "pand %%mm7, %%mm5\n\t"
  343. "por %%mm1, %%mm0\n\t"
  344. "por %%mm4, %%mm3\n\t"
  345. "por %%mm2, %%mm0\n\t"
  346. "por %%mm5, %%mm3\n\t"
  347. "psllq $16, %%mm3\n\t"
  348. "por %%mm3, %%mm0\n\t"
  349. MOVNTQ" %%mm0, %0\n\t"
  350. :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
  351. d += 4;
  352. s += 16;
  353. }
  354. __asm __volatile(SFENCE:::"memory");
  355. __asm __volatile(EMMS:::"memory");
  356. #endif
  357. while(s < end)
  358. {
  359. const int b= *s++;
  360. const int g= *s++;
  361. const int r= *s++;
  362. *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
  363. s++;
  364. }
  365. }
  366. static inline void RENAME(rgb24to16)(const uint8_t *src, uint8_t *dst, unsigned src_size)
  367. {
  368. const uint8_t *s = src;
  369. const uint8_t *end;
  370. #ifdef HAVE_MMX
  371. const uint8_t *mm_end;
  372. #endif
  373. uint16_t *d = (uint16_t *)dst;
  374. end = s + src_size;
  375. #ifdef HAVE_MMX
  376. __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
  377. __asm __volatile(
  378. "movq %0, %%mm7\n\t"
  379. "movq %1, %%mm6\n\t"
  380. ::"m"(red_16mask),"m"(green_16mask));
  381. mm_end = (uint8_t*)((((unsigned long)end)/16)*16);
  382. while(s < mm_end)
  383. {
  384. __asm __volatile(
  385. PREFETCH" 32%1\n\t"
  386. "movd %1, %%mm0\n\t"
  387. "movd 3%1, %%mm3\n\t"
  388. "punpckldq 6%1, %%mm0\n\t"
  389. "punpckldq 9%1, %%mm3\n\t"
  390. "movq %%mm0, %%mm1\n\t"
  391. "movq %%mm0, %%mm2\n\t"
  392. "movq %%mm3, %%mm4\n\t"
  393. "movq %%mm3, %%mm5\n\t"
  394. "psrlq $3, %%mm0\n\t"
  395. "psrlq $3, %%mm3\n\t"
  396. "pand %2, %%mm0\n\t"
  397. "pand %2, %%mm3\n\t"
  398. "psrlq $5, %%mm1\n\t"
  399. "psrlq $5, %%mm4\n\t"
  400. "pand %%mm6, %%mm1\n\t"
  401. "pand %%mm6, %%mm4\n\t"
  402. "psrlq $8, %%mm2\n\t"
  403. "psrlq $8, %%mm5\n\t"
  404. "pand %%mm7, %%mm2\n\t"
  405. "pand %%mm7, %%mm5\n\t"
  406. "por %%mm1, %%mm0\n\t"
  407. "por %%mm4, %%mm3\n\t"
  408. "por %%mm2, %%mm0\n\t"
  409. "por %%mm5, %%mm3\n\t"
  410. "psllq $16, %%mm3\n\t"
  411. "por %%mm3, %%mm0\n\t"
  412. MOVNTQ" %%mm0, %0\n\t"
  413. :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
  414. d += 4;
  415. s += 12;
  416. }
  417. __asm __volatile(SFENCE:::"memory");
  418. __asm __volatile(EMMS:::"memory");
  419. #endif
  420. while(s < end)
  421. {
  422. const int b= *s++;
  423. const int g= *s++;
  424. const int r= *s++;
  425. *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
  426. }
  427. }
  428. static inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, unsigned src_size)
  429. {
  430. const uint8_t *s = src;
  431. const uint8_t *end;
  432. #ifdef HAVE_MMX
  433. const uint8_t *mm_end;
  434. #endif
  435. uint16_t *d = (uint16_t *)dst;
  436. end = s + src_size;
  437. #ifdef HAVE_MMX
  438. __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
  439. __asm __volatile(
  440. "movq %0, %%mm7\n\t"
  441. "movq %1, %%mm6\n\t"
  442. ::"m"(red_15mask),"m"(green_15mask));
  443. mm_end = (uint8_t*)((((unsigned long)end)/16)*16);
  444. while(s < mm_end)
  445. {
  446. __asm __volatile(
  447. PREFETCH" 32%1\n\t"
  448. "movd %1, %%mm0\n\t"
  449. "movd 3%1, %%mm3\n\t"
  450. "punpckldq 6%1, %%mm0\n\t"
  451. "punpckldq 9%1, %%mm3\n\t"
  452. "movq %%mm0, %%mm1\n\t"
  453. "movq %%mm0, %%mm2\n\t"
  454. "movq %%mm3, %%mm4\n\t"
  455. "movq %%mm3, %%mm5\n\t"
  456. "psrlq $3, %%mm0\n\t"
  457. "psrlq $3, %%mm3\n\t"
  458. "pand %2, %%mm0\n\t"
  459. "pand %2, %%mm3\n\t"
  460. "psrlq $6, %%mm1\n\t"
  461. "psrlq $6, %%mm4\n\t"
  462. "pand %%mm6, %%mm1\n\t"
  463. "pand %%mm6, %%mm4\n\t"
  464. "psrlq $9, %%mm2\n\t"
  465. "psrlq $9, %%mm5\n\t"
  466. "pand %%mm7, %%mm2\n\t"
  467. "pand %%mm7, %%mm5\n\t"
  468. "por %%mm1, %%mm0\n\t"
  469. "por %%mm4, %%mm3\n\t"
  470. "por %%mm2, %%mm0\n\t"
  471. "por %%mm5, %%mm3\n\t"
  472. "psllq $16, %%mm3\n\t"
  473. "por %%mm3, %%mm0\n\t"
  474. MOVNTQ" %%mm0, %0\n\t"
  475. :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
  476. d += 4;
  477. s += 12;
  478. }
  479. __asm __volatile(SFENCE:::"memory");
  480. __asm __volatile(EMMS:::"memory");
  481. #endif
  482. while(s < end)
  483. {
  484. const int b= *s++;
  485. const int g= *s++;
  486. const int r= *s++;
  487. *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
  488. }
  489. }
  490. /*
  491. I use here less accurate approximation by simply
  492. left-shifting the input
  493. value and filling the low order bits with
  494. zeroes. This method improves png's
  495. compression but this scheme cannot reproduce white exactly, since it does not
  496. generate an all-ones maximum value; the net effect is to darken the
  497. image slightly.
  498. The better method should be "left bit replication":
  499. 4 3 2 1 0
  500. ---------
  501. 1 1 0 1 1
  502. 7 6 5 4 3 2 1 0
  503. ----------------
  504. 1 1 0 1 1 1 1 0
  505. |=======| |===|
  506. | Leftmost Bits Repeated to Fill Open Bits
  507. |
  508. Original Bits
  509. */
  510. static inline void RENAME(rgb15to24)(const uint8_t *src, uint8_t *dst, unsigned src_size)
  511. {
  512. const uint16_t *end;
  513. #ifdef HAVE_MMX
  514. const uint16_t *mm_end;
  515. #endif
  516. uint8_t *d = (uint8_t *)dst;
  517. const uint16_t *s = (uint16_t *)src;
  518. end = s + src_size/2;
  519. #ifdef HAVE_MMX
  520. __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
  521. mm_end = (uint16_t*)((((unsigned long)end)/8)*8);
  522. while(s < mm_end)
  523. {
  524. __asm __volatile(
  525. PREFETCH" 32%1\n\t"
  526. "movq %1, %%mm0\n\t"
  527. "movq %1, %%mm1\n\t"
  528. "movq %1, %%mm2\n\t"
  529. "pand %2, %%mm0\n\t"
  530. "pand %3, %%mm1\n\t"
  531. "pand %4, %%mm2\n\t"
  532. "psllq $3, %%mm0\n\t"
  533. "psrlq $2, %%mm1\n\t"
  534. "psrlq $7, %%mm2\n\t"
  535. "movq %%mm0, %%mm3\n\t"
  536. "movq %%mm1, %%mm4\n\t"
  537. "movq %%mm2, %%mm5\n\t"
  538. "punpcklwd %5, %%mm0\n\t"
  539. "punpcklwd %5, %%mm1\n\t"
  540. "punpcklwd %5, %%mm2\n\t"
  541. "punpckhwd %5, %%mm3\n\t"
  542. "punpckhwd %5, %%mm4\n\t"
  543. "punpckhwd %5, %%mm5\n\t"
  544. "psllq $8, %%mm1\n\t"
  545. "psllq $16, %%mm2\n\t"
  546. "por %%mm1, %%mm0\n\t"
  547. "por %%mm2, %%mm0\n\t"
  548. "psllq $8, %%mm4\n\t"
  549. "psllq $16, %%mm5\n\t"
  550. "por %%mm4, %%mm3\n\t"
  551. "por %%mm5, %%mm3\n\t"
  552. "movq %%mm0, %%mm6\n\t"
  553. "movq %%mm3, %%mm7\n\t"
  554. "movq 8%1, %%mm0\n\t"
  555. "movq 8%1, %%mm1\n\t"
  556. "movq 8%1, %%mm2\n\t"
  557. "pand %2, %%mm0\n\t"
  558. "pand %3, %%mm1\n\t"
  559. "pand %4, %%mm2\n\t"
  560. "psllq $3, %%mm0\n\t"
  561. "psrlq $2, %%mm1\n\t"
  562. "psrlq $7, %%mm2\n\t"
  563. "movq %%mm0, %%mm3\n\t"
  564. "movq %%mm1, %%mm4\n\t"
  565. "movq %%mm2, %%mm5\n\t"
  566. "punpcklwd %5, %%mm0\n\t"
  567. "punpcklwd %5, %%mm1\n\t"
  568. "punpcklwd %5, %%mm2\n\t"
  569. "punpckhwd %5, %%mm3\n\t"
  570. "punpckhwd %5, %%mm4\n\t"
  571. "punpckhwd %5, %%mm5\n\t"
  572. "psllq $8, %%mm1\n\t"
  573. "psllq $16, %%mm2\n\t"
  574. "por %%mm1, %%mm0\n\t"
  575. "por %%mm2, %%mm0\n\t"
  576. "psllq $8, %%mm4\n\t"
  577. "psllq $16, %%mm5\n\t"
  578. "por %%mm4, %%mm3\n\t"
  579. "por %%mm5, %%mm3\n\t"
  580. :"=m"(*d)
  581. :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r), "m"(mmx_null)
  582. :"memory");
  583. /* Borrowed 32 to 24 */
  584. __asm __volatile(
  585. "movq %%mm0, %%mm4\n\t"
  586. "movq %%mm3, %%mm5\n\t"
  587. "movq %%mm6, %%mm0\n\t"
  588. "movq %%mm7, %%mm1\n\t"
  589. "movq %%mm4, %%mm6\n\t"
  590. "movq %%mm5, %%mm7\n\t"
  591. "movq %%mm0, %%mm2\n\t"
  592. "movq %%mm1, %%mm3\n\t"
  593. "psrlq $8, %%mm2\n\t"
  594. "psrlq $8, %%mm3\n\t"
  595. "psrlq $8, %%mm6\n\t"
  596. "psrlq $8, %%mm7\n\t"
  597. "pand %2, %%mm0\n\t"
  598. "pand %2, %%mm1\n\t"
  599. "pand %2, %%mm4\n\t"
  600. "pand %2, %%mm5\n\t"
  601. "pand %3, %%mm2\n\t"
  602. "pand %3, %%mm3\n\t"
  603. "pand %3, %%mm6\n\t"
  604. "pand %3, %%mm7\n\t"
  605. "por %%mm2, %%mm0\n\t"
  606. "por %%mm3, %%mm1\n\t"
  607. "por %%mm6, %%mm4\n\t"
  608. "por %%mm7, %%mm5\n\t"
  609. "movq %%mm1, %%mm2\n\t"
  610. "movq %%mm4, %%mm3\n\t"
  611. "psllq $48, %%mm2\n\t"
  612. "psllq $32, %%mm3\n\t"
  613. "pand %4, %%mm2\n\t"
  614. "pand %5, %%mm3\n\t"
  615. "por %%mm2, %%mm0\n\t"
  616. "psrlq $16, %%mm1\n\t"
  617. "psrlq $32, %%mm4\n\t"
  618. "psllq $16, %%mm5\n\t"
  619. "por %%mm3, %%mm1\n\t"
  620. "pand %6, %%mm5\n\t"
  621. "por %%mm5, %%mm4\n\t"
  622. MOVNTQ" %%mm0, %0\n\t"
  623. MOVNTQ" %%mm1, 8%0\n\t"
  624. MOVNTQ" %%mm4, 16%0"
  625. :"=m"(*d)
  626. :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
  627. :"memory");
  628. d += 24;
  629. s += 8;
  630. }
  631. __asm __volatile(SFENCE:::"memory");
  632. __asm __volatile(EMMS:::"memory");
  633. #endif
  634. while(s < end)
  635. {
  636. register uint16_t bgr;
  637. bgr = *s++;
  638. *d++ = (bgr&0x1F)<<3;
  639. *d++ = (bgr&0x3E0)>>2;
  640. *d++ = (bgr&0x7C00)>>7;
  641. }
  642. }
  643. static inline void RENAME(rgb16to24)(const uint8_t *src, uint8_t *dst, unsigned src_size)
  644. {
  645. const uint16_t *end;
  646. #ifdef HAVE_MMX
  647. const uint16_t *mm_end;
  648. #endif
  649. uint8_t *d = (uint8_t *)dst;
  650. const uint16_t *s = (const uint16_t *)src;
  651. end = s + src_size/2;
  652. #ifdef HAVE_MMX
  653. __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
  654. mm_end = (uint16_t*)((((unsigned long)end)/8)*8);
  655. while(s < mm_end)
  656. {
  657. __asm __volatile(
  658. PREFETCH" 32%1\n\t"
  659. "movq %1, %%mm0\n\t"
  660. "movq %1, %%mm1\n\t"
  661. "movq %1, %%mm2\n\t"
  662. "pand %2, %%mm0\n\t"
  663. "pand %3, %%mm1\n\t"
  664. "pand %4, %%mm2\n\t"
  665. "psllq $3, %%mm0\n\t"
  666. "psrlq $3, %%mm1\n\t"
  667. "psrlq $8, %%mm2\n\t"
  668. "movq %%mm0, %%mm3\n\t"
  669. "movq %%mm1, %%mm4\n\t"
  670. "movq %%mm2, %%mm5\n\t"
  671. "punpcklwd %5, %%mm0\n\t"
  672. "punpcklwd %5, %%mm1\n\t"
  673. "punpcklwd %5, %%mm2\n\t"
  674. "punpckhwd %5, %%mm3\n\t"
  675. "punpckhwd %5, %%mm4\n\t"
  676. "punpckhwd %5, %%mm5\n\t"
  677. "psllq $8, %%mm1\n\t"
  678. "psllq $16, %%mm2\n\t"
  679. "por %%mm1, %%mm0\n\t"
  680. "por %%mm2, %%mm0\n\t"
  681. "psllq $8, %%mm4\n\t"
  682. "psllq $16, %%mm5\n\t"
  683. "por %%mm4, %%mm3\n\t"
  684. "por %%mm5, %%mm3\n\t"
  685. "movq %%mm0, %%mm6\n\t"
  686. "movq %%mm3, %%mm7\n\t"
  687. "movq 8%1, %%mm0\n\t"
  688. "movq 8%1, %%mm1\n\t"
  689. "movq 8%1, %%mm2\n\t"
  690. "pand %2, %%mm0\n\t"
  691. "pand %3, %%mm1\n\t"
  692. "pand %4, %%mm2\n\t"
  693. "psllq $3, %%mm0\n\t"
  694. "psrlq $3, %%mm1\n\t"
  695. "psrlq $8, %%mm2\n\t"
  696. "movq %%mm0, %%mm3\n\t"
  697. "movq %%mm1, %%mm4\n\t"
  698. "movq %%mm2, %%mm5\n\t"
  699. "punpcklwd %5, %%mm0\n\t"
  700. "punpcklwd %5, %%mm1\n\t"
  701. "punpcklwd %5, %%mm2\n\t"
  702. "punpckhwd %5, %%mm3\n\t"
  703. "punpckhwd %5, %%mm4\n\t"
  704. "punpckhwd %5, %%mm5\n\t"
  705. "psllq $8, %%mm1\n\t"
  706. "psllq $16, %%mm2\n\t"
  707. "por %%mm1, %%mm0\n\t"
  708. "por %%mm2, %%mm0\n\t"
  709. "psllq $8, %%mm4\n\t"
  710. "psllq $16, %%mm5\n\t"
  711. "por %%mm4, %%mm3\n\t"
  712. "por %%mm5, %%mm3\n\t"
  713. :"=m"(*d)
  714. :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mmx_null)
  715. :"memory");
  716. /* Borrowed 32 to 24 */
  717. __asm __volatile(
  718. "movq %%mm0, %%mm4\n\t"
  719. "movq %%mm3, %%mm5\n\t"
  720. "movq %%mm6, %%mm0\n\t"
  721. "movq %%mm7, %%mm1\n\t"
  722. "movq %%mm4, %%mm6\n\t"
  723. "movq %%mm5, %%mm7\n\t"
  724. "movq %%mm0, %%mm2\n\t"
  725. "movq %%mm1, %%mm3\n\t"
  726. "psrlq $8, %%mm2\n\t"
  727. "psrlq $8, %%mm3\n\t"
  728. "psrlq $8, %%mm6\n\t"
  729. "psrlq $8, %%mm7\n\t"
  730. "pand %2, %%mm0\n\t"
  731. "pand %2, %%mm1\n\t"
  732. "pand %2, %%mm4\n\t"
  733. "pand %2, %%mm5\n\t"
  734. "pand %3, %%mm2\n\t"
  735. "pand %3, %%mm3\n\t"
  736. "pand %3, %%mm6\n\t"
  737. "pand %3, %%mm7\n\t"
  738. "por %%mm2, %%mm0\n\t"
  739. "por %%mm3, %%mm1\n\t"
  740. "por %%mm6, %%mm4\n\t"
  741. "por %%mm7, %%mm5\n\t"
  742. "movq %%mm1, %%mm2\n\t"
  743. "movq %%mm4, %%mm3\n\t"
  744. "psllq $48, %%mm2\n\t"
  745. "psllq $32, %%mm3\n\t"
  746. "pand %4, %%mm2\n\t"
  747. "pand %5, %%mm3\n\t"
  748. "por %%mm2, %%mm0\n\t"
  749. "psrlq $16, %%mm1\n\t"
  750. "psrlq $32, %%mm4\n\t"
  751. "psllq $16, %%mm5\n\t"
  752. "por %%mm3, %%mm1\n\t"
  753. "pand %6, %%mm5\n\t"
  754. "por %%mm5, %%mm4\n\t"
  755. MOVNTQ" %%mm0, %0\n\t"
  756. MOVNTQ" %%mm1, 8%0\n\t"
  757. MOVNTQ" %%mm4, 16%0"
  758. :"=m"(*d)
  759. :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
  760. :"memory");
  761. d += 24;
  762. s += 8;
  763. }
  764. __asm __volatile(SFENCE:::"memory");
  765. __asm __volatile(EMMS:::"memory");
  766. #endif
  767. while(s < end)
  768. {
  769. register uint16_t bgr;
  770. bgr = *s++;
  771. *d++ = (bgr&0x1F)<<3;
  772. *d++ = (bgr&0x7E0)>>3;
  773. *d++ = (bgr&0xF800)>>8;
  774. }
  775. }
  776. static inline void RENAME(rgb15to32)(const uint8_t *src, uint8_t *dst, unsigned src_size)
  777. {
  778. const uint16_t *end;
  779. #ifdef HAVE_MMX
  780. const uint16_t *mm_end;
  781. #endif
  782. uint8_t *d = (uint8_t *)dst;
  783. const uint16_t *s = (const uint16_t *)src;
  784. end = s + src_size/2;
  785. #ifdef HAVE_MMX
  786. __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
  787. __asm __volatile("pxor %%mm7,%%mm7\n\t":::"memory");
  788. mm_end = (uint16_t*)((((unsigned long)end)/4)*4);
  789. while(s < mm_end)
  790. {
  791. __asm __volatile(
  792. PREFETCH" 32%1\n\t"
  793. "movq %1, %%mm0\n\t"
  794. "movq %1, %%mm1\n\t"
  795. "movq %1, %%mm2\n\t"
  796. "pand %2, %%mm0\n\t"
  797. "pand %3, %%mm1\n\t"
  798. "pand %4, %%mm2\n\t"
  799. "psllq $3, %%mm0\n\t"
  800. "psrlq $2, %%mm1\n\t"
  801. "psrlq $7, %%mm2\n\t"
  802. "movq %%mm0, %%mm3\n\t"
  803. "movq %%mm1, %%mm4\n\t"
  804. "movq %%mm2, %%mm5\n\t"
  805. "punpcklwd %%mm7, %%mm0\n\t"
  806. "punpcklwd %%mm7, %%mm1\n\t"
  807. "punpcklwd %%mm7, %%mm2\n\t"
  808. "punpckhwd %%mm7, %%mm3\n\t"
  809. "punpckhwd %%mm7, %%mm4\n\t"
  810. "punpckhwd %%mm7, %%mm5\n\t"
  811. "psllq $8, %%mm1\n\t"
  812. "psllq $16, %%mm2\n\t"
  813. "por %%mm1, %%mm0\n\t"
  814. "por %%mm2, %%mm0\n\t"
  815. "psllq $8, %%mm4\n\t"
  816. "psllq $16, %%mm5\n\t"
  817. "por %%mm4, %%mm3\n\t"
  818. "por %%mm5, %%mm3\n\t"
  819. MOVNTQ" %%mm0, %0\n\t"
  820. MOVNTQ" %%mm3, 8%0\n\t"
  821. :"=m"(*d)
  822. :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r)
  823. :"memory");
  824. d += 16;
  825. s += 4;
  826. }
  827. __asm __volatile(SFENCE:::"memory");
  828. __asm __volatile(EMMS:::"memory");
  829. #endif
  830. while(s < end)
  831. {
  832. register uint16_t bgr;
  833. bgr = *s++;
  834. *d++ = (bgr&0x1F)<<3;
  835. *d++ = (bgr&0x3E0)>>2;
  836. *d++ = (bgr&0x7C00)>>7;
  837. *d++ = 0;
  838. }
  839. }
  840. static inline void RENAME(rgb16to32)(const uint8_t *src, uint8_t *dst, unsigned src_size)
  841. {
  842. const uint16_t *end;
  843. #ifdef HAVE_MMX
  844. const uint16_t *mm_end;
  845. #endif
  846. uint8_t *d = (uint8_t *)dst;
  847. const uint16_t *s = (uint16_t *)src;
  848. end = s + src_size/2;
  849. #ifdef HAVE_MMX
  850. __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
  851. __asm __volatile("pxor %%mm7,%%mm7\n\t":::"memory");
  852. mm_end = (uint16_t*)((((unsigned long)end)/4)*4);
  853. while(s < mm_end)
  854. {
  855. __asm __volatile(
  856. PREFETCH" 32%1\n\t"
  857. "movq %1, %%mm0\n\t"
  858. "movq %1, %%mm1\n\t"
  859. "movq %1, %%mm2\n\t"
  860. "pand %2, %%mm0\n\t"
  861. "pand %3, %%mm1\n\t"
  862. "pand %4, %%mm2\n\t"
  863. "psllq $3, %%mm0\n\t"
  864. "psrlq $3, %%mm1\n\t"
  865. "psrlq $8, %%mm2\n\t"
  866. "movq %%mm0, %%mm3\n\t"
  867. "movq %%mm1, %%mm4\n\t"
  868. "movq %%mm2, %%mm5\n\t"
  869. "punpcklwd %%mm7, %%mm0\n\t"
  870. "punpcklwd %%mm7, %%mm1\n\t"
  871. "punpcklwd %%mm7, %%mm2\n\t"
  872. "punpckhwd %%mm7, %%mm3\n\t"
  873. "punpckhwd %%mm7, %%mm4\n\t"
  874. "punpckhwd %%mm7, %%mm5\n\t"
  875. "psllq $8, %%mm1\n\t"
  876. "psllq $16, %%mm2\n\t"
  877. "por %%mm1, %%mm0\n\t"
  878. "por %%mm2, %%mm0\n\t"
  879. "psllq $8, %%mm4\n\t"
  880. "psllq $16, %%mm5\n\t"
  881. "por %%mm4, %%mm3\n\t"
  882. "por %%mm5, %%mm3\n\t"
  883. MOVNTQ" %%mm0, %0\n\t"
  884. MOVNTQ" %%mm3, 8%0\n\t"
  885. :"=m"(*d)
  886. :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r)
  887. :"memory");
  888. d += 16;
  889. s += 4;
  890. }
  891. __asm __volatile(SFENCE:::"memory");
  892. __asm __volatile(EMMS:::"memory");
  893. #endif
  894. while(s < end)
  895. {
  896. register uint16_t bgr;
  897. bgr = *s++;
  898. *d++ = (bgr&0x1F)<<3;
  899. *d++ = (bgr&0x7E0)>>3;
  900. *d++ = (bgr&0xF800)>>8;
  901. *d++ = 0;
  902. }
  903. }
  904. static inline void RENAME(rgb32tobgr32)(const uint8_t *src, uint8_t *dst, unsigned int src_size)
  905. {
  906. #ifdef HAVE_MMX
  907. /* TODO: unroll this loop */
  908. asm volatile (
  909. "xorl %%eax, %%eax \n\t"
  910. ".balign 16 \n\t"
  911. "1: \n\t"
  912. PREFETCH" 32(%0, %%eax) \n\t"
  913. "movq (%0, %%eax), %%mm0 \n\t"
  914. "movq %%mm0, %%mm1 \n\t"
  915. "movq %%mm0, %%mm2 \n\t"
  916. "pslld $16, %%mm0 \n\t"
  917. "psrld $16, %%mm1 \n\t"
  918. "pand "MANGLE(mask32r)", %%mm0 \n\t"
  919. "pand "MANGLE(mask32g)", %%mm2 \n\t"
  920. "pand "MANGLE(mask32b)", %%mm1 \n\t"
  921. "por %%mm0, %%mm2 \n\t"
  922. "por %%mm1, %%mm2 \n\t"
  923. MOVNTQ" %%mm2, (%1, %%eax) \n\t"
  924. "addl $8, %%eax \n\t"
  925. "cmpl %2, %%eax \n\t"
  926. " jb 1b \n\t"
  927. :: "r" (src), "r"(dst), "r" (src_size)
  928. : "%eax"
  929. );
  930. __asm __volatile(SFENCE:::"memory");
  931. __asm __volatile(EMMS:::"memory");
  932. #else
  933. unsigned i;
  934. unsigned num_pixels = src_size >> 2;
  935. for(i=0; i<num_pixels; i++)
  936. {
  937. dst[4*i + 0] = src[4*i + 2];
  938. dst[4*i + 1] = src[4*i + 1];
  939. dst[4*i + 2] = src[4*i + 0];
  940. }
  941. #endif
  942. }
  943. static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, unsigned int src_size)
  944. {
  945. unsigned i;
  946. #ifdef HAVE_MMX
  947. int mmx_size= 23 - src_size;
  948. asm volatile (
  949. "movq "MANGLE(mask24r)", %%mm5 \n\t"
  950. "movq "MANGLE(mask24g)", %%mm6 \n\t"
  951. "movq "MANGLE(mask24b)", %%mm7 \n\t"
  952. ".balign 16 \n\t"
  953. "1: \n\t"
  954. PREFETCH" 32(%1, %%eax) \n\t"
  955. "movq (%1, %%eax), %%mm0 \n\t" // BGR BGR BG
  956. "movq (%1, %%eax), %%mm1 \n\t" // BGR BGR BG
  957. "movq 2(%1, %%eax), %%mm2 \n\t" // R BGR BGR B
  958. "psllq $16, %%mm0 \n\t" // 00 BGR BGR
  959. "pand %%mm5, %%mm0 \n\t"
  960. "pand %%mm6, %%mm1 \n\t"
  961. "pand %%mm7, %%mm2 \n\t"
  962. "por %%mm0, %%mm1 \n\t"
  963. "por %%mm2, %%mm1 \n\t"
  964. "movq 6(%1, %%eax), %%mm0 \n\t" // BGR BGR BG
  965. MOVNTQ" %%mm1, (%2, %%eax) \n\t" // RGB RGB RG
  966. "movq 8(%1, %%eax), %%mm1 \n\t" // R BGR BGR B
  967. "movq 10(%1, %%eax), %%mm2 \n\t" // GR BGR BGR
  968. "pand %%mm7, %%mm0 \n\t"
  969. "pand %%mm5, %%mm1 \n\t"
  970. "pand %%mm6, %%mm2 \n\t"
  971. "por %%mm0, %%mm1 \n\t"
  972. "por %%mm2, %%mm1 \n\t"
  973. "movq 14(%1, %%eax), %%mm0 \n\t" // R BGR BGR B
  974. MOVNTQ" %%mm1, 8(%2, %%eax) \n\t" // B RGB RGB R
  975. "movq 16(%1, %%eax), %%mm1 \n\t" // GR BGR BGR
  976. "movq 18(%1, %%eax), %%mm2 \n\t" // BGR BGR BG
  977. "pand %%mm6, %%mm0 \n\t"
  978. "pand %%mm7, %%mm1 \n\t"
  979. "pand %%mm5, %%mm2 \n\t"
  980. "por %%mm0, %%mm1 \n\t"
  981. "por %%mm2, %%mm1 \n\t"
  982. MOVNTQ" %%mm1, 16(%2, %%eax) \n\t"
  983. "addl $24, %%eax \n\t"
  984. " js 1b \n\t"
  985. : "+a" (mmx_size)
  986. : "r" (src-mmx_size), "r"(dst-mmx_size)
  987. );
  988. __asm __volatile(SFENCE:::"memory");
  989. __asm __volatile(EMMS:::"memory");
  990. if(mmx_size==23) return; //finihsed, was multiple of 8
  991. src+= src_size;
  992. dst+= src_size;
  993. src_size= 23-mmx_size;
  994. src-= src_size;
  995. dst-= src_size;
  996. #endif
  997. for(i=0; i<src_size; i+=3)
  998. {
  999. register uint8_t x;
  1000. x = src[i + 2];
  1001. dst[i + 1] = src[i + 1];
  1002. dst[i + 2] = src[i + 0];
  1003. dst[i + 0] = x;
  1004. }
  1005. }
  1006. static inline void RENAME(yuvPlanartoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
  1007. unsigned int width, unsigned int height,
  1008. unsigned int lumStride, unsigned int chromStride, unsigned int dstStride, int vertLumPerChroma)
  1009. {
  1010. unsigned y;
  1011. const unsigned chromWidth= width>>1;
  1012. for(y=0; y<height; y++)
  1013. {
  1014. #ifdef HAVE_MMX
  1015. //FIXME handle 2 lines a once (fewer prefetch, reuse some chrom, but very likely limited by mem anyway)
  1016. asm volatile(
  1017. "xorl %%eax, %%eax \n\t"
  1018. ".balign 16 \n\t"
  1019. "1: \n\t"
  1020. PREFETCH" 32(%1, %%eax, 2) \n\t"
  1021. PREFETCH" 32(%2, %%eax) \n\t"
  1022. PREFETCH" 32(%3, %%eax) \n\t"
  1023. "movq (%2, %%eax), %%mm0 \n\t" // U(0)
  1024. "movq %%mm0, %%mm2 \n\t" // U(0)
  1025. "movq (%3, %%eax), %%mm1 \n\t" // V(0)
  1026. "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
  1027. "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8)
  1028. "movq (%1, %%eax,2), %%mm3 \n\t" // Y(0)
  1029. "movq 8(%1, %%eax,2), %%mm5 \n\t" // Y(8)
  1030. "movq %%mm3, %%mm4 \n\t" // Y(0)
  1031. "movq %%mm5, %%mm6 \n\t" // Y(8)
  1032. "punpcklbw %%mm0, %%mm3 \n\t" // YUYV YUYV(0)
  1033. "punpckhbw %%mm0, %%mm4 \n\t" // YUYV YUYV(4)
  1034. "punpcklbw %%mm2, %%mm5 \n\t" // YUYV YUYV(8)
  1035. "punpckhbw %%mm2, %%mm6 \n\t" // YUYV YUYV(12)
  1036. MOVNTQ" %%mm3, (%0, %%eax, 4) \n\t"
  1037. MOVNTQ" %%mm4, 8(%0, %%eax, 4) \n\t"
  1038. MOVNTQ" %%mm5, 16(%0, %%eax, 4) \n\t"
  1039. MOVNTQ" %%mm6, 24(%0, %%eax, 4) \n\t"
  1040. "addl $8, %%eax \n\t"
  1041. "cmpl %4, %%eax \n\t"
  1042. " jb 1b \n\t"
  1043. ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "r" (chromWidth)
  1044. : "%eax"
  1045. );
  1046. #else
  1047. #if __WORDSIZE >= 64
  1048. int i;
  1049. uint64_t *ldst = (uint64_t *) dst;
  1050. const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
  1051. for(i = 0; i < chromWidth; i += 2){
  1052. uint64_t k, l;
  1053. k = yc[0] + (uc[0] << 8) +
  1054. (yc[1] << 16) + (vc[0] << 24);
  1055. l = yc[2] + (uc[1] << 8) +
  1056. (yc[3] << 16) + (vc[1] << 24);
  1057. *ldst++ = k + (l << 32);
  1058. yc += 4;
  1059. uc += 2;
  1060. vc += 2;
  1061. }
  1062. #else
  1063. int i, *idst = (int32_t *) dst;
  1064. const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
  1065. for(i = 0; i < chromWidth; i++){
  1066. *idst++ = yc[0] + (uc[0] << 8) +
  1067. (yc[1] << 16) + (vc[0] << 24);
  1068. yc += 2;
  1069. uc++;
  1070. vc++;
  1071. }
  1072. #endif
  1073. #endif
  1074. if((y&(vertLumPerChroma-1))==(vertLumPerChroma-1) )
  1075. {
  1076. usrc += chromStride;
  1077. vsrc += chromStride;
  1078. }
  1079. ysrc += lumStride;
  1080. dst += dstStride;
  1081. }
  1082. #ifdef HAVE_MMX
  1083. asm( EMMS" \n\t"
  1084. SFENCE" \n\t"
  1085. :::"memory");
  1086. #endif
  1087. }
  1088. /**
  1089. *
  1090. * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
  1091. * problem for anyone then tell me, and ill fix it)
  1092. */
  1093. static inline void RENAME(yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
  1094. unsigned int width, unsigned int height,
  1095. unsigned int lumStride, unsigned int chromStride, unsigned int dstStride)
  1096. {
  1097. //FIXME interpolate chroma
  1098. RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
  1099. }
  1100. /**
  1101. *
  1102. * width should be a multiple of 16
  1103. */
  1104. static inline void RENAME(yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
  1105. unsigned int width, unsigned int height,
  1106. unsigned int lumStride, unsigned int chromStride, unsigned int dstStride)
  1107. {
  1108. RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
  1109. }
  1110. /**
  1111. *
  1112. * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
  1113. * problem for anyone then tell me, and ill fix it)
  1114. */
  1115. static inline void RENAME(yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
  1116. unsigned int width, unsigned int height,
  1117. unsigned int lumStride, unsigned int chromStride, unsigned int srcStride)
  1118. {
  1119. unsigned y;
  1120. const unsigned chromWidth= width>>1;
  1121. for(y=0; y<height; y+=2)
  1122. {
  1123. #ifdef HAVE_MMX
  1124. asm volatile(
  1125. "xorl %%eax, %%eax \n\t"
  1126. "pcmpeqw %%mm7, %%mm7 \n\t"
  1127. "psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
  1128. ".balign 16 \n\t"
  1129. "1: \n\t"
  1130. PREFETCH" 64(%0, %%eax, 4) \n\t"
  1131. "movq (%0, %%eax, 4), %%mm0 \n\t" // YUYV YUYV(0)
  1132. "movq 8(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(4)
  1133. "movq %%mm0, %%mm2 \n\t" // YUYV YUYV(0)
  1134. "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(4)
  1135. "psrlw $8, %%mm0 \n\t" // U0V0 U0V0(0)
  1136. "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(4)
  1137. "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
  1138. "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
  1139. "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
  1140. "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
  1141. MOVNTQ" %%mm2, (%1, %%eax, 2) \n\t"
  1142. "movq 16(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(8)
  1143. "movq 24(%0, %%eax, 4), %%mm2 \n\t" // YUYV YUYV(12)
  1144. "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(8)
  1145. "movq %%mm2, %%mm4 \n\t" // YUYV YUYV(12)
  1146. "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(8)
  1147. "psrlw $8, %%mm2 \n\t" // U0V0 U0V0(12)
  1148. "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
  1149. "pand %%mm7, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
  1150. "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
  1151. "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
  1152. MOVNTQ" %%mm3, 8(%1, %%eax, 2) \n\t"
  1153. "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
  1154. "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
  1155. "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
  1156. "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
  1157. "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
  1158. "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
  1159. "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
  1160. "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
  1161. MOVNTQ" %%mm0, (%3, %%eax) \n\t"
  1162. MOVNTQ" %%mm2, (%2, %%eax) \n\t"
  1163. "addl $8, %%eax \n\t"
  1164. "cmpl %4, %%eax \n\t"
  1165. " jb 1b \n\t"
  1166. ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "r" (chromWidth)
  1167. : "memory", "%eax"
  1168. );
  1169. ydst += lumStride;
  1170. src += srcStride;
  1171. asm volatile(
  1172. "xorl %%eax, %%eax \n\t"
  1173. ".balign 16 \n\t"
  1174. "1: \n\t"
  1175. PREFETCH" 64(%0, %%eax, 4) \n\t"
  1176. "movq (%0, %%eax, 4), %%mm0 \n\t" // YUYV YUYV(0)
  1177. "movq 8(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(4)
  1178. "movq 16(%0, %%eax, 4), %%mm2 \n\t" // YUYV YUYV(8)
  1179. "movq 24(%0, %%eax, 4), %%mm3 \n\t" // YUYV YUYV(12)
  1180. "pand %%mm7, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
  1181. "pand %%mm7, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
  1182. "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
  1183. "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
  1184. "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
  1185. "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
  1186. MOVNTQ" %%mm0, (%1, %%eax, 2) \n\t"
  1187. MOVNTQ" %%mm2, 8(%1, %%eax, 2) \n\t"
  1188. "addl $8, %%eax \n\t"
  1189. "cmpl %4, %%eax \n\t"
  1190. " jb 1b \n\t"
  1191. ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "r" (chromWidth)
  1192. : "memory", "%eax"
  1193. );
  1194. #else
  1195. unsigned i;
  1196. for(i=0; i<chromWidth; i++)
  1197. {
  1198. ydst[2*i+0] = src[4*i+0];
  1199. udst[i] = src[4*i+1];
  1200. ydst[2*i+1] = src[4*i+2];
  1201. vdst[i] = src[4*i+3];
  1202. }
  1203. ydst += lumStride;
  1204. src += srcStride;
  1205. for(i=0; i<chromWidth; i++)
  1206. {
  1207. ydst[2*i+0] = src[4*i+0];
  1208. ydst[2*i+1] = src[4*i+2];
  1209. }
  1210. #endif
  1211. udst += chromStride;
  1212. vdst += chromStride;
  1213. ydst += lumStride;
  1214. src += srcStride;
  1215. }
  1216. #ifdef HAVE_MMX
  1217. asm volatile( EMMS" \n\t"
  1218. SFENCE" \n\t"
  1219. :::"memory");
  1220. #endif
  1221. }
  1222. static inline void RENAME(yvu9toyv12)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc,
  1223. uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
  1224. unsigned int width, unsigned int height, unsigned int lumStride, unsigned int chromStride)
  1225. {
  1226. /* Y Plane */
  1227. memcpy(ydst, ysrc, width*height);
  1228. /* XXX: implement upscaling for U,V */
  1229. }
  1230. static inline void RENAME(planar2x)(const uint8_t *src, uint8_t *dst, int srcWidth, int srcHeight, int srcStride, int dstStride)
  1231. {
  1232. int x,y;
  1233. // first line
  1234. for(x=0; x<srcWidth; x++){
  1235. dst[2*x+0]=
  1236. dst[2*x+1]= src[x];
  1237. }
  1238. dst+= dstStride;
  1239. for(y=1; y<srcHeight; y++){
  1240. #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
  1241. const int mmxSize= srcWidth;
  1242. asm volatile(
  1243. "movl %4, %%eax \n\t"
  1244. "1: \n\t"
  1245. "movq (%0, %%eax), %%mm0 \n\t"
  1246. "movq (%1, %%eax), %%mm1 \n\t"
  1247. "movq 1(%0, %%eax), %%mm2 \n\t"
  1248. "movq 1(%1, %%eax), %%mm3 \n\t"
  1249. "movq %%mm0, %%mm4 \n\t"
  1250. "movq %%mm1, %%mm5 \n\t"
  1251. PAVGB" %%mm3, %%mm0 \n\t"
  1252. PAVGB" %%mm3, %%mm0 \n\t"
  1253. PAVGB" %%mm4, %%mm3 \n\t"
  1254. PAVGB" %%mm4, %%mm3 \n\t"
  1255. PAVGB" %%mm2, %%mm1 \n\t"
  1256. PAVGB" %%mm2, %%mm1 \n\t"
  1257. PAVGB" %%mm5, %%mm2 \n\t"
  1258. PAVGB" %%mm5, %%mm2 \n\t"
  1259. "movq %%mm3, %%mm4 \n\t"
  1260. "movq %%mm2, %%mm5 \n\t"
  1261. "punpcklbw %%mm1, %%mm3 \n\t"
  1262. "punpckhbw %%mm1, %%mm4 \n\t"
  1263. "punpcklbw %%mm0, %%mm2 \n\t"
  1264. "punpckhbw %%mm0, %%mm5 \n\t"
  1265. #if 1
  1266. MOVNTQ" %%mm3, (%2, %%eax, 2) \n\t"
  1267. MOVNTQ" %%mm4, 8(%2, %%eax, 2) \n\t"
  1268. MOVNTQ" %%mm2, (%3, %%eax, 2) \n\t"
  1269. MOVNTQ" %%mm5, 8(%3, %%eax, 2) \n\t"
  1270. #else
  1271. "movq %%mm3, (%2, %%eax, 2) \n\t"
  1272. "movq %%mm4, 8(%2, %%eax, 2) \n\t"
  1273. "movq %%mm2, (%3, %%eax, 2) \n\t"
  1274. "movq %%mm5, 8(%3, %%eax, 2) \n\t"
  1275. #endif
  1276. "addl $8, %%eax \n\t"
  1277. " js 1b \n\t"
  1278. :: "r" (src + mmxSize-1), "r" (src + srcStride + mmxSize-1),
  1279. "r" (dst + mmxSize*2), "r" (dst + dstStride + mmxSize*2),
  1280. "g" (-mmxSize)
  1281. : "%eax"
  1282. );
  1283. dst[0]=
  1284. dst[dstStride]= src[0];
  1285. #else
  1286. dst[0]=
  1287. dst[dstStride]= src[0];
  1288. for(x=0; x<srcWidth-1; x++){
  1289. dst[2*x +1]= (3*src[x+0] + src[x+srcStride+1])>>2;
  1290. dst[2*x+dstStride+2]= ( src[x+0] + 3*src[x+srcStride+1])>>2;
  1291. dst[2*x+dstStride+1]= ( src[x+1] + 3*src[x+srcStride ])>>2;
  1292. dst[2*x +2]= (3*src[x+1] + src[x+srcStride ])>>2;
  1293. }
  1294. #endif
  1295. dst[srcWidth*2 -1]=
  1296. dst[srcWidth*2 -1 + dstStride]= src[srcWidth-1];
  1297. dst+=dstStride*2;
  1298. src+=srcStride;
  1299. }
  1300. src-=srcStride;
  1301. // last line
  1302. for(x=0; x<srcWidth; x++){
  1303. dst[2*x+0]=
  1304. dst[2*x+1]= src[x];
  1305. }
  1306. #ifdef HAVE_MMX
  1307. asm volatile( EMMS" \n\t"
  1308. SFENCE" \n\t"
  1309. :::"memory");
  1310. #endif
  1311. }
  1312. /**
  1313. *
  1314. * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
  1315. * problem for anyone then tell me, and ill fix it)
  1316. * chrominance data is only taken from every secound line others are ignored FIXME write HQ version
  1317. */
  1318. static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
  1319. unsigned int width, unsigned int height,
  1320. unsigned int lumStride, unsigned int chromStride, unsigned int srcStride)
  1321. {
  1322. unsigned y;
  1323. const unsigned chromWidth= width>>1;
  1324. for(y=0; y<height; y+=2)
  1325. {
  1326. #ifdef HAVE_MMX
  1327. asm volatile(
  1328. "xorl %%eax, %%eax \n\t"
  1329. "pcmpeqw %%mm7, %%mm7 \n\t"
  1330. "psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
  1331. ".balign 16 \n\t"
  1332. "1: \n\t"
  1333. PREFETCH" 64(%0, %%eax, 4) \n\t"
  1334. "movq (%0, %%eax, 4), %%mm0 \n\t" // UYVY UYVY(0)
  1335. "movq 8(%0, %%eax, 4), %%mm1 \n\t" // UYVY UYVY(4)
  1336. "movq %%mm0, %%mm2 \n\t" // UYVY UYVY(0)
  1337. "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(4)
  1338. "pand %%mm7, %%mm0 \n\t" // U0V0 U0V0(0)
  1339. "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(4)
  1340. "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
  1341. "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
  1342. "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
  1343. "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
  1344. MOVNTQ" %%mm2, (%1, %%eax, 2) \n\t"
  1345. "movq 16(%0, %%eax, 4), %%mm1 \n\t" // UYVY UYVY(8)
  1346. "movq 24(%0, %%eax, 4), %%mm2 \n\t" // UYVY UYVY(12)
  1347. "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(8)
  1348. "movq %%mm2, %%mm4 \n\t" // UYVY UYVY(12)
  1349. "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(8)
  1350. "pand %%mm7, %%mm2 \n\t" // U0V0 U0V0(12)
  1351. "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
  1352. "psrlw $8, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
  1353. "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
  1354. "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
  1355. MOVNTQ" %%mm3, 8(%1, %%eax, 2) \n\t"
  1356. "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
  1357. "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
  1358. "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
  1359. "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
  1360. "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
  1361. "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
  1362. "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
  1363. "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
  1364. MOVNTQ" %%mm0, (%3, %%eax) \n\t"
  1365. MOVNTQ" %%mm2, (%2, %%eax) \n\t"
  1366. "addl $8, %%eax \n\t"
  1367. "cmpl %4, %%eax \n\t"
  1368. " jb 1b \n\t"
  1369. ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "r" (chromWidth)
  1370. : "memory", "%eax"
  1371. );
  1372. ydst += lumStride;
  1373. src += srcStride;
  1374. asm volatile(
  1375. "xorl %%eax, %%eax \n\t"
  1376. ".balign 16 \n\t"
  1377. "1: \n\t"
  1378. PREFETCH" 64(%0, %%eax, 4) \n\t"
  1379. "movq (%0, %%eax, 4), %%mm0 \n\t" // YUYV YUYV(0)
  1380. "movq 8(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(4)
  1381. "movq 16(%0, %%eax, 4), %%mm2 \n\t" // YUYV YUYV(8)
  1382. "movq 24(%0, %%eax, 4), %%mm3 \n\t" // YUYV YUYV(12)
  1383. "psrlw $8, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
  1384. "psrlw $8, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
  1385. "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
  1386. "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
  1387. "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
  1388. "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
  1389. MOVNTQ" %%mm0, (%1, %%eax, 2) \n\t"
  1390. MOVNTQ" %%mm2, 8(%1, %%eax, 2) \n\t"
  1391. "addl $8, %%eax \n\t"
  1392. "cmpl %4, %%eax \n\t"
  1393. " jb 1b \n\t"
  1394. ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "r" (chromWidth)
  1395. : "memory", "%eax"
  1396. );
  1397. #else
  1398. unsigned i;
  1399. for(i=0; i<chromWidth; i++)
  1400. {
  1401. udst[i] = src[4*i+0];
  1402. ydst[2*i+0] = src[4*i+1];
  1403. vdst[i] = src[4*i+2];
  1404. ydst[2*i+1] = src[4*i+3];
  1405. }
  1406. ydst += lumStride;
  1407. src += srcStride;
  1408. for(i=0; i<chromWidth; i++)
  1409. {
  1410. ydst[2*i+0] = src[4*i+1];
  1411. ydst[2*i+1] = src[4*i+3];
  1412. }
  1413. #endif
  1414. udst += chromStride;
  1415. vdst += chromStride;
  1416. ydst += lumStride;
  1417. src += srcStride;
  1418. }
  1419. #ifdef HAVE_MMX
  1420. asm volatile( EMMS" \n\t"
  1421. SFENCE" \n\t"
  1422. :::"memory");
  1423. #endif
  1424. }
  1425. /**
  1426. *
  1427. * height should be a multiple of 2 and width should be a multiple of 2 (if this is a
  1428. * problem for anyone then tell me, and ill fix it)
  1429. * chrominance data is only taken from every secound line others are ignored in the C version FIXME write HQ version
  1430. */
  1431. static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
  1432. unsigned int width, unsigned int height,
  1433. unsigned int lumStride, unsigned int chromStride, unsigned int srcStride)
  1434. {
  1435. unsigned y;
  1436. const unsigned chromWidth= width>>1;
  1437. #ifdef HAVE_MMX
  1438. for(y=0; y<height-2; y+=2)
  1439. {
  1440. unsigned i;
  1441. for(i=0; i<2; i++)
  1442. {
  1443. asm volatile(
  1444. "movl %2, %%eax \n\t"
  1445. "movq "MANGLE(bgr2YCoeff)", %%mm6 \n\t"
  1446. "movq "MANGLE(w1111)", %%mm5 \n\t"
  1447. "pxor %%mm7, %%mm7 \n\t"
  1448. "leal (%%eax, %%eax, 2), %%ebx \n\t"
  1449. ".balign 16 \n\t"
  1450. "1: \n\t"
  1451. PREFETCH" 64(%0, %%ebx) \n\t"
  1452. "movd (%0, %%ebx), %%mm0 \n\t"
  1453. "movd 3(%0, %%ebx), %%mm1 \n\t"
  1454. "punpcklbw %%mm7, %%mm0 \n\t"
  1455. "punpcklbw %%mm7, %%mm1 \n\t"
  1456. "movd 6(%0, %%ebx), %%mm2 \n\t"
  1457. "movd 9(%0, %%ebx), %%mm3 \n\t"
  1458. "punpcklbw %%mm7, %%mm2 \n\t"
  1459. "punpcklbw %%mm7, %%mm3 \n\t"
  1460. "pmaddwd %%mm6, %%mm0 \n\t"
  1461. "pmaddwd %%mm6, %%mm1 \n\t"
  1462. "pmaddwd %%mm6, %%mm2 \n\t"
  1463. "pmaddwd %%mm6, %%mm3 \n\t"
  1464. #ifndef FAST_BGR2YV12
  1465. "psrad $8, %%mm0 \n\t"
  1466. "psrad $8, %%mm1 \n\t"
  1467. "psrad $8, %%mm2 \n\t"
  1468. "psrad $8, %%mm3 \n\t"
  1469. #endif
  1470. "packssdw %%mm1, %%mm0 \n\t"
  1471. "packssdw %%mm3, %%mm2 \n\t"
  1472. "pmaddwd %%mm5, %%mm0 \n\t"
  1473. "pmaddwd %%mm5, %%mm2 \n\t"
  1474. "packssdw %%mm2, %%mm0 \n\t"
  1475. "psraw $7, %%mm0 \n\t"
  1476. "movd 12(%0, %%ebx), %%mm4 \n\t"
  1477. "movd 15(%0, %%ebx), %%mm1 \n\t"
  1478. "punpcklbw %%mm7, %%mm4 \n\t"
  1479. "punpcklbw %%mm7, %%mm1 \n\t"
  1480. "movd 18(%0, %%ebx), %%mm2 \n\t"
  1481. "movd 21(%0, %%ebx), %%mm3 \n\t"
  1482. "punpcklbw %%mm7, %%mm2 \n\t"
  1483. "punpcklbw %%mm7, %%mm3 \n\t"
  1484. "pmaddwd %%mm6, %%mm4 \n\t"
  1485. "pmaddwd %%mm6, %%mm1 \n\t"
  1486. "pmaddwd %%mm6, %%mm2 \n\t"
  1487. "pmaddwd %%mm6, %%mm3 \n\t"
  1488. #ifndef FAST_BGR2YV12
  1489. "psrad $8, %%mm4 \n\t"
  1490. "psrad $8, %%mm1 \n\t"
  1491. "psrad $8, %%mm2 \n\t"
  1492. "psrad $8, %%mm3 \n\t"
  1493. #endif
  1494. "packssdw %%mm1, %%mm4 \n\t"
  1495. "packssdw %%mm3, %%mm2 \n\t"
  1496. "pmaddwd %%mm5, %%mm4 \n\t"
  1497. "pmaddwd %%mm5, %%mm2 \n\t"
  1498. "addl $24, %%ebx \n\t"
  1499. "packssdw %%mm2, %%mm4 \n\t"
  1500. "psraw $7, %%mm4 \n\t"
  1501. "packuswb %%mm4, %%mm0 \n\t"
  1502. "paddusb "MANGLE(bgr2YOffset)", %%mm0 \n\t"
  1503. MOVNTQ" %%mm0, (%1, %%eax) \n\t"
  1504. "addl $8, %%eax \n\t"
  1505. " js 1b \n\t"
  1506. : : "r" (src+width*3), "r" (ydst+width), "g" (-width)
  1507. : "%eax", "%ebx"
  1508. );
  1509. ydst += lumStride;
  1510. src += srcStride;
  1511. }
  1512. src -= srcStride*2;
  1513. asm volatile(
  1514. "movl %4, %%eax \n\t"
  1515. "movq "MANGLE(w1111)", %%mm5 \n\t"
  1516. "movq "MANGLE(bgr2UCoeff)", %%mm6 \n\t"
  1517. "pxor %%mm7, %%mm7 \n\t"
  1518. "leal (%%eax, %%eax, 2), %%ebx \n\t"
  1519. "addl %%ebx, %%ebx \n\t"
  1520. ".balign 16 \n\t"
  1521. "1: \n\t"
  1522. PREFETCH" 64(%0, %%ebx) \n\t"
  1523. PREFETCH" 64(%1, %%ebx) \n\t"
  1524. #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
  1525. "movq (%0, %%ebx), %%mm0 \n\t"
  1526. "movq (%1, %%ebx), %%mm1 \n\t"
  1527. "movq 6(%0, %%ebx), %%mm2 \n\t"
  1528. "movq 6(%1, %%ebx), %%mm3 \n\t"
  1529. PAVGB" %%mm1, %%mm0 \n\t"
  1530. PAVGB" %%mm3, %%mm2 \n\t"
  1531. "movq %%mm0, %%mm1 \n\t"
  1532. "movq %%mm2, %%mm3 \n\t"
  1533. "psrlq $24, %%mm0 \n\t"
  1534. "psrlq $24, %%mm2 \n\t"
  1535. PAVGB" %%mm1, %%mm0 \n\t"
  1536. PAVGB" %%mm3, %%mm2 \n\t"
  1537. "punpcklbw %%mm7, %%mm0 \n\t"
  1538. "punpcklbw %%mm7, %%mm2 \n\t"
  1539. #else
  1540. "movd (%0, %%ebx), %%mm0 \n\t"
  1541. "movd (%1, %%ebx), %%mm1 \n\t"
  1542. "movd 3(%0, %%ebx), %%mm2 \n\t"
  1543. "movd 3(%1, %%ebx), %%mm3 \n\t"
  1544. "punpcklbw %%mm7, %%mm0 \n\t"
  1545. "punpcklbw %%mm7, %%mm1 \n\t"
  1546. "punpcklbw %%mm7, %%mm2 \n\t"
  1547. "punpcklbw %%mm7, %%mm3 \n\t"
  1548. "paddw %%mm1, %%mm0 \n\t"
  1549. "paddw %%mm3, %%mm2 \n\t"
  1550. "paddw %%mm2, %%mm0 \n\t"
  1551. "movd 6(%0, %%ebx), %%mm4 \n\t"
  1552. "movd 6(%1, %%ebx), %%mm1 \n\t"
  1553. "movd 9(%0, %%ebx), %%mm2 \n\t"
  1554. "movd 9(%1, %%ebx), %%mm3 \n\t"
  1555. "punpcklbw %%mm7, %%mm4 \n\t"
  1556. "punpcklbw %%mm7, %%mm1 \n\t"
  1557. "punpcklbw %%mm7, %%mm2 \n\t"
  1558. "punpcklbw %%mm7, %%mm3 \n\t"
  1559. "paddw %%mm1, %%mm4 \n\t"
  1560. "paddw %%mm3, %%mm2 \n\t"
  1561. "paddw %%mm4, %%mm2 \n\t"
  1562. "psrlw $2, %%mm0 \n\t"
  1563. "psrlw $2, %%mm2 \n\t"
  1564. #endif
  1565. "movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t"
  1566. "movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t"
  1567. "pmaddwd %%mm0, %%mm1 \n\t"
  1568. "pmaddwd %%mm2, %%mm3 \n\t"
  1569. "pmaddwd %%mm6, %%mm0 \n\t"
  1570. "pmaddwd %%mm6, %%mm2 \n\t"
  1571. #ifndef FAST_BGR2YV12
  1572. "psrad $8, %%mm0 \n\t"
  1573. "psrad $8, %%mm1 \n\t"
  1574. "psrad $8, %%mm2 \n\t"
  1575. "psrad $8, %%mm3 \n\t"
  1576. #endif
  1577. "packssdw %%mm2, %%mm0 \n\t"
  1578. "packssdw %%mm3, %%mm1 \n\t"
  1579. "pmaddwd %%mm5, %%mm0 \n\t"
  1580. "pmaddwd %%mm5, %%mm1 \n\t"
  1581. "packssdw %%mm1, %%mm0 \n\t" // V1 V0 U1 U0
  1582. "psraw $7, %%mm0 \n\t"
  1583. #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
  1584. "movq 12(%0, %%ebx), %%mm4 \n\t"
  1585. "movq 12(%1, %%ebx), %%mm1 \n\t"
  1586. "movq 18(%0, %%ebx), %%mm2 \n\t"
  1587. "movq 18(%1, %%ebx), %%mm3 \n\t"
  1588. PAVGB" %%mm1, %%mm4 \n\t"
  1589. PAVGB" %%mm3, %%mm2 \n\t"
  1590. "movq %%mm4, %%mm1 \n\t"
  1591. "movq %%mm2, %%mm3 \n\t"
  1592. "psrlq $24, %%mm4 \n\t"
  1593. "psrlq $24, %%mm2 \n\t"
  1594. PAVGB" %%mm1, %%mm4 \n\t"
  1595. PAVGB" %%mm3, %%mm2 \n\t"
  1596. "punpcklbw %%mm7, %%mm4 \n\t"
  1597. "punpcklbw %%mm7, %%mm2 \n\t"
  1598. #else
  1599. "movd 12(%0, %%ebx), %%mm4 \n\t"
  1600. "movd 12(%1, %%ebx), %%mm1 \n\t"
  1601. "movd 15(%0, %%ebx), %%mm2 \n\t"
  1602. "movd 15(%1, %%ebx), %%mm3 \n\t"
  1603. "punpcklbw %%mm7, %%mm4 \n\t"
  1604. "punpcklbw %%mm7, %%mm1 \n\t"
  1605. "punpcklbw %%mm7, %%mm2 \n\t"
  1606. "punpcklbw %%mm7, %%mm3 \n\t"
  1607. "paddw %%mm1, %%mm4 \n\t"
  1608. "paddw %%mm3, %%mm2 \n\t"
  1609. "paddw %%mm2, %%mm4 \n\t"
  1610. "movd 18(%0, %%ebx), %%mm5 \n\t"
  1611. "movd 18(%1, %%ebx), %%mm1 \n\t"
  1612. "movd 21(%0, %%ebx), %%mm2 \n\t"
  1613. "movd 21(%1, %%ebx), %%mm3 \n\t"
  1614. "punpcklbw %%mm7, %%mm5 \n\t"
  1615. "punpcklbw %%mm7, %%mm1 \n\t"
  1616. "punpcklbw %%mm7, %%mm2 \n\t"
  1617. "punpcklbw %%mm7, %%mm3 \n\t"
  1618. "paddw %%mm1, %%mm5 \n\t"
  1619. "paddw %%mm3, %%mm2 \n\t"
  1620. "paddw %%mm5, %%mm2 \n\t"
  1621. "movq "MANGLE(w1111)", %%mm5 \n\t"
  1622. "psrlw $2, %%mm4 \n\t"
  1623. "psrlw $2, %%mm2 \n\t"
  1624. #endif
  1625. "movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t"
  1626. "movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t"
  1627. "pmaddwd %%mm4, %%mm1 \n\t"
  1628. "pmaddwd %%mm2, %%mm3 \n\t"
  1629. "pmaddwd %%mm6, %%mm4 \n\t"
  1630. "pmaddwd %%mm6, %%mm2 \n\t"
  1631. #ifndef FAST_BGR2YV12
  1632. "psrad $8, %%mm4 \n\t"
  1633. "psrad $8, %%mm1 \n\t"
  1634. "psrad $8, %%mm2 \n\t"
  1635. "psrad $8, %%mm3 \n\t"
  1636. #endif
  1637. "packssdw %%mm2, %%mm4 \n\t"
  1638. "packssdw %%mm3, %%mm1 \n\t"
  1639. "pmaddwd %%mm5, %%mm4 \n\t"
  1640. "pmaddwd %%mm5, %%mm1 \n\t"
  1641. "addl $24, %%ebx \n\t"
  1642. "packssdw %%mm1, %%mm4 \n\t" // V3 V2 U3 U2
  1643. "psraw $7, %%mm4 \n\t"
  1644. "movq %%mm0, %%mm1 \n\t"
  1645. "punpckldq %%mm4, %%mm0 \n\t"
  1646. "punpckhdq %%mm4, %%mm1 \n\t"
  1647. "packsswb %%mm1, %%mm0 \n\t"
  1648. "paddb "MANGLE(bgr2UVOffset)", %%mm0 \n\t"
  1649. "movd %%mm0, (%2, %%eax) \n\t"
  1650. "punpckhdq %%mm0, %%mm0 \n\t"
  1651. "movd %%mm0, (%3, %%eax) \n\t"
  1652. "addl $4, %%eax \n\t"
  1653. " js 1b \n\t"
  1654. : : "r" (src+width*6), "r" (src+srcStride+width*6), "r" (udst+width), "r" (vdst+width), "g" (-width)
  1655. : "%eax", "%ebx"
  1656. );
  1657. udst += chromStride;
  1658. vdst += chromStride;
  1659. src += srcStride*2;
  1660. }
  1661. asm volatile( EMMS" \n\t"
  1662. SFENCE" \n\t"
  1663. :::"memory");
  1664. #else
  1665. y=0;
  1666. #endif
  1667. for(; y<height; y+=2)
  1668. {
  1669. unsigned i;
  1670. for(i=0; i<chromWidth; i++)
  1671. {
  1672. unsigned int b= src[6*i+0];
  1673. unsigned int g= src[6*i+1];
  1674. unsigned int r= src[6*i+2];
  1675. unsigned int Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
  1676. unsigned int V = ((RV*r + GV*g + BV*b)>>RGB2YUV_SHIFT) + 128;
  1677. unsigned int U = ((RU*r + GU*g + BU*b)>>RGB2YUV_SHIFT) + 128;
  1678. udst[i] = U;
  1679. vdst[i] = V;
  1680. ydst[2*i] = Y;
  1681. b= src[6*i+3];
  1682. g= src[6*i+4];
  1683. r= src[6*i+5];
  1684. Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
  1685. ydst[2*i+1] = Y;
  1686. }
  1687. ydst += lumStride;
  1688. src += srcStride;
  1689. for(i=0; i<chromWidth; i++)
  1690. {
  1691. unsigned int b= src[6*i+0];
  1692. unsigned int g= src[6*i+1];
  1693. unsigned int r= src[6*i+2];
  1694. unsigned int Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
  1695. ydst[2*i] = Y;
  1696. b= src[6*i+3];
  1697. g= src[6*i+4];
  1698. r= src[6*i+5];
  1699. Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
  1700. ydst[2*i+1] = Y;
  1701. }
  1702. udst += chromStride;
  1703. vdst += chromStride;
  1704. ydst += lumStride;
  1705. src += srcStride;
  1706. }
  1707. }
  1708. void RENAME(interleaveBytes)(uint8_t *src1, uint8_t *src2, uint8_t *dest,
  1709. unsigned width, unsigned height, unsigned src1Stride,
  1710. unsigned src2Stride, unsigned dstStride){
  1711. unsigned h;
  1712. for(h=0; h < height; h++)
  1713. {
  1714. unsigned w;
  1715. #ifdef HAVE_MMX
  1716. #ifdef HAVE_SSE2
  1717. asm(
  1718. "xorl %%eax, %%eax \n\t"
  1719. "1: \n\t"
  1720. PREFETCH" 64(%1, %%eax) \n\t"
  1721. PREFETCH" 64(%2, %%eax) \n\t"
  1722. "movdqa (%1, %%eax), %%xmm0 \n\t"
  1723. "movdqa (%1, %%eax), %%xmm1 \n\t"
  1724. "movdqa (%2, %%eax), %%xmm2 \n\t"
  1725. "punpcklbw %%xmm2, %%xmm0 \n\t"
  1726. "punpckhbw %%xmm2, %%xmm1 \n\t"
  1727. "movntdq %%xmm0, (%0, %%eax, 2) \n\t"
  1728. "movntdq %%xmm1, 16(%0, %%eax, 2)\n\t"
  1729. "addl $16, %%eax \n\t"
  1730. "cmpl %3, %%eax \n\t"
  1731. " jb 1b \n\t"
  1732. ::"r"(dest), "r"(src1), "r"(src2), "r" (width-15)
  1733. : "memory", "%eax"
  1734. );
  1735. #else
  1736. asm(
  1737. "xorl %%eax, %%eax \n\t"
  1738. "1: \n\t"
  1739. PREFETCH" 64(%1, %%eax) \n\t"
  1740. PREFETCH" 64(%2, %%eax) \n\t"
  1741. "movq (%1, %%eax), %%mm0 \n\t"
  1742. "movq 8(%1, %%eax), %%mm2 \n\t"
  1743. "movq %%mm0, %%mm1 \n\t"
  1744. "movq %%mm2, %%mm3 \n\t"
  1745. "movq (%2, %%eax), %%mm4 \n\t"
  1746. "movq 8(%2, %%eax), %%mm5 \n\t"
  1747. "punpcklbw %%mm4, %%mm0 \n\t"
  1748. "punpckhbw %%mm4, %%mm1 \n\t"
  1749. "punpcklbw %%mm5, %%mm2 \n\t"
  1750. "punpckhbw %%mm5, %%mm3 \n\t"
  1751. MOVNTQ" %%mm0, (%0, %%eax, 2) \n\t"
  1752. MOVNTQ" %%mm1, 8(%0, %%eax, 2) \n\t"
  1753. MOVNTQ" %%mm2, 16(%0, %%eax, 2) \n\t"
  1754. MOVNTQ" %%mm3, 24(%0, %%eax, 2) \n\t"
  1755. "addl $16, %%eax \n\t"
  1756. "cmpl %3, %%eax \n\t"
  1757. " jb 1b \n\t"
  1758. ::"r"(dest), "r"(src1), "r"(src2), "r" (width-15)
  1759. : "memory", "%eax"
  1760. );
  1761. #endif
  1762. for(w= (width&(~15)); w < width; w++)
  1763. {
  1764. dest[2*w+0] = src1[w];
  1765. dest[2*w+1] = src2[w];
  1766. }
  1767. #else
  1768. for(w=0; w < width; w++)
  1769. {
  1770. dest[2*w+0] = src1[w];
  1771. dest[2*w+1] = src2[w];
  1772. }
  1773. #endif
  1774. dest += dstStride;
  1775. src1 += src1Stride;
  1776. src2 += src2Stride;
  1777. }
  1778. #ifdef HAVE_MMX
  1779. asm(
  1780. EMMS" \n\t"
  1781. SFENCE" \n\t"
  1782. ::: "memory"
  1783. );
  1784. #endif
  1785. }