You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

946 lines
25KB

  1. /*
  2. *
  3. * rgb2rgb.c, Software RGB to RGB convertor
  4. * pluralize by Software PAL8 to RGB convertor
  5. * Software YUV to YUV convertor
  6. * Software YUV to RGB convertor
  7. * Written by Nick Kurshev.
  8. * palette stuff & yuv stuff by Michael
  9. */
  10. #include <inttypes.h>
  11. #include "../config.h"
  12. #include "rgb2rgb.h"
  13. #include "../mmx_defs.h"
  14. #ifdef HAVE_MMX
  15. static const uint64_t mask32b __attribute__((aligned(8))) = 0x000000FF000000FFULL;
  16. static const uint64_t mask32g __attribute__((aligned(8))) = 0x0000FF000000FF00ULL;
  17. static const uint64_t mask32r __attribute__((aligned(8))) = 0x00FF000000FF0000ULL;
  18. static const uint64_t mask32 __attribute__((aligned(8))) = 0x00FFFFFF00FFFFFFULL;
  19. static const uint64_t mask24l __attribute__((aligned(8))) = 0x0000000000FFFFFFULL;
  20. static const uint64_t mask24h __attribute__((aligned(8))) = 0x0000FFFFFF000000ULL;
  21. static const uint64_t mask24hh __attribute__((aligned(8))) = 0xffff000000000000ULL;
  22. static const uint64_t mask24hhh __attribute__((aligned(8))) = 0xffffffff00000000ULL;
  23. static const uint64_t mask24hhhh __attribute__((aligned(8))) = 0xffffffffffff0000ULL;
  24. static const uint64_t mask15b __attribute__((aligned(8))) = 0x001F001F001F001FULL; /* 00000000 00011111 xxB */
  25. static const uint64_t mask15rg __attribute__((aligned(8))) = 0x7FE07FE07FE07FE0ULL; /* 01111111 11100000 RGx */
  26. static const uint64_t mask15s __attribute__((aligned(8))) = 0xFFE0FFE0FFE0FFE0ULL;
  27. static const uint64_t red_16mask __attribute__((aligned(8))) = 0x0000f8000000f800ULL;
  28. static const uint64_t green_16mask __attribute__((aligned(8)))= 0x000007e0000007e0ULL;
  29. static const uint64_t blue_16mask __attribute__((aligned(8))) = 0x0000001f0000001fULL;
  30. static const uint64_t red_15mask __attribute__((aligned(8))) = 0x00007c000000f800ULL;
  31. static const uint64_t green_15mask __attribute__((aligned(8)))= 0x000003e0000007e0ULL;
  32. static const uint64_t blue_15mask __attribute__((aligned(8))) = 0x0000001f0000001fULL;
  33. #if 0
  34. static volatile uint64_t __attribute__((aligned(8))) b5Dither;
  35. static volatile uint64_t __attribute__((aligned(8))) g5Dither;
  36. static volatile uint64_t __attribute__((aligned(8))) g6Dither;
  37. static volatile uint64_t __attribute__((aligned(8))) r5Dither;
  38. static uint64_t __attribute__((aligned(8))) dither4[2]={
  39. 0x0103010301030103LL,
  40. 0x0200020002000200LL,};
  41. static uint64_t __attribute__((aligned(8))) dither8[2]={
  42. 0x0602060206020602LL,
  43. 0x0004000400040004LL,};
  44. #endif
  45. #endif
  46. void rgb24to32(const uint8_t *src,uint8_t *dst,unsigned src_size)
  47. {
  48. uint8_t *dest = dst;
  49. const uint8_t *s = src;
  50. const uint8_t *end;
  51. #ifdef HAVE_MMX
  52. uint8_t *mm_end;
  53. #endif
  54. end = s + src_size;
  55. #ifdef HAVE_MMX
  56. __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
  57. mm_end = (uint8_t*)((((unsigned long)end)/(MMREG_SIZE*4))*(MMREG_SIZE*4));
  58. __asm __volatile("movq %0, %%mm7"::"m"(mask32):"memory");
  59. if(mm_end == end) mm_end -= MMREG_SIZE*4;
  60. while(s < mm_end)
  61. {
  62. __asm __volatile(
  63. PREFETCH" 32%1\n\t"
  64. "movd %1, %%mm0\n\t"
  65. "punpckldq 3%1, %%mm0\n\t"
  66. "movd 6%1, %%mm1\n\t"
  67. "punpckldq 9%1, %%mm1\n\t"
  68. "movd 12%1, %%mm2\n\t"
  69. "punpckldq 15%1, %%mm2\n\t"
  70. "movd 18%1, %%mm3\n\t"
  71. "punpckldq 21%1, %%mm3\n\t"
  72. "pand %%mm7, %%mm0\n\t"
  73. "pand %%mm7, %%mm1\n\t"
  74. "pand %%mm7, %%mm2\n\t"
  75. "pand %%mm7, %%mm3\n\t"
  76. MOVNTQ" %%mm0, %0\n\t"
  77. MOVNTQ" %%mm1, 8%0\n\t"
  78. MOVNTQ" %%mm2, 16%0\n\t"
  79. MOVNTQ" %%mm3, 24%0"
  80. :"=m"(*dest)
  81. :"m"(*s)
  82. :"memory");
  83. dest += 32;
  84. s += 24;
  85. }
  86. __asm __volatile(SFENCE:::"memory");
  87. __asm __volatile(EMMS:::"memory");
  88. #endif
  89. while(s < end)
  90. {
  91. *dest++ = *s++;
  92. *dest++ = *s++;
  93. *dest++ = *s++;
  94. *dest++ = 0;
  95. }
  96. }
  97. void rgb32to24(const uint8_t *src,uint8_t *dst,unsigned src_size)
  98. {
  99. uint8_t *dest = dst;
  100. const uint8_t *s = src;
  101. const uint8_t *end;
  102. #ifdef HAVE_MMX
  103. uint8_t *mm_end;
  104. #endif
  105. end = s + src_size;
  106. #ifdef HAVE_MMX
  107. __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
  108. mm_end = (uint8_t*)((((unsigned long)end)/(MMREG_SIZE*4))*(MMREG_SIZE*4));
  109. while(s < mm_end)
  110. {
  111. __asm __volatile(
  112. PREFETCH" 32%1\n\t"
  113. "movq %1, %%mm0\n\t"
  114. "movq 8%1, %%mm1\n\t"
  115. "movq 16%1, %%mm4\n\t"
  116. "movq 24%1, %%mm5\n\t"
  117. "movq %%mm0, %%mm2\n\t"
  118. "movq %%mm1, %%mm3\n\t"
  119. "movq %%mm4, %%mm6\n\t"
  120. "movq %%mm5, %%mm7\n\t"
  121. "psrlq $8, %%mm2\n\t"
  122. "psrlq $8, %%mm3\n\t"
  123. "psrlq $8, %%mm6\n\t"
  124. "psrlq $8, %%mm7\n\t"
  125. "pand %2, %%mm0\n\t"
  126. "pand %2, %%mm1\n\t"
  127. "pand %2, %%mm4\n\t"
  128. "pand %2, %%mm5\n\t"
  129. "pand %3, %%mm2\n\t"
  130. "pand %3, %%mm3\n\t"
  131. "pand %3, %%mm6\n\t"
  132. "pand %3, %%mm7\n\t"
  133. "por %%mm2, %%mm0\n\t"
  134. "por %%mm3, %%mm1\n\t"
  135. "por %%mm6, %%mm4\n\t"
  136. "por %%mm7, %%mm5\n\t"
  137. "movq %%mm1, %%mm2\n\t"
  138. "movq %%mm4, %%mm3\n\t"
  139. "psllq $48, %%mm2\n\t"
  140. "psllq $32, %%mm3\n\t"
  141. "pand %4, %%mm2\n\t"
  142. "pand %5, %%mm3\n\t"
  143. "por %%mm2, %%mm0\n\t"
  144. "psrlq $16, %%mm1\n\t"
  145. "psrlq $32, %%mm4\n\t"
  146. "psllq $16, %%mm5\n\t"
  147. "por %%mm3, %%mm1\n\t"
  148. "pand %6, %%mm5\n\t"
  149. "por %%mm5, %%mm4\n\t"
  150. MOVNTQ" %%mm0, %0\n\t"
  151. MOVNTQ" %%mm1, 8%0\n\t"
  152. MOVNTQ" %%mm4, 16%0"
  153. :"=m"(*dest)
  154. :"m"(*s),"m"(mask24l),
  155. "m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
  156. :"memory");
  157. dest += 24;
  158. s += 32;
  159. }
  160. __asm __volatile(SFENCE:::"memory");
  161. __asm __volatile(EMMS:::"memory");
  162. #endif
  163. while(s < end)
  164. {
  165. *dest++ = *s++;
  166. *dest++ = *s++;
  167. *dest++ = *s++;
  168. s++;
  169. }
  170. }
  171. /*
  172. Original by Strepto/Astral
  173. ported to gcc & bugfixed : A'rpi
  174. MMX2, 3DNOW optimization by Nick Kurshev
  175. 32bit c version, and and&add trick by Michael Niedermayer
  176. */
  177. void rgb15to16(const uint8_t *src,uint8_t *dst,unsigned src_size)
  178. {
  179. #ifdef HAVE_MMX
  180. register const char* s=src+src_size;
  181. register char* d=dst+src_size;
  182. register int offs=-src_size;
  183. __asm __volatile(PREFETCH" %0"::"m"(*(s+offs)));
  184. __asm __volatile(
  185. "movq %0, %%mm4\n\t"
  186. ::"m"(mask15s));
  187. while(offs<0)
  188. {
  189. __asm __volatile(
  190. PREFETCH" 32%1\n\t"
  191. "movq %1, %%mm0\n\t"
  192. "movq 8%1, %%mm2\n\t"
  193. "movq %%mm0, %%mm1\n\t"
  194. "movq %%mm2, %%mm3\n\t"
  195. "pand %%mm4, %%mm0\n\t"
  196. "pand %%mm4, %%mm2\n\t"
  197. "paddw %%mm1, %%mm0\n\t"
  198. "paddw %%mm3, %%mm2\n\t"
  199. MOVNTQ" %%mm0, %0\n\t"
  200. MOVNTQ" %%mm2, 8%0"
  201. :"=m"(*(d+offs))
  202. :"m"(*(s+offs))
  203. );
  204. offs+=16;
  205. }
  206. __asm __volatile(SFENCE:::"memory");
  207. __asm __volatile(EMMS:::"memory");
  208. #else
  209. #if 0
  210. const uint16_t *s1=( uint16_t * )src;
  211. uint16_t *d1=( uint16_t * )dst;
  212. uint16_t *e=((uint8_t *)s1)+src_size;
  213. while( s1<e ){
  214. register int x=*( s1++ );
  215. /* rrrrrggggggbbbbb
  216. 0rrrrrgggggbbbbb
  217. 0111 1111 1110 0000=0x7FE0
  218. 00000000000001 1111=0x001F */
  219. *( d1++ )=( x&0x001F )|( ( x&0x7FE0 )<<1 );
  220. }
  221. #else
  222. const unsigned *s1=( unsigned * )src;
  223. unsigned *d1=( unsigned * )dst;
  224. int i;
  225. int size= src_size>>2;
  226. for(i=0; i<size; i++)
  227. {
  228. register int x= s1[i];
  229. // d1[i] = x + (x&0x7FE07FE0); //faster but need msbit =0 which might not allways be true
  230. d1[i] = (x&0x7FFF7FFF) + (x&0x7FE07FE0);
  231. }
  232. #endif
  233. #endif
  234. }
  235. /**
  236. * Pallete is assumed to contain bgr32
  237. */
  238. void palette8torgb32(const uint8_t *src, uint8_t *dst, unsigned num_pixels, const uint8_t *palette)
  239. {
  240. unsigned i;
  241. for(i=0; i<num_pixels; i++)
  242. ((unsigned *)dst)[i] = ((unsigned *)palette)[ src[i] ];
  243. }
  244. /**
  245. * Pallete is assumed to contain bgr32
  246. */
  247. void palette8torgb24(const uint8_t *src, uint8_t *dst, unsigned num_pixels, const uint8_t *palette)
  248. {
  249. unsigned i;
  250. /*
  251. writes 1 byte o much and might cause alignment issues on some architectures?
  252. for(i=0; i<num_pixels; i++)
  253. ((unsigned *)(&dst[i*3])) = ((unsigned *)palette)[ src[i] ];
  254. */
  255. for(i=0; i<num_pixels; i++)
  256. {
  257. //FIXME slow?
  258. dst[0]= palette[ src[i]*4+0 ];
  259. dst[1]= palette[ src[i]*4+1 ];
  260. dst[2]= palette[ src[i]*4+2 ];
  261. dst+= 3;
  262. }
  263. }
  264. void rgb32to16(const uint8_t *src, uint8_t *dst, unsigned src_size)
  265. {
  266. #ifdef HAVE_MMX
  267. const uint8_t *s = src;
  268. const uint8_t *end,*mm_end;
  269. uint16_t *d = (uint16_t *)dst;
  270. end = s + src_size;
  271. mm_end = (uint8_t*)((((unsigned long)end)/(MMREG_SIZE*2))*(MMREG_SIZE*2));
  272. __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
  273. __asm __volatile(
  274. "movq %0, %%mm7\n\t"
  275. "movq %1, %%mm6\n\t"
  276. ::"m"(red_16mask),"m"(green_16mask));
  277. while(s < mm_end)
  278. {
  279. __asm __volatile(
  280. PREFETCH" 32%1\n\t"
  281. "movd %1, %%mm0\n\t"
  282. "movd 4%1, %%mm3\n\t"
  283. "punpckldq 8%1, %%mm0\n\t"
  284. "punpckldq 12%1, %%mm3\n\t"
  285. "movq %%mm0, %%mm1\n\t"
  286. "movq %%mm0, %%mm2\n\t"
  287. "movq %%mm3, %%mm4\n\t"
  288. "movq %%mm3, %%mm5\n\t"
  289. "psrlq $3, %%mm0\n\t"
  290. "psrlq $3, %%mm3\n\t"
  291. "pand %2, %%mm0\n\t"
  292. "pand %2, %%mm3\n\t"
  293. "psrlq $5, %%mm1\n\t"
  294. "psrlq $5, %%mm4\n\t"
  295. "pand %%mm6, %%mm1\n\t"
  296. "pand %%mm6, %%mm4\n\t"
  297. "psrlq $8, %%mm2\n\t"
  298. "psrlq $8, %%mm5\n\t"
  299. "pand %%mm7, %%mm2\n\t"
  300. "pand %%mm7, %%mm5\n\t"
  301. "por %%mm1, %%mm0\n\t"
  302. "por %%mm4, %%mm3\n\t"
  303. "por %%mm2, %%mm0\n\t"
  304. "por %%mm5, %%mm3\n\t"
  305. "psllq $16, %%mm3\n\t"
  306. "por %%mm3, %%mm0\n\t"
  307. MOVNTQ" %%mm0, %0\n\t"
  308. :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
  309. d += 4;
  310. s += 16;
  311. }
  312. while(s < end)
  313. {
  314. const int b= *s++;
  315. const int g= *s++;
  316. const int r= *s++;
  317. *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
  318. }
  319. __asm __volatile(SFENCE:::"memory");
  320. __asm __volatile(EMMS:::"memory");
  321. #else
  322. unsigned j,i,num_pixels=src_size/4;
  323. uint16_t *d = (uint16_t *)dst;
  324. for(i=0,j=0; j<num_pixels; i+=4,j++)
  325. {
  326. const int b= src[i+0];
  327. const int g= src[i+1];
  328. const int r= src[i+2];
  329. d[j]= (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
  330. }
  331. #endif
  332. }
  333. void rgb32to15(const uint8_t *src, uint8_t *dst, unsigned src_size)
  334. {
  335. #ifdef HAVE_MMX
  336. const uint8_t *s = src;
  337. const uint8_t *end,*mm_end;
  338. uint16_t *d = (uint16_t *)dst;
  339. end = s + src_size;
  340. mm_end = (uint8_t*)((((unsigned long)end)/(MMREG_SIZE*2))*(MMREG_SIZE*2));
  341. __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
  342. __asm __volatile(
  343. "movq %0, %%mm7\n\t"
  344. "movq %1, %%mm6\n\t"
  345. ::"m"(red_15mask),"m"(green_15mask));
  346. while(s < mm_end)
  347. {
  348. __asm __volatile(
  349. PREFETCH" 32%1\n\t"
  350. "movd %1, %%mm0\n\t"
  351. "movd 4%1, %%mm3\n\t"
  352. "punpckldq 8%1, %%mm0\n\t"
  353. "punpckldq 12%1, %%mm3\n\t"
  354. "movq %%mm0, %%mm1\n\t"
  355. "movq %%mm0, %%mm2\n\t"
  356. "movq %%mm3, %%mm4\n\t"
  357. "movq %%mm3, %%mm5\n\t"
  358. "psrlq $3, %%mm0\n\t"
  359. "psrlq $3, %%mm3\n\t"
  360. "pand %2, %%mm0\n\t"
  361. "pand %2, %%mm3\n\t"
  362. "psrlq $6, %%mm1\n\t"
  363. "psrlq $6, %%mm4\n\t"
  364. "pand %%mm6, %%mm1\n\t"
  365. "pand %%mm6, %%mm4\n\t"
  366. "psrlq $9, %%mm2\n\t"
  367. "psrlq $9, %%mm5\n\t"
  368. "pand %%mm7, %%mm2\n\t"
  369. "pand %%mm7, %%mm5\n\t"
  370. "por %%mm1, %%mm0\n\t"
  371. "por %%mm4, %%mm3\n\t"
  372. "por %%mm2, %%mm0\n\t"
  373. "por %%mm5, %%mm3\n\t"
  374. "psllq $16, %%mm3\n\t"
  375. "por %%mm3, %%mm0\n\t"
  376. MOVNTQ" %%mm0, %0\n\t"
  377. :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
  378. d += 4;
  379. s += 16;
  380. }
  381. while(s < end)
  382. {
  383. const int b= *s++;
  384. const int g= *s++;
  385. const int r= *s++;
  386. *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
  387. }
  388. __asm __volatile(SFENCE:::"memory");
  389. __asm __volatile(EMMS:::"memory");
  390. #else
  391. unsigned j,i,num_pixels=src_size/4;
  392. uint16_t *d = (uint16_t *)dst;
  393. for(i=0,j=0; j<num_pixels; i+=4,j++)
  394. {
  395. const int b= src[i+0];
  396. const int g= src[i+1];
  397. const int r= src[i+2];
  398. d[j]= (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
  399. }
  400. #endif
  401. }
  402. void rgb24to16(const uint8_t *src, uint8_t *dst, unsigned src_size)
  403. {
  404. #ifdef HAVE_MMX
  405. const uint8_t *s = src;
  406. const uint8_t *end,*mm_end;
  407. uint16_t *d = (uint16_t *)dst;
  408. end = s + src_size;
  409. mm_end = (uint8_t*)((((unsigned long)end)/(MMREG_SIZE*2))*(MMREG_SIZE*2));
  410. __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
  411. __asm __volatile(
  412. "movq %0, %%mm7\n\t"
  413. "movq %1, %%mm6\n\t"
  414. ::"m"(red_16mask),"m"(green_16mask));
  415. if(mm_end == end) mm_end -= MMREG_SIZE*2;
  416. while(s < mm_end)
  417. {
  418. __asm __volatile(
  419. PREFETCH" 32%1\n\t"
  420. "movd %1, %%mm0\n\t"
  421. "movd 3%1, %%mm3\n\t"
  422. "punpckldq 6%1, %%mm0\n\t"
  423. "punpckldq 9%1, %%mm3\n\t"
  424. "movq %%mm0, %%mm1\n\t"
  425. "movq %%mm0, %%mm2\n\t"
  426. "movq %%mm3, %%mm4\n\t"
  427. "movq %%mm3, %%mm5\n\t"
  428. "psrlq $3, %%mm0\n\t"
  429. "psrlq $3, %%mm3\n\t"
  430. "pand %2, %%mm0\n\t"
  431. "pand %2, %%mm3\n\t"
  432. "psrlq $5, %%mm1\n\t"
  433. "psrlq $5, %%mm4\n\t"
  434. "pand %%mm6, %%mm1\n\t"
  435. "pand %%mm6, %%mm4\n\t"
  436. "psrlq $8, %%mm2\n\t"
  437. "psrlq $8, %%mm5\n\t"
  438. "pand %%mm7, %%mm2\n\t"
  439. "pand %%mm7, %%mm5\n\t"
  440. "por %%mm1, %%mm0\n\t"
  441. "por %%mm4, %%mm3\n\t"
  442. "por %%mm2, %%mm0\n\t"
  443. "por %%mm5, %%mm3\n\t"
  444. "psllq $16, %%mm3\n\t"
  445. "por %%mm3, %%mm0\n\t"
  446. MOVNTQ" %%mm0, %0\n\t"
  447. :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
  448. d += 4;
  449. s += 12;
  450. }
  451. while(s < end)
  452. {
  453. const int b= *s++;
  454. const int g= *s++;
  455. const int r= *s++;
  456. *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
  457. }
  458. __asm __volatile(SFENCE:::"memory");
  459. __asm __volatile(EMMS:::"memory");
  460. #else
  461. unsigned j,i,num_pixels=src_size/3;
  462. uint16_t *d = (uint16_t *)dst;
  463. for(i=0,j=0; j<num_pixels; i+=3,j++)
  464. {
  465. const int b= src[i+0];
  466. const int g= src[i+1];
  467. const int r= src[i+2];
  468. d[j]= (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
  469. }
  470. #endif
  471. }
  472. void rgb24to15(const uint8_t *src, uint8_t *dst, unsigned src_size)
  473. {
  474. #ifdef HAVE_MMX
  475. const uint8_t *s = src;
  476. const uint8_t *end,*mm_end;
  477. uint16_t *d = (uint16_t *)dst;
  478. end = s + src_size;
  479. mm_end = (uint8_t*)((((unsigned long)end)/(MMREG_SIZE*2))*(MMREG_SIZE*2));
  480. __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
  481. __asm __volatile(
  482. "movq %0, %%mm7\n\t"
  483. "movq %1, %%mm6\n\t"
  484. ::"m"(red_15mask),"m"(green_15mask));
  485. if(mm_end == end) mm_end -= MMREG_SIZE*2;
  486. while(s < mm_end)
  487. {
  488. __asm __volatile(
  489. PREFETCH" 32%1\n\t"
  490. "movd %1, %%mm0\n\t"
  491. "movd 3%1, %%mm3\n\t"
  492. "punpckldq 6%1, %%mm0\n\t"
  493. "punpckldq 9%1, %%mm3\n\t"
  494. "movq %%mm0, %%mm1\n\t"
  495. "movq %%mm0, %%mm2\n\t"
  496. "movq %%mm3, %%mm4\n\t"
  497. "movq %%mm3, %%mm5\n\t"
  498. "psrlq $3, %%mm0\n\t"
  499. "psrlq $3, %%mm3\n\t"
  500. "pand %2, %%mm0\n\t"
  501. "pand %2, %%mm3\n\t"
  502. "psrlq $6, %%mm1\n\t"
  503. "psrlq $6, %%mm4\n\t"
  504. "pand %%mm6, %%mm1\n\t"
  505. "pand %%mm6, %%mm4\n\t"
  506. "psrlq $9, %%mm2\n\t"
  507. "psrlq $9, %%mm5\n\t"
  508. "pand %%mm7, %%mm2\n\t"
  509. "pand %%mm7, %%mm5\n\t"
  510. "por %%mm1, %%mm0\n\t"
  511. "por %%mm4, %%mm3\n\t"
  512. "por %%mm2, %%mm0\n\t"
  513. "por %%mm5, %%mm3\n\t"
  514. "psllq $16, %%mm3\n\t"
  515. "por %%mm3, %%mm0\n\t"
  516. MOVNTQ" %%mm0, %0\n\t"
  517. :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
  518. d += 4;
  519. s += 12;
  520. }
  521. while(s < end)
  522. {
  523. const int b= *s++;
  524. const int g= *s++;
  525. const int r= *s++;
  526. *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
  527. }
  528. __asm __volatile(SFENCE:::"memory");
  529. __asm __volatile(EMMS:::"memory");
  530. #else
  531. unsigned j,i,num_pixels=src_size/3;
  532. uint16_t *d = (uint16_t *)dst;
  533. for(i=0,j=0; j<num_pixels; i+=3,j++)
  534. {
  535. const int b= src[i+0];
  536. const int g= src[i+1];
  537. const int r= src[i+2];
  538. d[j]= (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
  539. }
  540. #endif
  541. }
  542. /**
  543. * Palette is assumed to contain bgr16, see rgb32to16 to convert the palette
  544. */
  545. void palette8torgb16(const uint8_t *src, uint8_t *dst, unsigned num_pixels, const uint8_t *palette)
  546. {
  547. unsigned i;
  548. for(i=0; i<num_pixels; i++)
  549. ((uint16_t *)dst)[i] = ((uint16_t *)palette)[ src[i] ];
  550. }
  551. /**
  552. * Pallete is assumed to contain bgr15, see rgb32to15 to convert the palette
  553. */
  554. void palette8torgb15(const uint8_t *src, uint8_t *dst, unsigned num_pixels, const uint8_t *palette)
  555. {
  556. unsigned i;
  557. for(i=0; i<num_pixels; i++)
  558. ((uint16_t *)dst)[i] = ((uint16_t *)palette)[ src[i] ];
  559. }
  560. void rgb32tobgr32(const uint8_t *src, uint8_t *dst, unsigned int src_size)
  561. {
  562. int num_pixels= src_size >> 2;
  563. #ifdef HAVE_MMX
  564. asm volatile (
  565. "xorl %%eax, %%eax \n\t"
  566. ".balign 16 \n\t"
  567. "1: \n\t"
  568. PREFETCH" 32(%0, %%eax) \n\t"
  569. "movq (%0, %%eax), %%mm0 \n\t"
  570. "movq %%mm0, %%mm1 \n\t"
  571. "movq %%mm0, %%mm2 \n\t"
  572. "pslld $16, %%mm0 \n\t"
  573. "psrld $16, %%mm1 \n\t"
  574. "pand mask32r, %%mm0 \n\t"
  575. "pand mask32g, %%mm2 \n\t"
  576. "pand mask32b, %%mm1 \n\t"
  577. "por %%mm0, %%mm2 \n\t"
  578. "por %%mm1, %%mm2 \n\t"
  579. MOVNTQ" %%mm2, (%1, %%eax) \n\t"
  580. "addl $2, %%eax \n\t"
  581. "cmpl %2, %%eax \n\t"
  582. " jb 1b \n\t"
  583. :: "r" (src), "r"(dst), "r" (num_pixels)
  584. : "%eax"
  585. );
  586. __asm __volatile(SFENCE:::"memory");
  587. __asm __volatile(EMMS:::"memory");
  588. #else
  589. int i;
  590. for(i=0; i<num_pixels; i++)
  591. {
  592. dst[4*i + 0] = src[4*i + 2];
  593. dst[4*i + 1] = src[4*i + 1];
  594. dst[4*i + 2] = src[4*i + 0];
  595. }
  596. #endif
  597. }
  598. /**
  599. *
  600. * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
  601. * problem for anyone then tell me, and ill fix it)
  602. */
  603. void yv12toyuy2(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
  604. unsigned int width, unsigned int height,
  605. unsigned int lumStride, unsigned int chromStride, unsigned int dstStride)
  606. {
  607. int y;
  608. const int chromWidth= width>>1;
  609. for(y=0; y<height; y++)
  610. {
  611. #ifdef HAVE_MMX
  612. //FIXME handle 2 lines a once (fewer prefetch, reuse some chrom, but very likely limited by mem anyway)
  613. asm volatile(
  614. "xorl %%eax, %%eax \n\t"
  615. ".balign 16 \n\t"
  616. "1: \n\t"
  617. PREFETCH" 32(%1, %%eax, 2) \n\t"
  618. PREFETCH" 32(%2, %%eax) \n\t"
  619. PREFETCH" 32(%3, %%eax) \n\t"
  620. "movq (%2, %%eax), %%mm0 \n\t" // U(0)
  621. "movq %%mm0, %%mm2 \n\t" // U(0)
  622. "movq (%3, %%eax), %%mm1 \n\t" // V(0)
  623. "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
  624. "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8)
  625. "movq (%1, %%eax,2), %%mm3 \n\t" // Y(0)
  626. "movq 8(%1, %%eax,2), %%mm5 \n\t" // Y(8)
  627. "movq %%mm3, %%mm4 \n\t" // Y(0)
  628. "movq %%mm5, %%mm6 \n\t" // Y(8)
  629. "punpcklbw %%mm0, %%mm3 \n\t" // YUYV YUYV(0)
  630. "punpckhbw %%mm0, %%mm4 \n\t" // YUYV YUYV(4)
  631. "punpcklbw %%mm2, %%mm5 \n\t" // YUYV YUYV(8)
  632. "punpckhbw %%mm2, %%mm6 \n\t" // YUYV YUYV(12)
  633. MOVNTQ" %%mm3, (%0, %%eax, 4) \n\t"
  634. MOVNTQ" %%mm4, 8(%0, %%eax, 4) \n\t"
  635. MOVNTQ" %%mm5, 16(%0, %%eax, 4) \n\t"
  636. MOVNTQ" %%mm6, 24(%0, %%eax, 4) \n\t"
  637. "addl $8, %%eax \n\t"
  638. "cmpl %4, %%eax \n\t"
  639. " jb 1b \n\t"
  640. ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "r" (chromWidth)
  641. : "%eax"
  642. );
  643. #else
  644. int i;
  645. for(i=0; i<chromWidth; i++)
  646. {
  647. dst[4*i+0] = ysrc[2*i+0];
  648. dst[4*i+1] = usrc[i];
  649. dst[4*i+2] = ysrc[2*i+1];
  650. dst[4*i+3] = vsrc[i];
  651. }
  652. #endif
  653. if(y&1)
  654. {
  655. usrc += chromStride;
  656. vsrc += chromStride;
  657. }
  658. ysrc += lumStride;
  659. dst += dstStride;
  660. }
  661. #ifdef HAVE_MMX
  662. asm( EMMS" \n\t"
  663. SFENCE" \n\t"
  664. :::"memory");
  665. #endif
  666. }
  667. /**
  668. *
  669. * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
  670. * problem for anyone then tell me, and ill fix it)
  671. */
  672. void yuy2toyv12(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
  673. unsigned int width, unsigned int height,
  674. unsigned int lumStride, unsigned int chromStride, unsigned int srcStride)
  675. {
  676. int y;
  677. const int chromWidth= width>>1;
  678. for(y=0; y<height; y+=2)
  679. {
  680. #ifdef HAVE_MMX
  681. asm volatile(
  682. "xorl %%eax, %%eax \n\t"
  683. "pcmpeqw %%mm7, %%mm7 \n\t"
  684. "psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
  685. ".balign 16 \n\t"
  686. "1: \n\t"
  687. PREFETCH" 64(%0, %%eax, 4) \n\t"
  688. "movq (%0, %%eax, 4), %%mm0 \n\t" // YUYV YUYV(0)
  689. "movq 8(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(4)
  690. "movq %%mm0, %%mm2 \n\t" // YUYV YUYV(0)
  691. "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(4)
  692. "psrlw $8, %%mm0 \n\t" // U0V0 U0V0(0)
  693. "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(4)
  694. "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
  695. "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
  696. "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
  697. "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
  698. MOVNTQ" %%mm2, (%1, %%eax, 2) \n\t"
  699. "movq 16(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(8)
  700. "movq 24(%0, %%eax, 4), %%mm2 \n\t" // YUYV YUYV(12)
  701. "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(8)
  702. "movq %%mm2, %%mm4 \n\t" // YUYV YUYV(12)
  703. "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(8)
  704. "psrlw $8, %%mm2 \n\t" // U0V0 U0V0(12)
  705. "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
  706. "pand %%mm7, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
  707. "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
  708. "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
  709. MOVNTQ" %%mm3, 8(%1, %%eax, 2) \n\t"
  710. "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
  711. "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
  712. "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
  713. "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
  714. "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
  715. "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
  716. "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
  717. "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
  718. MOVNTQ" %%mm0, (%3, %%eax) \n\t"
  719. MOVNTQ" %%mm2, (%2, %%eax) \n\t"
  720. "addl $8, %%eax \n\t"
  721. "cmpl %4, %%eax \n\t"
  722. " jb 1b \n\t"
  723. ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "r" (chromWidth)
  724. : "memory", "%eax"
  725. );
  726. ydst += lumStride;
  727. src += srcStride;
  728. asm volatile(
  729. "xorl %%eax, %%eax \n\t"
  730. ".balign 16 \n\t"
  731. "1: \n\t"
  732. PREFETCH" 64(%0, %%eax, 4) \n\t"
  733. "movq (%0, %%eax, 4), %%mm0 \n\t" // YUYV YUYV(0)
  734. "movq 8(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(4)
  735. "movq 16(%0, %%eax, 4), %%mm2 \n\t" // YUYV YUYV(8)
  736. "movq 24(%0, %%eax, 4), %%mm3 \n\t" // YUYV YUYV(12)
  737. "pand %%mm7, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
  738. "pand %%mm7, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
  739. "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
  740. "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
  741. "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
  742. "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
  743. MOVNTQ" %%mm0, (%1, %%eax, 2) \n\t"
  744. MOVNTQ" %%mm2, 8(%1, %%eax, 2) \n\t"
  745. "addl $8, %%eax \n\t"
  746. "cmpl %4, %%eax \n\t"
  747. " jb 1b \n\t"
  748. ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "r" (chromWidth)
  749. : "memory", "%eax"
  750. );
  751. #else
  752. int i;
  753. for(i=0; i<chromWidth; i++)
  754. {
  755. ydst[2*i+0] = src[4*i+0];
  756. udst[i] = src[4*i+1];
  757. ydst[2*i+1] = src[4*i+2];
  758. vdst[i] = src[4*i+3];
  759. }
  760. ydst += lumStride;
  761. src += srcStride;
  762. for(i=0; i<chromWidth; i++)
  763. {
  764. ydst[2*i+0] = src[4*i+0];
  765. ydst[2*i+1] = src[4*i+2];
  766. }
  767. #endif
  768. udst += chromStride;
  769. vdst += chromStride;
  770. ydst += lumStride;
  771. src += srcStride;
  772. }
  773. #ifdef HAVE_MMX
  774. asm volatile( EMMS" \n\t"
  775. SFENCE" \n\t"
  776. :::"memory");
  777. #endif
  778. }
  779. /**
  780. *
  781. * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
  782. * problem for anyone then tell me, and ill fix it)
  783. */
  784. void uyvytoyv12(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
  785. unsigned int width, unsigned int height,
  786. unsigned int lumStride, unsigned int chromStride, unsigned int srcStride)
  787. {
  788. int y;
  789. const int chromWidth= width>>1;
  790. for(y=0; y<height; y+=2)
  791. {
  792. #ifdef HAVE_MMX
  793. asm volatile(
  794. "xorl %%eax, %%eax \n\t"
  795. "pcmpeqw %%mm7, %%mm7 \n\t"
  796. "psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
  797. ".balign 16 \n\t"
  798. "1: \n\t"
  799. PREFETCH" 64(%0, %%eax, 4) \n\t"
  800. "movq (%0, %%eax, 4), %%mm0 \n\t" // UYVY UYVY(0)
  801. "movq 8(%0, %%eax, 4), %%mm1 \n\t" // UYVY UYVY(4)
  802. "movq %%mm0, %%mm2 \n\t" // UYVY UYVY(0)
  803. "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(4)
  804. "pand %%mm7, %%mm0 \n\t" // U0V0 U0V0(0)
  805. "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(4)
  806. "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
  807. "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
  808. "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
  809. "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
  810. MOVNTQ" %%mm2, (%1, %%eax, 2) \n\t"
  811. "movq 16(%0, %%eax, 4), %%mm1 \n\t" // UYVY UYVY(8)
  812. "movq 24(%0, %%eax, 4), %%mm2 \n\t" // UYVY UYVY(12)
  813. "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(8)
  814. "movq %%mm2, %%mm4 \n\t" // UYVY UYVY(12)
  815. "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(8)
  816. "pand %%mm7, %%mm2 \n\t" // U0V0 U0V0(12)
  817. "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
  818. "psrlw $8, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
  819. "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
  820. "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
  821. MOVNTQ" %%mm3, 8(%1, %%eax, 2) \n\t"
  822. "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
  823. "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
  824. "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
  825. "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
  826. "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
  827. "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
  828. "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
  829. "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
  830. MOVNTQ" %%mm0, (%3, %%eax) \n\t"
  831. MOVNTQ" %%mm2, (%2, %%eax) \n\t"
  832. "addl $8, %%eax \n\t"
  833. "cmpl %4, %%eax \n\t"
  834. " jb 1b \n\t"
  835. ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "r" (chromWidth)
  836. : "memory", "%eax"
  837. );
  838. ydst += lumStride;
  839. src += srcStride;
  840. asm volatile(
  841. "xorl %%eax, %%eax \n\t"
  842. ".balign 16 \n\t"
  843. "1: \n\t"
  844. PREFETCH" 64(%0, %%eax, 4) \n\t"
  845. "movq (%0, %%eax, 4), %%mm0 \n\t" // YUYV YUYV(0)
  846. "movq 8(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(4)
  847. "movq 16(%0, %%eax, 4), %%mm2 \n\t" // YUYV YUYV(8)
  848. "movq 24(%0, %%eax, 4), %%mm3 \n\t" // YUYV YUYV(12)
  849. "psrlw $8, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
  850. "psrlw $8, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
  851. "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
  852. "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
  853. "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
  854. "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
  855. MOVNTQ" %%mm0, (%1, %%eax, 2) \n\t"
  856. MOVNTQ" %%mm2, 8(%1, %%eax, 2) \n\t"
  857. "addl $8, %%eax \n\t"
  858. "cmpl %4, %%eax \n\t"
  859. " jb 1b \n\t"
  860. ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "r" (chromWidth)
  861. : "memory", "%eax"
  862. );
  863. #else
  864. int i;
  865. for(i=0; i<chromWidth; i++)
  866. {
  867. udst[i] = src[4*i+0];
  868. ydst[2*i+0] = src[4*i+1];
  869. vdst[i] = src[4*i+2];
  870. ydst[2*i+1] = src[4*i+3];
  871. }
  872. ydst += lumStride;
  873. src += srcStride;
  874. for(i=0; i<chromWidth; i++)
  875. {
  876. ydst[2*i+0] = src[4*i+1];
  877. ydst[2*i+1] = src[4*i+3];
  878. }
  879. #endif
  880. udst += chromStride;
  881. vdst += chromStride;
  882. ydst += lumStride;
  883. src += srcStride;
  884. }
  885. #ifdef HAVE_MMX
  886. asm volatile( EMMS" \n\t"
  887. SFENCE" \n\t"
  888. :::"memory");
  889. #endif
  890. }