You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

2419 lines
60KB

  1. /*
  2. *
  3. * rgb2rgb.c, Software RGB to RGB convertor
  4. * pluralize by Software PAL8 to RGB convertor
  5. * Software YUV to YUV convertor
  6. * Software YUV to RGB convertor
  7. * Written by Nick Kurshev.
  8. * palette & yuv & runtime cpu stuff by Michael (michaelni@gmx.at) (under GPL)
  9. */
  10. #include <stddef.h>
  11. #include <inttypes.h> /* for __WORDSIZE */
  12. #ifndef __WORDSIZE
  13. // #warning You have misconfigured system and probably will lose performance!
  14. #define __WORDSIZE MP_WORDSIZE
  15. #endif
  16. #undef PREFETCH
  17. #undef MOVNTQ
  18. #undef EMMS
  19. #undef SFENCE
  20. #undef MMREG_SIZE
  21. #undef PREFETCHW
  22. #undef PAVGB
  23. #ifdef HAVE_SSE2
  24. #define MMREG_SIZE 16
  25. #else
  26. #define MMREG_SIZE 8
  27. #endif
  28. #ifdef HAVE_3DNOW
  29. #define PREFETCH "prefetch"
  30. #define PREFETCHW "prefetchw"
  31. #define PAVGB "pavgusb"
  32. #elif defined ( HAVE_MMX2 )
  33. #define PREFETCH "prefetchnta"
  34. #define PREFETCHW "prefetcht0"
  35. #define PAVGB "pavgb"
  36. #else
  37. #define PREFETCH "/nop"
  38. #define PREFETCHW "/nop"
  39. #endif
  40. #ifdef HAVE_3DNOW
  41. /* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
  42. #define EMMS "femms"
  43. #else
  44. #define EMMS "emms"
  45. #endif
  46. #ifdef HAVE_MMX2
  47. #define MOVNTQ "movntq"
  48. #define SFENCE "sfence"
  49. #else
  50. #define MOVNTQ "movq"
  51. #define SFENCE "/nop"
  52. #endif
  53. static inline void RENAME(rgb24to32)(const uint8_t *src,uint8_t *dst,unsigned src_size)
  54. {
  55. uint8_t *dest = dst;
  56. const uint8_t *s = src;
  57. const uint8_t *end;
  58. #ifdef HAVE_MMX
  59. const uint8_t *mm_end;
  60. #endif
  61. end = s + src_size;
  62. #ifdef HAVE_MMX
  63. __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
  64. mm_end = end - 23;
  65. __asm __volatile("movq %0, %%mm7"::"m"(mask32):"memory");
  66. while(s < mm_end)
  67. {
  68. __asm __volatile(
  69. PREFETCH" 32%1\n\t"
  70. "movd %1, %%mm0\n\t"
  71. "punpckldq 3%1, %%mm0\n\t"
  72. "movd 6%1, %%mm1\n\t"
  73. "punpckldq 9%1, %%mm1\n\t"
  74. "movd 12%1, %%mm2\n\t"
  75. "punpckldq 15%1, %%mm2\n\t"
  76. "movd 18%1, %%mm3\n\t"
  77. "punpckldq 21%1, %%mm3\n\t"
  78. "pand %%mm7, %%mm0\n\t"
  79. "pand %%mm7, %%mm1\n\t"
  80. "pand %%mm7, %%mm2\n\t"
  81. "pand %%mm7, %%mm3\n\t"
  82. MOVNTQ" %%mm0, %0\n\t"
  83. MOVNTQ" %%mm1, 8%0\n\t"
  84. MOVNTQ" %%mm2, 16%0\n\t"
  85. MOVNTQ" %%mm3, 24%0"
  86. :"=m"(*dest)
  87. :"m"(*s)
  88. :"memory");
  89. dest += 32;
  90. s += 24;
  91. }
  92. __asm __volatile(SFENCE:::"memory");
  93. __asm __volatile(EMMS:::"memory");
  94. #endif
  95. while(s < end)
  96. {
  97. *dest++ = *s++;
  98. *dest++ = *s++;
  99. *dest++ = *s++;
  100. *dest++ = 0;
  101. }
  102. }
  103. static inline void RENAME(rgb32to24)(const uint8_t *src,uint8_t *dst,unsigned src_size)
  104. {
  105. uint8_t *dest = dst;
  106. const uint8_t *s = src;
  107. const uint8_t *end;
  108. #ifdef HAVE_MMX
  109. const uint8_t *mm_end;
  110. #endif
  111. end = s + src_size;
  112. #ifdef HAVE_MMX
  113. __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
  114. mm_end = end - 31;
  115. while(s < mm_end)
  116. {
  117. __asm __volatile(
  118. PREFETCH" 32%1\n\t"
  119. "movq %1, %%mm0\n\t"
  120. "movq 8%1, %%mm1\n\t"
  121. "movq 16%1, %%mm4\n\t"
  122. "movq 24%1, %%mm5\n\t"
  123. "movq %%mm0, %%mm2\n\t"
  124. "movq %%mm1, %%mm3\n\t"
  125. "movq %%mm4, %%mm6\n\t"
  126. "movq %%mm5, %%mm7\n\t"
  127. "psrlq $8, %%mm2\n\t"
  128. "psrlq $8, %%mm3\n\t"
  129. "psrlq $8, %%mm6\n\t"
  130. "psrlq $8, %%mm7\n\t"
  131. "pand %2, %%mm0\n\t"
  132. "pand %2, %%mm1\n\t"
  133. "pand %2, %%mm4\n\t"
  134. "pand %2, %%mm5\n\t"
  135. "pand %3, %%mm2\n\t"
  136. "pand %3, %%mm3\n\t"
  137. "pand %3, %%mm6\n\t"
  138. "pand %3, %%mm7\n\t"
  139. "por %%mm2, %%mm0\n\t"
  140. "por %%mm3, %%mm1\n\t"
  141. "por %%mm6, %%mm4\n\t"
  142. "por %%mm7, %%mm5\n\t"
  143. "movq %%mm1, %%mm2\n\t"
  144. "movq %%mm4, %%mm3\n\t"
  145. "psllq $48, %%mm2\n\t"
  146. "psllq $32, %%mm3\n\t"
  147. "pand %4, %%mm2\n\t"
  148. "pand %5, %%mm3\n\t"
  149. "por %%mm2, %%mm0\n\t"
  150. "psrlq $16, %%mm1\n\t"
  151. "psrlq $32, %%mm4\n\t"
  152. "psllq $16, %%mm5\n\t"
  153. "por %%mm3, %%mm1\n\t"
  154. "pand %6, %%mm5\n\t"
  155. "por %%mm5, %%mm4\n\t"
  156. MOVNTQ" %%mm0, %0\n\t"
  157. MOVNTQ" %%mm1, 8%0\n\t"
  158. MOVNTQ" %%mm4, 16%0"
  159. :"=m"(*dest)
  160. :"m"(*s),"m"(mask24l),
  161. "m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
  162. :"memory");
  163. dest += 24;
  164. s += 32;
  165. }
  166. __asm __volatile(SFENCE:::"memory");
  167. __asm __volatile(EMMS:::"memory");
  168. #endif
  169. while(s < end)
  170. {
  171. *dest++ = *s++;
  172. *dest++ = *s++;
  173. *dest++ = *s++;
  174. s++;
  175. }
  176. }
  177. /*
  178. Original by Strepto/Astral
  179. ported to gcc & bugfixed : A'rpi
  180. MMX2, 3DNOW optimization by Nick Kurshev
  181. 32bit c version, and and&add trick by Michael Niedermayer
  182. */
  183. static inline void RENAME(rgb15to16)(const uint8_t *src,uint8_t *dst,unsigned src_size)
  184. {
  185. register const uint8_t* s=src;
  186. register uint8_t* d=dst;
  187. register const uint8_t *end;
  188. const uint8_t *mm_end;
  189. end = s + src_size;
  190. #ifdef HAVE_MMX
  191. __asm __volatile(PREFETCH" %0"::"m"(*s));
  192. __asm __volatile("movq %0, %%mm4"::"m"(mask15s));
  193. mm_end = end - 15;
  194. while(s<mm_end)
  195. {
  196. __asm __volatile(
  197. PREFETCH" 32%1\n\t"
  198. "movq %1, %%mm0\n\t"
  199. "movq 8%1, %%mm2\n\t"
  200. "movq %%mm0, %%mm1\n\t"
  201. "movq %%mm2, %%mm3\n\t"
  202. "pand %%mm4, %%mm0\n\t"
  203. "pand %%mm4, %%mm2\n\t"
  204. "paddw %%mm1, %%mm0\n\t"
  205. "paddw %%mm3, %%mm2\n\t"
  206. MOVNTQ" %%mm0, %0\n\t"
  207. MOVNTQ" %%mm2, 8%0"
  208. :"=m"(*d)
  209. :"m"(*s)
  210. );
  211. d+=16;
  212. s+=16;
  213. }
  214. __asm __volatile(SFENCE:::"memory");
  215. __asm __volatile(EMMS:::"memory");
  216. #endif
  217. mm_end = end - 3;
  218. while(s < mm_end)
  219. {
  220. register unsigned x= *((uint32_t *)s);
  221. *((uint32_t *)d) = (x&0x7FFF7FFF) + (x&0x7FE07FE0);
  222. d+=4;
  223. s+=4;
  224. }
  225. if(s < end)
  226. {
  227. register unsigned short x= *((uint16_t *)s);
  228. *((uint16_t *)d) = (x&0x7FFF) + (x&0x7FE0);
  229. }
  230. }
  231. static inline void RENAME(bgr24torgb24)(const uint8_t *src, uint8_t *dst, unsigned src_size)
  232. {
  233. unsigned j,i,num_pixels=src_size/3;
  234. for(i=0,j=0; j<num_pixels; i+=3,j+=3)
  235. {
  236. dst[j+0] = src[i+2];
  237. dst[j+1] = src[i+1];
  238. dst[j+2] = src[i+0];
  239. }
  240. }
  241. static inline void RENAME(rgb16to15)(const uint8_t *src,uint8_t *dst,unsigned src_size)
  242. {
  243. register const uint8_t* s=src;
  244. register uint8_t* d=dst;
  245. register const uint8_t *end;
  246. const uint8_t *mm_end;
  247. end = s + src_size;
  248. #ifdef HAVE_MMX
  249. __asm __volatile(PREFETCH" %0"::"m"(*s));
  250. __asm __volatile("movq %0, %%mm7"::"m"(mask15rg));
  251. __asm __volatile("movq %0, %%mm6"::"m"(mask15b));
  252. mm_end = end - 15;
  253. while(s<mm_end)
  254. {
  255. __asm __volatile(
  256. PREFETCH" 32%1\n\t"
  257. "movq %1, %%mm0\n\t"
  258. "movq 8%1, %%mm2\n\t"
  259. "movq %%mm0, %%mm1\n\t"
  260. "movq %%mm2, %%mm3\n\t"
  261. "psrlq $1, %%mm0\n\t"
  262. "psrlq $1, %%mm2\n\t"
  263. "pand %%mm7, %%mm0\n\t"
  264. "pand %%mm7, %%mm2\n\t"
  265. "pand %%mm6, %%mm1\n\t"
  266. "pand %%mm6, %%mm3\n\t"
  267. "por %%mm1, %%mm0\n\t"
  268. "por %%mm3, %%mm2\n\t"
  269. MOVNTQ" %%mm0, %0\n\t"
  270. MOVNTQ" %%mm2, 8%0"
  271. :"=m"(*d)
  272. :"m"(*s)
  273. );
  274. d+=16;
  275. s+=16;
  276. }
  277. __asm __volatile(SFENCE:::"memory");
  278. __asm __volatile(EMMS:::"memory");
  279. #endif
  280. mm_end = end - 3;
  281. while(s < mm_end)
  282. {
  283. register uint32_t x= *((uint32_t *)s);
  284. *((uint32_t *)d) = ((x>>1)&0x7FE07FE0) | (x&0x001F001F);
  285. s+=4;
  286. d+=4;
  287. }
  288. if(s < end)
  289. {
  290. register uint16_t x= *((uint16_t *)s);
  291. *((uint16_t *)d) = ((x>>1)&0x7FE0) | (x&0x001F);
  292. s+=2;
  293. d+=2;
  294. }
  295. }
  296. static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, unsigned src_size)
  297. {
  298. const uint8_t *s = src;
  299. const uint8_t *end;
  300. #ifdef HAVE_MMX
  301. const uint8_t *mm_end;
  302. #endif
  303. uint16_t *d = (uint16_t *)dst;
  304. end = s + src_size;
  305. #ifdef HAVE_MMX
  306. __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
  307. __asm __volatile(
  308. "movq %0, %%mm7\n\t"
  309. "movq %1, %%mm6\n\t"
  310. ::"m"(red_16mask),"m"(green_16mask));
  311. mm_end = end - 15;
  312. while(s < mm_end)
  313. {
  314. __asm __volatile(
  315. PREFETCH" 32%1\n\t"
  316. "movd %1, %%mm0\n\t"
  317. "movd 4%1, %%mm3\n\t"
  318. "punpckldq 8%1, %%mm0\n\t"
  319. "punpckldq 12%1, %%mm3\n\t"
  320. "movq %%mm0, %%mm1\n\t"
  321. "movq %%mm0, %%mm2\n\t"
  322. "movq %%mm3, %%mm4\n\t"
  323. "movq %%mm3, %%mm5\n\t"
  324. "psrlq $3, %%mm0\n\t"
  325. "psrlq $3, %%mm3\n\t"
  326. "pand %2, %%mm0\n\t"
  327. "pand %2, %%mm3\n\t"
  328. "psrlq $5, %%mm1\n\t"
  329. "psrlq $5, %%mm4\n\t"
  330. "pand %%mm6, %%mm1\n\t"
  331. "pand %%mm6, %%mm4\n\t"
  332. "psrlq $8, %%mm2\n\t"
  333. "psrlq $8, %%mm5\n\t"
  334. "pand %%mm7, %%mm2\n\t"
  335. "pand %%mm7, %%mm5\n\t"
  336. "por %%mm1, %%mm0\n\t"
  337. "por %%mm4, %%mm3\n\t"
  338. "por %%mm2, %%mm0\n\t"
  339. "por %%mm5, %%mm3\n\t"
  340. "psllq $16, %%mm3\n\t"
  341. "por %%mm3, %%mm0\n\t"
  342. MOVNTQ" %%mm0, %0\n\t"
  343. :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
  344. d += 4;
  345. s += 16;
  346. }
  347. __asm __volatile(SFENCE:::"memory");
  348. __asm __volatile(EMMS:::"memory");
  349. #endif
  350. while(s < end)
  351. {
  352. #ifndef WORDS_BIGENDIAN
  353. const int b= *s++;
  354. const int g= *s++;
  355. const int r= *s++;
  356. #else
  357. const int a= *s++; /*skip*/
  358. const int r= *s++;
  359. const int g= *s++;
  360. const int b= *s++;
  361. #endif
  362. *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
  363. #ifndef WORDS_BIGENDIAN
  364. s++;
  365. #endif
  366. }
  367. }
  368. static inline void RENAME(rgb32tobgr16)(const uint8_t *src, uint8_t *dst, unsigned int src_size)
  369. {
  370. const uint8_t *s = src;
  371. const uint8_t *end;
  372. #ifdef HAVE_MMX
  373. const uint8_t *mm_end;
  374. #endif
  375. uint16_t *d = (uint16_t *)dst;
  376. end = s + src_size;
  377. #ifdef HAVE_MMX
  378. __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
  379. __asm __volatile(
  380. "movq %0, %%mm7\n\t"
  381. "movq %1, %%mm6\n\t"
  382. ::"m"(red_16mask),"m"(green_16mask));
  383. mm_end = end - 15;
  384. while(s < mm_end)
  385. {
  386. __asm __volatile(
  387. PREFETCH" 32%1\n\t"
  388. "movd %1, %%mm0\n\t"
  389. "movd 4%1, %%mm3\n\t"
  390. "punpckldq 8%1, %%mm0\n\t"
  391. "punpckldq 12%1, %%mm3\n\t"
  392. "movq %%mm0, %%mm1\n\t"
  393. "movq %%mm0, %%mm2\n\t"
  394. "movq %%mm3, %%mm4\n\t"
  395. "movq %%mm3, %%mm5\n\t"
  396. "psllq $8, %%mm0\n\t"
  397. "psllq $8, %%mm3\n\t"
  398. "pand %%mm7, %%mm0\n\t"
  399. "pand %%mm7, %%mm3\n\t"
  400. "psrlq $5, %%mm1\n\t"
  401. "psrlq $5, %%mm4\n\t"
  402. "pand %%mm6, %%mm1\n\t"
  403. "pand %%mm6, %%mm4\n\t"
  404. "psrlq $19, %%mm2\n\t"
  405. "psrlq $19, %%mm5\n\t"
  406. "pand %2, %%mm2\n\t"
  407. "pand %2, %%mm5\n\t"
  408. "por %%mm1, %%mm0\n\t"
  409. "por %%mm4, %%mm3\n\t"
  410. "por %%mm2, %%mm0\n\t"
  411. "por %%mm5, %%mm3\n\t"
  412. "psllq $16, %%mm3\n\t"
  413. "por %%mm3, %%mm0\n\t"
  414. MOVNTQ" %%mm0, %0\n\t"
  415. :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
  416. d += 4;
  417. s += 16;
  418. }
  419. __asm __volatile(SFENCE:::"memory");
  420. __asm __volatile(EMMS:::"memory");
  421. #endif
  422. while(s < end)
  423. {
  424. const int r= *s++;
  425. const int g= *s++;
  426. const int b= *s++;
  427. *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
  428. s++;
  429. }
  430. }
  431. static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, unsigned src_size)
  432. {
  433. const uint8_t *s = src;
  434. const uint8_t *end;
  435. #ifdef HAVE_MMX
  436. const uint8_t *mm_end;
  437. #endif
  438. uint16_t *d = (uint16_t *)dst;
  439. end = s + src_size;
  440. #ifdef HAVE_MMX
  441. __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
  442. __asm __volatile(
  443. "movq %0, %%mm7\n\t"
  444. "movq %1, %%mm6\n\t"
  445. ::"m"(red_15mask),"m"(green_15mask));
  446. mm_end = end - 15;
  447. while(s < mm_end)
  448. {
  449. __asm __volatile(
  450. PREFETCH" 32%1\n\t"
  451. "movd %1, %%mm0\n\t"
  452. "movd 4%1, %%mm3\n\t"
  453. "punpckldq 8%1, %%mm0\n\t"
  454. "punpckldq 12%1, %%mm3\n\t"
  455. "movq %%mm0, %%mm1\n\t"
  456. "movq %%mm0, %%mm2\n\t"
  457. "movq %%mm3, %%mm4\n\t"
  458. "movq %%mm3, %%mm5\n\t"
  459. "psrlq $3, %%mm0\n\t"
  460. "psrlq $3, %%mm3\n\t"
  461. "pand %2, %%mm0\n\t"
  462. "pand %2, %%mm3\n\t"
  463. "psrlq $6, %%mm1\n\t"
  464. "psrlq $6, %%mm4\n\t"
  465. "pand %%mm6, %%mm1\n\t"
  466. "pand %%mm6, %%mm4\n\t"
  467. "psrlq $9, %%mm2\n\t"
  468. "psrlq $9, %%mm5\n\t"
  469. "pand %%mm7, %%mm2\n\t"
  470. "pand %%mm7, %%mm5\n\t"
  471. "por %%mm1, %%mm0\n\t"
  472. "por %%mm4, %%mm3\n\t"
  473. "por %%mm2, %%mm0\n\t"
  474. "por %%mm5, %%mm3\n\t"
  475. "psllq $16, %%mm3\n\t"
  476. "por %%mm3, %%mm0\n\t"
  477. MOVNTQ" %%mm0, %0\n\t"
  478. :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
  479. d += 4;
  480. s += 16;
  481. }
  482. __asm __volatile(SFENCE:::"memory");
  483. __asm __volatile(EMMS:::"memory");
  484. #endif
  485. while(s < end)
  486. {
  487. const int b= *s++;
  488. const int g= *s++;
  489. const int r= *s++;
  490. *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
  491. s++;
  492. }
  493. }
  494. static inline void RENAME(rgb32tobgr15)(const uint8_t *src, uint8_t *dst, unsigned src_size)
  495. {
  496. const uint8_t *s = src;
  497. const uint8_t *end;
  498. #ifdef HAVE_MMX
  499. const uint8_t *mm_end;
  500. #endif
  501. uint16_t *d = (uint16_t *)dst;
  502. end = s + src_size;
  503. #ifdef HAVE_MMX
  504. __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
  505. __asm __volatile(
  506. "movq %0, %%mm7\n\t"
  507. "movq %1, %%mm6\n\t"
  508. ::"m"(red_15mask),"m"(green_15mask));
  509. mm_end = end - 15;
  510. while(s < mm_end)
  511. {
  512. __asm __volatile(
  513. PREFETCH" 32%1\n\t"
  514. "movd %1, %%mm0\n\t"
  515. "movd 4%1, %%mm3\n\t"
  516. "punpckldq 8%1, %%mm0\n\t"
  517. "punpckldq 12%1, %%mm3\n\t"
  518. "movq %%mm0, %%mm1\n\t"
  519. "movq %%mm0, %%mm2\n\t"
  520. "movq %%mm3, %%mm4\n\t"
  521. "movq %%mm3, %%mm5\n\t"
  522. "psllq $7, %%mm0\n\t"
  523. "psllq $7, %%mm3\n\t"
  524. "pand %%mm7, %%mm0\n\t"
  525. "pand %%mm7, %%mm3\n\t"
  526. "psrlq $6, %%mm1\n\t"
  527. "psrlq $6, %%mm4\n\t"
  528. "pand %%mm6, %%mm1\n\t"
  529. "pand %%mm6, %%mm4\n\t"
  530. "psrlq $19, %%mm2\n\t"
  531. "psrlq $19, %%mm5\n\t"
  532. "pand %2, %%mm2\n\t"
  533. "pand %2, %%mm5\n\t"
  534. "por %%mm1, %%mm0\n\t"
  535. "por %%mm4, %%mm3\n\t"
  536. "por %%mm2, %%mm0\n\t"
  537. "por %%mm5, %%mm3\n\t"
  538. "psllq $16, %%mm3\n\t"
  539. "por %%mm3, %%mm0\n\t"
  540. MOVNTQ" %%mm0, %0\n\t"
  541. :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
  542. d += 4;
  543. s += 16;
  544. }
  545. __asm __volatile(SFENCE:::"memory");
  546. __asm __volatile(EMMS:::"memory");
  547. #endif
  548. while(s < end)
  549. {
  550. const int r= *s++;
  551. const int g= *s++;
  552. const int b= *s++;
  553. *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
  554. s++;
  555. }
  556. }
  557. static inline void RENAME(rgb24to16)(const uint8_t *src, uint8_t *dst, unsigned src_size)
  558. {
  559. const uint8_t *s = src;
  560. const uint8_t *end;
  561. #ifdef HAVE_MMX
  562. const uint8_t *mm_end;
  563. #endif
  564. uint16_t *d = (uint16_t *)dst;
  565. end = s + src_size;
  566. #ifdef HAVE_MMX
  567. __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
  568. __asm __volatile(
  569. "movq %0, %%mm7\n\t"
  570. "movq %1, %%mm6\n\t"
  571. ::"m"(red_16mask),"m"(green_16mask));
  572. mm_end = end - 11;
  573. while(s < mm_end)
  574. {
  575. __asm __volatile(
  576. PREFETCH" 32%1\n\t"
  577. "movd %1, %%mm0\n\t"
  578. "movd 3%1, %%mm3\n\t"
  579. "punpckldq 6%1, %%mm0\n\t"
  580. "punpckldq 9%1, %%mm3\n\t"
  581. "movq %%mm0, %%mm1\n\t"
  582. "movq %%mm0, %%mm2\n\t"
  583. "movq %%mm3, %%mm4\n\t"
  584. "movq %%mm3, %%mm5\n\t"
  585. "psrlq $3, %%mm0\n\t"
  586. "psrlq $3, %%mm3\n\t"
  587. "pand %2, %%mm0\n\t"
  588. "pand %2, %%mm3\n\t"
  589. "psrlq $5, %%mm1\n\t"
  590. "psrlq $5, %%mm4\n\t"
  591. "pand %%mm6, %%mm1\n\t"
  592. "pand %%mm6, %%mm4\n\t"
  593. "psrlq $8, %%mm2\n\t"
  594. "psrlq $8, %%mm5\n\t"
  595. "pand %%mm7, %%mm2\n\t"
  596. "pand %%mm7, %%mm5\n\t"
  597. "por %%mm1, %%mm0\n\t"
  598. "por %%mm4, %%mm3\n\t"
  599. "por %%mm2, %%mm0\n\t"
  600. "por %%mm5, %%mm3\n\t"
  601. "psllq $16, %%mm3\n\t"
  602. "por %%mm3, %%mm0\n\t"
  603. MOVNTQ" %%mm0, %0\n\t"
  604. :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
  605. d += 4;
  606. s += 12;
  607. }
  608. __asm __volatile(SFENCE:::"memory");
  609. __asm __volatile(EMMS:::"memory");
  610. #endif
  611. while(s < end)
  612. {
  613. const int b= *s++;
  614. const int g= *s++;
  615. const int r= *s++;
  616. *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
  617. }
  618. }
  619. static inline void RENAME(rgb24tobgr16)(const uint8_t *src, uint8_t *dst, unsigned int src_size)
  620. {
  621. const uint8_t *s = src;
  622. const uint8_t *end;
  623. #ifdef HAVE_MMX
  624. const uint8_t *mm_end;
  625. #endif
  626. uint16_t *d = (uint16_t *)dst;
  627. end = s + src_size;
  628. #ifdef HAVE_MMX
  629. __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
  630. __asm __volatile(
  631. "movq %0, %%mm7\n\t"
  632. "movq %1, %%mm6\n\t"
  633. ::"m"(red_16mask),"m"(green_16mask));
  634. mm_end = end - 15;
  635. while(s < mm_end)
  636. {
  637. __asm __volatile(
  638. PREFETCH" 32%1\n\t"
  639. "movd %1, %%mm0\n\t"
  640. "movd 3%1, %%mm3\n\t"
  641. "punpckldq 6%1, %%mm0\n\t"
  642. "punpckldq 9%1, %%mm3\n\t"
  643. "movq %%mm0, %%mm1\n\t"
  644. "movq %%mm0, %%mm2\n\t"
  645. "movq %%mm3, %%mm4\n\t"
  646. "movq %%mm3, %%mm5\n\t"
  647. "psllq $8, %%mm0\n\t"
  648. "psllq $8, %%mm3\n\t"
  649. "pand %%mm7, %%mm0\n\t"
  650. "pand %%mm7, %%mm3\n\t"
  651. "psrlq $5, %%mm1\n\t"
  652. "psrlq $5, %%mm4\n\t"
  653. "pand %%mm6, %%mm1\n\t"
  654. "pand %%mm6, %%mm4\n\t"
  655. "psrlq $19, %%mm2\n\t"
  656. "psrlq $19, %%mm5\n\t"
  657. "pand %2, %%mm2\n\t"
  658. "pand %2, %%mm5\n\t"
  659. "por %%mm1, %%mm0\n\t"
  660. "por %%mm4, %%mm3\n\t"
  661. "por %%mm2, %%mm0\n\t"
  662. "por %%mm5, %%mm3\n\t"
  663. "psllq $16, %%mm3\n\t"
  664. "por %%mm3, %%mm0\n\t"
  665. MOVNTQ" %%mm0, %0\n\t"
  666. :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
  667. d += 4;
  668. s += 12;
  669. }
  670. __asm __volatile(SFENCE:::"memory");
  671. __asm __volatile(EMMS:::"memory");
  672. #endif
  673. while(s < end)
  674. {
  675. const int r= *s++;
  676. const int g= *s++;
  677. const int b= *s++;
  678. *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
  679. }
  680. }
  681. static inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, unsigned src_size)
  682. {
  683. const uint8_t *s = src;
  684. const uint8_t *end;
  685. #ifdef HAVE_MMX
  686. const uint8_t *mm_end;
  687. #endif
  688. uint16_t *d = (uint16_t *)dst;
  689. end = s + src_size;
  690. #ifdef HAVE_MMX
  691. __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
  692. __asm __volatile(
  693. "movq %0, %%mm7\n\t"
  694. "movq %1, %%mm6\n\t"
  695. ::"m"(red_15mask),"m"(green_15mask));
  696. mm_end = end - 11;
  697. while(s < mm_end)
  698. {
  699. __asm __volatile(
  700. PREFETCH" 32%1\n\t"
  701. "movd %1, %%mm0\n\t"
  702. "movd 3%1, %%mm3\n\t"
  703. "punpckldq 6%1, %%mm0\n\t"
  704. "punpckldq 9%1, %%mm3\n\t"
  705. "movq %%mm0, %%mm1\n\t"
  706. "movq %%mm0, %%mm2\n\t"
  707. "movq %%mm3, %%mm4\n\t"
  708. "movq %%mm3, %%mm5\n\t"
  709. "psrlq $3, %%mm0\n\t"
  710. "psrlq $3, %%mm3\n\t"
  711. "pand %2, %%mm0\n\t"
  712. "pand %2, %%mm3\n\t"
  713. "psrlq $6, %%mm1\n\t"
  714. "psrlq $6, %%mm4\n\t"
  715. "pand %%mm6, %%mm1\n\t"
  716. "pand %%mm6, %%mm4\n\t"
  717. "psrlq $9, %%mm2\n\t"
  718. "psrlq $9, %%mm5\n\t"
  719. "pand %%mm7, %%mm2\n\t"
  720. "pand %%mm7, %%mm5\n\t"
  721. "por %%mm1, %%mm0\n\t"
  722. "por %%mm4, %%mm3\n\t"
  723. "por %%mm2, %%mm0\n\t"
  724. "por %%mm5, %%mm3\n\t"
  725. "psllq $16, %%mm3\n\t"
  726. "por %%mm3, %%mm0\n\t"
  727. MOVNTQ" %%mm0, %0\n\t"
  728. :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
  729. d += 4;
  730. s += 12;
  731. }
  732. __asm __volatile(SFENCE:::"memory");
  733. __asm __volatile(EMMS:::"memory");
  734. #endif
  735. while(s < end)
  736. {
  737. const int b= *s++;
  738. const int g= *s++;
  739. const int r= *s++;
  740. *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
  741. }
  742. }
  743. static inline void RENAME(rgb24tobgr15)(const uint8_t *src, uint8_t *dst, unsigned src_size)
  744. {
  745. const uint8_t *s = src;
  746. const uint8_t *end;
  747. #ifdef HAVE_MMX
  748. const uint8_t *mm_end;
  749. #endif
  750. uint16_t *d = (uint16_t *)dst;
  751. end = s + src_size;
  752. #ifdef HAVE_MMX
  753. __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
  754. __asm __volatile(
  755. "movq %0, %%mm7\n\t"
  756. "movq %1, %%mm6\n\t"
  757. ::"m"(red_15mask),"m"(green_15mask));
  758. mm_end = end - 15;
  759. while(s < mm_end)
  760. {
  761. __asm __volatile(
  762. PREFETCH" 32%1\n\t"
  763. "movd %1, %%mm0\n\t"
  764. "movd 3%1, %%mm3\n\t"
  765. "punpckldq 6%1, %%mm0\n\t"
  766. "punpckldq 9%1, %%mm3\n\t"
  767. "movq %%mm0, %%mm1\n\t"
  768. "movq %%mm0, %%mm2\n\t"
  769. "movq %%mm3, %%mm4\n\t"
  770. "movq %%mm3, %%mm5\n\t"
  771. "psllq $7, %%mm0\n\t"
  772. "psllq $7, %%mm3\n\t"
  773. "pand %%mm7, %%mm0\n\t"
  774. "pand %%mm7, %%mm3\n\t"
  775. "psrlq $6, %%mm1\n\t"
  776. "psrlq $6, %%mm4\n\t"
  777. "pand %%mm6, %%mm1\n\t"
  778. "pand %%mm6, %%mm4\n\t"
  779. "psrlq $19, %%mm2\n\t"
  780. "psrlq $19, %%mm5\n\t"
  781. "pand %2, %%mm2\n\t"
  782. "pand %2, %%mm5\n\t"
  783. "por %%mm1, %%mm0\n\t"
  784. "por %%mm4, %%mm3\n\t"
  785. "por %%mm2, %%mm0\n\t"
  786. "por %%mm5, %%mm3\n\t"
  787. "psllq $16, %%mm3\n\t"
  788. "por %%mm3, %%mm0\n\t"
  789. MOVNTQ" %%mm0, %0\n\t"
  790. :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
  791. d += 4;
  792. s += 12;
  793. }
  794. __asm __volatile(SFENCE:::"memory");
  795. __asm __volatile(EMMS:::"memory");
  796. #endif
  797. while(s < end)
  798. {
  799. const int r= *s++;
  800. const int g= *s++;
  801. const int b= *s++;
  802. *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
  803. }
  804. }
  805. /*
  806. I use here less accurate approximation by simply
  807. left-shifting the input
  808. value and filling the low order bits with
  809. zeroes. This method improves png's
  810. compression but this scheme cannot reproduce white exactly, since it does not
  811. generate an all-ones maximum value; the net effect is to darken the
  812. image slightly.
  813. The better method should be "left bit replication":
  814. 4 3 2 1 0
  815. ---------
  816. 1 1 0 1 1
  817. 7 6 5 4 3 2 1 0
  818. ----------------
  819. 1 1 0 1 1 1 1 0
  820. |=======| |===|
  821. | Leftmost Bits Repeated to Fill Open Bits
  822. |
  823. Original Bits
  824. */
  825. static inline void RENAME(rgb15to24)(const uint8_t *src, uint8_t *dst, unsigned src_size)
  826. {
  827. const uint16_t *end;
  828. #ifdef HAVE_MMX
  829. const uint16_t *mm_end;
  830. #endif
  831. uint8_t *d = (uint8_t *)dst;
  832. const uint16_t *s = (uint16_t *)src;
  833. end = s + src_size/2;
  834. #ifdef HAVE_MMX
  835. __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
  836. mm_end = end - 7;
  837. while(s < mm_end)
  838. {
  839. __asm __volatile(
  840. PREFETCH" 32%1\n\t"
  841. "movq %1, %%mm0\n\t"
  842. "movq %1, %%mm1\n\t"
  843. "movq %1, %%mm2\n\t"
  844. "pand %2, %%mm0\n\t"
  845. "pand %3, %%mm1\n\t"
  846. "pand %4, %%mm2\n\t"
  847. "psllq $3, %%mm0\n\t"
  848. "psrlq $2, %%mm1\n\t"
  849. "psrlq $7, %%mm2\n\t"
  850. "movq %%mm0, %%mm3\n\t"
  851. "movq %%mm1, %%mm4\n\t"
  852. "movq %%mm2, %%mm5\n\t"
  853. "punpcklwd %5, %%mm0\n\t"
  854. "punpcklwd %5, %%mm1\n\t"
  855. "punpcklwd %5, %%mm2\n\t"
  856. "punpckhwd %5, %%mm3\n\t"
  857. "punpckhwd %5, %%mm4\n\t"
  858. "punpckhwd %5, %%mm5\n\t"
  859. "psllq $8, %%mm1\n\t"
  860. "psllq $16, %%mm2\n\t"
  861. "por %%mm1, %%mm0\n\t"
  862. "por %%mm2, %%mm0\n\t"
  863. "psllq $8, %%mm4\n\t"
  864. "psllq $16, %%mm5\n\t"
  865. "por %%mm4, %%mm3\n\t"
  866. "por %%mm5, %%mm3\n\t"
  867. "movq %%mm0, %%mm6\n\t"
  868. "movq %%mm3, %%mm7\n\t"
  869. "movq 8%1, %%mm0\n\t"
  870. "movq 8%1, %%mm1\n\t"
  871. "movq 8%1, %%mm2\n\t"
  872. "pand %2, %%mm0\n\t"
  873. "pand %3, %%mm1\n\t"
  874. "pand %4, %%mm2\n\t"
  875. "psllq $3, %%mm0\n\t"
  876. "psrlq $2, %%mm1\n\t"
  877. "psrlq $7, %%mm2\n\t"
  878. "movq %%mm0, %%mm3\n\t"
  879. "movq %%mm1, %%mm4\n\t"
  880. "movq %%mm2, %%mm5\n\t"
  881. "punpcklwd %5, %%mm0\n\t"
  882. "punpcklwd %5, %%mm1\n\t"
  883. "punpcklwd %5, %%mm2\n\t"
  884. "punpckhwd %5, %%mm3\n\t"
  885. "punpckhwd %5, %%mm4\n\t"
  886. "punpckhwd %5, %%mm5\n\t"
  887. "psllq $8, %%mm1\n\t"
  888. "psllq $16, %%mm2\n\t"
  889. "por %%mm1, %%mm0\n\t"
  890. "por %%mm2, %%mm0\n\t"
  891. "psllq $8, %%mm4\n\t"
  892. "psllq $16, %%mm5\n\t"
  893. "por %%mm4, %%mm3\n\t"
  894. "por %%mm5, %%mm3\n\t"
  895. :"=m"(*d)
  896. :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r), "m"(mmx_null)
  897. :"memory");
  898. /* Borrowed 32 to 24 */
  899. __asm __volatile(
  900. "movq %%mm0, %%mm4\n\t"
  901. "movq %%mm3, %%mm5\n\t"
  902. "movq %%mm6, %%mm0\n\t"
  903. "movq %%mm7, %%mm1\n\t"
  904. "movq %%mm4, %%mm6\n\t"
  905. "movq %%mm5, %%mm7\n\t"
  906. "movq %%mm0, %%mm2\n\t"
  907. "movq %%mm1, %%mm3\n\t"
  908. "psrlq $8, %%mm2\n\t"
  909. "psrlq $8, %%mm3\n\t"
  910. "psrlq $8, %%mm6\n\t"
  911. "psrlq $8, %%mm7\n\t"
  912. "pand %2, %%mm0\n\t"
  913. "pand %2, %%mm1\n\t"
  914. "pand %2, %%mm4\n\t"
  915. "pand %2, %%mm5\n\t"
  916. "pand %3, %%mm2\n\t"
  917. "pand %3, %%mm3\n\t"
  918. "pand %3, %%mm6\n\t"
  919. "pand %3, %%mm7\n\t"
  920. "por %%mm2, %%mm0\n\t"
  921. "por %%mm3, %%mm1\n\t"
  922. "por %%mm6, %%mm4\n\t"
  923. "por %%mm7, %%mm5\n\t"
  924. "movq %%mm1, %%mm2\n\t"
  925. "movq %%mm4, %%mm3\n\t"
  926. "psllq $48, %%mm2\n\t"
  927. "psllq $32, %%mm3\n\t"
  928. "pand %4, %%mm2\n\t"
  929. "pand %5, %%mm3\n\t"
  930. "por %%mm2, %%mm0\n\t"
  931. "psrlq $16, %%mm1\n\t"
  932. "psrlq $32, %%mm4\n\t"
  933. "psllq $16, %%mm5\n\t"
  934. "por %%mm3, %%mm1\n\t"
  935. "pand %6, %%mm5\n\t"
  936. "por %%mm5, %%mm4\n\t"
  937. MOVNTQ" %%mm0, %0\n\t"
  938. MOVNTQ" %%mm1, 8%0\n\t"
  939. MOVNTQ" %%mm4, 16%0"
  940. :"=m"(*d)
  941. :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
  942. :"memory");
  943. d += 24;
  944. s += 8;
  945. }
  946. __asm __volatile(SFENCE:::"memory");
  947. __asm __volatile(EMMS:::"memory");
  948. #endif
  949. while(s < end)
  950. {
  951. register uint16_t bgr;
  952. bgr = *s++;
  953. *d++ = (bgr&0x1F)<<3;
  954. *d++ = (bgr&0x3E0)>>2;
  955. *d++ = (bgr&0x7C00)>>7;
  956. }
  957. }
  958. static inline void RENAME(rgb16to24)(const uint8_t *src, uint8_t *dst, unsigned src_size)
  959. {
  960. const uint16_t *end;
  961. #ifdef HAVE_MMX
  962. const uint16_t *mm_end;
  963. #endif
  964. uint8_t *d = (uint8_t *)dst;
  965. const uint16_t *s = (const uint16_t *)src;
  966. end = s + src_size/2;
  967. #ifdef HAVE_MMX
  968. __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
  969. mm_end = end - 7;
  970. while(s < mm_end)
  971. {
  972. __asm __volatile(
  973. PREFETCH" 32%1\n\t"
  974. "movq %1, %%mm0\n\t"
  975. "movq %1, %%mm1\n\t"
  976. "movq %1, %%mm2\n\t"
  977. "pand %2, %%mm0\n\t"
  978. "pand %3, %%mm1\n\t"
  979. "pand %4, %%mm2\n\t"
  980. "psllq $3, %%mm0\n\t"
  981. "psrlq $3, %%mm1\n\t"
  982. "psrlq $8, %%mm2\n\t"
  983. "movq %%mm0, %%mm3\n\t"
  984. "movq %%mm1, %%mm4\n\t"
  985. "movq %%mm2, %%mm5\n\t"
  986. "punpcklwd %5, %%mm0\n\t"
  987. "punpcklwd %5, %%mm1\n\t"
  988. "punpcklwd %5, %%mm2\n\t"
  989. "punpckhwd %5, %%mm3\n\t"
  990. "punpckhwd %5, %%mm4\n\t"
  991. "punpckhwd %5, %%mm5\n\t"
  992. "psllq $8, %%mm1\n\t"
  993. "psllq $16, %%mm2\n\t"
  994. "por %%mm1, %%mm0\n\t"
  995. "por %%mm2, %%mm0\n\t"
  996. "psllq $8, %%mm4\n\t"
  997. "psllq $16, %%mm5\n\t"
  998. "por %%mm4, %%mm3\n\t"
  999. "por %%mm5, %%mm3\n\t"
  1000. "movq %%mm0, %%mm6\n\t"
  1001. "movq %%mm3, %%mm7\n\t"
  1002. "movq 8%1, %%mm0\n\t"
  1003. "movq 8%1, %%mm1\n\t"
  1004. "movq 8%1, %%mm2\n\t"
  1005. "pand %2, %%mm0\n\t"
  1006. "pand %3, %%mm1\n\t"
  1007. "pand %4, %%mm2\n\t"
  1008. "psllq $3, %%mm0\n\t"
  1009. "psrlq $3, %%mm1\n\t"
  1010. "psrlq $8, %%mm2\n\t"
  1011. "movq %%mm0, %%mm3\n\t"
  1012. "movq %%mm1, %%mm4\n\t"
  1013. "movq %%mm2, %%mm5\n\t"
  1014. "punpcklwd %5, %%mm0\n\t"
  1015. "punpcklwd %5, %%mm1\n\t"
  1016. "punpcklwd %5, %%mm2\n\t"
  1017. "punpckhwd %5, %%mm3\n\t"
  1018. "punpckhwd %5, %%mm4\n\t"
  1019. "punpckhwd %5, %%mm5\n\t"
  1020. "psllq $8, %%mm1\n\t"
  1021. "psllq $16, %%mm2\n\t"
  1022. "por %%mm1, %%mm0\n\t"
  1023. "por %%mm2, %%mm0\n\t"
  1024. "psllq $8, %%mm4\n\t"
  1025. "psllq $16, %%mm5\n\t"
  1026. "por %%mm4, %%mm3\n\t"
  1027. "por %%mm5, %%mm3\n\t"
  1028. :"=m"(*d)
  1029. :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mmx_null)
  1030. :"memory");
  1031. /* Borrowed 32 to 24 */
  1032. __asm __volatile(
  1033. "movq %%mm0, %%mm4\n\t"
  1034. "movq %%mm3, %%mm5\n\t"
  1035. "movq %%mm6, %%mm0\n\t"
  1036. "movq %%mm7, %%mm1\n\t"
  1037. "movq %%mm4, %%mm6\n\t"
  1038. "movq %%mm5, %%mm7\n\t"
  1039. "movq %%mm0, %%mm2\n\t"
  1040. "movq %%mm1, %%mm3\n\t"
  1041. "psrlq $8, %%mm2\n\t"
  1042. "psrlq $8, %%mm3\n\t"
  1043. "psrlq $8, %%mm6\n\t"
  1044. "psrlq $8, %%mm7\n\t"
  1045. "pand %2, %%mm0\n\t"
  1046. "pand %2, %%mm1\n\t"
  1047. "pand %2, %%mm4\n\t"
  1048. "pand %2, %%mm5\n\t"
  1049. "pand %3, %%mm2\n\t"
  1050. "pand %3, %%mm3\n\t"
  1051. "pand %3, %%mm6\n\t"
  1052. "pand %3, %%mm7\n\t"
  1053. "por %%mm2, %%mm0\n\t"
  1054. "por %%mm3, %%mm1\n\t"
  1055. "por %%mm6, %%mm4\n\t"
  1056. "por %%mm7, %%mm5\n\t"
  1057. "movq %%mm1, %%mm2\n\t"
  1058. "movq %%mm4, %%mm3\n\t"
  1059. "psllq $48, %%mm2\n\t"
  1060. "psllq $32, %%mm3\n\t"
  1061. "pand %4, %%mm2\n\t"
  1062. "pand %5, %%mm3\n\t"
  1063. "por %%mm2, %%mm0\n\t"
  1064. "psrlq $16, %%mm1\n\t"
  1065. "psrlq $32, %%mm4\n\t"
  1066. "psllq $16, %%mm5\n\t"
  1067. "por %%mm3, %%mm1\n\t"
  1068. "pand %6, %%mm5\n\t"
  1069. "por %%mm5, %%mm4\n\t"
  1070. MOVNTQ" %%mm0, %0\n\t"
  1071. MOVNTQ" %%mm1, 8%0\n\t"
  1072. MOVNTQ" %%mm4, 16%0"
  1073. :"=m"(*d)
  1074. :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
  1075. :"memory");
  1076. d += 24;
  1077. s += 8;
  1078. }
  1079. __asm __volatile(SFENCE:::"memory");
  1080. __asm __volatile(EMMS:::"memory");
  1081. #endif
  1082. while(s < end)
  1083. {
  1084. register uint16_t bgr;
  1085. bgr = *s++;
  1086. *d++ = (bgr&0x1F)<<3;
  1087. *d++ = (bgr&0x7E0)>>3;
  1088. *d++ = (bgr&0xF800)>>8;
  1089. }
  1090. }
  1091. static inline void RENAME(rgb15to32)(const uint8_t *src, uint8_t *dst, unsigned src_size)
  1092. {
  1093. const uint16_t *end;
  1094. #ifdef HAVE_MMX
  1095. const uint16_t *mm_end;
  1096. #endif
  1097. uint8_t *d = (uint8_t *)dst;
  1098. const uint16_t *s = (const uint16_t *)src;
  1099. end = s + src_size/2;
  1100. #ifdef HAVE_MMX
  1101. __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
  1102. __asm __volatile("pxor %%mm7,%%mm7\n\t":::"memory");
  1103. mm_end = end - 3;
  1104. while(s < mm_end)
  1105. {
  1106. __asm __volatile(
  1107. PREFETCH" 32%1\n\t"
  1108. "movq %1, %%mm0\n\t"
  1109. "movq %1, %%mm1\n\t"
  1110. "movq %1, %%mm2\n\t"
  1111. "pand %2, %%mm0\n\t"
  1112. "pand %3, %%mm1\n\t"
  1113. "pand %4, %%mm2\n\t"
  1114. "psllq $3, %%mm0\n\t"
  1115. "psrlq $2, %%mm1\n\t"
  1116. "psrlq $7, %%mm2\n\t"
  1117. "movq %%mm0, %%mm3\n\t"
  1118. "movq %%mm1, %%mm4\n\t"
  1119. "movq %%mm2, %%mm5\n\t"
  1120. "punpcklwd %%mm7, %%mm0\n\t"
  1121. "punpcklwd %%mm7, %%mm1\n\t"
  1122. "punpcklwd %%mm7, %%mm2\n\t"
  1123. "punpckhwd %%mm7, %%mm3\n\t"
  1124. "punpckhwd %%mm7, %%mm4\n\t"
  1125. "punpckhwd %%mm7, %%mm5\n\t"
  1126. "psllq $8, %%mm1\n\t"
  1127. "psllq $16, %%mm2\n\t"
  1128. "por %%mm1, %%mm0\n\t"
  1129. "por %%mm2, %%mm0\n\t"
  1130. "psllq $8, %%mm4\n\t"
  1131. "psllq $16, %%mm5\n\t"
  1132. "por %%mm4, %%mm3\n\t"
  1133. "por %%mm5, %%mm3\n\t"
  1134. MOVNTQ" %%mm0, %0\n\t"
  1135. MOVNTQ" %%mm3, 8%0\n\t"
  1136. :"=m"(*d)
  1137. :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r)
  1138. :"memory");
  1139. d += 16;
  1140. s += 4;
  1141. }
  1142. __asm __volatile(SFENCE:::"memory");
  1143. __asm __volatile(EMMS:::"memory");
  1144. #endif
  1145. while(s < end)
  1146. {
  1147. register uint16_t bgr;
  1148. bgr = *s++;
  1149. *d++ = (bgr&0x1F)<<3;
  1150. *d++ = (bgr&0x3E0)>>2;
  1151. *d++ = (bgr&0x7C00)>>7;
  1152. *d++ = 0;
  1153. }
  1154. }
  1155. static inline void RENAME(rgb16to32)(const uint8_t *src, uint8_t *dst, unsigned src_size)
  1156. {
  1157. const uint16_t *end;
  1158. #ifdef HAVE_MMX
  1159. const uint16_t *mm_end;
  1160. #endif
  1161. uint8_t *d = (uint8_t *)dst;
  1162. const uint16_t *s = (uint16_t *)src;
  1163. end = s + src_size/2;
  1164. #ifdef HAVE_MMX
  1165. __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
  1166. __asm __volatile("pxor %%mm7,%%mm7\n\t":::"memory");
  1167. mm_end = end - 3;
  1168. while(s < mm_end)
  1169. {
  1170. __asm __volatile(
  1171. PREFETCH" 32%1\n\t"
  1172. "movq %1, %%mm0\n\t"
  1173. "movq %1, %%mm1\n\t"
  1174. "movq %1, %%mm2\n\t"
  1175. "pand %2, %%mm0\n\t"
  1176. "pand %3, %%mm1\n\t"
  1177. "pand %4, %%mm2\n\t"
  1178. "psllq $3, %%mm0\n\t"
  1179. "psrlq $3, %%mm1\n\t"
  1180. "psrlq $8, %%mm2\n\t"
  1181. "movq %%mm0, %%mm3\n\t"
  1182. "movq %%mm1, %%mm4\n\t"
  1183. "movq %%mm2, %%mm5\n\t"
  1184. "punpcklwd %%mm7, %%mm0\n\t"
  1185. "punpcklwd %%mm7, %%mm1\n\t"
  1186. "punpcklwd %%mm7, %%mm2\n\t"
  1187. "punpckhwd %%mm7, %%mm3\n\t"
  1188. "punpckhwd %%mm7, %%mm4\n\t"
  1189. "punpckhwd %%mm7, %%mm5\n\t"
  1190. "psllq $8, %%mm1\n\t"
  1191. "psllq $16, %%mm2\n\t"
  1192. "por %%mm1, %%mm0\n\t"
  1193. "por %%mm2, %%mm0\n\t"
  1194. "psllq $8, %%mm4\n\t"
  1195. "psllq $16, %%mm5\n\t"
  1196. "por %%mm4, %%mm3\n\t"
  1197. "por %%mm5, %%mm3\n\t"
  1198. MOVNTQ" %%mm0, %0\n\t"
  1199. MOVNTQ" %%mm3, 8%0\n\t"
  1200. :"=m"(*d)
  1201. :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r)
  1202. :"memory");
  1203. d += 16;
  1204. s += 4;
  1205. }
  1206. __asm __volatile(SFENCE:::"memory");
  1207. __asm __volatile(EMMS:::"memory");
  1208. #endif
  1209. while(s < end)
  1210. {
  1211. register uint16_t bgr;
  1212. bgr = *s++;
  1213. *d++ = (bgr&0x1F)<<3;
  1214. *d++ = (bgr&0x7E0)>>3;
  1215. *d++ = (bgr&0xF800)>>8;
  1216. *d++ = 0;
  1217. }
  1218. }
  1219. static inline void RENAME(rgb32tobgr32)(const uint8_t *src, uint8_t *dst, unsigned int src_size)
  1220. {
  1221. #ifdef HAVE_MMX
  1222. /* TODO: unroll this loop */
  1223. asm volatile (
  1224. "xorl %%eax, %%eax \n\t"
  1225. ".balign 16 \n\t"
  1226. "1: \n\t"
  1227. PREFETCH" 32(%0, %%eax) \n\t"
  1228. "movq (%0, %%eax), %%mm0 \n\t"
  1229. "movq %%mm0, %%mm1 \n\t"
  1230. "movq %%mm0, %%mm2 \n\t"
  1231. "pslld $16, %%mm0 \n\t"
  1232. "psrld $16, %%mm1 \n\t"
  1233. "pand "MANGLE(mask32r)", %%mm0 \n\t"
  1234. "pand "MANGLE(mask32g)", %%mm2 \n\t"
  1235. "pand "MANGLE(mask32b)", %%mm1 \n\t"
  1236. "por %%mm0, %%mm2 \n\t"
  1237. "por %%mm1, %%mm2 \n\t"
  1238. MOVNTQ" %%mm2, (%1, %%eax) \n\t"
  1239. "addl $8, %%eax \n\t"
  1240. "cmpl %2, %%eax \n\t"
  1241. " jb 1b \n\t"
  1242. :: "r" (src), "r"(dst), "r" (src_size-7)
  1243. : "%eax"
  1244. );
  1245. __asm __volatile(SFENCE:::"memory");
  1246. __asm __volatile(EMMS:::"memory");
  1247. #else
  1248. unsigned i;
  1249. unsigned num_pixels = src_size >> 2;
  1250. for(i=0; i<num_pixels; i++)
  1251. {
  1252. dst[4*i + 0] = src[4*i + 2];
  1253. dst[4*i + 1] = src[4*i + 1];
  1254. dst[4*i + 2] = src[4*i + 0];
  1255. }
  1256. #endif
  1257. }
  1258. static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, unsigned int src_size)
  1259. {
  1260. unsigned i;
  1261. #ifdef HAVE_MMX
  1262. int mmx_size= 23 - src_size;
  1263. asm volatile (
  1264. "movq "MANGLE(mask24r)", %%mm5 \n\t"
  1265. "movq "MANGLE(mask24g)", %%mm6 \n\t"
  1266. "movq "MANGLE(mask24b)", %%mm7 \n\t"
  1267. ".balign 16 \n\t"
  1268. "1: \n\t"
  1269. PREFETCH" 32(%1, %%eax) \n\t"
  1270. "movq (%1, %%eax), %%mm0 \n\t" // BGR BGR BG
  1271. "movq (%1, %%eax), %%mm1 \n\t" // BGR BGR BG
  1272. "movq 2(%1, %%eax), %%mm2 \n\t" // R BGR BGR B
  1273. "psllq $16, %%mm0 \n\t" // 00 BGR BGR
  1274. "pand %%mm5, %%mm0 \n\t"
  1275. "pand %%mm6, %%mm1 \n\t"
  1276. "pand %%mm7, %%mm2 \n\t"
  1277. "por %%mm0, %%mm1 \n\t"
  1278. "por %%mm2, %%mm1 \n\t"
  1279. "movq 6(%1, %%eax), %%mm0 \n\t" // BGR BGR BG
  1280. MOVNTQ" %%mm1, (%2, %%eax) \n\t" // RGB RGB RG
  1281. "movq 8(%1, %%eax), %%mm1 \n\t" // R BGR BGR B
  1282. "movq 10(%1, %%eax), %%mm2 \n\t" // GR BGR BGR
  1283. "pand %%mm7, %%mm0 \n\t"
  1284. "pand %%mm5, %%mm1 \n\t"
  1285. "pand %%mm6, %%mm2 \n\t"
  1286. "por %%mm0, %%mm1 \n\t"
  1287. "por %%mm2, %%mm1 \n\t"
  1288. "movq 14(%1, %%eax), %%mm0 \n\t" // R BGR BGR B
  1289. MOVNTQ" %%mm1, 8(%2, %%eax) \n\t" // B RGB RGB R
  1290. "movq 16(%1, %%eax), %%mm1 \n\t" // GR BGR BGR
  1291. "movq 18(%1, %%eax), %%mm2 \n\t" // BGR BGR BG
  1292. "pand %%mm6, %%mm0 \n\t"
  1293. "pand %%mm7, %%mm1 \n\t"
  1294. "pand %%mm5, %%mm2 \n\t"
  1295. "por %%mm0, %%mm1 \n\t"
  1296. "por %%mm2, %%mm1 \n\t"
  1297. MOVNTQ" %%mm1, 16(%2, %%eax) \n\t"
  1298. "addl $24, %%eax \n\t"
  1299. " js 1b \n\t"
  1300. : "+a" (mmx_size)
  1301. : "r" (src-mmx_size), "r"(dst-mmx_size)
  1302. );
  1303. __asm __volatile(SFENCE:::"memory");
  1304. __asm __volatile(EMMS:::"memory");
  1305. if(mmx_size==23) return; //finihsed, was multiple of 8
  1306. src+= src_size;
  1307. dst+= src_size;
  1308. src_size= 23-mmx_size;
  1309. src-= src_size;
  1310. dst-= src_size;
  1311. #endif
  1312. for(i=0; i<src_size; i+=3)
  1313. {
  1314. register uint8_t x;
  1315. x = src[i + 2];
  1316. dst[i + 1] = src[i + 1];
  1317. dst[i + 2] = src[i + 0];
  1318. dst[i + 0] = x;
  1319. }
  1320. }
  1321. static inline void RENAME(yuvPlanartoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
  1322. unsigned int width, unsigned int height,
  1323. unsigned int lumStride, unsigned int chromStride, unsigned int dstStride, int vertLumPerChroma)
  1324. {
  1325. unsigned y;
  1326. const unsigned chromWidth= width>>1;
  1327. for(y=0; y<height; y++)
  1328. {
  1329. #ifdef HAVE_MMX
  1330. //FIXME handle 2 lines a once (fewer prefetch, reuse some chrom, but very likely limited by mem anyway)
  1331. asm volatile(
  1332. "xorl %%eax, %%eax \n\t"
  1333. ".balign 16 \n\t"
  1334. "1: \n\t"
  1335. PREFETCH" 32(%1, %%eax, 2) \n\t"
  1336. PREFETCH" 32(%2, %%eax) \n\t"
  1337. PREFETCH" 32(%3, %%eax) \n\t"
  1338. "movq (%2, %%eax), %%mm0 \n\t" // U(0)
  1339. "movq %%mm0, %%mm2 \n\t" // U(0)
  1340. "movq (%3, %%eax), %%mm1 \n\t" // V(0)
  1341. "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
  1342. "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8)
  1343. "movq (%1, %%eax,2), %%mm3 \n\t" // Y(0)
  1344. "movq 8(%1, %%eax,2), %%mm5 \n\t" // Y(8)
  1345. "movq %%mm3, %%mm4 \n\t" // Y(0)
  1346. "movq %%mm5, %%mm6 \n\t" // Y(8)
  1347. "punpcklbw %%mm0, %%mm3 \n\t" // YUYV YUYV(0)
  1348. "punpckhbw %%mm0, %%mm4 \n\t" // YUYV YUYV(4)
  1349. "punpcklbw %%mm2, %%mm5 \n\t" // YUYV YUYV(8)
  1350. "punpckhbw %%mm2, %%mm6 \n\t" // YUYV YUYV(12)
  1351. MOVNTQ" %%mm3, (%0, %%eax, 4) \n\t"
  1352. MOVNTQ" %%mm4, 8(%0, %%eax, 4) \n\t"
  1353. MOVNTQ" %%mm5, 16(%0, %%eax, 4) \n\t"
  1354. MOVNTQ" %%mm6, 24(%0, %%eax, 4) \n\t"
  1355. "addl $8, %%eax \n\t"
  1356. "cmpl %4, %%eax \n\t"
  1357. " jb 1b \n\t"
  1358. ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "r" (chromWidth)
  1359. : "%eax"
  1360. );
  1361. #else
  1362. #if __WORDSIZE >= 64
  1363. int i;
  1364. uint64_t *ldst = (uint64_t *) dst;
  1365. const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
  1366. for(i = 0; i < chromWidth; i += 2){
  1367. uint64_t k, l;
  1368. k = yc[0] + (uc[0] << 8) +
  1369. (yc[1] << 16) + (vc[0] << 24);
  1370. l = yc[2] + (uc[1] << 8) +
  1371. (yc[3] << 16) + (vc[1] << 24);
  1372. *ldst++ = k + (l << 32);
  1373. yc += 4;
  1374. uc += 2;
  1375. vc += 2;
  1376. }
  1377. #else
  1378. int i, *idst = (int32_t *) dst;
  1379. const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
  1380. for(i = 0; i < chromWidth; i++){
  1381. *idst++ = yc[0] + (uc[0] << 8) +
  1382. (yc[1] << 16) + (vc[0] << 24);
  1383. yc += 2;
  1384. uc++;
  1385. vc++;
  1386. }
  1387. #endif
  1388. #endif
  1389. if((y&(vertLumPerChroma-1))==(vertLumPerChroma-1) )
  1390. {
  1391. usrc += chromStride;
  1392. vsrc += chromStride;
  1393. }
  1394. ysrc += lumStride;
  1395. dst += dstStride;
  1396. }
  1397. #ifdef HAVE_MMX
  1398. asm( EMMS" \n\t"
  1399. SFENCE" \n\t"
  1400. :::"memory");
  1401. #endif
  1402. }
  1403. /**
  1404. *
  1405. * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
  1406. * problem for anyone then tell me, and ill fix it)
  1407. */
  1408. static inline void RENAME(yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
  1409. unsigned int width, unsigned int height,
  1410. unsigned int lumStride, unsigned int chromStride, unsigned int dstStride)
  1411. {
  1412. //FIXME interpolate chroma
  1413. RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
  1414. }
  1415. /**
  1416. *
  1417. * width should be a multiple of 16
  1418. */
  1419. static inline void RENAME(yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
  1420. unsigned int width, unsigned int height,
  1421. unsigned int lumStride, unsigned int chromStride, unsigned int dstStride)
  1422. {
  1423. RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
  1424. }
  1425. /**
  1426. *
  1427. * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
  1428. * problem for anyone then tell me, and ill fix it)
  1429. */
  1430. static inline void RENAME(yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
  1431. unsigned int width, unsigned int height,
  1432. unsigned int lumStride, unsigned int chromStride, unsigned int srcStride)
  1433. {
  1434. unsigned y;
  1435. const unsigned chromWidth= width>>1;
  1436. for(y=0; y<height; y+=2)
  1437. {
  1438. #ifdef HAVE_MMX
  1439. asm volatile(
  1440. "xorl %%eax, %%eax \n\t"
  1441. "pcmpeqw %%mm7, %%mm7 \n\t"
  1442. "psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
  1443. ".balign 16 \n\t"
  1444. "1: \n\t"
  1445. PREFETCH" 64(%0, %%eax, 4) \n\t"
  1446. "movq (%0, %%eax, 4), %%mm0 \n\t" // YUYV YUYV(0)
  1447. "movq 8(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(4)
  1448. "movq %%mm0, %%mm2 \n\t" // YUYV YUYV(0)
  1449. "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(4)
  1450. "psrlw $8, %%mm0 \n\t" // U0V0 U0V0(0)
  1451. "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(4)
  1452. "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
  1453. "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
  1454. "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
  1455. "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
  1456. MOVNTQ" %%mm2, (%1, %%eax, 2) \n\t"
  1457. "movq 16(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(8)
  1458. "movq 24(%0, %%eax, 4), %%mm2 \n\t" // YUYV YUYV(12)
  1459. "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(8)
  1460. "movq %%mm2, %%mm4 \n\t" // YUYV YUYV(12)
  1461. "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(8)
  1462. "psrlw $8, %%mm2 \n\t" // U0V0 U0V0(12)
  1463. "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
  1464. "pand %%mm7, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
  1465. "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
  1466. "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
  1467. MOVNTQ" %%mm3, 8(%1, %%eax, 2) \n\t"
  1468. "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
  1469. "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
  1470. "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
  1471. "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
  1472. "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
  1473. "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
  1474. "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
  1475. "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
  1476. MOVNTQ" %%mm0, (%3, %%eax) \n\t"
  1477. MOVNTQ" %%mm2, (%2, %%eax) \n\t"
  1478. "addl $8, %%eax \n\t"
  1479. "cmpl %4, %%eax \n\t"
  1480. " jb 1b \n\t"
  1481. ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "r" (chromWidth)
  1482. : "memory", "%eax"
  1483. );
  1484. ydst += lumStride;
  1485. src += srcStride;
  1486. asm volatile(
  1487. "xorl %%eax, %%eax \n\t"
  1488. ".balign 16 \n\t"
  1489. "1: \n\t"
  1490. PREFETCH" 64(%0, %%eax, 4) \n\t"
  1491. "movq (%0, %%eax, 4), %%mm0 \n\t" // YUYV YUYV(0)
  1492. "movq 8(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(4)
  1493. "movq 16(%0, %%eax, 4), %%mm2 \n\t" // YUYV YUYV(8)
  1494. "movq 24(%0, %%eax, 4), %%mm3 \n\t" // YUYV YUYV(12)
  1495. "pand %%mm7, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
  1496. "pand %%mm7, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
  1497. "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
  1498. "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
  1499. "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
  1500. "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
  1501. MOVNTQ" %%mm0, (%1, %%eax, 2) \n\t"
  1502. MOVNTQ" %%mm2, 8(%1, %%eax, 2) \n\t"
  1503. "addl $8, %%eax \n\t"
  1504. "cmpl %4, %%eax \n\t"
  1505. " jb 1b \n\t"
  1506. ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "r" (chromWidth)
  1507. : "memory", "%eax"
  1508. );
  1509. #else
  1510. unsigned i;
  1511. for(i=0; i<chromWidth; i++)
  1512. {
  1513. ydst[2*i+0] = src[4*i+0];
  1514. udst[i] = src[4*i+1];
  1515. ydst[2*i+1] = src[4*i+2];
  1516. vdst[i] = src[4*i+3];
  1517. }
  1518. ydst += lumStride;
  1519. src += srcStride;
  1520. for(i=0; i<chromWidth; i++)
  1521. {
  1522. ydst[2*i+0] = src[4*i+0];
  1523. ydst[2*i+1] = src[4*i+2];
  1524. }
  1525. #endif
  1526. udst += chromStride;
  1527. vdst += chromStride;
  1528. ydst += lumStride;
  1529. src += srcStride;
  1530. }
  1531. #ifdef HAVE_MMX
  1532. asm volatile( EMMS" \n\t"
  1533. SFENCE" \n\t"
  1534. :::"memory");
  1535. #endif
  1536. }
  1537. static inline void RENAME(yvu9toyv12)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc,
  1538. uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
  1539. unsigned int width, unsigned int height, unsigned int lumStride, unsigned int chromStride)
  1540. {
  1541. /* Y Plane */
  1542. memcpy(ydst, ysrc, width*height);
  1543. /* XXX: implement upscaling for U,V */
  1544. }
  1545. static inline void RENAME(planar2x)(const uint8_t *src, uint8_t *dst, int srcWidth, int srcHeight, int srcStride, int dstStride)
  1546. {
  1547. int x,y;
  1548. // first line
  1549. for(x=0; x<srcWidth; x++){
  1550. dst[2*x+0]=
  1551. dst[2*x+1]= src[x];
  1552. }
  1553. dst+= dstStride;
  1554. for(y=1; y<srcHeight; y++){
  1555. #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
  1556. const int mmxSize= srcWidth;
  1557. asm volatile(
  1558. "movl %4, %%eax \n\t"
  1559. "1: \n\t"
  1560. "movq (%0, %%eax), %%mm0 \n\t"
  1561. "movq (%1, %%eax), %%mm1 \n\t"
  1562. "movq 1(%0, %%eax), %%mm2 \n\t"
  1563. "movq 1(%1, %%eax), %%mm3 \n\t"
  1564. "movq %%mm0, %%mm4 \n\t"
  1565. "movq %%mm1, %%mm5 \n\t"
  1566. PAVGB" %%mm3, %%mm0 \n\t"
  1567. PAVGB" %%mm3, %%mm0 \n\t"
  1568. PAVGB" %%mm4, %%mm3 \n\t"
  1569. PAVGB" %%mm4, %%mm3 \n\t"
  1570. PAVGB" %%mm2, %%mm1 \n\t"
  1571. PAVGB" %%mm2, %%mm1 \n\t"
  1572. PAVGB" %%mm5, %%mm2 \n\t"
  1573. PAVGB" %%mm5, %%mm2 \n\t"
  1574. "movq %%mm3, %%mm4 \n\t"
  1575. "movq %%mm2, %%mm5 \n\t"
  1576. "punpcklbw %%mm1, %%mm3 \n\t"
  1577. "punpckhbw %%mm1, %%mm4 \n\t"
  1578. "punpcklbw %%mm0, %%mm2 \n\t"
  1579. "punpckhbw %%mm0, %%mm5 \n\t"
  1580. #if 1
  1581. MOVNTQ" %%mm3, (%2, %%eax, 2) \n\t"
  1582. MOVNTQ" %%mm4, 8(%2, %%eax, 2) \n\t"
  1583. MOVNTQ" %%mm2, (%3, %%eax, 2) \n\t"
  1584. MOVNTQ" %%mm5, 8(%3, %%eax, 2) \n\t"
  1585. #else
  1586. "movq %%mm3, (%2, %%eax, 2) \n\t"
  1587. "movq %%mm4, 8(%2, %%eax, 2) \n\t"
  1588. "movq %%mm2, (%3, %%eax, 2) \n\t"
  1589. "movq %%mm5, 8(%3, %%eax, 2) \n\t"
  1590. #endif
  1591. "addl $8, %%eax \n\t"
  1592. " js 1b \n\t"
  1593. :: "r" (src + mmxSize-1), "r" (src + srcStride + mmxSize-1),
  1594. "r" (dst + mmxSize*2), "r" (dst + dstStride + mmxSize*2),
  1595. "g" (-mmxSize)
  1596. : "%eax"
  1597. );
  1598. dst[0]=
  1599. dst[dstStride]= src[0];
  1600. #else
  1601. dst[0]=
  1602. dst[dstStride]= src[0];
  1603. for(x=0; x<srcWidth-1; x++){
  1604. dst[2*x +1]= (3*src[x+0] + src[x+srcStride+1])>>2;
  1605. dst[2*x+dstStride+2]= ( src[x+0] + 3*src[x+srcStride+1])>>2;
  1606. dst[2*x+dstStride+1]= ( src[x+1] + 3*src[x+srcStride ])>>2;
  1607. dst[2*x +2]= (3*src[x+1] + src[x+srcStride ])>>2;
  1608. }
  1609. #endif
  1610. dst[srcWidth*2 -1]=
  1611. dst[srcWidth*2 -1 + dstStride]= src[srcWidth-1];
  1612. dst+=dstStride*2;
  1613. src+=srcStride;
  1614. }
  1615. src-=srcStride;
  1616. // last line
  1617. for(x=0; x<srcWidth; x++){
  1618. dst[2*x+0]=
  1619. dst[2*x+1]= src[x];
  1620. }
  1621. #ifdef HAVE_MMX
  1622. asm volatile( EMMS" \n\t"
  1623. SFENCE" \n\t"
  1624. :::"memory");
  1625. #endif
  1626. }
  1627. /**
  1628. *
  1629. * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
  1630. * problem for anyone then tell me, and ill fix it)
  1631. * chrominance data is only taken from every secound line others are ignored FIXME write HQ version
  1632. */
  1633. static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
  1634. unsigned int width, unsigned int height,
  1635. unsigned int lumStride, unsigned int chromStride, unsigned int srcStride)
  1636. {
  1637. unsigned y;
  1638. const unsigned chromWidth= width>>1;
  1639. for(y=0; y<height; y+=2)
  1640. {
  1641. #ifdef HAVE_MMX
  1642. asm volatile(
  1643. "xorl %%eax, %%eax \n\t"
  1644. "pcmpeqw %%mm7, %%mm7 \n\t"
  1645. "psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
  1646. ".balign 16 \n\t"
  1647. "1: \n\t"
  1648. PREFETCH" 64(%0, %%eax, 4) \n\t"
  1649. "movq (%0, %%eax, 4), %%mm0 \n\t" // UYVY UYVY(0)
  1650. "movq 8(%0, %%eax, 4), %%mm1 \n\t" // UYVY UYVY(4)
  1651. "movq %%mm0, %%mm2 \n\t" // UYVY UYVY(0)
  1652. "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(4)
  1653. "pand %%mm7, %%mm0 \n\t" // U0V0 U0V0(0)
  1654. "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(4)
  1655. "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
  1656. "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
  1657. "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
  1658. "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
  1659. MOVNTQ" %%mm2, (%1, %%eax, 2) \n\t"
  1660. "movq 16(%0, %%eax, 4), %%mm1 \n\t" // UYVY UYVY(8)
  1661. "movq 24(%0, %%eax, 4), %%mm2 \n\t" // UYVY UYVY(12)
  1662. "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(8)
  1663. "movq %%mm2, %%mm4 \n\t" // UYVY UYVY(12)
  1664. "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(8)
  1665. "pand %%mm7, %%mm2 \n\t" // U0V0 U0V0(12)
  1666. "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
  1667. "psrlw $8, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
  1668. "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
  1669. "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
  1670. MOVNTQ" %%mm3, 8(%1, %%eax, 2) \n\t"
  1671. "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
  1672. "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
  1673. "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
  1674. "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
  1675. "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
  1676. "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
  1677. "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
  1678. "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
  1679. MOVNTQ" %%mm0, (%3, %%eax) \n\t"
  1680. MOVNTQ" %%mm2, (%2, %%eax) \n\t"
  1681. "addl $8, %%eax \n\t"
  1682. "cmpl %4, %%eax \n\t"
  1683. " jb 1b \n\t"
  1684. ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "r" (chromWidth)
  1685. : "memory", "%eax"
  1686. );
  1687. ydst += lumStride;
  1688. src += srcStride;
  1689. asm volatile(
  1690. "xorl %%eax, %%eax \n\t"
  1691. ".balign 16 \n\t"
  1692. "1: \n\t"
  1693. PREFETCH" 64(%0, %%eax, 4) \n\t"
  1694. "movq (%0, %%eax, 4), %%mm0 \n\t" // YUYV YUYV(0)
  1695. "movq 8(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(4)
  1696. "movq 16(%0, %%eax, 4), %%mm2 \n\t" // YUYV YUYV(8)
  1697. "movq 24(%0, %%eax, 4), %%mm3 \n\t" // YUYV YUYV(12)
  1698. "psrlw $8, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
  1699. "psrlw $8, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
  1700. "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
  1701. "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
  1702. "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
  1703. "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
  1704. MOVNTQ" %%mm0, (%1, %%eax, 2) \n\t"
  1705. MOVNTQ" %%mm2, 8(%1, %%eax, 2) \n\t"
  1706. "addl $8, %%eax \n\t"
  1707. "cmpl %4, %%eax \n\t"
  1708. " jb 1b \n\t"
  1709. ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "r" (chromWidth)
  1710. : "memory", "%eax"
  1711. );
  1712. #else
  1713. unsigned i;
  1714. for(i=0; i<chromWidth; i++)
  1715. {
  1716. udst[i] = src[4*i+0];
  1717. ydst[2*i+0] = src[4*i+1];
  1718. vdst[i] = src[4*i+2];
  1719. ydst[2*i+1] = src[4*i+3];
  1720. }
  1721. ydst += lumStride;
  1722. src += srcStride;
  1723. for(i=0; i<chromWidth; i++)
  1724. {
  1725. ydst[2*i+0] = src[4*i+1];
  1726. ydst[2*i+1] = src[4*i+3];
  1727. }
  1728. #endif
  1729. udst += chromStride;
  1730. vdst += chromStride;
  1731. ydst += lumStride;
  1732. src += srcStride;
  1733. }
  1734. #ifdef HAVE_MMX
  1735. asm volatile( EMMS" \n\t"
  1736. SFENCE" \n\t"
  1737. :::"memory");
  1738. #endif
  1739. }
  1740. /**
  1741. *
  1742. * height should be a multiple of 2 and width should be a multiple of 2 (if this is a
  1743. * problem for anyone then tell me, and ill fix it)
  1744. * chrominance data is only taken from every secound line others are ignored in the C version FIXME write HQ version
  1745. */
  1746. static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
  1747. unsigned int width, unsigned int height,
  1748. unsigned int lumStride, unsigned int chromStride, unsigned int srcStride)
  1749. {
  1750. unsigned y;
  1751. const unsigned chromWidth= width>>1;
  1752. #ifdef HAVE_MMX
  1753. for(y=0; y<height-2; y+=2)
  1754. {
  1755. unsigned i;
  1756. for(i=0; i<2; i++)
  1757. {
  1758. asm volatile(
  1759. "movl %2, %%eax \n\t"
  1760. "movq "MANGLE(bgr2YCoeff)", %%mm6 \n\t"
  1761. "movq "MANGLE(w1111)", %%mm5 \n\t"
  1762. "pxor %%mm7, %%mm7 \n\t"
  1763. "leal (%%eax, %%eax, 2), %%ebx \n\t"
  1764. ".balign 16 \n\t"
  1765. "1: \n\t"
  1766. PREFETCH" 64(%0, %%ebx) \n\t"
  1767. "movd (%0, %%ebx), %%mm0 \n\t"
  1768. "movd 3(%0, %%ebx), %%mm1 \n\t"
  1769. "punpcklbw %%mm7, %%mm0 \n\t"
  1770. "punpcklbw %%mm7, %%mm1 \n\t"
  1771. "movd 6(%0, %%ebx), %%mm2 \n\t"
  1772. "movd 9(%0, %%ebx), %%mm3 \n\t"
  1773. "punpcklbw %%mm7, %%mm2 \n\t"
  1774. "punpcklbw %%mm7, %%mm3 \n\t"
  1775. "pmaddwd %%mm6, %%mm0 \n\t"
  1776. "pmaddwd %%mm6, %%mm1 \n\t"
  1777. "pmaddwd %%mm6, %%mm2 \n\t"
  1778. "pmaddwd %%mm6, %%mm3 \n\t"
  1779. #ifndef FAST_BGR2YV12
  1780. "psrad $8, %%mm0 \n\t"
  1781. "psrad $8, %%mm1 \n\t"
  1782. "psrad $8, %%mm2 \n\t"
  1783. "psrad $8, %%mm3 \n\t"
  1784. #endif
  1785. "packssdw %%mm1, %%mm0 \n\t"
  1786. "packssdw %%mm3, %%mm2 \n\t"
  1787. "pmaddwd %%mm5, %%mm0 \n\t"
  1788. "pmaddwd %%mm5, %%mm2 \n\t"
  1789. "packssdw %%mm2, %%mm0 \n\t"
  1790. "psraw $7, %%mm0 \n\t"
  1791. "movd 12(%0, %%ebx), %%mm4 \n\t"
  1792. "movd 15(%0, %%ebx), %%mm1 \n\t"
  1793. "punpcklbw %%mm7, %%mm4 \n\t"
  1794. "punpcklbw %%mm7, %%mm1 \n\t"
  1795. "movd 18(%0, %%ebx), %%mm2 \n\t"
  1796. "movd 21(%0, %%ebx), %%mm3 \n\t"
  1797. "punpcklbw %%mm7, %%mm2 \n\t"
  1798. "punpcklbw %%mm7, %%mm3 \n\t"
  1799. "pmaddwd %%mm6, %%mm4 \n\t"
  1800. "pmaddwd %%mm6, %%mm1 \n\t"
  1801. "pmaddwd %%mm6, %%mm2 \n\t"
  1802. "pmaddwd %%mm6, %%mm3 \n\t"
  1803. #ifndef FAST_BGR2YV12
  1804. "psrad $8, %%mm4 \n\t"
  1805. "psrad $8, %%mm1 \n\t"
  1806. "psrad $8, %%mm2 \n\t"
  1807. "psrad $8, %%mm3 \n\t"
  1808. #endif
  1809. "packssdw %%mm1, %%mm4 \n\t"
  1810. "packssdw %%mm3, %%mm2 \n\t"
  1811. "pmaddwd %%mm5, %%mm4 \n\t"
  1812. "pmaddwd %%mm5, %%mm2 \n\t"
  1813. "addl $24, %%ebx \n\t"
  1814. "packssdw %%mm2, %%mm4 \n\t"
  1815. "psraw $7, %%mm4 \n\t"
  1816. "packuswb %%mm4, %%mm0 \n\t"
  1817. "paddusb "MANGLE(bgr2YOffset)", %%mm0 \n\t"
  1818. MOVNTQ" %%mm0, (%1, %%eax) \n\t"
  1819. "addl $8, %%eax \n\t"
  1820. " js 1b \n\t"
  1821. : : "r" (src+width*3), "r" (ydst+width), "g" (-width)
  1822. : "%eax", "%ebx"
  1823. );
  1824. ydst += lumStride;
  1825. src += srcStride;
  1826. }
  1827. src -= srcStride*2;
  1828. asm volatile(
  1829. "movl %4, %%eax \n\t"
  1830. "movq "MANGLE(w1111)", %%mm5 \n\t"
  1831. "movq "MANGLE(bgr2UCoeff)", %%mm6 \n\t"
  1832. "pxor %%mm7, %%mm7 \n\t"
  1833. "leal (%%eax, %%eax, 2), %%ebx \n\t"
  1834. "addl %%ebx, %%ebx \n\t"
  1835. ".balign 16 \n\t"
  1836. "1: \n\t"
  1837. PREFETCH" 64(%0, %%ebx) \n\t"
  1838. PREFETCH" 64(%1, %%ebx) \n\t"
  1839. #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
  1840. "movq (%0, %%ebx), %%mm0 \n\t"
  1841. "movq (%1, %%ebx), %%mm1 \n\t"
  1842. "movq 6(%0, %%ebx), %%mm2 \n\t"
  1843. "movq 6(%1, %%ebx), %%mm3 \n\t"
  1844. PAVGB" %%mm1, %%mm0 \n\t"
  1845. PAVGB" %%mm3, %%mm2 \n\t"
  1846. "movq %%mm0, %%mm1 \n\t"
  1847. "movq %%mm2, %%mm3 \n\t"
  1848. "psrlq $24, %%mm0 \n\t"
  1849. "psrlq $24, %%mm2 \n\t"
  1850. PAVGB" %%mm1, %%mm0 \n\t"
  1851. PAVGB" %%mm3, %%mm2 \n\t"
  1852. "punpcklbw %%mm7, %%mm0 \n\t"
  1853. "punpcklbw %%mm7, %%mm2 \n\t"
  1854. #else
  1855. "movd (%0, %%ebx), %%mm0 \n\t"
  1856. "movd (%1, %%ebx), %%mm1 \n\t"
  1857. "movd 3(%0, %%ebx), %%mm2 \n\t"
  1858. "movd 3(%1, %%ebx), %%mm3 \n\t"
  1859. "punpcklbw %%mm7, %%mm0 \n\t"
  1860. "punpcklbw %%mm7, %%mm1 \n\t"
  1861. "punpcklbw %%mm7, %%mm2 \n\t"
  1862. "punpcklbw %%mm7, %%mm3 \n\t"
  1863. "paddw %%mm1, %%mm0 \n\t"
  1864. "paddw %%mm3, %%mm2 \n\t"
  1865. "paddw %%mm2, %%mm0 \n\t"
  1866. "movd 6(%0, %%ebx), %%mm4 \n\t"
  1867. "movd 6(%1, %%ebx), %%mm1 \n\t"
  1868. "movd 9(%0, %%ebx), %%mm2 \n\t"
  1869. "movd 9(%1, %%ebx), %%mm3 \n\t"
  1870. "punpcklbw %%mm7, %%mm4 \n\t"
  1871. "punpcklbw %%mm7, %%mm1 \n\t"
  1872. "punpcklbw %%mm7, %%mm2 \n\t"
  1873. "punpcklbw %%mm7, %%mm3 \n\t"
  1874. "paddw %%mm1, %%mm4 \n\t"
  1875. "paddw %%mm3, %%mm2 \n\t"
  1876. "paddw %%mm4, %%mm2 \n\t"
  1877. "psrlw $2, %%mm0 \n\t"
  1878. "psrlw $2, %%mm2 \n\t"
  1879. #endif
  1880. "movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t"
  1881. "movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t"
  1882. "pmaddwd %%mm0, %%mm1 \n\t"
  1883. "pmaddwd %%mm2, %%mm3 \n\t"
  1884. "pmaddwd %%mm6, %%mm0 \n\t"
  1885. "pmaddwd %%mm6, %%mm2 \n\t"
  1886. #ifndef FAST_BGR2YV12
  1887. "psrad $8, %%mm0 \n\t"
  1888. "psrad $8, %%mm1 \n\t"
  1889. "psrad $8, %%mm2 \n\t"
  1890. "psrad $8, %%mm3 \n\t"
  1891. #endif
  1892. "packssdw %%mm2, %%mm0 \n\t"
  1893. "packssdw %%mm3, %%mm1 \n\t"
  1894. "pmaddwd %%mm5, %%mm0 \n\t"
  1895. "pmaddwd %%mm5, %%mm1 \n\t"
  1896. "packssdw %%mm1, %%mm0 \n\t" // V1 V0 U1 U0
  1897. "psraw $7, %%mm0 \n\t"
  1898. #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
  1899. "movq 12(%0, %%ebx), %%mm4 \n\t"
  1900. "movq 12(%1, %%ebx), %%mm1 \n\t"
  1901. "movq 18(%0, %%ebx), %%mm2 \n\t"
  1902. "movq 18(%1, %%ebx), %%mm3 \n\t"
  1903. PAVGB" %%mm1, %%mm4 \n\t"
  1904. PAVGB" %%mm3, %%mm2 \n\t"
  1905. "movq %%mm4, %%mm1 \n\t"
  1906. "movq %%mm2, %%mm3 \n\t"
  1907. "psrlq $24, %%mm4 \n\t"
  1908. "psrlq $24, %%mm2 \n\t"
  1909. PAVGB" %%mm1, %%mm4 \n\t"
  1910. PAVGB" %%mm3, %%mm2 \n\t"
  1911. "punpcklbw %%mm7, %%mm4 \n\t"
  1912. "punpcklbw %%mm7, %%mm2 \n\t"
  1913. #else
  1914. "movd 12(%0, %%ebx), %%mm4 \n\t"
  1915. "movd 12(%1, %%ebx), %%mm1 \n\t"
  1916. "movd 15(%0, %%ebx), %%mm2 \n\t"
  1917. "movd 15(%1, %%ebx), %%mm3 \n\t"
  1918. "punpcklbw %%mm7, %%mm4 \n\t"
  1919. "punpcklbw %%mm7, %%mm1 \n\t"
  1920. "punpcklbw %%mm7, %%mm2 \n\t"
  1921. "punpcklbw %%mm7, %%mm3 \n\t"
  1922. "paddw %%mm1, %%mm4 \n\t"
  1923. "paddw %%mm3, %%mm2 \n\t"
  1924. "paddw %%mm2, %%mm4 \n\t"
  1925. "movd 18(%0, %%ebx), %%mm5 \n\t"
  1926. "movd 18(%1, %%ebx), %%mm1 \n\t"
  1927. "movd 21(%0, %%ebx), %%mm2 \n\t"
  1928. "movd 21(%1, %%ebx), %%mm3 \n\t"
  1929. "punpcklbw %%mm7, %%mm5 \n\t"
  1930. "punpcklbw %%mm7, %%mm1 \n\t"
  1931. "punpcklbw %%mm7, %%mm2 \n\t"
  1932. "punpcklbw %%mm7, %%mm3 \n\t"
  1933. "paddw %%mm1, %%mm5 \n\t"
  1934. "paddw %%mm3, %%mm2 \n\t"
  1935. "paddw %%mm5, %%mm2 \n\t"
  1936. "movq "MANGLE(w1111)", %%mm5 \n\t"
  1937. "psrlw $2, %%mm4 \n\t"
  1938. "psrlw $2, %%mm2 \n\t"
  1939. #endif
  1940. "movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t"
  1941. "movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t"
  1942. "pmaddwd %%mm4, %%mm1 \n\t"
  1943. "pmaddwd %%mm2, %%mm3 \n\t"
  1944. "pmaddwd %%mm6, %%mm4 \n\t"
  1945. "pmaddwd %%mm6, %%mm2 \n\t"
  1946. #ifndef FAST_BGR2YV12
  1947. "psrad $8, %%mm4 \n\t"
  1948. "psrad $8, %%mm1 \n\t"
  1949. "psrad $8, %%mm2 \n\t"
  1950. "psrad $8, %%mm3 \n\t"
  1951. #endif
  1952. "packssdw %%mm2, %%mm4 \n\t"
  1953. "packssdw %%mm3, %%mm1 \n\t"
  1954. "pmaddwd %%mm5, %%mm4 \n\t"
  1955. "pmaddwd %%mm5, %%mm1 \n\t"
  1956. "addl $24, %%ebx \n\t"
  1957. "packssdw %%mm1, %%mm4 \n\t" // V3 V2 U3 U2
  1958. "psraw $7, %%mm4 \n\t"
  1959. "movq %%mm0, %%mm1 \n\t"
  1960. "punpckldq %%mm4, %%mm0 \n\t"
  1961. "punpckhdq %%mm4, %%mm1 \n\t"
  1962. "packsswb %%mm1, %%mm0 \n\t"
  1963. "paddb "MANGLE(bgr2UVOffset)", %%mm0 \n\t"
  1964. "movd %%mm0, (%2, %%eax) \n\t"
  1965. "punpckhdq %%mm0, %%mm0 \n\t"
  1966. "movd %%mm0, (%3, %%eax) \n\t"
  1967. "addl $4, %%eax \n\t"
  1968. " js 1b \n\t"
  1969. : : "r" (src+width*6), "r" (src+srcStride+width*6), "r" (udst+width), "r" (vdst+width), "g" (-width)
  1970. : "%eax", "%ebx"
  1971. );
  1972. udst += chromStride;
  1973. vdst += chromStride;
  1974. src += srcStride*2;
  1975. }
  1976. asm volatile( EMMS" \n\t"
  1977. SFENCE" \n\t"
  1978. :::"memory");
  1979. #else
  1980. y=0;
  1981. #endif
  1982. for(; y<height; y+=2)
  1983. {
  1984. unsigned i;
  1985. for(i=0; i<chromWidth; i++)
  1986. {
  1987. unsigned int b= src[6*i+0];
  1988. unsigned int g= src[6*i+1];
  1989. unsigned int r= src[6*i+2];
  1990. unsigned int Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
  1991. unsigned int V = ((RV*r + GV*g + BV*b)>>RGB2YUV_SHIFT) + 128;
  1992. unsigned int U = ((RU*r + GU*g + BU*b)>>RGB2YUV_SHIFT) + 128;
  1993. udst[i] = U;
  1994. vdst[i] = V;
  1995. ydst[2*i] = Y;
  1996. b= src[6*i+3];
  1997. g= src[6*i+4];
  1998. r= src[6*i+5];
  1999. Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
  2000. ydst[2*i+1] = Y;
  2001. }
  2002. ydst += lumStride;
  2003. src += srcStride;
  2004. for(i=0; i<chromWidth; i++)
  2005. {
  2006. unsigned int b= src[6*i+0];
  2007. unsigned int g= src[6*i+1];
  2008. unsigned int r= src[6*i+2];
  2009. unsigned int Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
  2010. ydst[2*i] = Y;
  2011. b= src[6*i+3];
  2012. g= src[6*i+4];
  2013. r= src[6*i+5];
  2014. Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
  2015. ydst[2*i+1] = Y;
  2016. }
  2017. udst += chromStride;
  2018. vdst += chromStride;
  2019. ydst += lumStride;
  2020. src += srcStride;
  2021. }
  2022. }
  2023. void RENAME(interleaveBytes)(uint8_t *src1, uint8_t *src2, uint8_t *dest,
  2024. unsigned width, unsigned height, unsigned src1Stride,
  2025. unsigned src2Stride, unsigned dstStride){
  2026. unsigned h;
  2027. for(h=0; h < height; h++)
  2028. {
  2029. unsigned w;
  2030. #ifdef HAVE_MMX
  2031. #ifdef HAVE_SSE2
  2032. asm(
  2033. "xorl %%eax, %%eax \n\t"
  2034. "1: \n\t"
  2035. PREFETCH" 64(%1, %%eax) \n\t"
  2036. PREFETCH" 64(%2, %%eax) \n\t"
  2037. "movdqa (%1, %%eax), %%xmm0 \n\t"
  2038. "movdqa (%1, %%eax), %%xmm1 \n\t"
  2039. "movdqa (%2, %%eax), %%xmm2 \n\t"
  2040. "punpcklbw %%xmm2, %%xmm0 \n\t"
  2041. "punpckhbw %%xmm2, %%xmm1 \n\t"
  2042. "movntdq %%xmm0, (%0, %%eax, 2) \n\t"
  2043. "movntdq %%xmm1, 16(%0, %%eax, 2)\n\t"
  2044. "addl $16, %%eax \n\t"
  2045. "cmpl %3, %%eax \n\t"
  2046. " jb 1b \n\t"
  2047. ::"r"(dest), "r"(src1), "r"(src2), "r" (width-15)
  2048. : "memory", "%eax"
  2049. );
  2050. #else
  2051. asm(
  2052. "xorl %%eax, %%eax \n\t"
  2053. "1: \n\t"
  2054. PREFETCH" 64(%1, %%eax) \n\t"
  2055. PREFETCH" 64(%2, %%eax) \n\t"
  2056. "movq (%1, %%eax), %%mm0 \n\t"
  2057. "movq 8(%1, %%eax), %%mm2 \n\t"
  2058. "movq %%mm0, %%mm1 \n\t"
  2059. "movq %%mm2, %%mm3 \n\t"
  2060. "movq (%2, %%eax), %%mm4 \n\t"
  2061. "movq 8(%2, %%eax), %%mm5 \n\t"
  2062. "punpcklbw %%mm4, %%mm0 \n\t"
  2063. "punpckhbw %%mm4, %%mm1 \n\t"
  2064. "punpcklbw %%mm5, %%mm2 \n\t"
  2065. "punpckhbw %%mm5, %%mm3 \n\t"
  2066. MOVNTQ" %%mm0, (%0, %%eax, 2) \n\t"
  2067. MOVNTQ" %%mm1, 8(%0, %%eax, 2) \n\t"
  2068. MOVNTQ" %%mm2, 16(%0, %%eax, 2) \n\t"
  2069. MOVNTQ" %%mm3, 24(%0, %%eax, 2) \n\t"
  2070. "addl $16, %%eax \n\t"
  2071. "cmpl %3, %%eax \n\t"
  2072. " jb 1b \n\t"
  2073. ::"r"(dest), "r"(src1), "r"(src2), "r" (width-15)
  2074. : "memory", "%eax"
  2075. );
  2076. #endif
  2077. for(w= (width&(~15)); w < width; w++)
  2078. {
  2079. dest[2*w+0] = src1[w];
  2080. dest[2*w+1] = src2[w];
  2081. }
  2082. #else
  2083. for(w=0; w < width; w++)
  2084. {
  2085. dest[2*w+0] = src1[w];
  2086. dest[2*w+1] = src2[w];
  2087. }
  2088. #endif
  2089. dest += dstStride;
  2090. src1 += src1Stride;
  2091. src2 += src2Stride;
  2092. }
  2093. #ifdef HAVE_MMX
  2094. asm(
  2095. EMMS" \n\t"
  2096. SFENCE" \n\t"
  2097. ::: "memory"
  2098. );
  2099. #endif
  2100. }
  2101. static inline void RENAME(vu9_to_vu12)(const uint8_t *src1, const uint8_t *src2,
  2102. uint8_t *dst1, uint8_t *dst2,
  2103. unsigned width, unsigned height,
  2104. unsigned srcStride1, unsigned srcStride2,
  2105. unsigned dstStride1, unsigned dstStride2)
  2106. {
  2107. unsigned y,x,w,h;
  2108. w=width/2; h=height/2;
  2109. #ifdef HAVE_MMX
  2110. asm volatile(
  2111. PREFETCH" %0\n\t"
  2112. PREFETCH" %1\n\t"
  2113. ::"m"(*(src1+srcStride1)),"m"(*(src2+srcStride2)):"memory");
  2114. #endif
  2115. for(y=0;y<h;y++){
  2116. const uint8_t* s1=src1+srcStride1*(y>>1);
  2117. uint8_t* d=dst1+dstStride1*y;
  2118. x=0;
  2119. #ifdef HAVE_MMX
  2120. if(w > 32)
  2121. for(;x<w;x+=32)
  2122. {
  2123. asm volatile(
  2124. PREFETCH" 32%1\n\t"
  2125. "movq %1, %%mm0\n\t"
  2126. "movq 8%1, %%mm2\n\t"
  2127. "movq 16%1, %%mm4\n\t"
  2128. "movq 24%1, %%mm6\n\t"
  2129. "movq %%mm0, %%mm1\n\t"
  2130. "movq %%mm2, %%mm3\n\t"
  2131. "movq %%mm4, %%mm5\n\t"
  2132. "movq %%mm6, %%mm7\n\t"
  2133. "punpcklbw %%mm0, %%mm0\n\t"
  2134. "punpckhbw %%mm1, %%mm1\n\t"
  2135. "punpcklbw %%mm2, %%mm2\n\t"
  2136. "punpckhbw %%mm3, %%mm3\n\t"
  2137. "punpcklbw %%mm4, %%mm4\n\t"
  2138. "punpckhbw %%mm5, %%mm5\n\t"
  2139. "punpcklbw %%mm6, %%mm6\n\t"
  2140. "punpckhbw %%mm7, %%mm7\n\t"
  2141. MOVNTQ" %%mm0, %0\n\t"
  2142. MOVNTQ" %%mm1, 8%0\n\t"
  2143. MOVNTQ" %%mm2, 16%0\n\t"
  2144. MOVNTQ" %%mm3, 24%0\n\t"
  2145. MOVNTQ" %%mm4, 32%0\n\t"
  2146. MOVNTQ" %%mm5, 40%0\n\t"
  2147. MOVNTQ" %%mm6, 48%0\n\t"
  2148. MOVNTQ" %%mm7, 56%0"
  2149. :"=m"(d[2*x])
  2150. :"m"(s1[x])
  2151. :"memory");
  2152. }
  2153. #endif
  2154. for(;x<w;x++) d[2*x]=d[2*x+1]=s1[x];
  2155. }
  2156. for(y=0;y<h;y++){
  2157. const uint8_t* s2=src2+srcStride2*(y>>1);
  2158. uint8_t* d=dst2+dstStride2*y;
  2159. x=0;
  2160. #ifdef HAVE_MMX
  2161. if(w > 32)
  2162. for(;x<w;x+=32)
  2163. {
  2164. asm volatile(
  2165. PREFETCH" 32%1\n\t"
  2166. "movq %1, %%mm0\n\t"
  2167. "movq 8%1, %%mm2\n\t"
  2168. "movq 16%1, %%mm4\n\t"
  2169. "movq 24%1, %%mm6\n\t"
  2170. "movq %%mm0, %%mm1\n\t"
  2171. "movq %%mm2, %%mm3\n\t"
  2172. "movq %%mm4, %%mm5\n\t"
  2173. "movq %%mm6, %%mm7\n\t"
  2174. "punpcklbw %%mm0, %%mm0\n\t"
  2175. "punpckhbw %%mm1, %%mm1\n\t"
  2176. "punpcklbw %%mm2, %%mm2\n\t"
  2177. "punpckhbw %%mm3, %%mm3\n\t"
  2178. "punpcklbw %%mm4, %%mm4\n\t"
  2179. "punpckhbw %%mm5, %%mm5\n\t"
  2180. "punpcklbw %%mm6, %%mm6\n\t"
  2181. "punpckhbw %%mm7, %%mm7\n\t"
  2182. MOVNTQ" %%mm0, %0\n\t"
  2183. MOVNTQ" %%mm1, 8%0\n\t"
  2184. MOVNTQ" %%mm2, 16%0\n\t"
  2185. MOVNTQ" %%mm3, 24%0\n\t"
  2186. MOVNTQ" %%mm4, 32%0\n\t"
  2187. MOVNTQ" %%mm5, 40%0\n\t"
  2188. MOVNTQ" %%mm6, 48%0\n\t"
  2189. MOVNTQ" %%mm7, 56%0"
  2190. :"=m"(d[2*x])
  2191. :"m"(s2[x])
  2192. :"memory");
  2193. }
  2194. #endif
  2195. for(;x<w;x++) d[2*x]=d[2*x+1]=s2[x];
  2196. }
  2197. #ifdef HAVE_MMX
  2198. asm(
  2199. EMMS" \n\t"
  2200. SFENCE" \n\t"
  2201. ::: "memory"
  2202. );
  2203. #endif
  2204. }
  2205. static inline void RENAME(yvu9_to_yuy2)(const uint8_t *src1, const uint8_t *src2, const uint8_t *src3,
  2206. uint8_t *dst,
  2207. unsigned width, unsigned height,
  2208. unsigned srcStride1, unsigned srcStride2,
  2209. unsigned srcStride3, unsigned dstStride)
  2210. {
  2211. unsigned y,x,x2,w,h;
  2212. w=width/2; h=height;
  2213. #ifdef HAVE_MMX
  2214. asm volatile(
  2215. PREFETCH" %0\n\t"
  2216. PREFETCH" %1\n\t"
  2217. PREFETCH" %2\n\t"
  2218. ::"m"(*(src1+srcStride1)),"m"(*(src2+srcStride2)),"m"(*(src3+srcStride3)):"memory");
  2219. #endif
  2220. for(y=0;y<h;y++){
  2221. const uint8_t* yp=src1+srcStride1*y;
  2222. const uint8_t* up=src2+srcStride2*(y>>2);
  2223. const uint8_t* vp=src3+srcStride3*(y>>2);
  2224. uint8_t* d=dst+dstStride*y;
  2225. x2=0;
  2226. x=0;
  2227. #ifdef HAVE_MMX
  2228. for(;x<w;x+=8,x2+=32)
  2229. {
  2230. asm volatile(
  2231. PREFETCH" 32%1\n\t"
  2232. PREFETCH" 32%2\n\t"
  2233. PREFETCH" 32%3\n\t"
  2234. "movq %1, %%mm0\n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
  2235. "movq %2, %%mm1\n\t" /* U0U1U2U3U4U5U6U7 */
  2236. "movq %3, %%mm2\n\t" /* V0V1V2V3V4V5V6V7 */
  2237. "movq %%mm0, %%mm3\n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
  2238. "movq %%mm1, %%mm4\n\t" /* U0U1U2U3U4U5U6U7 */
  2239. "movq %%mm2, %%mm5\n\t" /* V0V1V2V3V4V5V6V7 */
  2240. "punpcklbw %%mm1, %%mm1\n\t" /* U0U0 U1U1 U2U2 U3U3 */
  2241. "punpcklbw %%mm2, %%mm2\n\t" /* V0V0 V1V1 V2V2 V3V3 */
  2242. "punpckhbw %%mm4, %%mm4\n\t" /* U4U4 U5U5 U6U6 U7U7 */
  2243. "punpckhbw %%mm5, %%mm5\n\t" /* V4V4 V5V5 V6V6 V7V7 */
  2244. "movq %%mm1, %%mm6\n\t"
  2245. "punpcklbw %%mm2, %%mm1\n\t" /* U0V0 U0V0 U1V1 U1V1*/
  2246. "punpcklbw %%mm1, %%mm0\n\t" /* Y0U0 Y1V0 Y2U0 Y3V0*/
  2247. "punpckhbw %%mm1, %%mm3\n\t" /* Y4U1 Y5V1 Y6U1 Y7V1*/
  2248. MOVNTQ" %%mm0, %0\n\t"
  2249. MOVNTQ" %%mm3, 8%0\n\t"
  2250. "punpckhbw %%mm2, %%mm6\n\t" /* U2V2 U2V2 U3V3 U3V3*/
  2251. "movq 8%1, %%mm0\n\t"
  2252. "movq %%mm0, %%mm3\n\t"
  2253. "punpcklbw %%mm6, %%mm0\n\t" /* Y U2 Y V2 Y U2 Y V2*/
  2254. "punpckhbw %%mm6, %%mm3\n\t" /* Y U3 Y V3 Y U3 Y V3*/
  2255. MOVNTQ" %%mm0, 16%0\n\t"
  2256. MOVNTQ" %%mm3, 24%0\n\t"
  2257. "movq %%mm4, %%mm6\n\t"
  2258. "movq 16%1, %%mm0\n\t"
  2259. "movq %%mm0, %%mm3\n\t"
  2260. "punpcklbw %%mm5, %%mm4\n\t"
  2261. "punpcklbw %%mm4, %%mm0\n\t" /* Y U4 Y V4 Y U4 Y V4*/
  2262. "punpckhbw %%mm4, %%mm3\n\t" /* Y U5 Y V5 Y U5 Y V5*/
  2263. MOVNTQ" %%mm0, 32%0\n\t"
  2264. MOVNTQ" %%mm3, 40%0\n\t"
  2265. "punpckhbw %%mm5, %%mm6\n\t"
  2266. "movq 24%1, %%mm0\n\t"
  2267. "movq %%mm0, %%mm3\n\t"
  2268. "punpcklbw %%mm6, %%mm0\n\t" /* Y U6 Y V6 Y U6 Y V6*/
  2269. "punpckhbw %%mm6, %%mm3\n\t" /* Y U7 Y V7 Y U7 Y V7*/
  2270. MOVNTQ" %%mm0, 48%0\n\t"
  2271. MOVNTQ" %%mm3, 56%0\n\t"
  2272. :"=m"(d[8*x])
  2273. :"m"(yp[x2]),"m"(up[x]),"m"(vp[x])
  2274. :"memory");
  2275. }
  2276. #endif
  2277. for(;x<w;x++,x2+=4)
  2278. {
  2279. d[8*x+0]=yp[x2];
  2280. d[8*x+1]=up[x];
  2281. d[8*x+2]=yp[x2+1];
  2282. d[8*x+3]=vp[x];
  2283. d[8*x+4]=yp[x2+2];
  2284. d[8*x+5]=up[x];
  2285. d[8*x+6]=yp[x2+3];
  2286. d[8*x+7]=vp[x];
  2287. }
  2288. }
  2289. #ifdef HAVE_MMX
  2290. asm(
  2291. EMMS" \n\t"
  2292. SFENCE" \n\t"
  2293. ::: "memory"
  2294. );
  2295. #endif
  2296. }