You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

2409 lines
60KB

  1. /*
  2. *
  3. * rgb2rgb.c, Software RGB to RGB convertor
  4. * pluralize by Software PAL8 to RGB convertor
  5. * Software YUV to YUV convertor
  6. * Software YUV to RGB convertor
  7. * Written by Nick Kurshev.
  8. * palette & yuv & runtime cpu stuff by Michael (michaelni@gmx.at) (under GPL)
  9. */
  10. #include <stddef.h>
  11. #include <inttypes.h> /* for __WORDSIZE */
  12. #ifndef __WORDSIZE
  13. #warning You have misconfigured system and probably will lose performance!
  14. #endif
  15. #undef PREFETCH
  16. #undef MOVNTQ
  17. #undef EMMS
  18. #undef SFENCE
  19. #undef MMREG_SIZE
  20. #undef PREFETCHW
  21. #undef PAVGB
  22. #ifdef HAVE_SSE2
  23. #define MMREG_SIZE 16
  24. #else
  25. #define MMREG_SIZE 8
  26. #endif
  27. #ifdef HAVE_3DNOW
  28. #define PREFETCH "prefetch"
  29. #define PREFETCHW "prefetchw"
  30. #define PAVGB "pavgusb"
  31. #elif defined ( HAVE_MMX2 )
  32. #define PREFETCH "prefetchnta"
  33. #define PREFETCHW "prefetcht0"
  34. #define PAVGB "pavgb"
  35. #else
  36. #define PREFETCH "/nop"
  37. #define PREFETCHW "/nop"
  38. #endif
  39. #ifdef HAVE_3DNOW
  40. /* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
  41. #define EMMS "femms"
  42. #else
  43. #define EMMS "emms"
  44. #endif
  45. #ifdef HAVE_MMX2
  46. #define MOVNTQ "movntq"
  47. #define SFENCE "sfence"
  48. #else
  49. #define MOVNTQ "movq"
  50. #define SFENCE "/nop"
  51. #endif
  52. static inline void RENAME(rgb24to32)(const uint8_t *src,uint8_t *dst,unsigned src_size)
  53. {
  54. uint8_t *dest = dst;
  55. const uint8_t *s = src;
  56. const uint8_t *end;
  57. #ifdef HAVE_MMX
  58. const uint8_t *mm_end;
  59. #endif
  60. end = s + src_size;
  61. #ifdef HAVE_MMX
  62. __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
  63. mm_end = end - 23;
  64. __asm __volatile("movq %0, %%mm7"::"m"(mask32):"memory");
  65. while(s < mm_end)
  66. {
  67. __asm __volatile(
  68. PREFETCH" 32%1\n\t"
  69. "movd %1, %%mm0\n\t"
  70. "punpckldq 3%1, %%mm0\n\t"
  71. "movd 6%1, %%mm1\n\t"
  72. "punpckldq 9%1, %%mm1\n\t"
  73. "movd 12%1, %%mm2\n\t"
  74. "punpckldq 15%1, %%mm2\n\t"
  75. "movd 18%1, %%mm3\n\t"
  76. "punpckldq 21%1, %%mm3\n\t"
  77. "pand %%mm7, %%mm0\n\t"
  78. "pand %%mm7, %%mm1\n\t"
  79. "pand %%mm7, %%mm2\n\t"
  80. "pand %%mm7, %%mm3\n\t"
  81. MOVNTQ" %%mm0, %0\n\t"
  82. MOVNTQ" %%mm1, 8%0\n\t"
  83. MOVNTQ" %%mm2, 16%0\n\t"
  84. MOVNTQ" %%mm3, 24%0"
  85. :"=m"(*dest)
  86. :"m"(*s)
  87. :"memory");
  88. dest += 32;
  89. s += 24;
  90. }
  91. __asm __volatile(SFENCE:::"memory");
  92. __asm __volatile(EMMS:::"memory");
  93. #endif
  94. while(s < end)
  95. {
  96. *dest++ = *s++;
  97. *dest++ = *s++;
  98. *dest++ = *s++;
  99. *dest++ = 0;
  100. }
  101. }
  102. static inline void RENAME(rgb32to24)(const uint8_t *src,uint8_t *dst,unsigned src_size)
  103. {
  104. uint8_t *dest = dst;
  105. const uint8_t *s = src;
  106. const uint8_t *end;
  107. #ifdef HAVE_MMX
  108. const uint8_t *mm_end;
  109. #endif
  110. end = s + src_size;
  111. #ifdef HAVE_MMX
  112. __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
  113. mm_end = end - 31;
  114. while(s < mm_end)
  115. {
  116. __asm __volatile(
  117. PREFETCH" 32%1\n\t"
  118. "movq %1, %%mm0\n\t"
  119. "movq 8%1, %%mm1\n\t"
  120. "movq 16%1, %%mm4\n\t"
  121. "movq 24%1, %%mm5\n\t"
  122. "movq %%mm0, %%mm2\n\t"
  123. "movq %%mm1, %%mm3\n\t"
  124. "movq %%mm4, %%mm6\n\t"
  125. "movq %%mm5, %%mm7\n\t"
  126. "psrlq $8, %%mm2\n\t"
  127. "psrlq $8, %%mm3\n\t"
  128. "psrlq $8, %%mm6\n\t"
  129. "psrlq $8, %%mm7\n\t"
  130. "pand %2, %%mm0\n\t"
  131. "pand %2, %%mm1\n\t"
  132. "pand %2, %%mm4\n\t"
  133. "pand %2, %%mm5\n\t"
  134. "pand %3, %%mm2\n\t"
  135. "pand %3, %%mm3\n\t"
  136. "pand %3, %%mm6\n\t"
  137. "pand %3, %%mm7\n\t"
  138. "por %%mm2, %%mm0\n\t"
  139. "por %%mm3, %%mm1\n\t"
  140. "por %%mm6, %%mm4\n\t"
  141. "por %%mm7, %%mm5\n\t"
  142. "movq %%mm1, %%mm2\n\t"
  143. "movq %%mm4, %%mm3\n\t"
  144. "psllq $48, %%mm2\n\t"
  145. "psllq $32, %%mm3\n\t"
  146. "pand %4, %%mm2\n\t"
  147. "pand %5, %%mm3\n\t"
  148. "por %%mm2, %%mm0\n\t"
  149. "psrlq $16, %%mm1\n\t"
  150. "psrlq $32, %%mm4\n\t"
  151. "psllq $16, %%mm5\n\t"
  152. "por %%mm3, %%mm1\n\t"
  153. "pand %6, %%mm5\n\t"
  154. "por %%mm5, %%mm4\n\t"
  155. MOVNTQ" %%mm0, %0\n\t"
  156. MOVNTQ" %%mm1, 8%0\n\t"
  157. MOVNTQ" %%mm4, 16%0"
  158. :"=m"(*dest)
  159. :"m"(*s),"m"(mask24l),
  160. "m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
  161. :"memory");
  162. dest += 24;
  163. s += 32;
  164. }
  165. __asm __volatile(SFENCE:::"memory");
  166. __asm __volatile(EMMS:::"memory");
  167. #endif
  168. while(s < end)
  169. {
  170. *dest++ = *s++;
  171. *dest++ = *s++;
  172. *dest++ = *s++;
  173. s++;
  174. }
  175. }
  176. /*
  177. Original by Strepto/Astral
  178. ported to gcc & bugfixed : A'rpi
  179. MMX2, 3DNOW optimization by Nick Kurshev
  180. 32bit c version, and and&add trick by Michael Niedermayer
  181. */
  182. static inline void RENAME(rgb15to16)(const uint8_t *src,uint8_t *dst,unsigned src_size)
  183. {
  184. register const uint8_t* s=src;
  185. register uint8_t* d=dst;
  186. register const uint8_t *end;
  187. const uint8_t *mm_end;
  188. end = s + src_size;
  189. #ifdef HAVE_MMX
  190. __asm __volatile(PREFETCH" %0"::"m"(*s));
  191. __asm __volatile("movq %0, %%mm4"::"m"(mask15s));
  192. mm_end = end - 15;
  193. while(s<mm_end)
  194. {
  195. __asm __volatile(
  196. PREFETCH" 32%1\n\t"
  197. "movq %1, %%mm0\n\t"
  198. "movq 8%1, %%mm2\n\t"
  199. "movq %%mm0, %%mm1\n\t"
  200. "movq %%mm2, %%mm3\n\t"
  201. "pand %%mm4, %%mm0\n\t"
  202. "pand %%mm4, %%mm2\n\t"
  203. "paddw %%mm1, %%mm0\n\t"
  204. "paddw %%mm3, %%mm2\n\t"
  205. MOVNTQ" %%mm0, %0\n\t"
  206. MOVNTQ" %%mm2, 8%0"
  207. :"=m"(*d)
  208. :"m"(*s)
  209. );
  210. d+=16;
  211. s+=16;
  212. }
  213. __asm __volatile(SFENCE:::"memory");
  214. __asm __volatile(EMMS:::"memory");
  215. #endif
  216. mm_end = end - 3;
  217. while(s < mm_end)
  218. {
  219. register unsigned x= *((uint32_t *)s);
  220. *((uint32_t *)d) = (x&0x7FFF7FFF) + (x&0x7FE07FE0);
  221. d+=4;
  222. s+=4;
  223. }
  224. if(s < end)
  225. {
  226. register unsigned short x= *((uint16_t *)s);
  227. *((uint16_t *)d) = (x&0x7FFF) + (x&0x7FE0);
  228. }
  229. }
  230. static inline void RENAME(bgr24torgb24)(const uint8_t *src, uint8_t *dst, unsigned src_size)
  231. {
  232. unsigned j,i,num_pixels=src_size/3;
  233. for(i=0,j=0; j<num_pixels; i+=3,j+=3)
  234. {
  235. dst[j+0] = src[i+2];
  236. dst[j+1] = src[i+1];
  237. dst[j+2] = src[i+0];
  238. }
  239. }
  240. static inline void RENAME(rgb16to15)(const uint8_t *src,uint8_t *dst,unsigned src_size)
  241. {
  242. register const uint8_t* s=src;
  243. register uint8_t* d=dst;
  244. register const uint8_t *end;
  245. const uint8_t *mm_end;
  246. end = s + src_size;
  247. #ifdef HAVE_MMX
  248. __asm __volatile(PREFETCH" %0"::"m"(*s));
  249. __asm __volatile("movq %0, %%mm7"::"m"(mask15rg));
  250. __asm __volatile("movq %0, %%mm6"::"m"(mask15b));
  251. mm_end = end - 15;
  252. while(s<mm_end)
  253. {
  254. __asm __volatile(
  255. PREFETCH" 32%1\n\t"
  256. "movq %1, %%mm0\n\t"
  257. "movq 8%1, %%mm2\n\t"
  258. "movq %%mm0, %%mm1\n\t"
  259. "movq %%mm2, %%mm3\n\t"
  260. "psrlq $1, %%mm0\n\t"
  261. "psrlq $1, %%mm2\n\t"
  262. "pand %%mm7, %%mm0\n\t"
  263. "pand %%mm7, %%mm2\n\t"
  264. "pand %%mm6, %%mm1\n\t"
  265. "pand %%mm6, %%mm3\n\t"
  266. "por %%mm1, %%mm0\n\t"
  267. "por %%mm3, %%mm2\n\t"
  268. MOVNTQ" %%mm0, %0\n\t"
  269. MOVNTQ" %%mm2, 8%0"
  270. :"=m"(*d)
  271. :"m"(*s)
  272. );
  273. d+=16;
  274. s+=16;
  275. }
  276. __asm __volatile(SFENCE:::"memory");
  277. __asm __volatile(EMMS:::"memory");
  278. #endif
  279. mm_end = end - 3;
  280. while(s < mm_end)
  281. {
  282. register uint32_t x= *((uint32_t *)s);
  283. *((uint32_t *)d) = ((x>>1)&0x7FE07FE0) | (x&0x001F001F);
  284. s+=4;
  285. d+=4;
  286. }
  287. if(s < end)
  288. {
  289. register uint16_t x= *((uint16_t *)s);
  290. *((uint16_t *)d) = ((x>>1)&0x7FE0) | (x&0x001F);
  291. s+=2;
  292. d+=2;
  293. }
  294. }
  295. static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, unsigned src_size)
  296. {
  297. const uint8_t *s = src;
  298. const uint8_t *end;
  299. #ifdef HAVE_MMX
  300. const uint8_t *mm_end;
  301. #endif
  302. uint16_t *d = (uint16_t *)dst;
  303. end = s + src_size;
  304. #ifdef HAVE_MMX
  305. __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
  306. __asm __volatile(
  307. "movq %0, %%mm7\n\t"
  308. "movq %1, %%mm6\n\t"
  309. ::"m"(red_16mask),"m"(green_16mask));
  310. mm_end = end - 15;
  311. while(s < mm_end)
  312. {
  313. __asm __volatile(
  314. PREFETCH" 32%1\n\t"
  315. "movd %1, %%mm0\n\t"
  316. "movd 4%1, %%mm3\n\t"
  317. "punpckldq 8%1, %%mm0\n\t"
  318. "punpckldq 12%1, %%mm3\n\t"
  319. "movq %%mm0, %%mm1\n\t"
  320. "movq %%mm0, %%mm2\n\t"
  321. "movq %%mm3, %%mm4\n\t"
  322. "movq %%mm3, %%mm5\n\t"
  323. "psrlq $3, %%mm0\n\t"
  324. "psrlq $3, %%mm3\n\t"
  325. "pand %2, %%mm0\n\t"
  326. "pand %2, %%mm3\n\t"
  327. "psrlq $5, %%mm1\n\t"
  328. "psrlq $5, %%mm4\n\t"
  329. "pand %%mm6, %%mm1\n\t"
  330. "pand %%mm6, %%mm4\n\t"
  331. "psrlq $8, %%mm2\n\t"
  332. "psrlq $8, %%mm5\n\t"
  333. "pand %%mm7, %%mm2\n\t"
  334. "pand %%mm7, %%mm5\n\t"
  335. "por %%mm1, %%mm0\n\t"
  336. "por %%mm4, %%mm3\n\t"
  337. "por %%mm2, %%mm0\n\t"
  338. "por %%mm5, %%mm3\n\t"
  339. "psllq $16, %%mm3\n\t"
  340. "por %%mm3, %%mm0\n\t"
  341. MOVNTQ" %%mm0, %0\n\t"
  342. :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
  343. d += 4;
  344. s += 16;
  345. }
  346. __asm __volatile(SFENCE:::"memory");
  347. __asm __volatile(EMMS:::"memory");
  348. #endif
  349. while(s < end)
  350. {
  351. const int b= *s++;
  352. const int g= *s++;
  353. const int r= *s++;
  354. *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
  355. s++;
  356. }
  357. }
  358. static inline void RENAME(rgb32tobgr16)(const uint8_t *src, uint8_t *dst, unsigned int src_size)
  359. {
  360. const uint8_t *s = src;
  361. const uint8_t *end;
  362. #ifdef HAVE_MMX
  363. const uint8_t *mm_end;
  364. #endif
  365. uint16_t *d = (uint16_t *)dst;
  366. end = s + src_size;
  367. #ifdef HAVE_MMX
  368. __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
  369. __asm __volatile(
  370. "movq %0, %%mm7\n\t"
  371. "movq %1, %%mm6\n\t"
  372. ::"m"(red_16mask),"m"(green_16mask));
  373. mm_end = end - 15;
  374. while(s < mm_end)
  375. {
  376. __asm __volatile(
  377. PREFETCH" 32%1\n\t"
  378. "movd %1, %%mm0\n\t"
  379. "movd 4%1, %%mm3\n\t"
  380. "punpckldq 8%1, %%mm0\n\t"
  381. "punpckldq 12%1, %%mm3\n\t"
  382. "movq %%mm0, %%mm1\n\t"
  383. "movq %%mm0, %%mm2\n\t"
  384. "movq %%mm3, %%mm4\n\t"
  385. "movq %%mm3, %%mm5\n\t"
  386. "psllq $8, %%mm0\n\t"
  387. "psllq $8, %%mm3\n\t"
  388. "pand %%mm7, %%mm0\n\t"
  389. "pand %%mm7, %%mm3\n\t"
  390. "psrlq $5, %%mm1\n\t"
  391. "psrlq $5, %%mm4\n\t"
  392. "pand %%mm6, %%mm1\n\t"
  393. "pand %%mm6, %%mm4\n\t"
  394. "psrlq $19, %%mm2\n\t"
  395. "psrlq $19, %%mm5\n\t"
  396. "pand %2, %%mm2\n\t"
  397. "pand %2, %%mm5\n\t"
  398. "por %%mm1, %%mm0\n\t"
  399. "por %%mm4, %%mm3\n\t"
  400. "por %%mm2, %%mm0\n\t"
  401. "por %%mm5, %%mm3\n\t"
  402. "psllq $16, %%mm3\n\t"
  403. "por %%mm3, %%mm0\n\t"
  404. MOVNTQ" %%mm0, %0\n\t"
  405. :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
  406. d += 4;
  407. s += 16;
  408. }
  409. __asm __volatile(SFENCE:::"memory");
  410. __asm __volatile(EMMS:::"memory");
  411. #endif
  412. while(s < end)
  413. {
  414. const int r= *s++;
  415. const int g= *s++;
  416. const int b= *s++;
  417. *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
  418. s++;
  419. }
  420. }
  421. static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, unsigned src_size)
  422. {
  423. const uint8_t *s = src;
  424. const uint8_t *end;
  425. #ifdef HAVE_MMX
  426. const uint8_t *mm_end;
  427. #endif
  428. uint16_t *d = (uint16_t *)dst;
  429. end = s + src_size;
  430. #ifdef HAVE_MMX
  431. __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
  432. __asm __volatile(
  433. "movq %0, %%mm7\n\t"
  434. "movq %1, %%mm6\n\t"
  435. ::"m"(red_15mask),"m"(green_15mask));
  436. mm_end = end - 15;
  437. while(s < mm_end)
  438. {
  439. __asm __volatile(
  440. PREFETCH" 32%1\n\t"
  441. "movd %1, %%mm0\n\t"
  442. "movd 4%1, %%mm3\n\t"
  443. "punpckldq 8%1, %%mm0\n\t"
  444. "punpckldq 12%1, %%mm3\n\t"
  445. "movq %%mm0, %%mm1\n\t"
  446. "movq %%mm0, %%mm2\n\t"
  447. "movq %%mm3, %%mm4\n\t"
  448. "movq %%mm3, %%mm5\n\t"
  449. "psrlq $3, %%mm0\n\t"
  450. "psrlq $3, %%mm3\n\t"
  451. "pand %2, %%mm0\n\t"
  452. "pand %2, %%mm3\n\t"
  453. "psrlq $6, %%mm1\n\t"
  454. "psrlq $6, %%mm4\n\t"
  455. "pand %%mm6, %%mm1\n\t"
  456. "pand %%mm6, %%mm4\n\t"
  457. "psrlq $9, %%mm2\n\t"
  458. "psrlq $9, %%mm5\n\t"
  459. "pand %%mm7, %%mm2\n\t"
  460. "pand %%mm7, %%mm5\n\t"
  461. "por %%mm1, %%mm0\n\t"
  462. "por %%mm4, %%mm3\n\t"
  463. "por %%mm2, %%mm0\n\t"
  464. "por %%mm5, %%mm3\n\t"
  465. "psllq $16, %%mm3\n\t"
  466. "por %%mm3, %%mm0\n\t"
  467. MOVNTQ" %%mm0, %0\n\t"
  468. :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
  469. d += 4;
  470. s += 16;
  471. }
  472. __asm __volatile(SFENCE:::"memory");
  473. __asm __volatile(EMMS:::"memory");
  474. #endif
  475. while(s < end)
  476. {
  477. const int b= *s++;
  478. const int g= *s++;
  479. const int r= *s++;
  480. *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
  481. s++;
  482. }
  483. }
  484. static inline void RENAME(rgb32tobgr15)(const uint8_t *src, uint8_t *dst, unsigned src_size)
  485. {
  486. const uint8_t *s = src;
  487. const uint8_t *end;
  488. #ifdef HAVE_MMX
  489. const uint8_t *mm_end;
  490. #endif
  491. uint16_t *d = (uint16_t *)dst;
  492. end = s + src_size;
  493. #ifdef HAVE_MMX
  494. __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
  495. __asm __volatile(
  496. "movq %0, %%mm7\n\t"
  497. "movq %1, %%mm6\n\t"
  498. ::"m"(red_15mask),"m"(green_15mask));
  499. mm_end = end - 15;
  500. while(s < mm_end)
  501. {
  502. __asm __volatile(
  503. PREFETCH" 32%1\n\t"
  504. "movd %1, %%mm0\n\t"
  505. "movd 4%1, %%mm3\n\t"
  506. "punpckldq 8%1, %%mm0\n\t"
  507. "punpckldq 12%1, %%mm3\n\t"
  508. "movq %%mm0, %%mm1\n\t"
  509. "movq %%mm0, %%mm2\n\t"
  510. "movq %%mm3, %%mm4\n\t"
  511. "movq %%mm3, %%mm5\n\t"
  512. "psllq $7, %%mm0\n\t"
  513. "psllq $7, %%mm3\n\t"
  514. "pand %%mm7, %%mm0\n\t"
  515. "pand %%mm7, %%mm3\n\t"
  516. "psrlq $6, %%mm1\n\t"
  517. "psrlq $6, %%mm4\n\t"
  518. "pand %%mm6, %%mm1\n\t"
  519. "pand %%mm6, %%mm4\n\t"
  520. "psrlq $19, %%mm2\n\t"
  521. "psrlq $19, %%mm5\n\t"
  522. "pand %2, %%mm2\n\t"
  523. "pand %2, %%mm5\n\t"
  524. "por %%mm1, %%mm0\n\t"
  525. "por %%mm4, %%mm3\n\t"
  526. "por %%mm2, %%mm0\n\t"
  527. "por %%mm5, %%mm3\n\t"
  528. "psllq $16, %%mm3\n\t"
  529. "por %%mm3, %%mm0\n\t"
  530. MOVNTQ" %%mm0, %0\n\t"
  531. :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
  532. d += 4;
  533. s += 16;
  534. }
  535. __asm __volatile(SFENCE:::"memory");
  536. __asm __volatile(EMMS:::"memory");
  537. #endif
  538. while(s < end)
  539. {
  540. const int r= *s++;
  541. const int g= *s++;
  542. const int b= *s++;
  543. *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
  544. s++;
  545. }
  546. }
  547. static inline void RENAME(rgb24to16)(const uint8_t *src, uint8_t *dst, unsigned src_size)
  548. {
  549. const uint8_t *s = src;
  550. const uint8_t *end;
  551. #ifdef HAVE_MMX
  552. const uint8_t *mm_end;
  553. #endif
  554. uint16_t *d = (uint16_t *)dst;
  555. end = s + src_size;
  556. #ifdef HAVE_MMX
  557. __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
  558. __asm __volatile(
  559. "movq %0, %%mm7\n\t"
  560. "movq %1, %%mm6\n\t"
  561. ::"m"(red_16mask),"m"(green_16mask));
  562. mm_end = end - 11;
  563. while(s < mm_end)
  564. {
  565. __asm __volatile(
  566. PREFETCH" 32%1\n\t"
  567. "movd %1, %%mm0\n\t"
  568. "movd 3%1, %%mm3\n\t"
  569. "punpckldq 6%1, %%mm0\n\t"
  570. "punpckldq 9%1, %%mm3\n\t"
  571. "movq %%mm0, %%mm1\n\t"
  572. "movq %%mm0, %%mm2\n\t"
  573. "movq %%mm3, %%mm4\n\t"
  574. "movq %%mm3, %%mm5\n\t"
  575. "psrlq $3, %%mm0\n\t"
  576. "psrlq $3, %%mm3\n\t"
  577. "pand %2, %%mm0\n\t"
  578. "pand %2, %%mm3\n\t"
  579. "psrlq $5, %%mm1\n\t"
  580. "psrlq $5, %%mm4\n\t"
  581. "pand %%mm6, %%mm1\n\t"
  582. "pand %%mm6, %%mm4\n\t"
  583. "psrlq $8, %%mm2\n\t"
  584. "psrlq $8, %%mm5\n\t"
  585. "pand %%mm7, %%mm2\n\t"
  586. "pand %%mm7, %%mm5\n\t"
  587. "por %%mm1, %%mm0\n\t"
  588. "por %%mm4, %%mm3\n\t"
  589. "por %%mm2, %%mm0\n\t"
  590. "por %%mm5, %%mm3\n\t"
  591. "psllq $16, %%mm3\n\t"
  592. "por %%mm3, %%mm0\n\t"
  593. MOVNTQ" %%mm0, %0\n\t"
  594. :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
  595. d += 4;
  596. s += 12;
  597. }
  598. __asm __volatile(SFENCE:::"memory");
  599. __asm __volatile(EMMS:::"memory");
  600. #endif
  601. while(s < end)
  602. {
  603. const int b= *s++;
  604. const int g= *s++;
  605. const int r= *s++;
  606. *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
  607. }
  608. }
  609. static inline void RENAME(rgb24tobgr16)(const uint8_t *src, uint8_t *dst, unsigned int src_size)
  610. {
  611. const uint8_t *s = src;
  612. const uint8_t *end;
  613. #ifdef HAVE_MMX
  614. const uint8_t *mm_end;
  615. #endif
  616. uint16_t *d = (uint16_t *)dst;
  617. end = s + src_size;
  618. #ifdef HAVE_MMX
  619. __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
  620. __asm __volatile(
  621. "movq %0, %%mm7\n\t"
  622. "movq %1, %%mm6\n\t"
  623. ::"m"(red_16mask),"m"(green_16mask));
  624. mm_end = end - 15;
  625. while(s < mm_end)
  626. {
  627. __asm __volatile(
  628. PREFETCH" 32%1\n\t"
  629. "movd %1, %%mm0\n\t"
  630. "movd 3%1, %%mm3\n\t"
  631. "punpckldq 6%1, %%mm0\n\t"
  632. "punpckldq 9%1, %%mm3\n\t"
  633. "movq %%mm0, %%mm1\n\t"
  634. "movq %%mm0, %%mm2\n\t"
  635. "movq %%mm3, %%mm4\n\t"
  636. "movq %%mm3, %%mm5\n\t"
  637. "psllq $8, %%mm0\n\t"
  638. "psllq $8, %%mm3\n\t"
  639. "pand %%mm7, %%mm0\n\t"
  640. "pand %%mm7, %%mm3\n\t"
  641. "psrlq $5, %%mm1\n\t"
  642. "psrlq $5, %%mm4\n\t"
  643. "pand %%mm6, %%mm1\n\t"
  644. "pand %%mm6, %%mm4\n\t"
  645. "psrlq $19, %%mm2\n\t"
  646. "psrlq $19, %%mm5\n\t"
  647. "pand %2, %%mm2\n\t"
  648. "pand %2, %%mm5\n\t"
  649. "por %%mm1, %%mm0\n\t"
  650. "por %%mm4, %%mm3\n\t"
  651. "por %%mm2, %%mm0\n\t"
  652. "por %%mm5, %%mm3\n\t"
  653. "psllq $16, %%mm3\n\t"
  654. "por %%mm3, %%mm0\n\t"
  655. MOVNTQ" %%mm0, %0\n\t"
  656. :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
  657. d += 4;
  658. s += 12;
  659. }
  660. __asm __volatile(SFENCE:::"memory");
  661. __asm __volatile(EMMS:::"memory");
  662. #endif
  663. while(s < end)
  664. {
  665. const int r= *s++;
  666. const int g= *s++;
  667. const int b= *s++;
  668. *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
  669. }
  670. }
  671. static inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, unsigned src_size)
  672. {
  673. const uint8_t *s = src;
  674. const uint8_t *end;
  675. #ifdef HAVE_MMX
  676. const uint8_t *mm_end;
  677. #endif
  678. uint16_t *d = (uint16_t *)dst;
  679. end = s + src_size;
  680. #ifdef HAVE_MMX
  681. __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
  682. __asm __volatile(
  683. "movq %0, %%mm7\n\t"
  684. "movq %1, %%mm6\n\t"
  685. ::"m"(red_15mask),"m"(green_15mask));
  686. mm_end = end - 11;
  687. while(s < mm_end)
  688. {
  689. __asm __volatile(
  690. PREFETCH" 32%1\n\t"
  691. "movd %1, %%mm0\n\t"
  692. "movd 3%1, %%mm3\n\t"
  693. "punpckldq 6%1, %%mm0\n\t"
  694. "punpckldq 9%1, %%mm3\n\t"
  695. "movq %%mm0, %%mm1\n\t"
  696. "movq %%mm0, %%mm2\n\t"
  697. "movq %%mm3, %%mm4\n\t"
  698. "movq %%mm3, %%mm5\n\t"
  699. "psrlq $3, %%mm0\n\t"
  700. "psrlq $3, %%mm3\n\t"
  701. "pand %2, %%mm0\n\t"
  702. "pand %2, %%mm3\n\t"
  703. "psrlq $6, %%mm1\n\t"
  704. "psrlq $6, %%mm4\n\t"
  705. "pand %%mm6, %%mm1\n\t"
  706. "pand %%mm6, %%mm4\n\t"
  707. "psrlq $9, %%mm2\n\t"
  708. "psrlq $9, %%mm5\n\t"
  709. "pand %%mm7, %%mm2\n\t"
  710. "pand %%mm7, %%mm5\n\t"
  711. "por %%mm1, %%mm0\n\t"
  712. "por %%mm4, %%mm3\n\t"
  713. "por %%mm2, %%mm0\n\t"
  714. "por %%mm5, %%mm3\n\t"
  715. "psllq $16, %%mm3\n\t"
  716. "por %%mm3, %%mm0\n\t"
  717. MOVNTQ" %%mm0, %0\n\t"
  718. :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
  719. d += 4;
  720. s += 12;
  721. }
  722. __asm __volatile(SFENCE:::"memory");
  723. __asm __volatile(EMMS:::"memory");
  724. #endif
  725. while(s < end)
  726. {
  727. const int b= *s++;
  728. const int g= *s++;
  729. const int r= *s++;
  730. *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
  731. }
  732. }
  733. static inline void RENAME(rgb24tobgr15)(const uint8_t *src, uint8_t *dst, unsigned src_size)
  734. {
  735. const uint8_t *s = src;
  736. const uint8_t *end;
  737. #ifdef HAVE_MMX
  738. const uint8_t *mm_end;
  739. #endif
  740. uint16_t *d = (uint16_t *)dst;
  741. end = s + src_size;
  742. #ifdef HAVE_MMX
  743. __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
  744. __asm __volatile(
  745. "movq %0, %%mm7\n\t"
  746. "movq %1, %%mm6\n\t"
  747. ::"m"(red_15mask),"m"(green_15mask));
  748. mm_end = end - 15;
  749. while(s < mm_end)
  750. {
  751. __asm __volatile(
  752. PREFETCH" 32%1\n\t"
  753. "movd %1, %%mm0\n\t"
  754. "movd 3%1, %%mm3\n\t"
  755. "punpckldq 6%1, %%mm0\n\t"
  756. "punpckldq 9%1, %%mm3\n\t"
  757. "movq %%mm0, %%mm1\n\t"
  758. "movq %%mm0, %%mm2\n\t"
  759. "movq %%mm3, %%mm4\n\t"
  760. "movq %%mm3, %%mm5\n\t"
  761. "psllq $7, %%mm0\n\t"
  762. "psllq $7, %%mm3\n\t"
  763. "pand %%mm7, %%mm0\n\t"
  764. "pand %%mm7, %%mm3\n\t"
  765. "psrlq $6, %%mm1\n\t"
  766. "psrlq $6, %%mm4\n\t"
  767. "pand %%mm6, %%mm1\n\t"
  768. "pand %%mm6, %%mm4\n\t"
  769. "psrlq $19, %%mm2\n\t"
  770. "psrlq $19, %%mm5\n\t"
  771. "pand %2, %%mm2\n\t"
  772. "pand %2, %%mm5\n\t"
  773. "por %%mm1, %%mm0\n\t"
  774. "por %%mm4, %%mm3\n\t"
  775. "por %%mm2, %%mm0\n\t"
  776. "por %%mm5, %%mm3\n\t"
  777. "psllq $16, %%mm3\n\t"
  778. "por %%mm3, %%mm0\n\t"
  779. MOVNTQ" %%mm0, %0\n\t"
  780. :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
  781. d += 4;
  782. s += 12;
  783. }
  784. __asm __volatile(SFENCE:::"memory");
  785. __asm __volatile(EMMS:::"memory");
  786. #endif
  787. while(s < end)
  788. {
  789. const int r= *s++;
  790. const int g= *s++;
  791. const int b= *s++;
  792. *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
  793. }
  794. }
  795. /*
  796. I use here less accurate approximation by simply
  797. left-shifting the input
  798. value and filling the low order bits with
  799. zeroes. This method improves png's
  800. compression but this scheme cannot reproduce white exactly, since it does not
  801. generate an all-ones maximum value; the net effect is to darken the
  802. image slightly.
  803. The better method should be "left bit replication":
  804. 4 3 2 1 0
  805. ---------
  806. 1 1 0 1 1
  807. 7 6 5 4 3 2 1 0
  808. ----------------
  809. 1 1 0 1 1 1 1 0
  810. |=======| |===|
  811. | Leftmost Bits Repeated to Fill Open Bits
  812. |
  813. Original Bits
  814. */
  815. static inline void RENAME(rgb15to24)(const uint8_t *src, uint8_t *dst, unsigned src_size)
  816. {
  817. const uint16_t *end;
  818. #ifdef HAVE_MMX
  819. const uint16_t *mm_end;
  820. #endif
  821. uint8_t *d = (uint8_t *)dst;
  822. const uint16_t *s = (uint16_t *)src;
  823. end = s + src_size/2;
  824. #ifdef HAVE_MMX
  825. __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
  826. mm_end = end - 7;
  827. while(s < mm_end)
  828. {
  829. __asm __volatile(
  830. PREFETCH" 32%1\n\t"
  831. "movq %1, %%mm0\n\t"
  832. "movq %1, %%mm1\n\t"
  833. "movq %1, %%mm2\n\t"
  834. "pand %2, %%mm0\n\t"
  835. "pand %3, %%mm1\n\t"
  836. "pand %4, %%mm2\n\t"
  837. "psllq $3, %%mm0\n\t"
  838. "psrlq $2, %%mm1\n\t"
  839. "psrlq $7, %%mm2\n\t"
  840. "movq %%mm0, %%mm3\n\t"
  841. "movq %%mm1, %%mm4\n\t"
  842. "movq %%mm2, %%mm5\n\t"
  843. "punpcklwd %5, %%mm0\n\t"
  844. "punpcklwd %5, %%mm1\n\t"
  845. "punpcklwd %5, %%mm2\n\t"
  846. "punpckhwd %5, %%mm3\n\t"
  847. "punpckhwd %5, %%mm4\n\t"
  848. "punpckhwd %5, %%mm5\n\t"
  849. "psllq $8, %%mm1\n\t"
  850. "psllq $16, %%mm2\n\t"
  851. "por %%mm1, %%mm0\n\t"
  852. "por %%mm2, %%mm0\n\t"
  853. "psllq $8, %%mm4\n\t"
  854. "psllq $16, %%mm5\n\t"
  855. "por %%mm4, %%mm3\n\t"
  856. "por %%mm5, %%mm3\n\t"
  857. "movq %%mm0, %%mm6\n\t"
  858. "movq %%mm3, %%mm7\n\t"
  859. "movq 8%1, %%mm0\n\t"
  860. "movq 8%1, %%mm1\n\t"
  861. "movq 8%1, %%mm2\n\t"
  862. "pand %2, %%mm0\n\t"
  863. "pand %3, %%mm1\n\t"
  864. "pand %4, %%mm2\n\t"
  865. "psllq $3, %%mm0\n\t"
  866. "psrlq $2, %%mm1\n\t"
  867. "psrlq $7, %%mm2\n\t"
  868. "movq %%mm0, %%mm3\n\t"
  869. "movq %%mm1, %%mm4\n\t"
  870. "movq %%mm2, %%mm5\n\t"
  871. "punpcklwd %5, %%mm0\n\t"
  872. "punpcklwd %5, %%mm1\n\t"
  873. "punpcklwd %5, %%mm2\n\t"
  874. "punpckhwd %5, %%mm3\n\t"
  875. "punpckhwd %5, %%mm4\n\t"
  876. "punpckhwd %5, %%mm5\n\t"
  877. "psllq $8, %%mm1\n\t"
  878. "psllq $16, %%mm2\n\t"
  879. "por %%mm1, %%mm0\n\t"
  880. "por %%mm2, %%mm0\n\t"
  881. "psllq $8, %%mm4\n\t"
  882. "psllq $16, %%mm5\n\t"
  883. "por %%mm4, %%mm3\n\t"
  884. "por %%mm5, %%mm3\n\t"
  885. :"=m"(*d)
  886. :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r), "m"(mmx_null)
  887. :"memory");
  888. /* Borrowed 32 to 24 */
  889. __asm __volatile(
  890. "movq %%mm0, %%mm4\n\t"
  891. "movq %%mm3, %%mm5\n\t"
  892. "movq %%mm6, %%mm0\n\t"
  893. "movq %%mm7, %%mm1\n\t"
  894. "movq %%mm4, %%mm6\n\t"
  895. "movq %%mm5, %%mm7\n\t"
  896. "movq %%mm0, %%mm2\n\t"
  897. "movq %%mm1, %%mm3\n\t"
  898. "psrlq $8, %%mm2\n\t"
  899. "psrlq $8, %%mm3\n\t"
  900. "psrlq $8, %%mm6\n\t"
  901. "psrlq $8, %%mm7\n\t"
  902. "pand %2, %%mm0\n\t"
  903. "pand %2, %%mm1\n\t"
  904. "pand %2, %%mm4\n\t"
  905. "pand %2, %%mm5\n\t"
  906. "pand %3, %%mm2\n\t"
  907. "pand %3, %%mm3\n\t"
  908. "pand %3, %%mm6\n\t"
  909. "pand %3, %%mm7\n\t"
  910. "por %%mm2, %%mm0\n\t"
  911. "por %%mm3, %%mm1\n\t"
  912. "por %%mm6, %%mm4\n\t"
  913. "por %%mm7, %%mm5\n\t"
  914. "movq %%mm1, %%mm2\n\t"
  915. "movq %%mm4, %%mm3\n\t"
  916. "psllq $48, %%mm2\n\t"
  917. "psllq $32, %%mm3\n\t"
  918. "pand %4, %%mm2\n\t"
  919. "pand %5, %%mm3\n\t"
  920. "por %%mm2, %%mm0\n\t"
  921. "psrlq $16, %%mm1\n\t"
  922. "psrlq $32, %%mm4\n\t"
  923. "psllq $16, %%mm5\n\t"
  924. "por %%mm3, %%mm1\n\t"
  925. "pand %6, %%mm5\n\t"
  926. "por %%mm5, %%mm4\n\t"
  927. MOVNTQ" %%mm0, %0\n\t"
  928. MOVNTQ" %%mm1, 8%0\n\t"
  929. MOVNTQ" %%mm4, 16%0"
  930. :"=m"(*d)
  931. :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
  932. :"memory");
  933. d += 24;
  934. s += 8;
  935. }
  936. __asm __volatile(SFENCE:::"memory");
  937. __asm __volatile(EMMS:::"memory");
  938. #endif
  939. while(s < end)
  940. {
  941. register uint16_t bgr;
  942. bgr = *s++;
  943. *d++ = (bgr&0x1F)<<3;
  944. *d++ = (bgr&0x3E0)>>2;
  945. *d++ = (bgr&0x7C00)>>7;
  946. }
  947. }
  948. static inline void RENAME(rgb16to24)(const uint8_t *src, uint8_t *dst, unsigned src_size)
  949. {
  950. const uint16_t *end;
  951. #ifdef HAVE_MMX
  952. const uint16_t *mm_end;
  953. #endif
  954. uint8_t *d = (uint8_t *)dst;
  955. const uint16_t *s = (const uint16_t *)src;
  956. end = s + src_size/2;
  957. #ifdef HAVE_MMX
  958. __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
  959. mm_end = end - 7;
  960. while(s < mm_end)
  961. {
  962. __asm __volatile(
  963. PREFETCH" 32%1\n\t"
  964. "movq %1, %%mm0\n\t"
  965. "movq %1, %%mm1\n\t"
  966. "movq %1, %%mm2\n\t"
  967. "pand %2, %%mm0\n\t"
  968. "pand %3, %%mm1\n\t"
  969. "pand %4, %%mm2\n\t"
  970. "psllq $3, %%mm0\n\t"
  971. "psrlq $3, %%mm1\n\t"
  972. "psrlq $8, %%mm2\n\t"
  973. "movq %%mm0, %%mm3\n\t"
  974. "movq %%mm1, %%mm4\n\t"
  975. "movq %%mm2, %%mm5\n\t"
  976. "punpcklwd %5, %%mm0\n\t"
  977. "punpcklwd %5, %%mm1\n\t"
  978. "punpcklwd %5, %%mm2\n\t"
  979. "punpckhwd %5, %%mm3\n\t"
  980. "punpckhwd %5, %%mm4\n\t"
  981. "punpckhwd %5, %%mm5\n\t"
  982. "psllq $8, %%mm1\n\t"
  983. "psllq $16, %%mm2\n\t"
  984. "por %%mm1, %%mm0\n\t"
  985. "por %%mm2, %%mm0\n\t"
  986. "psllq $8, %%mm4\n\t"
  987. "psllq $16, %%mm5\n\t"
  988. "por %%mm4, %%mm3\n\t"
  989. "por %%mm5, %%mm3\n\t"
  990. "movq %%mm0, %%mm6\n\t"
  991. "movq %%mm3, %%mm7\n\t"
  992. "movq 8%1, %%mm0\n\t"
  993. "movq 8%1, %%mm1\n\t"
  994. "movq 8%1, %%mm2\n\t"
  995. "pand %2, %%mm0\n\t"
  996. "pand %3, %%mm1\n\t"
  997. "pand %4, %%mm2\n\t"
  998. "psllq $3, %%mm0\n\t"
  999. "psrlq $3, %%mm1\n\t"
  1000. "psrlq $8, %%mm2\n\t"
  1001. "movq %%mm0, %%mm3\n\t"
  1002. "movq %%mm1, %%mm4\n\t"
  1003. "movq %%mm2, %%mm5\n\t"
  1004. "punpcklwd %5, %%mm0\n\t"
  1005. "punpcklwd %5, %%mm1\n\t"
  1006. "punpcklwd %5, %%mm2\n\t"
  1007. "punpckhwd %5, %%mm3\n\t"
  1008. "punpckhwd %5, %%mm4\n\t"
  1009. "punpckhwd %5, %%mm5\n\t"
  1010. "psllq $8, %%mm1\n\t"
  1011. "psllq $16, %%mm2\n\t"
  1012. "por %%mm1, %%mm0\n\t"
  1013. "por %%mm2, %%mm0\n\t"
  1014. "psllq $8, %%mm4\n\t"
  1015. "psllq $16, %%mm5\n\t"
  1016. "por %%mm4, %%mm3\n\t"
  1017. "por %%mm5, %%mm3\n\t"
  1018. :"=m"(*d)
  1019. :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mmx_null)
  1020. :"memory");
  1021. /* Borrowed 32 to 24 */
  1022. __asm __volatile(
  1023. "movq %%mm0, %%mm4\n\t"
  1024. "movq %%mm3, %%mm5\n\t"
  1025. "movq %%mm6, %%mm0\n\t"
  1026. "movq %%mm7, %%mm1\n\t"
  1027. "movq %%mm4, %%mm6\n\t"
  1028. "movq %%mm5, %%mm7\n\t"
  1029. "movq %%mm0, %%mm2\n\t"
  1030. "movq %%mm1, %%mm3\n\t"
  1031. "psrlq $8, %%mm2\n\t"
  1032. "psrlq $8, %%mm3\n\t"
  1033. "psrlq $8, %%mm6\n\t"
  1034. "psrlq $8, %%mm7\n\t"
  1035. "pand %2, %%mm0\n\t"
  1036. "pand %2, %%mm1\n\t"
  1037. "pand %2, %%mm4\n\t"
  1038. "pand %2, %%mm5\n\t"
  1039. "pand %3, %%mm2\n\t"
  1040. "pand %3, %%mm3\n\t"
  1041. "pand %3, %%mm6\n\t"
  1042. "pand %3, %%mm7\n\t"
  1043. "por %%mm2, %%mm0\n\t"
  1044. "por %%mm3, %%mm1\n\t"
  1045. "por %%mm6, %%mm4\n\t"
  1046. "por %%mm7, %%mm5\n\t"
  1047. "movq %%mm1, %%mm2\n\t"
  1048. "movq %%mm4, %%mm3\n\t"
  1049. "psllq $48, %%mm2\n\t"
  1050. "psllq $32, %%mm3\n\t"
  1051. "pand %4, %%mm2\n\t"
  1052. "pand %5, %%mm3\n\t"
  1053. "por %%mm2, %%mm0\n\t"
  1054. "psrlq $16, %%mm1\n\t"
  1055. "psrlq $32, %%mm4\n\t"
  1056. "psllq $16, %%mm5\n\t"
  1057. "por %%mm3, %%mm1\n\t"
  1058. "pand %6, %%mm5\n\t"
  1059. "por %%mm5, %%mm4\n\t"
  1060. MOVNTQ" %%mm0, %0\n\t"
  1061. MOVNTQ" %%mm1, 8%0\n\t"
  1062. MOVNTQ" %%mm4, 16%0"
  1063. :"=m"(*d)
  1064. :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
  1065. :"memory");
  1066. d += 24;
  1067. s += 8;
  1068. }
  1069. __asm __volatile(SFENCE:::"memory");
  1070. __asm __volatile(EMMS:::"memory");
  1071. #endif
  1072. while(s < end)
  1073. {
  1074. register uint16_t bgr;
  1075. bgr = *s++;
  1076. *d++ = (bgr&0x1F)<<3;
  1077. *d++ = (bgr&0x7E0)>>3;
  1078. *d++ = (bgr&0xF800)>>8;
  1079. }
  1080. }
  1081. static inline void RENAME(rgb15to32)(const uint8_t *src, uint8_t *dst, unsigned src_size)
  1082. {
  1083. const uint16_t *end;
  1084. #ifdef HAVE_MMX
  1085. const uint16_t *mm_end;
  1086. #endif
  1087. uint8_t *d = (uint8_t *)dst;
  1088. const uint16_t *s = (const uint16_t *)src;
  1089. end = s + src_size/2;
  1090. #ifdef HAVE_MMX
  1091. __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
  1092. __asm __volatile("pxor %%mm7,%%mm7\n\t":::"memory");
  1093. mm_end = end - 3;
  1094. while(s < mm_end)
  1095. {
  1096. __asm __volatile(
  1097. PREFETCH" 32%1\n\t"
  1098. "movq %1, %%mm0\n\t"
  1099. "movq %1, %%mm1\n\t"
  1100. "movq %1, %%mm2\n\t"
  1101. "pand %2, %%mm0\n\t"
  1102. "pand %3, %%mm1\n\t"
  1103. "pand %4, %%mm2\n\t"
  1104. "psllq $3, %%mm0\n\t"
  1105. "psrlq $2, %%mm1\n\t"
  1106. "psrlq $7, %%mm2\n\t"
  1107. "movq %%mm0, %%mm3\n\t"
  1108. "movq %%mm1, %%mm4\n\t"
  1109. "movq %%mm2, %%mm5\n\t"
  1110. "punpcklwd %%mm7, %%mm0\n\t"
  1111. "punpcklwd %%mm7, %%mm1\n\t"
  1112. "punpcklwd %%mm7, %%mm2\n\t"
  1113. "punpckhwd %%mm7, %%mm3\n\t"
  1114. "punpckhwd %%mm7, %%mm4\n\t"
  1115. "punpckhwd %%mm7, %%mm5\n\t"
  1116. "psllq $8, %%mm1\n\t"
  1117. "psllq $16, %%mm2\n\t"
  1118. "por %%mm1, %%mm0\n\t"
  1119. "por %%mm2, %%mm0\n\t"
  1120. "psllq $8, %%mm4\n\t"
  1121. "psllq $16, %%mm5\n\t"
  1122. "por %%mm4, %%mm3\n\t"
  1123. "por %%mm5, %%mm3\n\t"
  1124. MOVNTQ" %%mm0, %0\n\t"
  1125. MOVNTQ" %%mm3, 8%0\n\t"
  1126. :"=m"(*d)
  1127. :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r)
  1128. :"memory");
  1129. d += 16;
  1130. s += 4;
  1131. }
  1132. __asm __volatile(SFENCE:::"memory");
  1133. __asm __volatile(EMMS:::"memory");
  1134. #endif
  1135. while(s < end)
  1136. {
  1137. register uint16_t bgr;
  1138. bgr = *s++;
  1139. *d++ = (bgr&0x1F)<<3;
  1140. *d++ = (bgr&0x3E0)>>2;
  1141. *d++ = (bgr&0x7C00)>>7;
  1142. *d++ = 0;
  1143. }
  1144. }
  1145. static inline void RENAME(rgb16to32)(const uint8_t *src, uint8_t *dst, unsigned src_size)
  1146. {
  1147. const uint16_t *end;
  1148. #ifdef HAVE_MMX
  1149. const uint16_t *mm_end;
  1150. #endif
  1151. uint8_t *d = (uint8_t *)dst;
  1152. const uint16_t *s = (uint16_t *)src;
  1153. end = s + src_size/2;
  1154. #ifdef HAVE_MMX
  1155. __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
  1156. __asm __volatile("pxor %%mm7,%%mm7\n\t":::"memory");
  1157. mm_end = end - 3;
  1158. while(s < mm_end)
  1159. {
  1160. __asm __volatile(
  1161. PREFETCH" 32%1\n\t"
  1162. "movq %1, %%mm0\n\t"
  1163. "movq %1, %%mm1\n\t"
  1164. "movq %1, %%mm2\n\t"
  1165. "pand %2, %%mm0\n\t"
  1166. "pand %3, %%mm1\n\t"
  1167. "pand %4, %%mm2\n\t"
  1168. "psllq $3, %%mm0\n\t"
  1169. "psrlq $3, %%mm1\n\t"
  1170. "psrlq $8, %%mm2\n\t"
  1171. "movq %%mm0, %%mm3\n\t"
  1172. "movq %%mm1, %%mm4\n\t"
  1173. "movq %%mm2, %%mm5\n\t"
  1174. "punpcklwd %%mm7, %%mm0\n\t"
  1175. "punpcklwd %%mm7, %%mm1\n\t"
  1176. "punpcklwd %%mm7, %%mm2\n\t"
  1177. "punpckhwd %%mm7, %%mm3\n\t"
  1178. "punpckhwd %%mm7, %%mm4\n\t"
  1179. "punpckhwd %%mm7, %%mm5\n\t"
  1180. "psllq $8, %%mm1\n\t"
  1181. "psllq $16, %%mm2\n\t"
  1182. "por %%mm1, %%mm0\n\t"
  1183. "por %%mm2, %%mm0\n\t"
  1184. "psllq $8, %%mm4\n\t"
  1185. "psllq $16, %%mm5\n\t"
  1186. "por %%mm4, %%mm3\n\t"
  1187. "por %%mm5, %%mm3\n\t"
  1188. MOVNTQ" %%mm0, %0\n\t"
  1189. MOVNTQ" %%mm3, 8%0\n\t"
  1190. :"=m"(*d)
  1191. :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r)
  1192. :"memory");
  1193. d += 16;
  1194. s += 4;
  1195. }
  1196. __asm __volatile(SFENCE:::"memory");
  1197. __asm __volatile(EMMS:::"memory");
  1198. #endif
  1199. while(s < end)
  1200. {
  1201. register uint16_t bgr;
  1202. bgr = *s++;
  1203. *d++ = (bgr&0x1F)<<3;
  1204. *d++ = (bgr&0x7E0)>>3;
  1205. *d++ = (bgr&0xF800)>>8;
  1206. *d++ = 0;
  1207. }
  1208. }
  1209. static inline void RENAME(rgb32tobgr32)(const uint8_t *src, uint8_t *dst, unsigned int src_size)
  1210. {
  1211. #ifdef HAVE_MMX
  1212. /* TODO: unroll this loop */
  1213. asm volatile (
  1214. "xorl %%eax, %%eax \n\t"
  1215. ".balign 16 \n\t"
  1216. "1: \n\t"
  1217. PREFETCH" 32(%0, %%eax) \n\t"
  1218. "movq (%0, %%eax), %%mm0 \n\t"
  1219. "movq %%mm0, %%mm1 \n\t"
  1220. "movq %%mm0, %%mm2 \n\t"
  1221. "pslld $16, %%mm0 \n\t"
  1222. "psrld $16, %%mm1 \n\t"
  1223. "pand "MANGLE(mask32r)", %%mm0 \n\t"
  1224. "pand "MANGLE(mask32g)", %%mm2 \n\t"
  1225. "pand "MANGLE(mask32b)", %%mm1 \n\t"
  1226. "por %%mm0, %%mm2 \n\t"
  1227. "por %%mm1, %%mm2 \n\t"
  1228. MOVNTQ" %%mm2, (%1, %%eax) \n\t"
  1229. "addl $8, %%eax \n\t"
  1230. "cmpl %2, %%eax \n\t"
  1231. " jb 1b \n\t"
  1232. :: "r" (src), "r"(dst), "r" (src_size-7)
  1233. : "%eax"
  1234. );
  1235. __asm __volatile(SFENCE:::"memory");
  1236. __asm __volatile(EMMS:::"memory");
  1237. #else
  1238. unsigned i;
  1239. unsigned num_pixels = src_size >> 2;
  1240. for(i=0; i<num_pixels; i++)
  1241. {
  1242. dst[4*i + 0] = src[4*i + 2];
  1243. dst[4*i + 1] = src[4*i + 1];
  1244. dst[4*i + 2] = src[4*i + 0];
  1245. }
  1246. #endif
  1247. }
  1248. static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, unsigned int src_size)
  1249. {
  1250. unsigned i;
  1251. #ifdef HAVE_MMX
  1252. int mmx_size= 23 - src_size;
  1253. asm volatile (
  1254. "movq "MANGLE(mask24r)", %%mm5 \n\t"
  1255. "movq "MANGLE(mask24g)", %%mm6 \n\t"
  1256. "movq "MANGLE(mask24b)", %%mm7 \n\t"
  1257. ".balign 16 \n\t"
  1258. "1: \n\t"
  1259. PREFETCH" 32(%1, %%eax) \n\t"
  1260. "movq (%1, %%eax), %%mm0 \n\t" // BGR BGR BG
  1261. "movq (%1, %%eax), %%mm1 \n\t" // BGR BGR BG
  1262. "movq 2(%1, %%eax), %%mm2 \n\t" // R BGR BGR B
  1263. "psllq $16, %%mm0 \n\t" // 00 BGR BGR
  1264. "pand %%mm5, %%mm0 \n\t"
  1265. "pand %%mm6, %%mm1 \n\t"
  1266. "pand %%mm7, %%mm2 \n\t"
  1267. "por %%mm0, %%mm1 \n\t"
  1268. "por %%mm2, %%mm1 \n\t"
  1269. "movq 6(%1, %%eax), %%mm0 \n\t" // BGR BGR BG
  1270. MOVNTQ" %%mm1, (%2, %%eax) \n\t" // RGB RGB RG
  1271. "movq 8(%1, %%eax), %%mm1 \n\t" // R BGR BGR B
  1272. "movq 10(%1, %%eax), %%mm2 \n\t" // GR BGR BGR
  1273. "pand %%mm7, %%mm0 \n\t"
  1274. "pand %%mm5, %%mm1 \n\t"
  1275. "pand %%mm6, %%mm2 \n\t"
  1276. "por %%mm0, %%mm1 \n\t"
  1277. "por %%mm2, %%mm1 \n\t"
  1278. "movq 14(%1, %%eax), %%mm0 \n\t" // R BGR BGR B
  1279. MOVNTQ" %%mm1, 8(%2, %%eax) \n\t" // B RGB RGB R
  1280. "movq 16(%1, %%eax), %%mm1 \n\t" // GR BGR BGR
  1281. "movq 18(%1, %%eax), %%mm2 \n\t" // BGR BGR BG
  1282. "pand %%mm6, %%mm0 \n\t"
  1283. "pand %%mm7, %%mm1 \n\t"
  1284. "pand %%mm5, %%mm2 \n\t"
  1285. "por %%mm0, %%mm1 \n\t"
  1286. "por %%mm2, %%mm1 \n\t"
  1287. MOVNTQ" %%mm1, 16(%2, %%eax) \n\t"
  1288. "addl $24, %%eax \n\t"
  1289. " js 1b \n\t"
  1290. : "+a" (mmx_size)
  1291. : "r" (src-mmx_size), "r"(dst-mmx_size)
  1292. );
  1293. __asm __volatile(SFENCE:::"memory");
  1294. __asm __volatile(EMMS:::"memory");
  1295. if(mmx_size==23) return; //finihsed, was multiple of 8
  1296. src+= src_size;
  1297. dst+= src_size;
  1298. src_size= 23-mmx_size;
  1299. src-= src_size;
  1300. dst-= src_size;
  1301. #endif
  1302. for(i=0; i<src_size; i+=3)
  1303. {
  1304. register uint8_t x;
  1305. x = src[i + 2];
  1306. dst[i + 1] = src[i + 1];
  1307. dst[i + 2] = src[i + 0];
  1308. dst[i + 0] = x;
  1309. }
  1310. }
  1311. static inline void RENAME(yuvPlanartoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
  1312. unsigned int width, unsigned int height,
  1313. unsigned int lumStride, unsigned int chromStride, unsigned int dstStride, int vertLumPerChroma)
  1314. {
  1315. unsigned y;
  1316. const unsigned chromWidth= width>>1;
  1317. for(y=0; y<height; y++)
  1318. {
  1319. #ifdef HAVE_MMX
  1320. //FIXME handle 2 lines a once (fewer prefetch, reuse some chrom, but very likely limited by mem anyway)
  1321. asm volatile(
  1322. "xorl %%eax, %%eax \n\t"
  1323. ".balign 16 \n\t"
  1324. "1: \n\t"
  1325. PREFETCH" 32(%1, %%eax, 2) \n\t"
  1326. PREFETCH" 32(%2, %%eax) \n\t"
  1327. PREFETCH" 32(%3, %%eax) \n\t"
  1328. "movq (%2, %%eax), %%mm0 \n\t" // U(0)
  1329. "movq %%mm0, %%mm2 \n\t" // U(0)
  1330. "movq (%3, %%eax), %%mm1 \n\t" // V(0)
  1331. "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
  1332. "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8)
  1333. "movq (%1, %%eax,2), %%mm3 \n\t" // Y(0)
  1334. "movq 8(%1, %%eax,2), %%mm5 \n\t" // Y(8)
  1335. "movq %%mm3, %%mm4 \n\t" // Y(0)
  1336. "movq %%mm5, %%mm6 \n\t" // Y(8)
  1337. "punpcklbw %%mm0, %%mm3 \n\t" // YUYV YUYV(0)
  1338. "punpckhbw %%mm0, %%mm4 \n\t" // YUYV YUYV(4)
  1339. "punpcklbw %%mm2, %%mm5 \n\t" // YUYV YUYV(8)
  1340. "punpckhbw %%mm2, %%mm6 \n\t" // YUYV YUYV(12)
  1341. MOVNTQ" %%mm3, (%0, %%eax, 4) \n\t"
  1342. MOVNTQ" %%mm4, 8(%0, %%eax, 4) \n\t"
  1343. MOVNTQ" %%mm5, 16(%0, %%eax, 4) \n\t"
  1344. MOVNTQ" %%mm6, 24(%0, %%eax, 4) \n\t"
  1345. "addl $8, %%eax \n\t"
  1346. "cmpl %4, %%eax \n\t"
  1347. " jb 1b \n\t"
  1348. ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "r" (chromWidth)
  1349. : "%eax"
  1350. );
  1351. #else
  1352. #if __WORDSIZE >= 64
  1353. int i;
  1354. uint64_t *ldst = (uint64_t *) dst;
  1355. const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
  1356. for(i = 0; i < chromWidth; i += 2){
  1357. uint64_t k, l;
  1358. k = yc[0] + (uc[0] << 8) +
  1359. (yc[1] << 16) + (vc[0] << 24);
  1360. l = yc[2] + (uc[1] << 8) +
  1361. (yc[3] << 16) + (vc[1] << 24);
  1362. *ldst++ = k + (l << 32);
  1363. yc += 4;
  1364. uc += 2;
  1365. vc += 2;
  1366. }
  1367. #else
  1368. int i, *idst = (int32_t *) dst;
  1369. const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
  1370. for(i = 0; i < chromWidth; i++){
  1371. *idst++ = yc[0] + (uc[0] << 8) +
  1372. (yc[1] << 16) + (vc[0] << 24);
  1373. yc += 2;
  1374. uc++;
  1375. vc++;
  1376. }
  1377. #endif
  1378. #endif
  1379. if((y&(vertLumPerChroma-1))==(vertLumPerChroma-1) )
  1380. {
  1381. usrc += chromStride;
  1382. vsrc += chromStride;
  1383. }
  1384. ysrc += lumStride;
  1385. dst += dstStride;
  1386. }
  1387. #ifdef HAVE_MMX
  1388. asm( EMMS" \n\t"
  1389. SFENCE" \n\t"
  1390. :::"memory");
  1391. #endif
  1392. }
  1393. /**
  1394. *
  1395. * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
  1396. * problem for anyone then tell me, and ill fix it)
  1397. */
  1398. static inline void RENAME(yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
  1399. unsigned int width, unsigned int height,
  1400. unsigned int lumStride, unsigned int chromStride, unsigned int dstStride)
  1401. {
  1402. //FIXME interpolate chroma
  1403. RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
  1404. }
  1405. /**
  1406. *
  1407. * width should be a multiple of 16
  1408. */
  1409. static inline void RENAME(yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
  1410. unsigned int width, unsigned int height,
  1411. unsigned int lumStride, unsigned int chromStride, unsigned int dstStride)
  1412. {
  1413. RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
  1414. }
  1415. /**
  1416. *
  1417. * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
  1418. * problem for anyone then tell me, and ill fix it)
  1419. */
  1420. static inline void RENAME(yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
  1421. unsigned int width, unsigned int height,
  1422. unsigned int lumStride, unsigned int chromStride, unsigned int srcStride)
  1423. {
  1424. unsigned y;
  1425. const unsigned chromWidth= width>>1;
  1426. for(y=0; y<height; y+=2)
  1427. {
  1428. #ifdef HAVE_MMX
  1429. asm volatile(
  1430. "xorl %%eax, %%eax \n\t"
  1431. "pcmpeqw %%mm7, %%mm7 \n\t"
  1432. "psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
  1433. ".balign 16 \n\t"
  1434. "1: \n\t"
  1435. PREFETCH" 64(%0, %%eax, 4) \n\t"
  1436. "movq (%0, %%eax, 4), %%mm0 \n\t" // YUYV YUYV(0)
  1437. "movq 8(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(4)
  1438. "movq %%mm0, %%mm2 \n\t" // YUYV YUYV(0)
  1439. "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(4)
  1440. "psrlw $8, %%mm0 \n\t" // U0V0 U0V0(0)
  1441. "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(4)
  1442. "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
  1443. "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
  1444. "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
  1445. "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
  1446. MOVNTQ" %%mm2, (%1, %%eax, 2) \n\t"
  1447. "movq 16(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(8)
  1448. "movq 24(%0, %%eax, 4), %%mm2 \n\t" // YUYV YUYV(12)
  1449. "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(8)
  1450. "movq %%mm2, %%mm4 \n\t" // YUYV YUYV(12)
  1451. "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(8)
  1452. "psrlw $8, %%mm2 \n\t" // U0V0 U0V0(12)
  1453. "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
  1454. "pand %%mm7, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
  1455. "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
  1456. "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
  1457. MOVNTQ" %%mm3, 8(%1, %%eax, 2) \n\t"
  1458. "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
  1459. "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
  1460. "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
  1461. "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
  1462. "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
  1463. "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
  1464. "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
  1465. "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
  1466. MOVNTQ" %%mm0, (%3, %%eax) \n\t"
  1467. MOVNTQ" %%mm2, (%2, %%eax) \n\t"
  1468. "addl $8, %%eax \n\t"
  1469. "cmpl %4, %%eax \n\t"
  1470. " jb 1b \n\t"
  1471. ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "r" (chromWidth)
  1472. : "memory", "%eax"
  1473. );
  1474. ydst += lumStride;
  1475. src += srcStride;
  1476. asm volatile(
  1477. "xorl %%eax, %%eax \n\t"
  1478. ".balign 16 \n\t"
  1479. "1: \n\t"
  1480. PREFETCH" 64(%0, %%eax, 4) \n\t"
  1481. "movq (%0, %%eax, 4), %%mm0 \n\t" // YUYV YUYV(0)
  1482. "movq 8(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(4)
  1483. "movq 16(%0, %%eax, 4), %%mm2 \n\t" // YUYV YUYV(8)
  1484. "movq 24(%0, %%eax, 4), %%mm3 \n\t" // YUYV YUYV(12)
  1485. "pand %%mm7, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
  1486. "pand %%mm7, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
  1487. "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
  1488. "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
  1489. "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
  1490. "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
  1491. MOVNTQ" %%mm0, (%1, %%eax, 2) \n\t"
  1492. MOVNTQ" %%mm2, 8(%1, %%eax, 2) \n\t"
  1493. "addl $8, %%eax \n\t"
  1494. "cmpl %4, %%eax \n\t"
  1495. " jb 1b \n\t"
  1496. ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "r" (chromWidth)
  1497. : "memory", "%eax"
  1498. );
  1499. #else
  1500. unsigned i;
  1501. for(i=0; i<chromWidth; i++)
  1502. {
  1503. ydst[2*i+0] = src[4*i+0];
  1504. udst[i] = src[4*i+1];
  1505. ydst[2*i+1] = src[4*i+2];
  1506. vdst[i] = src[4*i+3];
  1507. }
  1508. ydst += lumStride;
  1509. src += srcStride;
  1510. for(i=0; i<chromWidth; i++)
  1511. {
  1512. ydst[2*i+0] = src[4*i+0];
  1513. ydst[2*i+1] = src[4*i+2];
  1514. }
  1515. #endif
  1516. udst += chromStride;
  1517. vdst += chromStride;
  1518. ydst += lumStride;
  1519. src += srcStride;
  1520. }
  1521. #ifdef HAVE_MMX
  1522. asm volatile( EMMS" \n\t"
  1523. SFENCE" \n\t"
  1524. :::"memory");
  1525. #endif
  1526. }
  1527. static inline void RENAME(yvu9toyv12)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc,
  1528. uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
  1529. unsigned int width, unsigned int height, unsigned int lumStride, unsigned int chromStride)
  1530. {
  1531. /* Y Plane */
  1532. memcpy(ydst, ysrc, width*height);
  1533. /* XXX: implement upscaling for U,V */
  1534. }
  1535. static inline void RENAME(planar2x)(const uint8_t *src, uint8_t *dst, int srcWidth, int srcHeight, int srcStride, int dstStride)
  1536. {
  1537. int x,y;
  1538. // first line
  1539. for(x=0; x<srcWidth; x++){
  1540. dst[2*x+0]=
  1541. dst[2*x+1]= src[x];
  1542. }
  1543. dst+= dstStride;
  1544. for(y=1; y<srcHeight; y++){
  1545. #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
  1546. const int mmxSize= srcWidth;
  1547. asm volatile(
  1548. "movl %4, %%eax \n\t"
  1549. "1: \n\t"
  1550. "movq (%0, %%eax), %%mm0 \n\t"
  1551. "movq (%1, %%eax), %%mm1 \n\t"
  1552. "movq 1(%0, %%eax), %%mm2 \n\t"
  1553. "movq 1(%1, %%eax), %%mm3 \n\t"
  1554. "movq %%mm0, %%mm4 \n\t"
  1555. "movq %%mm1, %%mm5 \n\t"
  1556. PAVGB" %%mm3, %%mm0 \n\t"
  1557. PAVGB" %%mm3, %%mm0 \n\t"
  1558. PAVGB" %%mm4, %%mm3 \n\t"
  1559. PAVGB" %%mm4, %%mm3 \n\t"
  1560. PAVGB" %%mm2, %%mm1 \n\t"
  1561. PAVGB" %%mm2, %%mm1 \n\t"
  1562. PAVGB" %%mm5, %%mm2 \n\t"
  1563. PAVGB" %%mm5, %%mm2 \n\t"
  1564. "movq %%mm3, %%mm4 \n\t"
  1565. "movq %%mm2, %%mm5 \n\t"
  1566. "punpcklbw %%mm1, %%mm3 \n\t"
  1567. "punpckhbw %%mm1, %%mm4 \n\t"
  1568. "punpcklbw %%mm0, %%mm2 \n\t"
  1569. "punpckhbw %%mm0, %%mm5 \n\t"
  1570. #if 1
  1571. MOVNTQ" %%mm3, (%2, %%eax, 2) \n\t"
  1572. MOVNTQ" %%mm4, 8(%2, %%eax, 2) \n\t"
  1573. MOVNTQ" %%mm2, (%3, %%eax, 2) \n\t"
  1574. MOVNTQ" %%mm5, 8(%3, %%eax, 2) \n\t"
  1575. #else
  1576. "movq %%mm3, (%2, %%eax, 2) \n\t"
  1577. "movq %%mm4, 8(%2, %%eax, 2) \n\t"
  1578. "movq %%mm2, (%3, %%eax, 2) \n\t"
  1579. "movq %%mm5, 8(%3, %%eax, 2) \n\t"
  1580. #endif
  1581. "addl $8, %%eax \n\t"
  1582. " js 1b \n\t"
  1583. :: "r" (src + mmxSize-1), "r" (src + srcStride + mmxSize-1),
  1584. "r" (dst + mmxSize*2), "r" (dst + dstStride + mmxSize*2),
  1585. "g" (-mmxSize)
  1586. : "%eax"
  1587. );
  1588. dst[0]=
  1589. dst[dstStride]= src[0];
  1590. #else
  1591. dst[0]=
  1592. dst[dstStride]= src[0];
  1593. for(x=0; x<srcWidth-1; x++){
  1594. dst[2*x +1]= (3*src[x+0] + src[x+srcStride+1])>>2;
  1595. dst[2*x+dstStride+2]= ( src[x+0] + 3*src[x+srcStride+1])>>2;
  1596. dst[2*x+dstStride+1]= ( src[x+1] + 3*src[x+srcStride ])>>2;
  1597. dst[2*x +2]= (3*src[x+1] + src[x+srcStride ])>>2;
  1598. }
  1599. #endif
  1600. dst[srcWidth*2 -1]=
  1601. dst[srcWidth*2 -1 + dstStride]= src[srcWidth-1];
  1602. dst+=dstStride*2;
  1603. src+=srcStride;
  1604. }
  1605. src-=srcStride;
  1606. // last line
  1607. for(x=0; x<srcWidth; x++){
  1608. dst[2*x+0]=
  1609. dst[2*x+1]= src[x];
  1610. }
  1611. #ifdef HAVE_MMX
  1612. asm volatile( EMMS" \n\t"
  1613. SFENCE" \n\t"
  1614. :::"memory");
  1615. #endif
  1616. }
  1617. /**
  1618. *
  1619. * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
  1620. * problem for anyone then tell me, and ill fix it)
  1621. * chrominance data is only taken from every secound line others are ignored FIXME write HQ version
  1622. */
  1623. static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
  1624. unsigned int width, unsigned int height,
  1625. unsigned int lumStride, unsigned int chromStride, unsigned int srcStride)
  1626. {
  1627. unsigned y;
  1628. const unsigned chromWidth= width>>1;
  1629. for(y=0; y<height; y+=2)
  1630. {
  1631. #ifdef HAVE_MMX
  1632. asm volatile(
  1633. "xorl %%eax, %%eax \n\t"
  1634. "pcmpeqw %%mm7, %%mm7 \n\t"
  1635. "psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
  1636. ".balign 16 \n\t"
  1637. "1: \n\t"
  1638. PREFETCH" 64(%0, %%eax, 4) \n\t"
  1639. "movq (%0, %%eax, 4), %%mm0 \n\t" // UYVY UYVY(0)
  1640. "movq 8(%0, %%eax, 4), %%mm1 \n\t" // UYVY UYVY(4)
  1641. "movq %%mm0, %%mm2 \n\t" // UYVY UYVY(0)
  1642. "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(4)
  1643. "pand %%mm7, %%mm0 \n\t" // U0V0 U0V0(0)
  1644. "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(4)
  1645. "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
  1646. "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
  1647. "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
  1648. "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
  1649. MOVNTQ" %%mm2, (%1, %%eax, 2) \n\t"
  1650. "movq 16(%0, %%eax, 4), %%mm1 \n\t" // UYVY UYVY(8)
  1651. "movq 24(%0, %%eax, 4), %%mm2 \n\t" // UYVY UYVY(12)
  1652. "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(8)
  1653. "movq %%mm2, %%mm4 \n\t" // UYVY UYVY(12)
  1654. "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(8)
  1655. "pand %%mm7, %%mm2 \n\t" // U0V0 U0V0(12)
  1656. "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
  1657. "psrlw $8, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
  1658. "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
  1659. "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
  1660. MOVNTQ" %%mm3, 8(%1, %%eax, 2) \n\t"
  1661. "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
  1662. "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
  1663. "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
  1664. "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
  1665. "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
  1666. "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
  1667. "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
  1668. "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
  1669. MOVNTQ" %%mm0, (%3, %%eax) \n\t"
  1670. MOVNTQ" %%mm2, (%2, %%eax) \n\t"
  1671. "addl $8, %%eax \n\t"
  1672. "cmpl %4, %%eax \n\t"
  1673. " jb 1b \n\t"
  1674. ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "r" (chromWidth)
  1675. : "memory", "%eax"
  1676. );
  1677. ydst += lumStride;
  1678. src += srcStride;
  1679. asm volatile(
  1680. "xorl %%eax, %%eax \n\t"
  1681. ".balign 16 \n\t"
  1682. "1: \n\t"
  1683. PREFETCH" 64(%0, %%eax, 4) \n\t"
  1684. "movq (%0, %%eax, 4), %%mm0 \n\t" // YUYV YUYV(0)
  1685. "movq 8(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(4)
  1686. "movq 16(%0, %%eax, 4), %%mm2 \n\t" // YUYV YUYV(8)
  1687. "movq 24(%0, %%eax, 4), %%mm3 \n\t" // YUYV YUYV(12)
  1688. "psrlw $8, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
  1689. "psrlw $8, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
  1690. "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
  1691. "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
  1692. "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
  1693. "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
  1694. MOVNTQ" %%mm0, (%1, %%eax, 2) \n\t"
  1695. MOVNTQ" %%mm2, 8(%1, %%eax, 2) \n\t"
  1696. "addl $8, %%eax \n\t"
  1697. "cmpl %4, %%eax \n\t"
  1698. " jb 1b \n\t"
  1699. ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "r" (chromWidth)
  1700. : "memory", "%eax"
  1701. );
  1702. #else
  1703. unsigned i;
  1704. for(i=0; i<chromWidth; i++)
  1705. {
  1706. udst[i] = src[4*i+0];
  1707. ydst[2*i+0] = src[4*i+1];
  1708. vdst[i] = src[4*i+2];
  1709. ydst[2*i+1] = src[4*i+3];
  1710. }
  1711. ydst += lumStride;
  1712. src += srcStride;
  1713. for(i=0; i<chromWidth; i++)
  1714. {
  1715. ydst[2*i+0] = src[4*i+1];
  1716. ydst[2*i+1] = src[4*i+3];
  1717. }
  1718. #endif
  1719. udst += chromStride;
  1720. vdst += chromStride;
  1721. ydst += lumStride;
  1722. src += srcStride;
  1723. }
  1724. #ifdef HAVE_MMX
  1725. asm volatile( EMMS" \n\t"
  1726. SFENCE" \n\t"
  1727. :::"memory");
  1728. #endif
  1729. }
  1730. /**
  1731. *
  1732. * height should be a multiple of 2 and width should be a multiple of 2 (if this is a
  1733. * problem for anyone then tell me, and ill fix it)
  1734. * chrominance data is only taken from every secound line others are ignored in the C version FIXME write HQ version
  1735. */
  1736. static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
  1737. unsigned int width, unsigned int height,
  1738. unsigned int lumStride, unsigned int chromStride, unsigned int srcStride)
  1739. {
  1740. unsigned y;
  1741. const unsigned chromWidth= width>>1;
  1742. #ifdef HAVE_MMX
  1743. for(y=0; y<height-2; y+=2)
  1744. {
  1745. unsigned i;
  1746. for(i=0; i<2; i++)
  1747. {
  1748. asm volatile(
  1749. "movl %2, %%eax \n\t"
  1750. "movq "MANGLE(bgr2YCoeff)", %%mm6 \n\t"
  1751. "movq "MANGLE(w1111)", %%mm5 \n\t"
  1752. "pxor %%mm7, %%mm7 \n\t"
  1753. "leal (%%eax, %%eax, 2), %%ebx \n\t"
  1754. ".balign 16 \n\t"
  1755. "1: \n\t"
  1756. PREFETCH" 64(%0, %%ebx) \n\t"
  1757. "movd (%0, %%ebx), %%mm0 \n\t"
  1758. "movd 3(%0, %%ebx), %%mm1 \n\t"
  1759. "punpcklbw %%mm7, %%mm0 \n\t"
  1760. "punpcklbw %%mm7, %%mm1 \n\t"
  1761. "movd 6(%0, %%ebx), %%mm2 \n\t"
  1762. "movd 9(%0, %%ebx), %%mm3 \n\t"
  1763. "punpcklbw %%mm7, %%mm2 \n\t"
  1764. "punpcklbw %%mm7, %%mm3 \n\t"
  1765. "pmaddwd %%mm6, %%mm0 \n\t"
  1766. "pmaddwd %%mm6, %%mm1 \n\t"
  1767. "pmaddwd %%mm6, %%mm2 \n\t"
  1768. "pmaddwd %%mm6, %%mm3 \n\t"
  1769. #ifndef FAST_BGR2YV12
  1770. "psrad $8, %%mm0 \n\t"
  1771. "psrad $8, %%mm1 \n\t"
  1772. "psrad $8, %%mm2 \n\t"
  1773. "psrad $8, %%mm3 \n\t"
  1774. #endif
  1775. "packssdw %%mm1, %%mm0 \n\t"
  1776. "packssdw %%mm3, %%mm2 \n\t"
  1777. "pmaddwd %%mm5, %%mm0 \n\t"
  1778. "pmaddwd %%mm5, %%mm2 \n\t"
  1779. "packssdw %%mm2, %%mm0 \n\t"
  1780. "psraw $7, %%mm0 \n\t"
  1781. "movd 12(%0, %%ebx), %%mm4 \n\t"
  1782. "movd 15(%0, %%ebx), %%mm1 \n\t"
  1783. "punpcklbw %%mm7, %%mm4 \n\t"
  1784. "punpcklbw %%mm7, %%mm1 \n\t"
  1785. "movd 18(%0, %%ebx), %%mm2 \n\t"
  1786. "movd 21(%0, %%ebx), %%mm3 \n\t"
  1787. "punpcklbw %%mm7, %%mm2 \n\t"
  1788. "punpcklbw %%mm7, %%mm3 \n\t"
  1789. "pmaddwd %%mm6, %%mm4 \n\t"
  1790. "pmaddwd %%mm6, %%mm1 \n\t"
  1791. "pmaddwd %%mm6, %%mm2 \n\t"
  1792. "pmaddwd %%mm6, %%mm3 \n\t"
  1793. #ifndef FAST_BGR2YV12
  1794. "psrad $8, %%mm4 \n\t"
  1795. "psrad $8, %%mm1 \n\t"
  1796. "psrad $8, %%mm2 \n\t"
  1797. "psrad $8, %%mm3 \n\t"
  1798. #endif
  1799. "packssdw %%mm1, %%mm4 \n\t"
  1800. "packssdw %%mm3, %%mm2 \n\t"
  1801. "pmaddwd %%mm5, %%mm4 \n\t"
  1802. "pmaddwd %%mm5, %%mm2 \n\t"
  1803. "addl $24, %%ebx \n\t"
  1804. "packssdw %%mm2, %%mm4 \n\t"
  1805. "psraw $7, %%mm4 \n\t"
  1806. "packuswb %%mm4, %%mm0 \n\t"
  1807. "paddusb "MANGLE(bgr2YOffset)", %%mm0 \n\t"
  1808. MOVNTQ" %%mm0, (%1, %%eax) \n\t"
  1809. "addl $8, %%eax \n\t"
  1810. " js 1b \n\t"
  1811. : : "r" (src+width*3), "r" (ydst+width), "g" (-width)
  1812. : "%eax", "%ebx"
  1813. );
  1814. ydst += lumStride;
  1815. src += srcStride;
  1816. }
  1817. src -= srcStride*2;
  1818. asm volatile(
  1819. "movl %4, %%eax \n\t"
  1820. "movq "MANGLE(w1111)", %%mm5 \n\t"
  1821. "movq "MANGLE(bgr2UCoeff)", %%mm6 \n\t"
  1822. "pxor %%mm7, %%mm7 \n\t"
  1823. "leal (%%eax, %%eax, 2), %%ebx \n\t"
  1824. "addl %%ebx, %%ebx \n\t"
  1825. ".balign 16 \n\t"
  1826. "1: \n\t"
  1827. PREFETCH" 64(%0, %%ebx) \n\t"
  1828. PREFETCH" 64(%1, %%ebx) \n\t"
  1829. #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
  1830. "movq (%0, %%ebx), %%mm0 \n\t"
  1831. "movq (%1, %%ebx), %%mm1 \n\t"
  1832. "movq 6(%0, %%ebx), %%mm2 \n\t"
  1833. "movq 6(%1, %%ebx), %%mm3 \n\t"
  1834. PAVGB" %%mm1, %%mm0 \n\t"
  1835. PAVGB" %%mm3, %%mm2 \n\t"
  1836. "movq %%mm0, %%mm1 \n\t"
  1837. "movq %%mm2, %%mm3 \n\t"
  1838. "psrlq $24, %%mm0 \n\t"
  1839. "psrlq $24, %%mm2 \n\t"
  1840. PAVGB" %%mm1, %%mm0 \n\t"
  1841. PAVGB" %%mm3, %%mm2 \n\t"
  1842. "punpcklbw %%mm7, %%mm0 \n\t"
  1843. "punpcklbw %%mm7, %%mm2 \n\t"
  1844. #else
  1845. "movd (%0, %%ebx), %%mm0 \n\t"
  1846. "movd (%1, %%ebx), %%mm1 \n\t"
  1847. "movd 3(%0, %%ebx), %%mm2 \n\t"
  1848. "movd 3(%1, %%ebx), %%mm3 \n\t"
  1849. "punpcklbw %%mm7, %%mm0 \n\t"
  1850. "punpcklbw %%mm7, %%mm1 \n\t"
  1851. "punpcklbw %%mm7, %%mm2 \n\t"
  1852. "punpcklbw %%mm7, %%mm3 \n\t"
  1853. "paddw %%mm1, %%mm0 \n\t"
  1854. "paddw %%mm3, %%mm2 \n\t"
  1855. "paddw %%mm2, %%mm0 \n\t"
  1856. "movd 6(%0, %%ebx), %%mm4 \n\t"
  1857. "movd 6(%1, %%ebx), %%mm1 \n\t"
  1858. "movd 9(%0, %%ebx), %%mm2 \n\t"
  1859. "movd 9(%1, %%ebx), %%mm3 \n\t"
  1860. "punpcklbw %%mm7, %%mm4 \n\t"
  1861. "punpcklbw %%mm7, %%mm1 \n\t"
  1862. "punpcklbw %%mm7, %%mm2 \n\t"
  1863. "punpcklbw %%mm7, %%mm3 \n\t"
  1864. "paddw %%mm1, %%mm4 \n\t"
  1865. "paddw %%mm3, %%mm2 \n\t"
  1866. "paddw %%mm4, %%mm2 \n\t"
  1867. "psrlw $2, %%mm0 \n\t"
  1868. "psrlw $2, %%mm2 \n\t"
  1869. #endif
  1870. "movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t"
  1871. "movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t"
  1872. "pmaddwd %%mm0, %%mm1 \n\t"
  1873. "pmaddwd %%mm2, %%mm3 \n\t"
  1874. "pmaddwd %%mm6, %%mm0 \n\t"
  1875. "pmaddwd %%mm6, %%mm2 \n\t"
  1876. #ifndef FAST_BGR2YV12
  1877. "psrad $8, %%mm0 \n\t"
  1878. "psrad $8, %%mm1 \n\t"
  1879. "psrad $8, %%mm2 \n\t"
  1880. "psrad $8, %%mm3 \n\t"
  1881. #endif
  1882. "packssdw %%mm2, %%mm0 \n\t"
  1883. "packssdw %%mm3, %%mm1 \n\t"
  1884. "pmaddwd %%mm5, %%mm0 \n\t"
  1885. "pmaddwd %%mm5, %%mm1 \n\t"
  1886. "packssdw %%mm1, %%mm0 \n\t" // V1 V0 U1 U0
  1887. "psraw $7, %%mm0 \n\t"
  1888. #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
  1889. "movq 12(%0, %%ebx), %%mm4 \n\t"
  1890. "movq 12(%1, %%ebx), %%mm1 \n\t"
  1891. "movq 18(%0, %%ebx), %%mm2 \n\t"
  1892. "movq 18(%1, %%ebx), %%mm3 \n\t"
  1893. PAVGB" %%mm1, %%mm4 \n\t"
  1894. PAVGB" %%mm3, %%mm2 \n\t"
  1895. "movq %%mm4, %%mm1 \n\t"
  1896. "movq %%mm2, %%mm3 \n\t"
  1897. "psrlq $24, %%mm4 \n\t"
  1898. "psrlq $24, %%mm2 \n\t"
  1899. PAVGB" %%mm1, %%mm4 \n\t"
  1900. PAVGB" %%mm3, %%mm2 \n\t"
  1901. "punpcklbw %%mm7, %%mm4 \n\t"
  1902. "punpcklbw %%mm7, %%mm2 \n\t"
  1903. #else
  1904. "movd 12(%0, %%ebx), %%mm4 \n\t"
  1905. "movd 12(%1, %%ebx), %%mm1 \n\t"
  1906. "movd 15(%0, %%ebx), %%mm2 \n\t"
  1907. "movd 15(%1, %%ebx), %%mm3 \n\t"
  1908. "punpcklbw %%mm7, %%mm4 \n\t"
  1909. "punpcklbw %%mm7, %%mm1 \n\t"
  1910. "punpcklbw %%mm7, %%mm2 \n\t"
  1911. "punpcklbw %%mm7, %%mm3 \n\t"
  1912. "paddw %%mm1, %%mm4 \n\t"
  1913. "paddw %%mm3, %%mm2 \n\t"
  1914. "paddw %%mm2, %%mm4 \n\t"
  1915. "movd 18(%0, %%ebx), %%mm5 \n\t"
  1916. "movd 18(%1, %%ebx), %%mm1 \n\t"
  1917. "movd 21(%0, %%ebx), %%mm2 \n\t"
  1918. "movd 21(%1, %%ebx), %%mm3 \n\t"
  1919. "punpcklbw %%mm7, %%mm5 \n\t"
  1920. "punpcklbw %%mm7, %%mm1 \n\t"
  1921. "punpcklbw %%mm7, %%mm2 \n\t"
  1922. "punpcklbw %%mm7, %%mm3 \n\t"
  1923. "paddw %%mm1, %%mm5 \n\t"
  1924. "paddw %%mm3, %%mm2 \n\t"
  1925. "paddw %%mm5, %%mm2 \n\t"
  1926. "movq "MANGLE(w1111)", %%mm5 \n\t"
  1927. "psrlw $2, %%mm4 \n\t"
  1928. "psrlw $2, %%mm2 \n\t"
  1929. #endif
  1930. "movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t"
  1931. "movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t"
  1932. "pmaddwd %%mm4, %%mm1 \n\t"
  1933. "pmaddwd %%mm2, %%mm3 \n\t"
  1934. "pmaddwd %%mm6, %%mm4 \n\t"
  1935. "pmaddwd %%mm6, %%mm2 \n\t"
  1936. #ifndef FAST_BGR2YV12
  1937. "psrad $8, %%mm4 \n\t"
  1938. "psrad $8, %%mm1 \n\t"
  1939. "psrad $8, %%mm2 \n\t"
  1940. "psrad $8, %%mm3 \n\t"
  1941. #endif
  1942. "packssdw %%mm2, %%mm4 \n\t"
  1943. "packssdw %%mm3, %%mm1 \n\t"
  1944. "pmaddwd %%mm5, %%mm4 \n\t"
  1945. "pmaddwd %%mm5, %%mm1 \n\t"
  1946. "addl $24, %%ebx \n\t"
  1947. "packssdw %%mm1, %%mm4 \n\t" // V3 V2 U3 U2
  1948. "psraw $7, %%mm4 \n\t"
  1949. "movq %%mm0, %%mm1 \n\t"
  1950. "punpckldq %%mm4, %%mm0 \n\t"
  1951. "punpckhdq %%mm4, %%mm1 \n\t"
  1952. "packsswb %%mm1, %%mm0 \n\t"
  1953. "paddb "MANGLE(bgr2UVOffset)", %%mm0 \n\t"
  1954. "movd %%mm0, (%2, %%eax) \n\t"
  1955. "punpckhdq %%mm0, %%mm0 \n\t"
  1956. "movd %%mm0, (%3, %%eax) \n\t"
  1957. "addl $4, %%eax \n\t"
  1958. " js 1b \n\t"
  1959. : : "r" (src+width*6), "r" (src+srcStride+width*6), "r" (udst+width), "r" (vdst+width), "g" (-width)
  1960. : "%eax", "%ebx"
  1961. );
  1962. udst += chromStride;
  1963. vdst += chromStride;
  1964. src += srcStride*2;
  1965. }
  1966. asm volatile( EMMS" \n\t"
  1967. SFENCE" \n\t"
  1968. :::"memory");
  1969. #else
  1970. y=0;
  1971. #endif
  1972. for(; y<height; y+=2)
  1973. {
  1974. unsigned i;
  1975. for(i=0; i<chromWidth; i++)
  1976. {
  1977. unsigned int b= src[6*i+0];
  1978. unsigned int g= src[6*i+1];
  1979. unsigned int r= src[6*i+2];
  1980. unsigned int Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
  1981. unsigned int V = ((RV*r + GV*g + BV*b)>>RGB2YUV_SHIFT) + 128;
  1982. unsigned int U = ((RU*r + GU*g + BU*b)>>RGB2YUV_SHIFT) + 128;
  1983. udst[i] = U;
  1984. vdst[i] = V;
  1985. ydst[2*i] = Y;
  1986. b= src[6*i+3];
  1987. g= src[6*i+4];
  1988. r= src[6*i+5];
  1989. Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
  1990. ydst[2*i+1] = Y;
  1991. }
  1992. ydst += lumStride;
  1993. src += srcStride;
  1994. for(i=0; i<chromWidth; i++)
  1995. {
  1996. unsigned int b= src[6*i+0];
  1997. unsigned int g= src[6*i+1];
  1998. unsigned int r= src[6*i+2];
  1999. unsigned int Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
  2000. ydst[2*i] = Y;
  2001. b= src[6*i+3];
  2002. g= src[6*i+4];
  2003. r= src[6*i+5];
  2004. Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
  2005. ydst[2*i+1] = Y;
  2006. }
  2007. udst += chromStride;
  2008. vdst += chromStride;
  2009. ydst += lumStride;
  2010. src += srcStride;
  2011. }
  2012. }
  2013. void RENAME(interleaveBytes)(uint8_t *src1, uint8_t *src2, uint8_t *dest,
  2014. unsigned width, unsigned height, unsigned src1Stride,
  2015. unsigned src2Stride, unsigned dstStride){
  2016. unsigned h;
  2017. for(h=0; h < height; h++)
  2018. {
  2019. unsigned w;
  2020. #ifdef HAVE_MMX
  2021. #ifdef HAVE_SSE2
  2022. asm(
  2023. "xorl %%eax, %%eax \n\t"
  2024. "1: \n\t"
  2025. PREFETCH" 64(%1, %%eax) \n\t"
  2026. PREFETCH" 64(%2, %%eax) \n\t"
  2027. "movdqa (%1, %%eax), %%xmm0 \n\t"
  2028. "movdqa (%1, %%eax), %%xmm1 \n\t"
  2029. "movdqa (%2, %%eax), %%xmm2 \n\t"
  2030. "punpcklbw %%xmm2, %%xmm0 \n\t"
  2031. "punpckhbw %%xmm2, %%xmm1 \n\t"
  2032. "movntdq %%xmm0, (%0, %%eax, 2) \n\t"
  2033. "movntdq %%xmm1, 16(%0, %%eax, 2)\n\t"
  2034. "addl $16, %%eax \n\t"
  2035. "cmpl %3, %%eax \n\t"
  2036. " jb 1b \n\t"
  2037. ::"r"(dest), "r"(src1), "r"(src2), "r" (width-15)
  2038. : "memory", "%eax"
  2039. );
  2040. #else
  2041. asm(
  2042. "xorl %%eax, %%eax \n\t"
  2043. "1: \n\t"
  2044. PREFETCH" 64(%1, %%eax) \n\t"
  2045. PREFETCH" 64(%2, %%eax) \n\t"
  2046. "movq (%1, %%eax), %%mm0 \n\t"
  2047. "movq 8(%1, %%eax), %%mm2 \n\t"
  2048. "movq %%mm0, %%mm1 \n\t"
  2049. "movq %%mm2, %%mm3 \n\t"
  2050. "movq (%2, %%eax), %%mm4 \n\t"
  2051. "movq 8(%2, %%eax), %%mm5 \n\t"
  2052. "punpcklbw %%mm4, %%mm0 \n\t"
  2053. "punpckhbw %%mm4, %%mm1 \n\t"
  2054. "punpcklbw %%mm5, %%mm2 \n\t"
  2055. "punpckhbw %%mm5, %%mm3 \n\t"
  2056. MOVNTQ" %%mm0, (%0, %%eax, 2) \n\t"
  2057. MOVNTQ" %%mm1, 8(%0, %%eax, 2) \n\t"
  2058. MOVNTQ" %%mm2, 16(%0, %%eax, 2) \n\t"
  2059. MOVNTQ" %%mm3, 24(%0, %%eax, 2) \n\t"
  2060. "addl $16, %%eax \n\t"
  2061. "cmpl %3, %%eax \n\t"
  2062. " jb 1b \n\t"
  2063. ::"r"(dest), "r"(src1), "r"(src2), "r" (width-15)
  2064. : "memory", "%eax"
  2065. );
  2066. #endif
  2067. for(w= (width&(~15)); w < width; w++)
  2068. {
  2069. dest[2*w+0] = src1[w];
  2070. dest[2*w+1] = src2[w];
  2071. }
  2072. #else
  2073. for(w=0; w < width; w++)
  2074. {
  2075. dest[2*w+0] = src1[w];
  2076. dest[2*w+1] = src2[w];
  2077. }
  2078. #endif
  2079. dest += dstStride;
  2080. src1 += src1Stride;
  2081. src2 += src2Stride;
  2082. }
  2083. #ifdef HAVE_MMX
  2084. asm(
  2085. EMMS" \n\t"
  2086. SFENCE" \n\t"
  2087. ::: "memory"
  2088. );
  2089. #endif
  2090. }
  2091. static inline void RENAME(vu9_to_vu12)(const uint8_t *src1, const uint8_t *src2,
  2092. uint8_t *dst1, uint8_t *dst2,
  2093. unsigned width, unsigned height,
  2094. unsigned srcStride1, unsigned srcStride2,
  2095. unsigned dstStride1, unsigned dstStride2)
  2096. {
  2097. unsigned y,x,w,h;
  2098. w=width/2; h=height/2;
  2099. #ifdef HAVE_MMX
  2100. asm volatile(
  2101. PREFETCH" %0\n\t"
  2102. PREFETCH" %1\n\t"
  2103. ::"m"(*(src1+srcStride1)),"m"(*(src2+srcStride2)):"memory");
  2104. #endif
  2105. for(y=0;y<h;y++){
  2106. const uint8_t* s1=src1+srcStride1*(y>>1);
  2107. uint8_t* d=dst1+dstStride1*y;
  2108. x=0;
  2109. #ifdef HAVE_MMX
  2110. if(w > 32)
  2111. for(;x<w;x+=32)
  2112. {
  2113. asm volatile(
  2114. PREFETCH" 32%1\n\t"
  2115. "movq %1, %%mm0\n\t"
  2116. "movq 8%1, %%mm2\n\t"
  2117. "movq 16%1, %%mm4\n\t"
  2118. "movq 24%1, %%mm6\n\t"
  2119. "movq %%mm0, %%mm1\n\t"
  2120. "movq %%mm2, %%mm3\n\t"
  2121. "movq %%mm4, %%mm5\n\t"
  2122. "movq %%mm6, %%mm7\n\t"
  2123. "punpcklbw %%mm0, %%mm0\n\t"
  2124. "punpckhbw %%mm1, %%mm1\n\t"
  2125. "punpcklbw %%mm2, %%mm2\n\t"
  2126. "punpckhbw %%mm3, %%mm3\n\t"
  2127. "punpcklbw %%mm4, %%mm4\n\t"
  2128. "punpckhbw %%mm5, %%mm5\n\t"
  2129. "punpcklbw %%mm6, %%mm6\n\t"
  2130. "punpckhbw %%mm7, %%mm7\n\t"
  2131. MOVNTQ" %%mm0, %0\n\t"
  2132. MOVNTQ" %%mm1, 8%0\n\t"
  2133. MOVNTQ" %%mm2, 16%0\n\t"
  2134. MOVNTQ" %%mm3, 24%0\n\t"
  2135. MOVNTQ" %%mm4, 32%0\n\t"
  2136. MOVNTQ" %%mm5, 40%0\n\t"
  2137. MOVNTQ" %%mm6, 48%0\n\t"
  2138. MOVNTQ" %%mm7, 56%0"
  2139. :"=m"(d[2*x])
  2140. :"m"(s1[x])
  2141. :"memory");
  2142. }
  2143. #endif
  2144. for(;x<w;x++) d[2*x]=d[2*x+1]=s1[x];
  2145. }
  2146. for(y=0;y<h;y++){
  2147. const uint8_t* s2=src2+srcStride2*(y>>1);
  2148. uint8_t* d=dst2+dstStride2*y;
  2149. x=0;
  2150. #ifdef HAVE_MMX
  2151. if(w > 32)
  2152. for(;x<w;x+=32)
  2153. {
  2154. asm volatile(
  2155. PREFETCH" 32%1\n\t"
  2156. "movq %1, %%mm0\n\t"
  2157. "movq 8%1, %%mm2\n\t"
  2158. "movq 16%1, %%mm4\n\t"
  2159. "movq 24%1, %%mm6\n\t"
  2160. "movq %%mm0, %%mm1\n\t"
  2161. "movq %%mm2, %%mm3\n\t"
  2162. "movq %%mm4, %%mm5\n\t"
  2163. "movq %%mm6, %%mm7\n\t"
  2164. "punpcklbw %%mm0, %%mm0\n\t"
  2165. "punpckhbw %%mm1, %%mm1\n\t"
  2166. "punpcklbw %%mm2, %%mm2\n\t"
  2167. "punpckhbw %%mm3, %%mm3\n\t"
  2168. "punpcklbw %%mm4, %%mm4\n\t"
  2169. "punpckhbw %%mm5, %%mm5\n\t"
  2170. "punpcklbw %%mm6, %%mm6\n\t"
  2171. "punpckhbw %%mm7, %%mm7\n\t"
  2172. MOVNTQ" %%mm0, %0\n\t"
  2173. MOVNTQ" %%mm1, 8%0\n\t"
  2174. MOVNTQ" %%mm2, 16%0\n\t"
  2175. MOVNTQ" %%mm3, 24%0\n\t"
  2176. MOVNTQ" %%mm4, 32%0\n\t"
  2177. MOVNTQ" %%mm5, 40%0\n\t"
  2178. MOVNTQ" %%mm6, 48%0\n\t"
  2179. MOVNTQ" %%mm7, 56%0"
  2180. :"=m"(d[2*x])
  2181. :"m"(s2[x])
  2182. :"memory");
  2183. }
  2184. #endif
  2185. for(;x<w;x++) d[2*x]=d[2*x+1]=s2[x];
  2186. }
  2187. #ifdef HAVE_MMX
  2188. asm(
  2189. EMMS" \n\t"
  2190. SFENCE" \n\t"
  2191. ::: "memory"
  2192. );
  2193. #endif
  2194. }
  2195. static inline void RENAME(yvu9_to_yuy2)(const uint8_t *src1, const uint8_t *src2, const uint8_t *src3,
  2196. uint8_t *dst,
  2197. unsigned width, unsigned height,
  2198. unsigned srcStride1, unsigned srcStride2,
  2199. unsigned srcStride3, unsigned dstStride)
  2200. {
  2201. unsigned y,x,x2,w,h;
  2202. w=width/2; h=height;
  2203. #ifdef HAVE_MMX
  2204. asm volatile(
  2205. PREFETCH" %0\n\t"
  2206. PREFETCH" %1\n\t"
  2207. PREFETCH" %2\n\t"
  2208. ::"m"(*(src1+srcStride1)),"m"(*(src2+srcStride2)),"m"(*(src3+srcStride3)):"memory");
  2209. #endif
  2210. for(y=0;y<h;y++){
  2211. const uint8_t* yp=src1+srcStride1*y;
  2212. const uint8_t* up=src2+srcStride2*(y>>2);
  2213. const uint8_t* vp=src3+srcStride3*(y>>2);
  2214. uint8_t* d=dst+dstStride*y;
  2215. x2=0;
  2216. x=0;
  2217. #ifdef HAVE_MMX
  2218. for(;x<w;x+=8,x2+=32)
  2219. {
  2220. asm volatile(
  2221. PREFETCH" 32%1\n\t"
  2222. PREFETCH" 32%2\n\t"
  2223. PREFETCH" 32%3\n\t"
  2224. "movq %1, %%mm0\n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
  2225. "movq %2, %%mm1\n\t" /* U0U1U2U3U4U5U6U7 */
  2226. "movq %3, %%mm2\n\t" /* V0V1V2V3V4V5V6V7 */
  2227. "movq %%mm0, %%mm3\n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
  2228. "movq %%mm1, %%mm4\n\t" /* U0U1U2U3U4U5U6U7 */
  2229. "movq %%mm2, %%mm5\n\t" /* V0V1V2V3V4V5V6V7 */
  2230. "punpcklbw %%mm1, %%mm1\n\t" /* U0U0 U1U1 U2U2 U3U3 */
  2231. "punpcklbw %%mm2, %%mm2\n\t" /* V0V0 V1V1 V2V2 V3V3 */
  2232. "punpckhbw %%mm4, %%mm4\n\t" /* U4U4 U5U5 U6U6 U7U7 */
  2233. "punpckhbw %%mm5, %%mm5\n\t" /* V4V4 V5V5 V6V6 V7V7 */
  2234. "movq %%mm1, %%mm6\n\t"
  2235. "punpcklbw %%mm2, %%mm1\n\t" /* U0V0 U0V0 U1V1 U1V1*/
  2236. "punpcklbw %%mm1, %%mm0\n\t" /* Y0U0 Y1V0 Y2U0 Y3V0*/
  2237. "punpckhbw %%mm1, %%mm3\n\t" /* Y4U1 Y5V1 Y6U1 Y7V1*/
  2238. MOVNTQ" %%mm0, %0\n\t"
  2239. MOVNTQ" %%mm3, 8%0\n\t"
  2240. "punpckhbw %%mm2, %%mm6\n\t" /* U2V2 U2V2 U3V3 U3V3*/
  2241. "movq 8%1, %%mm0\n\t"
  2242. "movq %%mm0, %%mm3\n\t"
  2243. "punpcklbw %%mm6, %%mm0\n\t" /* Y U2 Y V2 Y U2 Y V2*/
  2244. "punpckhbw %%mm6, %%mm3\n\t" /* Y U3 Y V3 Y U3 Y V3*/
  2245. MOVNTQ" %%mm0, 16%0\n\t"
  2246. MOVNTQ" %%mm3, 24%0\n\t"
  2247. "movq %%mm4, %%mm6\n\t"
  2248. "movq 16%1, %%mm0\n\t"
  2249. "movq %%mm0, %%mm3\n\t"
  2250. "punpcklbw %%mm5, %%mm4\n\t"
  2251. "punpcklbw %%mm4, %%mm0\n\t" /* Y U4 Y V4 Y U4 Y V4*/
  2252. "punpckhbw %%mm4, %%mm3\n\t" /* Y U5 Y V5 Y U5 Y V5*/
  2253. MOVNTQ" %%mm0, 32%0\n\t"
  2254. MOVNTQ" %%mm3, 40%0\n\t"
  2255. "punpckhbw %%mm5, %%mm6\n\t"
  2256. "movq 24%1, %%mm0\n\t"
  2257. "movq %%mm0, %%mm3\n\t"
  2258. "punpcklbw %%mm6, %%mm0\n\t" /* Y U6 Y V6 Y U6 Y V6*/
  2259. "punpckhbw %%mm6, %%mm3\n\t" /* Y U7 Y V7 Y U7 Y V7*/
  2260. MOVNTQ" %%mm0, 48%0\n\t"
  2261. MOVNTQ" %%mm3, 56%0\n\t"
  2262. :"=m"(d[8*x])
  2263. :"m"(yp[x2]),"m"(up[x]),"m"(vp[x])
  2264. :"memory");
  2265. }
  2266. #endif
  2267. for(;x<w;x++,x2+=4)
  2268. {
  2269. d[8*x+0]=yp[x2];
  2270. d[8*x+1]=up[x];
  2271. d[8*x+2]=yp[x2+1];
  2272. d[8*x+3]=vp[x];
  2273. d[8*x+4]=yp[x2+2];
  2274. d[8*x+5]=up[x];
  2275. d[8*x+6]=yp[x2+3];
  2276. d[8*x+7]=vp[x];
  2277. }
  2278. }
  2279. #ifdef HAVE_MMX
  2280. asm(
  2281. EMMS" \n\t"
  2282. SFENCE" \n\t"
  2283. ::: "memory"
  2284. );
  2285. #endif
  2286. }