You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1364 lines
35KB

  1. /*
  2. *
  3. * rgb2rgb.c, Software RGB to RGB convertor
  4. * pluralize by Software PAL8 to RGB convertor
  5. * Software YUV to YUV convertor
  6. * Software YUV to RGB convertor
  7. * Written by Nick Kurshev.
  8. * palette & yuv & runtime cpu stuff by Michael (michaelni@gmx.at) (under GPL)
  9. */
  10. #undef PREFETCH
  11. #undef MOVNTQ
  12. #undef EMMS
  13. #undef SFENCE
  14. #undef MMREG_SIZE
  15. #undef PREFETCHW
  16. #undef PAVGB
  17. #ifdef HAVE_SSE2
  18. #define MMREG_SIZE 16
  19. #else
  20. #define MMREG_SIZE 8
  21. #endif
  22. #ifdef HAVE_3DNOW
  23. #define PREFETCH "prefetch"
  24. #define PREFETCHW "prefetchw"
  25. #define PAVGB "pavgusb"
  26. #elif defined ( HAVE_MMX2 )
  27. #define PREFETCH "prefetchnta"
  28. #define PREFETCHW "prefetcht0"
  29. #define PAVGB "pavgb"
  30. #else
  31. #define PREFETCH "/nop"
  32. #define PREFETCHW "/nop"
  33. #endif
  34. #ifdef HAVE_3DNOW
  35. /* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
  36. #define EMMS "femms"
  37. #else
  38. #define EMMS "emms"
  39. #endif
  40. #ifdef HAVE_MMX2
  41. #define MOVNTQ "movntq"
  42. #define SFENCE "sfence"
  43. #else
  44. #define MOVNTQ "movq"
  45. #define SFENCE "/nop"
  46. #endif
  47. static inline void RENAME(rgb24to32)(const uint8_t *src,uint8_t *dst,unsigned src_size)
  48. {
  49. uint8_t *dest = dst;
  50. const uint8_t *s = src;
  51. const uint8_t *end;
  52. #ifdef HAVE_MMX
  53. const uint8_t *mm_end;
  54. #endif
  55. end = s + src_size;
  56. #ifdef HAVE_MMX
  57. __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
  58. mm_end = end - 23;
  59. __asm __volatile("movq %0, %%mm7"::"m"(mask32):"memory");
  60. while(s < mm_end)
  61. {
  62. __asm __volatile(
  63. PREFETCH" 32%1\n\t"
  64. "movd %1, %%mm0\n\t"
  65. "punpckldq 3%1, %%mm0\n\t"
  66. "movd 6%1, %%mm1\n\t"
  67. "punpckldq 9%1, %%mm1\n\t"
  68. "movd 12%1, %%mm2\n\t"
  69. "punpckldq 15%1, %%mm2\n\t"
  70. "movd 18%1, %%mm3\n\t"
  71. "punpckldq 21%1, %%mm3\n\t"
  72. "pand %%mm7, %%mm0\n\t"
  73. "pand %%mm7, %%mm1\n\t"
  74. "pand %%mm7, %%mm2\n\t"
  75. "pand %%mm7, %%mm3\n\t"
  76. MOVNTQ" %%mm0, %0\n\t"
  77. MOVNTQ" %%mm1, 8%0\n\t"
  78. MOVNTQ" %%mm2, 16%0\n\t"
  79. MOVNTQ" %%mm3, 24%0"
  80. :"=m"(*dest)
  81. :"m"(*s)
  82. :"memory");
  83. dest += 32;
  84. s += 24;
  85. }
  86. __asm __volatile(SFENCE:::"memory");
  87. __asm __volatile(EMMS:::"memory");
  88. #endif
  89. while(s < end)
  90. {
  91. *dest++ = *s++;
  92. *dest++ = *s++;
  93. *dest++ = *s++;
  94. *dest++ = 0;
  95. }
  96. }
  97. static inline void RENAME(rgb32to24)(const uint8_t *src,uint8_t *dst,unsigned src_size)
  98. {
  99. uint8_t *dest = dst;
  100. const uint8_t *s = src;
  101. const uint8_t *end;
  102. #ifdef HAVE_MMX
  103. const uint8_t *mm_end;
  104. #endif
  105. end = s + src_size;
  106. #ifdef HAVE_MMX
  107. __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
  108. mm_end = end - 31;
  109. while(s < mm_end)
  110. {
  111. __asm __volatile(
  112. PREFETCH" 32%1\n\t"
  113. "movq %1, %%mm0\n\t"
  114. "movq 8%1, %%mm1\n\t"
  115. "movq 16%1, %%mm4\n\t"
  116. "movq 24%1, %%mm5\n\t"
  117. "movq %%mm0, %%mm2\n\t"
  118. "movq %%mm1, %%mm3\n\t"
  119. "movq %%mm4, %%mm6\n\t"
  120. "movq %%mm5, %%mm7\n\t"
  121. "psrlq $8, %%mm2\n\t"
  122. "psrlq $8, %%mm3\n\t"
  123. "psrlq $8, %%mm6\n\t"
  124. "psrlq $8, %%mm7\n\t"
  125. "pand %2, %%mm0\n\t"
  126. "pand %2, %%mm1\n\t"
  127. "pand %2, %%mm4\n\t"
  128. "pand %2, %%mm5\n\t"
  129. "pand %3, %%mm2\n\t"
  130. "pand %3, %%mm3\n\t"
  131. "pand %3, %%mm6\n\t"
  132. "pand %3, %%mm7\n\t"
  133. "por %%mm2, %%mm0\n\t"
  134. "por %%mm3, %%mm1\n\t"
  135. "por %%mm6, %%mm4\n\t"
  136. "por %%mm7, %%mm5\n\t"
  137. "movq %%mm1, %%mm2\n\t"
  138. "movq %%mm4, %%mm3\n\t"
  139. "psllq $48, %%mm2\n\t"
  140. "psllq $32, %%mm3\n\t"
  141. "pand %4, %%mm2\n\t"
  142. "pand %5, %%mm3\n\t"
  143. "por %%mm2, %%mm0\n\t"
  144. "psrlq $16, %%mm1\n\t"
  145. "psrlq $32, %%mm4\n\t"
  146. "psllq $16, %%mm5\n\t"
  147. "por %%mm3, %%mm1\n\t"
  148. "pand %6, %%mm5\n\t"
  149. "por %%mm5, %%mm4\n\t"
  150. MOVNTQ" %%mm0, %0\n\t"
  151. MOVNTQ" %%mm1, 8%0\n\t"
  152. MOVNTQ" %%mm4, 16%0"
  153. :"=m"(*dest)
  154. :"m"(*s),"m"(mask24l),
  155. "m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
  156. :"memory");
  157. dest += 24;
  158. s += 32;
  159. }
  160. __asm __volatile(SFENCE:::"memory");
  161. __asm __volatile(EMMS:::"memory");
  162. #endif
  163. while(s < end)
  164. {
  165. *dest++ = *s++;
  166. *dest++ = *s++;
  167. *dest++ = *s++;
  168. s++;
  169. }
  170. }
  171. /*
  172. Original by Strepto/Astral
  173. ported to gcc & bugfixed : A'rpi
  174. MMX2, 3DNOW optimization by Nick Kurshev
  175. 32bit c version, and and&add trick by Michael Niedermayer
  176. */
  177. static inline void RENAME(rgb15to16)(const uint8_t *src,uint8_t *dst,unsigned src_size)
  178. {
  179. #ifdef HAVE_MMX
  180. register int offs=15-src_size;
  181. register const char* s=src-offs;
  182. register char* d=dst-offs;
  183. __asm __volatile(PREFETCH" %0"::"m"(*(s+offs)));
  184. __asm __volatile(
  185. "movq %0, %%mm4\n\t"
  186. ::"m"(mask15s));
  187. while(offs<0)
  188. {
  189. __asm __volatile(
  190. PREFETCH" 32%1\n\t"
  191. "movq %1, %%mm0\n\t"
  192. "movq 8%1, %%mm2\n\t"
  193. "movq %%mm0, %%mm1\n\t"
  194. "movq %%mm2, %%mm3\n\t"
  195. "pand %%mm4, %%mm0\n\t"
  196. "pand %%mm4, %%mm2\n\t"
  197. "paddw %%mm1, %%mm0\n\t"
  198. "paddw %%mm3, %%mm2\n\t"
  199. MOVNTQ" %%mm0, %0\n\t"
  200. MOVNTQ" %%mm2, 8%0"
  201. :"=m"(*(d+offs))
  202. :"m"(*(s+offs))
  203. );
  204. offs+=16;
  205. }
  206. __asm __volatile(SFENCE:::"memory");
  207. __asm __volatile(EMMS:::"memory");
  208. #else
  209. #if 0
  210. const uint16_t *s1=( uint16_t * )src;
  211. uint16_t *d1=( uint16_t * )dst;
  212. uint16_t *e=((uint8_t *)s1)+src_size;
  213. while( s1<e ){
  214. register int x=*( s1++ );
  215. /* rrrrrggggggbbbbb
  216. 0rrrrrgggggbbbbb
  217. 0111 1111 1110 0000=0x7FE0
  218. 00000000000001 1111=0x001F */
  219. *( d1++ )=( x&0x001F )|( ( x&0x7FE0 )<<1 );
  220. }
  221. #else
  222. const unsigned *s1=( unsigned * )src;
  223. unsigned *d1=( unsigned * )dst;
  224. int i;
  225. int size= src_size>>2;
  226. for(i=0; i<size; i++)
  227. {
  228. register int x= s1[i];
  229. // d1[i] = x + (x&0x7FE07FE0); //faster but need msbit =0 which might not allways be true
  230. d1[i] = (x&0x7FFF7FFF) + (x&0x7FE07FE0);
  231. }
  232. #endif
  233. #endif
  234. }
  235. static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, unsigned src_size)
  236. {
  237. #ifdef HAVE_MMX
  238. const uint8_t *s = src;
  239. const uint8_t *end,*mm_end;
  240. uint16_t *d = (uint16_t *)dst;
  241. end = s + src_size;
  242. mm_end = end - 15;
  243. __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
  244. __asm __volatile(
  245. "movq %0, %%mm7\n\t"
  246. "movq %1, %%mm6\n\t"
  247. ::"m"(red_16mask),"m"(green_16mask));
  248. while(s < mm_end)
  249. {
  250. __asm __volatile(
  251. PREFETCH" 32%1\n\t"
  252. "movd %1, %%mm0\n\t"
  253. "movd 4%1, %%mm3\n\t"
  254. "punpckldq 8%1, %%mm0\n\t"
  255. "punpckldq 12%1, %%mm3\n\t"
  256. "movq %%mm0, %%mm1\n\t"
  257. "movq %%mm0, %%mm2\n\t"
  258. "movq %%mm3, %%mm4\n\t"
  259. "movq %%mm3, %%mm5\n\t"
  260. "psrlq $3, %%mm0\n\t"
  261. "psrlq $3, %%mm3\n\t"
  262. "pand %2, %%mm0\n\t"
  263. "pand %2, %%mm3\n\t"
  264. "psrlq $5, %%mm1\n\t"
  265. "psrlq $5, %%mm4\n\t"
  266. "pand %%mm6, %%mm1\n\t"
  267. "pand %%mm6, %%mm4\n\t"
  268. "psrlq $8, %%mm2\n\t"
  269. "psrlq $8, %%mm5\n\t"
  270. "pand %%mm7, %%mm2\n\t"
  271. "pand %%mm7, %%mm5\n\t"
  272. "por %%mm1, %%mm0\n\t"
  273. "por %%mm4, %%mm3\n\t"
  274. "por %%mm2, %%mm0\n\t"
  275. "por %%mm5, %%mm3\n\t"
  276. "psllq $16, %%mm3\n\t"
  277. "por %%mm3, %%mm0\n\t"
  278. MOVNTQ" %%mm0, %0\n\t"
  279. :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
  280. d += 4;
  281. s += 16;
  282. }
  283. while(s < end)
  284. {
  285. const int b= *s++;
  286. const int g= *s++;
  287. const int r= *s++;
  288. s++;
  289. *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
  290. }
  291. __asm __volatile(SFENCE:::"memory");
  292. __asm __volatile(EMMS:::"memory");
  293. #else
  294. unsigned j,i,num_pixels=src_size/4;
  295. uint16_t *d = (uint16_t *)dst;
  296. for(i=0,j=0; j<num_pixels; i+=4,j++)
  297. {
  298. const int b= src[i+0];
  299. const int g= src[i+1];
  300. const int r= src[i+2];
  301. d[j]= (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
  302. }
  303. #endif
  304. }
  305. static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, unsigned src_size)
  306. {
  307. #ifdef HAVE_MMX
  308. const uint8_t *s = src;
  309. const uint8_t *end,*mm_end;
  310. uint16_t *d = (uint16_t *)dst;
  311. end = s + src_size;
  312. mm_end = end - 15;
  313. __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
  314. __asm __volatile(
  315. "movq %0, %%mm7\n\t"
  316. "movq %1, %%mm6\n\t"
  317. ::"m"(red_15mask),"m"(green_15mask));
  318. while(s < mm_end)
  319. {
  320. __asm __volatile(
  321. PREFETCH" 32%1\n\t"
  322. "movd %1, %%mm0\n\t"
  323. "movd 4%1, %%mm3\n\t"
  324. "punpckldq 8%1, %%mm0\n\t"
  325. "punpckldq 12%1, %%mm3\n\t"
  326. "movq %%mm0, %%mm1\n\t"
  327. "movq %%mm0, %%mm2\n\t"
  328. "movq %%mm3, %%mm4\n\t"
  329. "movq %%mm3, %%mm5\n\t"
  330. "psrlq $3, %%mm0\n\t"
  331. "psrlq $3, %%mm3\n\t"
  332. "pand %2, %%mm0\n\t"
  333. "pand %2, %%mm3\n\t"
  334. "psrlq $6, %%mm1\n\t"
  335. "psrlq $6, %%mm4\n\t"
  336. "pand %%mm6, %%mm1\n\t"
  337. "pand %%mm6, %%mm4\n\t"
  338. "psrlq $9, %%mm2\n\t"
  339. "psrlq $9, %%mm5\n\t"
  340. "pand %%mm7, %%mm2\n\t"
  341. "pand %%mm7, %%mm5\n\t"
  342. "por %%mm1, %%mm0\n\t"
  343. "por %%mm4, %%mm3\n\t"
  344. "por %%mm2, %%mm0\n\t"
  345. "por %%mm5, %%mm3\n\t"
  346. "psllq $16, %%mm3\n\t"
  347. "por %%mm3, %%mm0\n\t"
  348. MOVNTQ" %%mm0, %0\n\t"
  349. :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
  350. d += 4;
  351. s += 16;
  352. }
  353. while(s < end)
  354. {
  355. const int b= *s++;
  356. const int g= *s++;
  357. const int r= *s++;
  358. s++;
  359. *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
  360. }
  361. __asm __volatile(SFENCE:::"memory");
  362. __asm __volatile(EMMS:::"memory");
  363. #else
  364. unsigned j,i,num_pixels=src_size/4;
  365. uint16_t *d = (uint16_t *)dst;
  366. for(i=0,j=0; j<num_pixels; i+=4,j++)
  367. {
  368. const int b= src[i+0];
  369. const int g= src[i+1];
  370. const int r= src[i+2];
  371. d[j]= (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
  372. }
  373. #endif
  374. }
  375. static inline void RENAME(rgb24to16)(const uint8_t *src, uint8_t *dst, unsigned src_size)
  376. {
  377. #ifdef HAVE_MMX
  378. const uint8_t *s = src;
  379. const uint8_t *end,*mm_end;
  380. uint16_t *d = (uint16_t *)dst;
  381. end = s + src_size;
  382. mm_end = end - 11;
  383. __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
  384. __asm __volatile(
  385. "movq %0, %%mm7\n\t"
  386. "movq %1, %%mm6\n\t"
  387. ::"m"(red_16mask),"m"(green_16mask));
  388. while(s < mm_end)
  389. {
  390. __asm __volatile(
  391. PREFETCH" 32%1\n\t"
  392. "movd %1, %%mm0\n\t"
  393. "movd 3%1, %%mm3\n\t"
  394. "punpckldq 6%1, %%mm0\n\t"
  395. "punpckldq 9%1, %%mm3\n\t"
  396. "movq %%mm0, %%mm1\n\t"
  397. "movq %%mm0, %%mm2\n\t"
  398. "movq %%mm3, %%mm4\n\t"
  399. "movq %%mm3, %%mm5\n\t"
  400. "psrlq $3, %%mm0\n\t"
  401. "psrlq $3, %%mm3\n\t"
  402. "pand %2, %%mm0\n\t"
  403. "pand %2, %%mm3\n\t"
  404. "psrlq $5, %%mm1\n\t"
  405. "psrlq $5, %%mm4\n\t"
  406. "pand %%mm6, %%mm1\n\t"
  407. "pand %%mm6, %%mm4\n\t"
  408. "psrlq $8, %%mm2\n\t"
  409. "psrlq $8, %%mm5\n\t"
  410. "pand %%mm7, %%mm2\n\t"
  411. "pand %%mm7, %%mm5\n\t"
  412. "por %%mm1, %%mm0\n\t"
  413. "por %%mm4, %%mm3\n\t"
  414. "por %%mm2, %%mm0\n\t"
  415. "por %%mm5, %%mm3\n\t"
  416. "psllq $16, %%mm3\n\t"
  417. "por %%mm3, %%mm0\n\t"
  418. MOVNTQ" %%mm0, %0\n\t"
  419. :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
  420. d += 4;
  421. s += 12;
  422. }
  423. while(s < end)
  424. {
  425. const int b= *s++;
  426. const int g= *s++;
  427. const int r= *s++;
  428. *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
  429. }
  430. __asm __volatile(SFENCE:::"memory");
  431. __asm __volatile(EMMS:::"memory");
  432. #else
  433. unsigned j,i,num_pixels=src_size/3;
  434. uint16_t *d = (uint16_t *)dst;
  435. for(i=0,j=0; j<num_pixels; i+=3,j++)
  436. {
  437. const int b= src[i+0];
  438. const int g= src[i+1];
  439. const int r= src[i+2];
  440. d[j]= (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
  441. }
  442. #endif
  443. }
  444. static inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, unsigned src_size)
  445. {
  446. #ifdef HAVE_MMX
  447. const uint8_t *s = src;
  448. const uint8_t *end,*mm_end;
  449. uint16_t *d = (uint16_t *)dst;
  450. end = s + src_size;
  451. mm_end = end -11;
  452. __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
  453. __asm __volatile(
  454. "movq %0, %%mm7\n\t"
  455. "movq %1, %%mm6\n\t"
  456. ::"m"(red_15mask),"m"(green_15mask));
  457. while(s < mm_end)
  458. {
  459. __asm __volatile(
  460. PREFETCH" 32%1\n\t"
  461. "movd %1, %%mm0\n\t"
  462. "movd 3%1, %%mm3\n\t"
  463. "punpckldq 6%1, %%mm0\n\t"
  464. "punpckldq 9%1, %%mm3\n\t"
  465. "movq %%mm0, %%mm1\n\t"
  466. "movq %%mm0, %%mm2\n\t"
  467. "movq %%mm3, %%mm4\n\t"
  468. "movq %%mm3, %%mm5\n\t"
  469. "psrlq $3, %%mm0\n\t"
  470. "psrlq $3, %%mm3\n\t"
  471. "pand %2, %%mm0\n\t"
  472. "pand %2, %%mm3\n\t"
  473. "psrlq $6, %%mm1\n\t"
  474. "psrlq $6, %%mm4\n\t"
  475. "pand %%mm6, %%mm1\n\t"
  476. "pand %%mm6, %%mm4\n\t"
  477. "psrlq $9, %%mm2\n\t"
  478. "psrlq $9, %%mm5\n\t"
  479. "pand %%mm7, %%mm2\n\t"
  480. "pand %%mm7, %%mm5\n\t"
  481. "por %%mm1, %%mm0\n\t"
  482. "por %%mm4, %%mm3\n\t"
  483. "por %%mm2, %%mm0\n\t"
  484. "por %%mm5, %%mm3\n\t"
  485. "psllq $16, %%mm3\n\t"
  486. "por %%mm3, %%mm0\n\t"
  487. MOVNTQ" %%mm0, %0\n\t"
  488. :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
  489. d += 4;
  490. s += 12;
  491. }
  492. while(s < end)
  493. {
  494. const int b= *s++;
  495. const int g= *s++;
  496. const int r= *s++;
  497. *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
  498. }
  499. __asm __volatile(SFENCE:::"memory");
  500. __asm __volatile(EMMS:::"memory");
  501. #else
  502. unsigned j,i,num_pixels=src_size/3;
  503. uint16_t *d = (uint16_t *)dst;
  504. for(i=0,j=0; j<num_pixels; i+=3,j++)
  505. {
  506. const int b= src[i+0];
  507. const int g= src[i+1];
  508. const int r= src[i+2];
  509. d[j]= (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
  510. }
  511. #endif
  512. }
  513. static inline void RENAME(rgb32tobgr32)(const uint8_t *src, uint8_t *dst, unsigned int src_size)
  514. {
  515. #ifdef HAVE_MMX
  516. asm volatile (
  517. "xorl %%eax, %%eax \n\t"
  518. ".balign 16 \n\t"
  519. "1: \n\t"
  520. PREFETCH" 32(%0, %%eax) \n\t"
  521. "movq (%0, %%eax), %%mm0 \n\t"
  522. "movq %%mm0, %%mm1 \n\t"
  523. "movq %%mm0, %%mm2 \n\t"
  524. "pslld $16, %%mm0 \n\t"
  525. "psrld $16, %%mm1 \n\t"
  526. "pand "MANGLE(mask32r)", %%mm0 \n\t"
  527. "pand "MANGLE(mask32g)", %%mm2 \n\t"
  528. "pand "MANGLE(mask32b)", %%mm1 \n\t"
  529. "por %%mm0, %%mm2 \n\t"
  530. "por %%mm1, %%mm2 \n\t"
  531. MOVNTQ" %%mm2, (%1, %%eax) \n\t"
  532. "addl $8, %%eax \n\t"
  533. "cmpl %2, %%eax \n\t"
  534. " jb 1b \n\t"
  535. :: "r" (src), "r"(dst), "r" (src_size)
  536. : "%eax"
  537. );
  538. __asm __volatile(SFENCE:::"memory");
  539. __asm __volatile(EMMS:::"memory");
  540. #else
  541. int i;
  542. int num_pixels= src_size >> 2;
  543. for(i=0; i<num_pixels; i++)
  544. {
  545. dst[4*i + 0] = src[4*i + 2];
  546. dst[4*i + 1] = src[4*i + 1];
  547. dst[4*i + 2] = src[4*i + 0];
  548. }
  549. #endif
  550. }
  551. static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, unsigned int src_size)
  552. {
  553. int i;
  554. #ifdef HAVE_MMX
  555. int mmx_size= 23 - src_size;
  556. asm volatile (
  557. "movq "MANGLE(mask24r)", %%mm5 \n\t"
  558. "movq "MANGLE(mask24g)", %%mm6 \n\t"
  559. "movq "MANGLE(mask24b)", %%mm7 \n\t"
  560. ".balign 16 \n\t"
  561. "1: \n\t"
  562. PREFETCH" 32(%1, %%eax) \n\t"
  563. "movq (%1, %%eax), %%mm0 \n\t" // BGR BGR BG
  564. "movq (%1, %%eax), %%mm1 \n\t" // BGR BGR BG
  565. "movq 2(%1, %%eax), %%mm2 \n\t" // R BGR BGR B
  566. "psllq $16, %%mm0 \n\t" // 00 BGR BGR
  567. "pand %%mm5, %%mm0 \n\t"
  568. "pand %%mm6, %%mm1 \n\t"
  569. "pand %%mm7, %%mm2 \n\t"
  570. "por %%mm0, %%mm1 \n\t"
  571. "por %%mm2, %%mm1 \n\t"
  572. "movq 6(%1, %%eax), %%mm0 \n\t" // BGR BGR BG
  573. MOVNTQ" %%mm1, (%2, %%eax) \n\t" // RGB RGB RG
  574. "movq 8(%1, %%eax), %%mm1 \n\t" // R BGR BGR B
  575. "movq 10(%1, %%eax), %%mm2 \n\t" // GR BGR BGR
  576. "pand %%mm7, %%mm0 \n\t"
  577. "pand %%mm5, %%mm1 \n\t"
  578. "pand %%mm6, %%mm2 \n\t"
  579. "por %%mm0, %%mm1 \n\t"
  580. "por %%mm2, %%mm1 \n\t"
  581. "movq 14(%1, %%eax), %%mm0 \n\t" // R BGR BGR B
  582. MOVNTQ" %%mm1, 8(%2, %%eax) \n\t" // B RGB RGB R
  583. "movq 16(%1, %%eax), %%mm1 \n\t" // GR BGR BGR
  584. "movq 18(%1, %%eax), %%mm2 \n\t" // BGR BGR BG
  585. "pand %%mm6, %%mm0 \n\t"
  586. "pand %%mm7, %%mm1 \n\t"
  587. "pand %%mm5, %%mm2 \n\t"
  588. "por %%mm0, %%mm1 \n\t"
  589. "por %%mm2, %%mm1 \n\t"
  590. MOVNTQ" %%mm1, 16(%2, %%eax) \n\t"
  591. "addl $24, %%eax \n\t"
  592. " js 1b \n\t"
  593. : "+a" (mmx_size)
  594. : "r" (src-mmx_size), "r"(dst-mmx_size)
  595. );
  596. __asm __volatile(SFENCE:::"memory");
  597. __asm __volatile(EMMS:::"memory");
  598. if(mmx_size==23) return; //finihsed, was multiple of 8
  599. src+= src_size;
  600. dst+= src_size;
  601. src_size= 23 - mmx_size;
  602. src-= src_size;
  603. dst-= src_size;
  604. #endif
  605. for(i=0; i<src_size; i+=3)
  606. {
  607. register int x;
  608. x = src[i + 2];
  609. dst[i + 1] = src[i + 1];
  610. dst[i + 2] = src[i + 0];
  611. dst[i + 0] = x;
  612. }
  613. }
  614. static inline void RENAME(yuvPlanartoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
  615. unsigned int width, unsigned int height,
  616. unsigned int lumStride, unsigned int chromStride, unsigned int dstStride, int vertLumPerChroma)
  617. {
  618. int y;
  619. const int chromWidth= width>>1;
  620. for(y=0; y<height; y++)
  621. {
  622. #ifdef HAVE_MMX
  623. //FIXME handle 2 lines a once (fewer prefetch, reuse some chrom, but very likely limited by mem anyway)
  624. asm volatile(
  625. "xorl %%eax, %%eax \n\t"
  626. ".balign 16 \n\t"
  627. "1: \n\t"
  628. PREFETCH" 32(%1, %%eax, 2) \n\t"
  629. PREFETCH" 32(%2, %%eax) \n\t"
  630. PREFETCH" 32(%3, %%eax) \n\t"
  631. "movq (%2, %%eax), %%mm0 \n\t" // U(0)
  632. "movq %%mm0, %%mm2 \n\t" // U(0)
  633. "movq (%3, %%eax), %%mm1 \n\t" // V(0)
  634. "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
  635. "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8)
  636. "movq (%1, %%eax,2), %%mm3 \n\t" // Y(0)
  637. "movq 8(%1, %%eax,2), %%mm5 \n\t" // Y(8)
  638. "movq %%mm3, %%mm4 \n\t" // Y(0)
  639. "movq %%mm5, %%mm6 \n\t" // Y(8)
  640. "punpcklbw %%mm0, %%mm3 \n\t" // YUYV YUYV(0)
  641. "punpckhbw %%mm0, %%mm4 \n\t" // YUYV YUYV(4)
  642. "punpcklbw %%mm2, %%mm5 \n\t" // YUYV YUYV(8)
  643. "punpckhbw %%mm2, %%mm6 \n\t" // YUYV YUYV(12)
  644. MOVNTQ" %%mm3, (%0, %%eax, 4) \n\t"
  645. MOVNTQ" %%mm4, 8(%0, %%eax, 4) \n\t"
  646. MOVNTQ" %%mm5, 16(%0, %%eax, 4) \n\t"
  647. MOVNTQ" %%mm6, 24(%0, %%eax, 4) \n\t"
  648. "addl $8, %%eax \n\t"
  649. "cmpl %4, %%eax \n\t"
  650. " jb 1b \n\t"
  651. ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "r" (chromWidth)
  652. : "%eax"
  653. );
  654. #else
  655. int i;
  656. for(i=0; i<chromWidth; i++)
  657. {
  658. dst[4*i+0] = ysrc[2*i+0];
  659. dst[4*i+1] = usrc[i];
  660. dst[4*i+2] = ysrc[2*i+1];
  661. dst[4*i+3] = vsrc[i];
  662. }
  663. #endif
  664. if((y&(vertLumPerChroma-1))==(vertLumPerChroma-1) )
  665. {
  666. usrc += chromStride;
  667. vsrc += chromStride;
  668. }
  669. ysrc += lumStride;
  670. dst += dstStride;
  671. }
  672. #ifdef HAVE_MMX
  673. asm( EMMS" \n\t"
  674. SFENCE" \n\t"
  675. :::"memory");
  676. #endif
  677. }
  678. /**
  679. *
  680. * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
  681. * problem for anyone then tell me, and ill fix it)
  682. */
  683. static inline void RENAME(yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
  684. unsigned int width, unsigned int height,
  685. unsigned int lumStride, unsigned int chromStride, unsigned int dstStride)
  686. {
  687. //FIXME interpolate chroma
  688. RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
  689. }
  690. /**
  691. *
  692. * width should be a multiple of 16
  693. */
  694. static inline void RENAME(yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
  695. unsigned int width, unsigned int height,
  696. unsigned int lumStride, unsigned int chromStride, unsigned int dstStride)
  697. {
  698. RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
  699. }
  700. /**
  701. *
  702. * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
  703. * problem for anyone then tell me, and ill fix it)
  704. */
  705. static inline void RENAME(yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
  706. unsigned int width, unsigned int height,
  707. unsigned int lumStride, unsigned int chromStride, unsigned int srcStride)
  708. {
  709. int y;
  710. const int chromWidth= width>>1;
  711. for(y=0; y<height; y+=2)
  712. {
  713. #ifdef HAVE_MMX
  714. asm volatile(
  715. "xorl %%eax, %%eax \n\t"
  716. "pcmpeqw %%mm7, %%mm7 \n\t"
  717. "psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
  718. ".balign 16 \n\t"
  719. "1: \n\t"
  720. PREFETCH" 64(%0, %%eax, 4) \n\t"
  721. "movq (%0, %%eax, 4), %%mm0 \n\t" // YUYV YUYV(0)
  722. "movq 8(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(4)
  723. "movq %%mm0, %%mm2 \n\t" // YUYV YUYV(0)
  724. "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(4)
  725. "psrlw $8, %%mm0 \n\t" // U0V0 U0V0(0)
  726. "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(4)
  727. "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
  728. "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
  729. "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
  730. "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
  731. MOVNTQ" %%mm2, (%1, %%eax, 2) \n\t"
  732. "movq 16(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(8)
  733. "movq 24(%0, %%eax, 4), %%mm2 \n\t" // YUYV YUYV(12)
  734. "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(8)
  735. "movq %%mm2, %%mm4 \n\t" // YUYV YUYV(12)
  736. "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(8)
  737. "psrlw $8, %%mm2 \n\t" // U0V0 U0V0(12)
  738. "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
  739. "pand %%mm7, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
  740. "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
  741. "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
  742. MOVNTQ" %%mm3, 8(%1, %%eax, 2) \n\t"
  743. "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
  744. "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
  745. "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
  746. "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
  747. "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
  748. "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
  749. "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
  750. "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
  751. MOVNTQ" %%mm0, (%3, %%eax) \n\t"
  752. MOVNTQ" %%mm2, (%2, %%eax) \n\t"
  753. "addl $8, %%eax \n\t"
  754. "cmpl %4, %%eax \n\t"
  755. " jb 1b \n\t"
  756. ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "r" (chromWidth)
  757. : "memory", "%eax"
  758. );
  759. ydst += lumStride;
  760. src += srcStride;
  761. asm volatile(
  762. "xorl %%eax, %%eax \n\t"
  763. ".balign 16 \n\t"
  764. "1: \n\t"
  765. PREFETCH" 64(%0, %%eax, 4) \n\t"
  766. "movq (%0, %%eax, 4), %%mm0 \n\t" // YUYV YUYV(0)
  767. "movq 8(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(4)
  768. "movq 16(%0, %%eax, 4), %%mm2 \n\t" // YUYV YUYV(8)
  769. "movq 24(%0, %%eax, 4), %%mm3 \n\t" // YUYV YUYV(12)
  770. "pand %%mm7, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
  771. "pand %%mm7, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
  772. "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
  773. "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
  774. "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
  775. "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
  776. MOVNTQ" %%mm0, (%1, %%eax, 2) \n\t"
  777. MOVNTQ" %%mm2, 8(%1, %%eax, 2) \n\t"
  778. "addl $8, %%eax \n\t"
  779. "cmpl %4, %%eax \n\t"
  780. " jb 1b \n\t"
  781. ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "r" (chromWidth)
  782. : "memory", "%eax"
  783. );
  784. #else
  785. int i;
  786. for(i=0; i<chromWidth; i++)
  787. {
  788. ydst[2*i+0] = src[4*i+0];
  789. udst[i] = src[4*i+1];
  790. ydst[2*i+1] = src[4*i+2];
  791. vdst[i] = src[4*i+3];
  792. }
  793. ydst += lumStride;
  794. src += srcStride;
  795. for(i=0; i<chromWidth; i++)
  796. {
  797. ydst[2*i+0] = src[4*i+0];
  798. ydst[2*i+1] = src[4*i+2];
  799. }
  800. #endif
  801. udst += chromStride;
  802. vdst += chromStride;
  803. ydst += lumStride;
  804. src += srcStride;
  805. }
  806. #ifdef HAVE_MMX
  807. asm volatile( EMMS" \n\t"
  808. SFENCE" \n\t"
  809. :::"memory");
  810. #endif
  811. }
  812. /**
  813. *
  814. * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
  815. * problem for anyone then tell me, and ill fix it)
  816. * chrominance data is only taken from every secound line others are ignored FIXME write HQ version
  817. */
  818. static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
  819. unsigned int width, unsigned int height,
  820. unsigned int lumStride, unsigned int chromStride, unsigned int srcStride)
  821. {
  822. int y;
  823. const int chromWidth= width>>1;
  824. for(y=0; y<height; y+=2)
  825. {
  826. #ifdef HAVE_MMX
  827. asm volatile(
  828. "xorl %%eax, %%eax \n\t"
  829. "pcmpeqw %%mm7, %%mm7 \n\t"
  830. "psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
  831. ".balign 16 \n\t"
  832. "1: \n\t"
  833. PREFETCH" 64(%0, %%eax, 4) \n\t"
  834. "movq (%0, %%eax, 4), %%mm0 \n\t" // UYVY UYVY(0)
  835. "movq 8(%0, %%eax, 4), %%mm1 \n\t" // UYVY UYVY(4)
  836. "movq %%mm0, %%mm2 \n\t" // UYVY UYVY(0)
  837. "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(4)
  838. "pand %%mm7, %%mm0 \n\t" // U0V0 U0V0(0)
  839. "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(4)
  840. "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
  841. "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
  842. "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
  843. "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
  844. MOVNTQ" %%mm2, (%1, %%eax, 2) \n\t"
  845. "movq 16(%0, %%eax, 4), %%mm1 \n\t" // UYVY UYVY(8)
  846. "movq 24(%0, %%eax, 4), %%mm2 \n\t" // UYVY UYVY(12)
  847. "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(8)
  848. "movq %%mm2, %%mm4 \n\t" // UYVY UYVY(12)
  849. "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(8)
  850. "pand %%mm7, %%mm2 \n\t" // U0V0 U0V0(12)
  851. "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
  852. "psrlw $8, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
  853. "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
  854. "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
  855. MOVNTQ" %%mm3, 8(%1, %%eax, 2) \n\t"
  856. "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
  857. "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
  858. "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
  859. "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
  860. "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
  861. "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
  862. "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
  863. "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
  864. MOVNTQ" %%mm0, (%3, %%eax) \n\t"
  865. MOVNTQ" %%mm2, (%2, %%eax) \n\t"
  866. "addl $8, %%eax \n\t"
  867. "cmpl %4, %%eax \n\t"
  868. " jb 1b \n\t"
  869. ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "r" (chromWidth)
  870. : "memory", "%eax"
  871. );
  872. ydst += lumStride;
  873. src += srcStride;
  874. asm volatile(
  875. "xorl %%eax, %%eax \n\t"
  876. ".balign 16 \n\t"
  877. "1: \n\t"
  878. PREFETCH" 64(%0, %%eax, 4) \n\t"
  879. "movq (%0, %%eax, 4), %%mm0 \n\t" // YUYV YUYV(0)
  880. "movq 8(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(4)
  881. "movq 16(%0, %%eax, 4), %%mm2 \n\t" // YUYV YUYV(8)
  882. "movq 24(%0, %%eax, 4), %%mm3 \n\t" // YUYV YUYV(12)
  883. "psrlw $8, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
  884. "psrlw $8, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
  885. "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
  886. "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
  887. "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
  888. "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
  889. MOVNTQ" %%mm0, (%1, %%eax, 2) \n\t"
  890. MOVNTQ" %%mm2, 8(%1, %%eax, 2) \n\t"
  891. "addl $8, %%eax \n\t"
  892. "cmpl %4, %%eax \n\t"
  893. " jb 1b \n\t"
  894. ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "r" (chromWidth)
  895. : "memory", "%eax"
  896. );
  897. #else
  898. int i;
  899. for(i=0; i<chromWidth; i++)
  900. {
  901. udst[i] = src[4*i+0];
  902. ydst[2*i+0] = src[4*i+1];
  903. vdst[i] = src[4*i+2];
  904. ydst[2*i+1] = src[4*i+3];
  905. }
  906. ydst += lumStride;
  907. src += srcStride;
  908. for(i=0; i<chromWidth; i++)
  909. {
  910. ydst[2*i+0] = src[4*i+1];
  911. ydst[2*i+1] = src[4*i+3];
  912. }
  913. #endif
  914. udst += chromStride;
  915. vdst += chromStride;
  916. ydst += lumStride;
  917. src += srcStride;
  918. }
  919. #ifdef HAVE_MMX
  920. asm volatile( EMMS" \n\t"
  921. SFENCE" \n\t"
  922. :::"memory");
  923. #endif
  924. }
  925. /**
  926. *
  927. * height should be a multiple of 2 and width should be a multiple of 2 (if this is a
  928. * problem for anyone then tell me, and ill fix it)
  929. * chrominance data is only taken from every secound line others are ignored in the C version FIXME write HQ version
  930. */
  931. static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
  932. unsigned int width, unsigned int height,
  933. unsigned int lumStride, unsigned int chromStride, unsigned int srcStride)
  934. {
  935. int y;
  936. const int chromWidth= width>>1;
  937. #ifdef HAVE_MMX
  938. for(y=0; y<height-2; y+=2)
  939. {
  940. int i;
  941. for(i=0; i<2; i++)
  942. {
  943. asm volatile(
  944. "movl %2, %%eax \n\t"
  945. "movq "MANGLE(bgr2YCoeff)", %%mm6 \n\t"
  946. "movq "MANGLE(w1111)", %%mm5 \n\t"
  947. "pxor %%mm7, %%mm7 \n\t"
  948. "leal (%%eax, %%eax, 2), %%ebx \n\t"
  949. ".balign 16 \n\t"
  950. "1: \n\t"
  951. PREFETCH" 64(%0, %%ebx) \n\t"
  952. "movd (%0, %%ebx), %%mm0 \n\t"
  953. "movd 3(%0, %%ebx), %%mm1 \n\t"
  954. "punpcklbw %%mm7, %%mm0 \n\t"
  955. "punpcklbw %%mm7, %%mm1 \n\t"
  956. "movd 6(%0, %%ebx), %%mm2 \n\t"
  957. "movd 9(%0, %%ebx), %%mm3 \n\t"
  958. "punpcklbw %%mm7, %%mm2 \n\t"
  959. "punpcklbw %%mm7, %%mm3 \n\t"
  960. "pmaddwd %%mm6, %%mm0 \n\t"
  961. "pmaddwd %%mm6, %%mm1 \n\t"
  962. "pmaddwd %%mm6, %%mm2 \n\t"
  963. "pmaddwd %%mm6, %%mm3 \n\t"
  964. #ifndef FAST_BGR2YV12
  965. "psrad $8, %%mm0 \n\t"
  966. "psrad $8, %%mm1 \n\t"
  967. "psrad $8, %%mm2 \n\t"
  968. "psrad $8, %%mm3 \n\t"
  969. #endif
  970. "packssdw %%mm1, %%mm0 \n\t"
  971. "packssdw %%mm3, %%mm2 \n\t"
  972. "pmaddwd %%mm5, %%mm0 \n\t"
  973. "pmaddwd %%mm5, %%mm2 \n\t"
  974. "packssdw %%mm2, %%mm0 \n\t"
  975. "psraw $7, %%mm0 \n\t"
  976. "movd 12(%0, %%ebx), %%mm4 \n\t"
  977. "movd 15(%0, %%ebx), %%mm1 \n\t"
  978. "punpcklbw %%mm7, %%mm4 \n\t"
  979. "punpcklbw %%mm7, %%mm1 \n\t"
  980. "movd 18(%0, %%ebx), %%mm2 \n\t"
  981. "movd 21(%0, %%ebx), %%mm3 \n\t"
  982. "punpcklbw %%mm7, %%mm2 \n\t"
  983. "punpcklbw %%mm7, %%mm3 \n\t"
  984. "pmaddwd %%mm6, %%mm4 \n\t"
  985. "pmaddwd %%mm6, %%mm1 \n\t"
  986. "pmaddwd %%mm6, %%mm2 \n\t"
  987. "pmaddwd %%mm6, %%mm3 \n\t"
  988. #ifndef FAST_BGR2YV12
  989. "psrad $8, %%mm4 \n\t"
  990. "psrad $8, %%mm1 \n\t"
  991. "psrad $8, %%mm2 \n\t"
  992. "psrad $8, %%mm3 \n\t"
  993. #endif
  994. "packssdw %%mm1, %%mm4 \n\t"
  995. "packssdw %%mm3, %%mm2 \n\t"
  996. "pmaddwd %%mm5, %%mm4 \n\t"
  997. "pmaddwd %%mm5, %%mm2 \n\t"
  998. "addl $24, %%ebx \n\t"
  999. "packssdw %%mm2, %%mm4 \n\t"
  1000. "psraw $7, %%mm4 \n\t"
  1001. "packuswb %%mm4, %%mm0 \n\t"
  1002. "paddusb "MANGLE(bgr2YOffset)", %%mm0 \n\t"
  1003. MOVNTQ" %%mm0, (%1, %%eax) \n\t"
  1004. "addl $8, %%eax \n\t"
  1005. " js 1b \n\t"
  1006. : : "r" (src+width*3), "r" (ydst+width), "g" (-width)
  1007. : "%eax", "%ebx"
  1008. );
  1009. ydst += lumStride;
  1010. src += srcStride;
  1011. }
  1012. src -= srcStride*2;
  1013. asm volatile(
  1014. "movl %4, %%eax \n\t"
  1015. "movq "MANGLE(w1111)", %%mm5 \n\t"
  1016. "movq "MANGLE(bgr2UCoeff)", %%mm6 \n\t"
  1017. "pxor %%mm7, %%mm7 \n\t"
  1018. "leal (%%eax, %%eax, 2), %%ebx \n\t"
  1019. "addl %%ebx, %%ebx \n\t"
  1020. ".balign 16 \n\t"
  1021. "1: \n\t"
  1022. PREFETCH" 64(%0, %%ebx) \n\t"
  1023. PREFETCH" 64(%1, %%ebx) \n\t"
  1024. #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
  1025. "movq (%0, %%ebx), %%mm0 \n\t"
  1026. "movq (%1, %%ebx), %%mm1 \n\t"
  1027. "movq 6(%0, %%ebx), %%mm2 \n\t"
  1028. "movq 6(%1, %%ebx), %%mm3 \n\t"
  1029. PAVGB" %%mm1, %%mm0 \n\t"
  1030. PAVGB" %%mm3, %%mm2 \n\t"
  1031. "movq %%mm0, %%mm1 \n\t"
  1032. "movq %%mm2, %%mm3 \n\t"
  1033. "psrlq $24, %%mm0 \n\t"
  1034. "psrlq $24, %%mm2 \n\t"
  1035. PAVGB" %%mm1, %%mm0 \n\t"
  1036. PAVGB" %%mm3, %%mm2 \n\t"
  1037. "punpcklbw %%mm7, %%mm0 \n\t"
  1038. "punpcklbw %%mm7, %%mm2 \n\t"
  1039. #else
  1040. "movd (%0, %%ebx), %%mm0 \n\t"
  1041. "movd (%1, %%ebx), %%mm1 \n\t"
  1042. "movd 3(%0, %%ebx), %%mm2 \n\t"
  1043. "movd 3(%1, %%ebx), %%mm3 \n\t"
  1044. "punpcklbw %%mm7, %%mm0 \n\t"
  1045. "punpcklbw %%mm7, %%mm1 \n\t"
  1046. "punpcklbw %%mm7, %%mm2 \n\t"
  1047. "punpcklbw %%mm7, %%mm3 \n\t"
  1048. "paddw %%mm1, %%mm0 \n\t"
  1049. "paddw %%mm3, %%mm2 \n\t"
  1050. "paddw %%mm2, %%mm0 \n\t"
  1051. "movd 6(%0, %%ebx), %%mm4 \n\t"
  1052. "movd 6(%1, %%ebx), %%mm1 \n\t"
  1053. "movd 9(%0, %%ebx), %%mm2 \n\t"
  1054. "movd 9(%1, %%ebx), %%mm3 \n\t"
  1055. "punpcklbw %%mm7, %%mm4 \n\t"
  1056. "punpcklbw %%mm7, %%mm1 \n\t"
  1057. "punpcklbw %%mm7, %%mm2 \n\t"
  1058. "punpcklbw %%mm7, %%mm3 \n\t"
  1059. "paddw %%mm1, %%mm4 \n\t"
  1060. "paddw %%mm3, %%mm2 \n\t"
  1061. "paddw %%mm4, %%mm2 \n\t"
  1062. "psrlw $2, %%mm0 \n\t"
  1063. "psrlw $2, %%mm2 \n\t"
  1064. #endif
  1065. "movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t"
  1066. "movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t"
  1067. "pmaddwd %%mm0, %%mm1 \n\t"
  1068. "pmaddwd %%mm2, %%mm3 \n\t"
  1069. "pmaddwd %%mm6, %%mm0 \n\t"
  1070. "pmaddwd %%mm6, %%mm2 \n\t"
  1071. #ifndef FAST_BGR2YV12
  1072. "psrad $8, %%mm0 \n\t"
  1073. "psrad $8, %%mm1 \n\t"
  1074. "psrad $8, %%mm2 \n\t"
  1075. "psrad $8, %%mm3 \n\t"
  1076. #endif
  1077. "packssdw %%mm2, %%mm0 \n\t"
  1078. "packssdw %%mm3, %%mm1 \n\t"
  1079. "pmaddwd %%mm5, %%mm0 \n\t"
  1080. "pmaddwd %%mm5, %%mm1 \n\t"
  1081. "packssdw %%mm1, %%mm0 \n\t" // V1 V0 U1 U0
  1082. "psraw $7, %%mm0 \n\t"
  1083. #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
  1084. "movq 12(%0, %%ebx), %%mm4 \n\t"
  1085. "movq 12(%1, %%ebx), %%mm1 \n\t"
  1086. "movq 18(%0, %%ebx), %%mm2 \n\t"
  1087. "movq 18(%1, %%ebx), %%mm3 \n\t"
  1088. PAVGB" %%mm1, %%mm4 \n\t"
  1089. PAVGB" %%mm3, %%mm2 \n\t"
  1090. "movq %%mm4, %%mm1 \n\t"
  1091. "movq %%mm2, %%mm3 \n\t"
  1092. "psrlq $24, %%mm4 \n\t"
  1093. "psrlq $24, %%mm2 \n\t"
  1094. PAVGB" %%mm1, %%mm4 \n\t"
  1095. PAVGB" %%mm3, %%mm2 \n\t"
  1096. "punpcklbw %%mm7, %%mm4 \n\t"
  1097. "punpcklbw %%mm7, %%mm2 \n\t"
  1098. #else
  1099. "movd 12(%0, %%ebx), %%mm4 \n\t"
  1100. "movd 12(%1, %%ebx), %%mm1 \n\t"
  1101. "movd 15(%0, %%ebx), %%mm2 \n\t"
  1102. "movd 15(%1, %%ebx), %%mm3 \n\t"
  1103. "punpcklbw %%mm7, %%mm4 \n\t"
  1104. "punpcklbw %%mm7, %%mm1 \n\t"
  1105. "punpcklbw %%mm7, %%mm2 \n\t"
  1106. "punpcklbw %%mm7, %%mm3 \n\t"
  1107. "paddw %%mm1, %%mm4 \n\t"
  1108. "paddw %%mm3, %%mm2 \n\t"
  1109. "paddw %%mm2, %%mm4 \n\t"
  1110. "movd 18(%0, %%ebx), %%mm5 \n\t"
  1111. "movd 18(%1, %%ebx), %%mm1 \n\t"
  1112. "movd 21(%0, %%ebx), %%mm2 \n\t"
  1113. "movd 21(%1, %%ebx), %%mm3 \n\t"
  1114. "punpcklbw %%mm7, %%mm5 \n\t"
  1115. "punpcklbw %%mm7, %%mm1 \n\t"
  1116. "punpcklbw %%mm7, %%mm2 \n\t"
  1117. "punpcklbw %%mm7, %%mm3 \n\t"
  1118. "paddw %%mm1, %%mm5 \n\t"
  1119. "paddw %%mm3, %%mm2 \n\t"
  1120. "paddw %%mm5, %%mm2 \n\t"
  1121. "movq "MANGLE(w1111)", %%mm5 \n\t"
  1122. "psrlw $2, %%mm4 \n\t"
  1123. "psrlw $2, %%mm2 \n\t"
  1124. #endif
  1125. "movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t"
  1126. "movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t"
  1127. "pmaddwd %%mm4, %%mm1 \n\t"
  1128. "pmaddwd %%mm2, %%mm3 \n\t"
  1129. "pmaddwd %%mm6, %%mm4 \n\t"
  1130. "pmaddwd %%mm6, %%mm2 \n\t"
  1131. #ifndef FAST_BGR2YV12
  1132. "psrad $8, %%mm4 \n\t"
  1133. "psrad $8, %%mm1 \n\t"
  1134. "psrad $8, %%mm2 \n\t"
  1135. "psrad $8, %%mm3 \n\t"
  1136. #endif
  1137. "packssdw %%mm2, %%mm4 \n\t"
  1138. "packssdw %%mm3, %%mm1 \n\t"
  1139. "pmaddwd %%mm5, %%mm4 \n\t"
  1140. "pmaddwd %%mm5, %%mm1 \n\t"
  1141. "addl $24, %%ebx \n\t"
  1142. "packssdw %%mm1, %%mm4 \n\t" // V3 V2 U3 U2
  1143. "psraw $7, %%mm4 \n\t"
  1144. "movq %%mm0, %%mm1 \n\t"
  1145. "punpckldq %%mm4, %%mm0 \n\t"
  1146. "punpckhdq %%mm4, %%mm1 \n\t"
  1147. "packsswb %%mm1, %%mm0 \n\t"
  1148. "paddb "MANGLE(bgr2UVOffset)", %%mm0 \n\t"
  1149. "movd %%mm0, (%2, %%eax) \n\t"
  1150. "punpckhdq %%mm0, %%mm0 \n\t"
  1151. "movd %%mm0, (%3, %%eax) \n\t"
  1152. "addl $4, %%eax \n\t"
  1153. " js 1b \n\t"
  1154. : : "r" (src+width*6), "r" (src+srcStride+width*6), "r" (udst+width), "r" (vdst+width), "g" (-width)
  1155. : "%eax", "%ebx"
  1156. );
  1157. udst += chromStride;
  1158. vdst += chromStride;
  1159. src += srcStride*2;
  1160. }
  1161. asm volatile( EMMS" \n\t"
  1162. SFENCE" \n\t"
  1163. :::"memory");
  1164. #else
  1165. y=0;
  1166. #endif
  1167. for(; y<height; y+=2)
  1168. {
  1169. int i;
  1170. for(i=0; i<chromWidth; i++)
  1171. {
  1172. unsigned int b= src[6*i+0];
  1173. unsigned int g= src[6*i+1];
  1174. unsigned int r= src[6*i+2];
  1175. unsigned int Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
  1176. unsigned int V = ((RV*r + GV*g + BV*b)>>RGB2YUV_SHIFT) + 128;
  1177. unsigned int U = ((RU*r + GU*g + BU*b)>>RGB2YUV_SHIFT) + 128;
  1178. udst[i] = U;
  1179. vdst[i] = V;
  1180. ydst[2*i] = Y;
  1181. b= src[6*i+3];
  1182. g= src[6*i+4];
  1183. r= src[6*i+5];
  1184. Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
  1185. ydst[2*i+1] = Y;
  1186. }
  1187. ydst += lumStride;
  1188. src += srcStride;
  1189. for(i=0; i<chromWidth; i++)
  1190. {
  1191. unsigned int b= src[6*i+0];
  1192. unsigned int g= src[6*i+1];
  1193. unsigned int r= src[6*i+2];
  1194. unsigned int Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
  1195. ydst[2*i] = Y;
  1196. b= src[6*i+3];
  1197. g= src[6*i+4];
  1198. r= src[6*i+5];
  1199. Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
  1200. ydst[2*i+1] = Y;
  1201. }
  1202. udst += chromStride;
  1203. vdst += chromStride;
  1204. ydst += lumStride;
  1205. src += srcStride;
  1206. }
  1207. }
  1208. void RENAME(interleaveBytes)(uint8_t *src1, uint8_t *src2, uint8_t *dest,
  1209. int width, int height, int src1Stride, int src2Stride, int dstStride){
  1210. int h;
  1211. for(h=0; h < height; h++)
  1212. {
  1213. int w;
  1214. #ifdef HAVE_MMX
  1215. #ifdef HAVE_SSE2
  1216. asm(
  1217. "xorl %%eax, %%eax \n\t"
  1218. "1: \n\t"
  1219. PREFETCH" 64(%1, %%eax) \n\t"
  1220. PREFETCH" 64(%2, %%eax) \n\t"
  1221. "movdqa (%1, %%eax), %%xmm0 \n\t"
  1222. "movdqa (%1, %%eax), %%xmm1 \n\t"
  1223. "movdqa (%2, %%eax), %%xmm2 \n\t"
  1224. "punpcklbw %%xmm2, %%xmm0 \n\t"
  1225. "punpckhbw %%xmm2, %%xmm1 \n\t"
  1226. "movntdq %%xmm0, (%0, %%eax, 2) \n\t"
  1227. "movntdq %%xmm1, 16(%0, %%eax, 2)\n\t"
  1228. "addl $16, %%eax \n\t"
  1229. "cmpl %3, %%eax \n\t"
  1230. " jb 1b \n\t"
  1231. ::"r"(dest), "r"(src1), "r"(src2), "r" (width-15)
  1232. : "memory", "%eax"
  1233. );
  1234. #else
  1235. asm(
  1236. "xorl %%eax, %%eax \n\t"
  1237. "1: \n\t"
  1238. PREFETCH" 64(%1, %%eax) \n\t"
  1239. PREFETCH" 64(%2, %%eax) \n\t"
  1240. "movq (%1, %%eax), %%mm0 \n\t"
  1241. "movq 8(%1, %%eax), %%mm2 \n\t"
  1242. "movq %%mm0, %%mm1 \n\t"
  1243. "movq %%mm2, %%mm3 \n\t"
  1244. "movq (%2, %%eax), %%mm4 \n\t"
  1245. "movq 8(%2, %%eax), %%mm5 \n\t"
  1246. "punpcklbw %%mm4, %%mm0 \n\t"
  1247. "punpckhbw %%mm4, %%mm1 \n\t"
  1248. "punpcklbw %%mm5, %%mm2 \n\t"
  1249. "punpckhbw %%mm5, %%mm3 \n\t"
  1250. MOVNTQ" %%mm0, (%0, %%eax, 2) \n\t"
  1251. MOVNTQ" %%mm1, 8(%0, %%eax, 2) \n\t"
  1252. MOVNTQ" %%mm2, 16(%0, %%eax, 2) \n\t"
  1253. MOVNTQ" %%mm3, 24(%0, %%eax, 2) \n\t"
  1254. "addl $16, %%eax \n\t"
  1255. "cmpl %3, %%eax \n\t"
  1256. " jb 1b \n\t"
  1257. ::"r"(dest), "r"(src1), "r"(src2), "r" (width-15)
  1258. : "memory", "%eax"
  1259. );
  1260. #endif
  1261. for(w= (width&(~15)); w < width; w++)
  1262. {
  1263. dest[2*w+0] = src1[w];
  1264. dest[2*w+1] = src2[w];
  1265. }
  1266. #else
  1267. for(w=0; w < width; w++)
  1268. {
  1269. dest[2*w+0] = src1[w];
  1270. dest[2*w+1] = src2[w];
  1271. }
  1272. #endif
  1273. dest += dstStride;
  1274. src1 += src1Stride;
  1275. src2 += src2Stride;
  1276. }
  1277. #ifdef HAVE_MMX
  1278. asm(
  1279. EMMS" \n\t"
  1280. SFENCE" \n\t"
  1281. ::: "memory"
  1282. );
  1283. #endif
  1284. }