You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1200 lines
31KB

  1. /*
  2. *
  3. * rgb2rgb.c, Software RGB to RGB convertor
  4. * pluralize by Software PAL8 to RGB convertor
  5. * Software YUV to YUV convertor
  6. * Software YUV to RGB convertor
  7. * Written by Nick Kurshev.
  8. * palette & yuv & runtime cpu stuff by Michael (michaelni@gmx.at) (under GPL)
  9. */
  10. #undef PREFETCH
  11. #undef MOVNTQ
  12. #undef EMMS
  13. #undef SFENCE
  14. #undef MMREG_SIZE
  15. #undef PREFETCHW
  16. #undef PAVGB
  17. #ifdef HAVE_SSE2
  18. #define MMREG_SIZE 16
  19. #else
  20. #define MMREG_SIZE 8
  21. #endif
  22. #ifdef HAVE_3DNOW
  23. #define PREFETCH "prefetch"
  24. #define PREFETCHW "prefetchw"
  25. #define PAVGB "pavgusb"
  26. #elif defined ( HAVE_MMX2 )
  27. #define PREFETCH "prefetchnta"
  28. #define PREFETCHW "prefetcht0"
  29. #define PAVGB "pavgb"
  30. #else
  31. #define PREFETCH "/nop"
  32. #define PREFETCHW "/nop"
  33. #endif
  34. #ifdef HAVE_3DNOW
  35. /* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
  36. #define EMMS "femms"
  37. #else
  38. #define EMMS "emms"
  39. #endif
  40. #ifdef HAVE_MMX2
  41. #define MOVNTQ "movntq"
  42. #define SFENCE "sfence"
  43. #else
  44. #define MOVNTQ "movq"
  45. #define SFENCE "/nop"
  46. #endif
  47. static inline void RENAME(rgb24to32)(const uint8_t *src,uint8_t *dst,unsigned src_size)
  48. {
  49. uint8_t *dest = dst;
  50. const uint8_t *s = src;
  51. const uint8_t *end;
  52. #ifdef HAVE_MMX
  53. uint8_t *mm_end;
  54. #endif
  55. end = s + src_size;
  56. #ifdef HAVE_MMX
  57. __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
  58. mm_end = (uint8_t*)((((unsigned long)end)/(MMREG_SIZE*4))*(MMREG_SIZE*4));
  59. __asm __volatile("movq %0, %%mm7"::"m"(mask32):"memory");
  60. if(mm_end == end) mm_end -= MMREG_SIZE*4;
  61. while(s < mm_end)
  62. {
  63. __asm __volatile(
  64. PREFETCH" 32%1\n\t"
  65. "movd %1, %%mm0\n\t"
  66. "punpckldq 3%1, %%mm0\n\t"
  67. "movd 6%1, %%mm1\n\t"
  68. "punpckldq 9%1, %%mm1\n\t"
  69. "movd 12%1, %%mm2\n\t"
  70. "punpckldq 15%1, %%mm2\n\t"
  71. "movd 18%1, %%mm3\n\t"
  72. "punpckldq 21%1, %%mm3\n\t"
  73. "pand %%mm7, %%mm0\n\t"
  74. "pand %%mm7, %%mm1\n\t"
  75. "pand %%mm7, %%mm2\n\t"
  76. "pand %%mm7, %%mm3\n\t"
  77. MOVNTQ" %%mm0, %0\n\t"
  78. MOVNTQ" %%mm1, 8%0\n\t"
  79. MOVNTQ" %%mm2, 16%0\n\t"
  80. MOVNTQ" %%mm3, 24%0"
  81. :"=m"(*dest)
  82. :"m"(*s)
  83. :"memory");
  84. dest += 32;
  85. s += 24;
  86. }
  87. __asm __volatile(SFENCE:::"memory");
  88. __asm __volatile(EMMS:::"memory");
  89. #endif
  90. while(s < end)
  91. {
  92. *dest++ = *s++;
  93. *dest++ = *s++;
  94. *dest++ = *s++;
  95. *dest++ = 0;
  96. }
  97. }
  98. static inline void RENAME(rgb32to24)(const uint8_t *src,uint8_t *dst,unsigned src_size)
  99. {
  100. uint8_t *dest = dst;
  101. const uint8_t *s = src;
  102. const uint8_t *end;
  103. #ifdef HAVE_MMX
  104. uint8_t *mm_end;
  105. #endif
  106. end = s + src_size;
  107. #ifdef HAVE_MMX
  108. __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
  109. mm_end = (uint8_t*)((((unsigned long)end)/(MMREG_SIZE*4))*(MMREG_SIZE*4));
  110. while(s < mm_end)
  111. {
  112. __asm __volatile(
  113. PREFETCH" 32%1\n\t"
  114. "movq %1, %%mm0\n\t"
  115. "movq 8%1, %%mm1\n\t"
  116. "movq 16%1, %%mm4\n\t"
  117. "movq 24%1, %%mm5\n\t"
  118. "movq %%mm0, %%mm2\n\t"
  119. "movq %%mm1, %%mm3\n\t"
  120. "movq %%mm4, %%mm6\n\t"
  121. "movq %%mm5, %%mm7\n\t"
  122. "psrlq $8, %%mm2\n\t"
  123. "psrlq $8, %%mm3\n\t"
  124. "psrlq $8, %%mm6\n\t"
  125. "psrlq $8, %%mm7\n\t"
  126. "pand %2, %%mm0\n\t"
  127. "pand %2, %%mm1\n\t"
  128. "pand %2, %%mm4\n\t"
  129. "pand %2, %%mm5\n\t"
  130. "pand %3, %%mm2\n\t"
  131. "pand %3, %%mm3\n\t"
  132. "pand %3, %%mm6\n\t"
  133. "pand %3, %%mm7\n\t"
  134. "por %%mm2, %%mm0\n\t"
  135. "por %%mm3, %%mm1\n\t"
  136. "por %%mm6, %%mm4\n\t"
  137. "por %%mm7, %%mm5\n\t"
  138. "movq %%mm1, %%mm2\n\t"
  139. "movq %%mm4, %%mm3\n\t"
  140. "psllq $48, %%mm2\n\t"
  141. "psllq $32, %%mm3\n\t"
  142. "pand %4, %%mm2\n\t"
  143. "pand %5, %%mm3\n\t"
  144. "por %%mm2, %%mm0\n\t"
  145. "psrlq $16, %%mm1\n\t"
  146. "psrlq $32, %%mm4\n\t"
  147. "psllq $16, %%mm5\n\t"
  148. "por %%mm3, %%mm1\n\t"
  149. "pand %6, %%mm5\n\t"
  150. "por %%mm5, %%mm4\n\t"
  151. MOVNTQ" %%mm0, %0\n\t"
  152. MOVNTQ" %%mm1, 8%0\n\t"
  153. MOVNTQ" %%mm4, 16%0"
  154. :"=m"(*dest)
  155. :"m"(*s),"m"(mask24l),
  156. "m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
  157. :"memory");
  158. dest += 24;
  159. s += 32;
  160. }
  161. __asm __volatile(SFENCE:::"memory");
  162. __asm __volatile(EMMS:::"memory");
  163. #endif
  164. while(s < end)
  165. {
  166. *dest++ = *s++;
  167. *dest++ = *s++;
  168. *dest++ = *s++;
  169. s++;
  170. }
  171. }
  172. /*
  173. Original by Strepto/Astral
  174. ported to gcc & bugfixed : A'rpi
  175. MMX2, 3DNOW optimization by Nick Kurshev
  176. 32bit c version, and and&add trick by Michael Niedermayer
  177. */
  178. static inline void RENAME(rgb15to16)(const uint8_t *src,uint8_t *dst,unsigned src_size)
  179. {
  180. #ifdef HAVE_MMX
  181. register const char* s=src+src_size;
  182. register char* d=dst+src_size;
  183. register int offs=-src_size;
  184. __asm __volatile(PREFETCH" %0"::"m"(*(s+offs)));
  185. __asm __volatile(
  186. "movq %0, %%mm4\n\t"
  187. ::"m"(mask15s));
  188. while(offs<0)
  189. {
  190. __asm __volatile(
  191. PREFETCH" 32%1\n\t"
  192. "movq %1, %%mm0\n\t"
  193. "movq 8%1, %%mm2\n\t"
  194. "movq %%mm0, %%mm1\n\t"
  195. "movq %%mm2, %%mm3\n\t"
  196. "pand %%mm4, %%mm0\n\t"
  197. "pand %%mm4, %%mm2\n\t"
  198. "paddw %%mm1, %%mm0\n\t"
  199. "paddw %%mm3, %%mm2\n\t"
  200. MOVNTQ" %%mm0, %0\n\t"
  201. MOVNTQ" %%mm2, 8%0"
  202. :"=m"(*(d+offs))
  203. :"m"(*(s+offs))
  204. );
  205. offs+=16;
  206. }
  207. __asm __volatile(SFENCE:::"memory");
  208. __asm __volatile(EMMS:::"memory");
  209. #else
  210. #if 0
  211. const uint16_t *s1=( uint16_t * )src;
  212. uint16_t *d1=( uint16_t * )dst;
  213. uint16_t *e=((uint8_t *)s1)+src_size;
  214. while( s1<e ){
  215. register int x=*( s1++ );
  216. /* rrrrrggggggbbbbb
  217. 0rrrrrgggggbbbbb
  218. 0111 1111 1110 0000=0x7FE0
  219. 00000000000001 1111=0x001F */
  220. *( d1++ )=( x&0x001F )|( ( x&0x7FE0 )<<1 );
  221. }
  222. #else
  223. const unsigned *s1=( unsigned * )src;
  224. unsigned *d1=( unsigned * )dst;
  225. int i;
  226. int size= src_size>>2;
  227. for(i=0; i<size; i++)
  228. {
  229. register int x= s1[i];
  230. // d1[i] = x + (x&0x7FE07FE0); //faster but need msbit =0 which might not allways be true
  231. d1[i] = (x&0x7FFF7FFF) + (x&0x7FE07FE0);
  232. }
  233. #endif
  234. #endif
  235. }
  236. static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, unsigned src_size)
  237. {
  238. #ifdef HAVE_MMX
  239. const uint8_t *s = src;
  240. const uint8_t *end,*mm_end;
  241. uint16_t *d = (uint16_t *)dst;
  242. end = s + src_size;
  243. mm_end = (uint8_t*)((((unsigned long)end)/(MMREG_SIZE*2))*(MMREG_SIZE*2));
  244. __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
  245. __asm __volatile(
  246. "movq %0, %%mm7\n\t"
  247. "movq %1, %%mm6\n\t"
  248. ::"m"(red_16mask),"m"(green_16mask));
  249. while(s < mm_end)
  250. {
  251. __asm __volatile(
  252. PREFETCH" 32%1\n\t"
  253. "movd %1, %%mm0\n\t"
  254. "movd 4%1, %%mm3\n\t"
  255. "punpckldq 8%1, %%mm0\n\t"
  256. "punpckldq 12%1, %%mm3\n\t"
  257. "movq %%mm0, %%mm1\n\t"
  258. "movq %%mm0, %%mm2\n\t"
  259. "movq %%mm3, %%mm4\n\t"
  260. "movq %%mm3, %%mm5\n\t"
  261. "psrlq $3, %%mm0\n\t"
  262. "psrlq $3, %%mm3\n\t"
  263. "pand %2, %%mm0\n\t"
  264. "pand %2, %%mm3\n\t"
  265. "psrlq $5, %%mm1\n\t"
  266. "psrlq $5, %%mm4\n\t"
  267. "pand %%mm6, %%mm1\n\t"
  268. "pand %%mm6, %%mm4\n\t"
  269. "psrlq $8, %%mm2\n\t"
  270. "psrlq $8, %%mm5\n\t"
  271. "pand %%mm7, %%mm2\n\t"
  272. "pand %%mm7, %%mm5\n\t"
  273. "por %%mm1, %%mm0\n\t"
  274. "por %%mm4, %%mm3\n\t"
  275. "por %%mm2, %%mm0\n\t"
  276. "por %%mm5, %%mm3\n\t"
  277. "psllq $16, %%mm3\n\t"
  278. "por %%mm3, %%mm0\n\t"
  279. MOVNTQ" %%mm0, %0\n\t"
  280. :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
  281. d += 4;
  282. s += 16;
  283. }
  284. while(s < end)
  285. {
  286. const int b= *s++;
  287. const int g= *s++;
  288. const int r= *s++;
  289. *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
  290. }
  291. __asm __volatile(SFENCE:::"memory");
  292. __asm __volatile(EMMS:::"memory");
  293. #else
  294. unsigned j,i,num_pixels=src_size/4;
  295. uint16_t *d = (uint16_t *)dst;
  296. for(i=0,j=0; j<num_pixels; i+=4,j++)
  297. {
  298. const int b= src[i+0];
  299. const int g= src[i+1];
  300. const int r= src[i+2];
  301. d[j]= (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
  302. }
  303. #endif
  304. }
  305. static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, unsigned src_size)
  306. {
  307. #ifdef HAVE_MMX
  308. const uint8_t *s = src;
  309. const uint8_t *end,*mm_end;
  310. uint16_t *d = (uint16_t *)dst;
  311. end = s + src_size;
  312. mm_end = (uint8_t*)((((unsigned long)end)/(MMREG_SIZE*2))*(MMREG_SIZE*2));
  313. __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
  314. __asm __volatile(
  315. "movq %0, %%mm7\n\t"
  316. "movq %1, %%mm6\n\t"
  317. ::"m"(red_15mask),"m"(green_15mask));
  318. while(s < mm_end)
  319. {
  320. __asm __volatile(
  321. PREFETCH" 32%1\n\t"
  322. "movd %1, %%mm0\n\t"
  323. "movd 4%1, %%mm3\n\t"
  324. "punpckldq 8%1, %%mm0\n\t"
  325. "punpckldq 12%1, %%mm3\n\t"
  326. "movq %%mm0, %%mm1\n\t"
  327. "movq %%mm0, %%mm2\n\t"
  328. "movq %%mm3, %%mm4\n\t"
  329. "movq %%mm3, %%mm5\n\t"
  330. "psrlq $3, %%mm0\n\t"
  331. "psrlq $3, %%mm3\n\t"
  332. "pand %2, %%mm0\n\t"
  333. "pand %2, %%mm3\n\t"
  334. "psrlq $6, %%mm1\n\t"
  335. "psrlq $6, %%mm4\n\t"
  336. "pand %%mm6, %%mm1\n\t"
  337. "pand %%mm6, %%mm4\n\t"
  338. "psrlq $9, %%mm2\n\t"
  339. "psrlq $9, %%mm5\n\t"
  340. "pand %%mm7, %%mm2\n\t"
  341. "pand %%mm7, %%mm5\n\t"
  342. "por %%mm1, %%mm0\n\t"
  343. "por %%mm4, %%mm3\n\t"
  344. "por %%mm2, %%mm0\n\t"
  345. "por %%mm5, %%mm3\n\t"
  346. "psllq $16, %%mm3\n\t"
  347. "por %%mm3, %%mm0\n\t"
  348. MOVNTQ" %%mm0, %0\n\t"
  349. :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
  350. d += 4;
  351. s += 16;
  352. }
  353. while(s < end)
  354. {
  355. const int b= *s++;
  356. const int g= *s++;
  357. const int r= *s++;
  358. *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
  359. }
  360. __asm __volatile(SFENCE:::"memory");
  361. __asm __volatile(EMMS:::"memory");
  362. #else
  363. unsigned j,i,num_pixels=src_size/4;
  364. uint16_t *d = (uint16_t *)dst;
  365. for(i=0,j=0; j<num_pixels; i+=4,j++)
  366. {
  367. const int b= src[i+0];
  368. const int g= src[i+1];
  369. const int r= src[i+2];
  370. d[j]= (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
  371. }
  372. #endif
  373. }
  374. static inline void RENAME(rgb24to16)(const uint8_t *src, uint8_t *dst, unsigned src_size)
  375. {
  376. #ifdef HAVE_MMX
  377. const uint8_t *s = src;
  378. const uint8_t *end,*mm_end;
  379. uint16_t *d = (uint16_t *)dst;
  380. end = s + src_size;
  381. mm_end = (uint8_t*)((((unsigned long)end)/(MMREG_SIZE*2))*(MMREG_SIZE*2));
  382. __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
  383. __asm __volatile(
  384. "movq %0, %%mm7\n\t"
  385. "movq %1, %%mm6\n\t"
  386. ::"m"(red_16mask),"m"(green_16mask));
  387. if(mm_end == end) mm_end -= MMREG_SIZE*2;
  388. while(s < mm_end)
  389. {
  390. __asm __volatile(
  391. PREFETCH" 32%1\n\t"
  392. "movd %1, %%mm0\n\t"
  393. "movd 3%1, %%mm3\n\t"
  394. "punpckldq 6%1, %%mm0\n\t"
  395. "punpckldq 9%1, %%mm3\n\t"
  396. "movq %%mm0, %%mm1\n\t"
  397. "movq %%mm0, %%mm2\n\t"
  398. "movq %%mm3, %%mm4\n\t"
  399. "movq %%mm3, %%mm5\n\t"
  400. "psrlq $3, %%mm0\n\t"
  401. "psrlq $3, %%mm3\n\t"
  402. "pand %2, %%mm0\n\t"
  403. "pand %2, %%mm3\n\t"
  404. "psrlq $5, %%mm1\n\t"
  405. "psrlq $5, %%mm4\n\t"
  406. "pand %%mm6, %%mm1\n\t"
  407. "pand %%mm6, %%mm4\n\t"
  408. "psrlq $8, %%mm2\n\t"
  409. "psrlq $8, %%mm5\n\t"
  410. "pand %%mm7, %%mm2\n\t"
  411. "pand %%mm7, %%mm5\n\t"
  412. "por %%mm1, %%mm0\n\t"
  413. "por %%mm4, %%mm3\n\t"
  414. "por %%mm2, %%mm0\n\t"
  415. "por %%mm5, %%mm3\n\t"
  416. "psllq $16, %%mm3\n\t"
  417. "por %%mm3, %%mm0\n\t"
  418. MOVNTQ" %%mm0, %0\n\t"
  419. :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
  420. d += 4;
  421. s += 12;
  422. }
  423. while(s < end)
  424. {
  425. const int b= *s++;
  426. const int g= *s++;
  427. const int r= *s++;
  428. *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
  429. }
  430. __asm __volatile(SFENCE:::"memory");
  431. __asm __volatile(EMMS:::"memory");
  432. #else
  433. unsigned j,i,num_pixels=src_size/3;
  434. uint16_t *d = (uint16_t *)dst;
  435. for(i=0,j=0; j<num_pixels; i+=3,j++)
  436. {
  437. const int b= src[i+0];
  438. const int g= src[i+1];
  439. const int r= src[i+2];
  440. d[j]= (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
  441. }
  442. #endif
  443. }
  444. static inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, unsigned src_size)
  445. {
  446. #ifdef HAVE_MMX
  447. const uint8_t *s = src;
  448. const uint8_t *end,*mm_end;
  449. uint16_t *d = (uint16_t *)dst;
  450. end = s + src_size;
  451. mm_end = (uint8_t*)((((unsigned long)end)/(MMREG_SIZE*2))*(MMREG_SIZE*2));
  452. __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
  453. __asm __volatile(
  454. "movq %0, %%mm7\n\t"
  455. "movq %1, %%mm6\n\t"
  456. ::"m"(red_15mask),"m"(green_15mask));
  457. if(mm_end == end) mm_end -= MMREG_SIZE*2;
  458. while(s < mm_end)
  459. {
  460. __asm __volatile(
  461. PREFETCH" 32%1\n\t"
  462. "movd %1, %%mm0\n\t"
  463. "movd 3%1, %%mm3\n\t"
  464. "punpckldq 6%1, %%mm0\n\t"
  465. "punpckldq 9%1, %%mm3\n\t"
  466. "movq %%mm0, %%mm1\n\t"
  467. "movq %%mm0, %%mm2\n\t"
  468. "movq %%mm3, %%mm4\n\t"
  469. "movq %%mm3, %%mm5\n\t"
  470. "psrlq $3, %%mm0\n\t"
  471. "psrlq $3, %%mm3\n\t"
  472. "pand %2, %%mm0\n\t"
  473. "pand %2, %%mm3\n\t"
  474. "psrlq $6, %%mm1\n\t"
  475. "psrlq $6, %%mm4\n\t"
  476. "pand %%mm6, %%mm1\n\t"
  477. "pand %%mm6, %%mm4\n\t"
  478. "psrlq $9, %%mm2\n\t"
  479. "psrlq $9, %%mm5\n\t"
  480. "pand %%mm7, %%mm2\n\t"
  481. "pand %%mm7, %%mm5\n\t"
  482. "por %%mm1, %%mm0\n\t"
  483. "por %%mm4, %%mm3\n\t"
  484. "por %%mm2, %%mm0\n\t"
  485. "por %%mm5, %%mm3\n\t"
  486. "psllq $16, %%mm3\n\t"
  487. "por %%mm3, %%mm0\n\t"
  488. MOVNTQ" %%mm0, %0\n\t"
  489. :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
  490. d += 4;
  491. s += 12;
  492. }
  493. while(s < end)
  494. {
  495. const int b= *s++;
  496. const int g= *s++;
  497. const int r= *s++;
  498. *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
  499. }
  500. __asm __volatile(SFENCE:::"memory");
  501. __asm __volatile(EMMS:::"memory");
  502. #else
  503. unsigned j,i,num_pixels=src_size/3;
  504. uint16_t *d = (uint16_t *)dst;
  505. for(i=0,j=0; j<num_pixels; i+=3,j++)
  506. {
  507. const int b= src[i+0];
  508. const int g= src[i+1];
  509. const int r= src[i+2];
  510. d[j]= (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
  511. }
  512. #endif
  513. }
  514. static inline void RENAME(rgb32tobgr32)(const uint8_t *src, uint8_t *dst, unsigned int src_size)
  515. {
  516. int num_pixels= src_size >> 2;
  517. #ifdef HAVE_MMX
  518. asm volatile (
  519. "xorl %%eax, %%eax \n\t"
  520. ".balign 16 \n\t"
  521. "1: \n\t"
  522. PREFETCH" 32(%0, %%eax) \n\t"
  523. "movq (%0, %%eax), %%mm0 \n\t"
  524. "movq %%mm0, %%mm1 \n\t"
  525. "movq %%mm0, %%mm2 \n\t"
  526. "pslld $16, %%mm0 \n\t"
  527. "psrld $16, %%mm1 \n\t"
  528. "pand "MANGLE(mask32r)", %%mm0 \n\t"
  529. "pand "MANGLE(mask32g)", %%mm2 \n\t"
  530. "pand "MANGLE(mask32b)", %%mm1 \n\t"
  531. "por %%mm0, %%mm2 \n\t"
  532. "por %%mm1, %%mm2 \n\t"
  533. MOVNTQ" %%mm2, (%1, %%eax) \n\t"
  534. "addl $2, %%eax \n\t"
  535. "cmpl %2, %%eax \n\t"
  536. " jb 1b \n\t"
  537. :: "r" (src), "r"(dst), "r" (num_pixels)
  538. : "%eax"
  539. );
  540. __asm __volatile(SFENCE:::"memory");
  541. __asm __volatile(EMMS:::"memory");
  542. #else
  543. int i;
  544. for(i=0; i<num_pixels; i++)
  545. {
  546. dst[4*i + 0] = src[4*i + 2];
  547. dst[4*i + 1] = src[4*i + 1];
  548. dst[4*i + 2] = src[4*i + 0];
  549. }
  550. #endif
  551. }
  552. /**
  553. *
  554. * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
  555. * problem for anyone then tell me, and ill fix it)
  556. */
  557. static inline void RENAME(yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
  558. unsigned int width, unsigned int height,
  559. unsigned int lumStride, unsigned int chromStride, unsigned int dstStride)
  560. {
  561. int y;
  562. const int chromWidth= width>>1;
  563. for(y=0; y<height; y++)
  564. {
  565. #ifdef HAVE_MMX
  566. //FIXME handle 2 lines a once (fewer prefetch, reuse some chrom, but very likely limited by mem anyway)
  567. asm volatile(
  568. "xorl %%eax, %%eax \n\t"
  569. ".balign 16 \n\t"
  570. "1: \n\t"
  571. PREFETCH" 32(%1, %%eax, 2) \n\t"
  572. PREFETCH" 32(%2, %%eax) \n\t"
  573. PREFETCH" 32(%3, %%eax) \n\t"
  574. "movq (%2, %%eax), %%mm0 \n\t" // U(0)
  575. "movq %%mm0, %%mm2 \n\t" // U(0)
  576. "movq (%3, %%eax), %%mm1 \n\t" // V(0)
  577. "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
  578. "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8)
  579. "movq (%1, %%eax,2), %%mm3 \n\t" // Y(0)
  580. "movq 8(%1, %%eax,2), %%mm5 \n\t" // Y(8)
  581. "movq %%mm3, %%mm4 \n\t" // Y(0)
  582. "movq %%mm5, %%mm6 \n\t" // Y(8)
  583. "punpcklbw %%mm0, %%mm3 \n\t" // YUYV YUYV(0)
  584. "punpckhbw %%mm0, %%mm4 \n\t" // YUYV YUYV(4)
  585. "punpcklbw %%mm2, %%mm5 \n\t" // YUYV YUYV(8)
  586. "punpckhbw %%mm2, %%mm6 \n\t" // YUYV YUYV(12)
  587. MOVNTQ" %%mm3, (%0, %%eax, 4) \n\t"
  588. MOVNTQ" %%mm4, 8(%0, %%eax, 4) \n\t"
  589. MOVNTQ" %%mm5, 16(%0, %%eax, 4) \n\t"
  590. MOVNTQ" %%mm6, 24(%0, %%eax, 4) \n\t"
  591. "addl $8, %%eax \n\t"
  592. "cmpl %4, %%eax \n\t"
  593. " jb 1b \n\t"
  594. ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "r" (chromWidth)
  595. : "%eax"
  596. );
  597. #else
  598. int i;
  599. for(i=0; i<chromWidth; i++)
  600. {
  601. dst[4*i+0] = ysrc[2*i+0];
  602. dst[4*i+1] = usrc[i];
  603. dst[4*i+2] = ysrc[2*i+1];
  604. dst[4*i+3] = vsrc[i];
  605. }
  606. #endif
  607. if(y&1)
  608. {
  609. usrc += chromStride;
  610. vsrc += chromStride;
  611. }
  612. ysrc += lumStride;
  613. dst += dstStride;
  614. }
  615. #ifdef HAVE_MMX
  616. asm( EMMS" \n\t"
  617. SFENCE" \n\t"
  618. :::"memory");
  619. #endif
  620. }
  621. /**
  622. *
  623. * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
  624. * problem for anyone then tell me, and ill fix it)
  625. */
  626. static inline void RENAME(yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
  627. unsigned int width, unsigned int height,
  628. unsigned int lumStride, unsigned int chromStride, unsigned int srcStride)
  629. {
  630. int y;
  631. const int chromWidth= width>>1;
  632. for(y=0; y<height; y+=2)
  633. {
  634. #ifdef HAVE_MMX
  635. asm volatile(
  636. "xorl %%eax, %%eax \n\t"
  637. "pcmpeqw %%mm7, %%mm7 \n\t"
  638. "psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
  639. ".balign 16 \n\t"
  640. "1: \n\t"
  641. PREFETCH" 64(%0, %%eax, 4) \n\t"
  642. "movq (%0, %%eax, 4), %%mm0 \n\t" // YUYV YUYV(0)
  643. "movq 8(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(4)
  644. "movq %%mm0, %%mm2 \n\t" // YUYV YUYV(0)
  645. "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(4)
  646. "psrlw $8, %%mm0 \n\t" // U0V0 U0V0(0)
  647. "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(4)
  648. "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
  649. "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
  650. "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
  651. "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
  652. MOVNTQ" %%mm2, (%1, %%eax, 2) \n\t"
  653. "movq 16(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(8)
  654. "movq 24(%0, %%eax, 4), %%mm2 \n\t" // YUYV YUYV(12)
  655. "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(8)
  656. "movq %%mm2, %%mm4 \n\t" // YUYV YUYV(12)
  657. "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(8)
  658. "psrlw $8, %%mm2 \n\t" // U0V0 U0V0(12)
  659. "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
  660. "pand %%mm7, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
  661. "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
  662. "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
  663. MOVNTQ" %%mm3, 8(%1, %%eax, 2) \n\t"
  664. "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
  665. "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
  666. "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
  667. "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
  668. "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
  669. "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
  670. "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
  671. "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
  672. MOVNTQ" %%mm0, (%3, %%eax) \n\t"
  673. MOVNTQ" %%mm2, (%2, %%eax) \n\t"
  674. "addl $8, %%eax \n\t"
  675. "cmpl %4, %%eax \n\t"
  676. " jb 1b \n\t"
  677. ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "r" (chromWidth)
  678. : "memory", "%eax"
  679. );
  680. ydst += lumStride;
  681. src += srcStride;
  682. asm volatile(
  683. "xorl %%eax, %%eax \n\t"
  684. ".balign 16 \n\t"
  685. "1: \n\t"
  686. PREFETCH" 64(%0, %%eax, 4) \n\t"
  687. "movq (%0, %%eax, 4), %%mm0 \n\t" // YUYV YUYV(0)
  688. "movq 8(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(4)
  689. "movq 16(%0, %%eax, 4), %%mm2 \n\t" // YUYV YUYV(8)
  690. "movq 24(%0, %%eax, 4), %%mm3 \n\t" // YUYV YUYV(12)
  691. "pand %%mm7, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
  692. "pand %%mm7, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
  693. "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
  694. "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
  695. "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
  696. "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
  697. MOVNTQ" %%mm0, (%1, %%eax, 2) \n\t"
  698. MOVNTQ" %%mm2, 8(%1, %%eax, 2) \n\t"
  699. "addl $8, %%eax \n\t"
  700. "cmpl %4, %%eax \n\t"
  701. " jb 1b \n\t"
  702. ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "r" (chromWidth)
  703. : "memory", "%eax"
  704. );
  705. #else
  706. int i;
  707. for(i=0; i<chromWidth; i++)
  708. {
  709. ydst[2*i+0] = src[4*i+0];
  710. udst[i] = src[4*i+1];
  711. ydst[2*i+1] = src[4*i+2];
  712. vdst[i] = src[4*i+3];
  713. }
  714. ydst += lumStride;
  715. src += srcStride;
  716. for(i=0; i<chromWidth; i++)
  717. {
  718. ydst[2*i+0] = src[4*i+0];
  719. ydst[2*i+1] = src[4*i+2];
  720. }
  721. #endif
  722. udst += chromStride;
  723. vdst += chromStride;
  724. ydst += lumStride;
  725. src += srcStride;
  726. }
  727. #ifdef HAVE_MMX
  728. asm volatile( EMMS" \n\t"
  729. SFENCE" \n\t"
  730. :::"memory");
  731. #endif
  732. }
  733. /**
  734. *
  735. * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
  736. * problem for anyone then tell me, and ill fix it)
  737. * chrominance data is only taken from every secound line others are ignored FIXME write HQ version
  738. */
  739. static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
  740. unsigned int width, unsigned int height,
  741. unsigned int lumStride, unsigned int chromStride, unsigned int srcStride)
  742. {
  743. int y;
  744. const int chromWidth= width>>1;
  745. for(y=0; y<height; y+=2)
  746. {
  747. #ifdef HAVE_MMX
  748. asm volatile(
  749. "xorl %%eax, %%eax \n\t"
  750. "pcmpeqw %%mm7, %%mm7 \n\t"
  751. "psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
  752. ".balign 16 \n\t"
  753. "1: \n\t"
  754. PREFETCH" 64(%0, %%eax, 4) \n\t"
  755. "movq (%0, %%eax, 4), %%mm0 \n\t" // UYVY UYVY(0)
  756. "movq 8(%0, %%eax, 4), %%mm1 \n\t" // UYVY UYVY(4)
  757. "movq %%mm0, %%mm2 \n\t" // UYVY UYVY(0)
  758. "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(4)
  759. "pand %%mm7, %%mm0 \n\t" // U0V0 U0V0(0)
  760. "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(4)
  761. "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
  762. "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
  763. "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
  764. "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
  765. MOVNTQ" %%mm2, (%1, %%eax, 2) \n\t"
  766. "movq 16(%0, %%eax, 4), %%mm1 \n\t" // UYVY UYVY(8)
  767. "movq 24(%0, %%eax, 4), %%mm2 \n\t" // UYVY UYVY(12)
  768. "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(8)
  769. "movq %%mm2, %%mm4 \n\t" // UYVY UYVY(12)
  770. "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(8)
  771. "pand %%mm7, %%mm2 \n\t" // U0V0 U0V0(12)
  772. "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
  773. "psrlw $8, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
  774. "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
  775. "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
  776. MOVNTQ" %%mm3, 8(%1, %%eax, 2) \n\t"
  777. "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
  778. "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
  779. "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
  780. "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
  781. "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
  782. "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
  783. "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
  784. "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
  785. MOVNTQ" %%mm0, (%3, %%eax) \n\t"
  786. MOVNTQ" %%mm2, (%2, %%eax) \n\t"
  787. "addl $8, %%eax \n\t"
  788. "cmpl %4, %%eax \n\t"
  789. " jb 1b \n\t"
  790. ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "r" (chromWidth)
  791. : "memory", "%eax"
  792. );
  793. ydst += lumStride;
  794. src += srcStride;
  795. asm volatile(
  796. "xorl %%eax, %%eax \n\t"
  797. ".balign 16 \n\t"
  798. "1: \n\t"
  799. PREFETCH" 64(%0, %%eax, 4) \n\t"
  800. "movq (%0, %%eax, 4), %%mm0 \n\t" // YUYV YUYV(0)
  801. "movq 8(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(4)
  802. "movq 16(%0, %%eax, 4), %%mm2 \n\t" // YUYV YUYV(8)
  803. "movq 24(%0, %%eax, 4), %%mm3 \n\t" // YUYV YUYV(12)
  804. "psrlw $8, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
  805. "psrlw $8, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
  806. "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
  807. "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
  808. "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
  809. "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
  810. MOVNTQ" %%mm0, (%1, %%eax, 2) \n\t"
  811. MOVNTQ" %%mm2, 8(%1, %%eax, 2) \n\t"
  812. "addl $8, %%eax \n\t"
  813. "cmpl %4, %%eax \n\t"
  814. " jb 1b \n\t"
  815. ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "r" (chromWidth)
  816. : "memory", "%eax"
  817. );
  818. #else
  819. int i;
  820. for(i=0; i<chromWidth; i++)
  821. {
  822. udst[i] = src[4*i+0];
  823. ydst[2*i+0] = src[4*i+1];
  824. vdst[i] = src[4*i+2];
  825. ydst[2*i+1] = src[4*i+3];
  826. }
  827. ydst += lumStride;
  828. src += srcStride;
  829. for(i=0; i<chromWidth; i++)
  830. {
  831. ydst[2*i+0] = src[4*i+1];
  832. ydst[2*i+1] = src[4*i+3];
  833. }
  834. #endif
  835. udst += chromStride;
  836. vdst += chromStride;
  837. ydst += lumStride;
  838. src += srcStride;
  839. }
  840. #ifdef HAVE_MMX
  841. asm volatile( EMMS" \n\t"
  842. SFENCE" \n\t"
  843. :::"memory");
  844. #endif
  845. }
  846. /**
  847. *
  848. * height should be a multiple of 2 and width should be a multiple of 2 (if this is a
  849. * problem for anyone then tell me, and ill fix it)
  850. * chrominance data is only taken from every secound line others are ignored in the C version FIXME write HQ version
  851. */
  852. static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
  853. unsigned int width, unsigned int height,
  854. unsigned int lumStride, unsigned int chromStride, unsigned int srcStride)
  855. {
  856. int y;
  857. const int chromWidth= width>>1;
  858. #ifdef HAVE_MMX
  859. for(y=0; y<height-2; y+=2)
  860. {
  861. int i;
  862. for(i=0; i<2; i++)
  863. {
  864. asm volatile(
  865. "movl %2, %%eax \n\t"
  866. "movq "MANGLE(bgr2YCoeff)", %%mm6 \n\t"
  867. "movq "MANGLE(w1111)", %%mm5 \n\t"
  868. "pxor %%mm7, %%mm7 \n\t"
  869. "leal (%%eax, %%eax, 2), %%ebx \n\t"
  870. ".balign 16 \n\t"
  871. "1: \n\t"
  872. PREFETCH" 64(%0, %%ebx) \n\t"
  873. "movd (%0, %%ebx), %%mm0 \n\t"
  874. "movd 3(%0, %%ebx), %%mm1 \n\t"
  875. "punpcklbw %%mm7, %%mm0 \n\t"
  876. "punpcklbw %%mm7, %%mm1 \n\t"
  877. "movd 6(%0, %%ebx), %%mm2 \n\t"
  878. "movd 9(%0, %%ebx), %%mm3 \n\t"
  879. "punpcklbw %%mm7, %%mm2 \n\t"
  880. "punpcklbw %%mm7, %%mm3 \n\t"
  881. "pmaddwd %%mm6, %%mm0 \n\t"
  882. "pmaddwd %%mm6, %%mm1 \n\t"
  883. "pmaddwd %%mm6, %%mm2 \n\t"
  884. "pmaddwd %%mm6, %%mm3 \n\t"
  885. #ifndef FAST_BGR2YV12
  886. "psrad $8, %%mm0 \n\t"
  887. "psrad $8, %%mm1 \n\t"
  888. "psrad $8, %%mm2 \n\t"
  889. "psrad $8, %%mm3 \n\t"
  890. #endif
  891. "packssdw %%mm1, %%mm0 \n\t"
  892. "packssdw %%mm3, %%mm2 \n\t"
  893. "pmaddwd %%mm5, %%mm0 \n\t"
  894. "pmaddwd %%mm5, %%mm2 \n\t"
  895. "packssdw %%mm2, %%mm0 \n\t"
  896. "psraw $7, %%mm0 \n\t"
  897. "movd 12(%0, %%ebx), %%mm4 \n\t"
  898. "movd 15(%0, %%ebx), %%mm1 \n\t"
  899. "punpcklbw %%mm7, %%mm4 \n\t"
  900. "punpcklbw %%mm7, %%mm1 \n\t"
  901. "movd 18(%0, %%ebx), %%mm2 \n\t"
  902. "movd 21(%0, %%ebx), %%mm3 \n\t"
  903. "punpcklbw %%mm7, %%mm2 \n\t"
  904. "punpcklbw %%mm7, %%mm3 \n\t"
  905. "pmaddwd %%mm6, %%mm4 \n\t"
  906. "pmaddwd %%mm6, %%mm1 \n\t"
  907. "pmaddwd %%mm6, %%mm2 \n\t"
  908. "pmaddwd %%mm6, %%mm3 \n\t"
  909. #ifndef FAST_BGR2YV12
  910. "psrad $8, %%mm4 \n\t"
  911. "psrad $8, %%mm1 \n\t"
  912. "psrad $8, %%mm2 \n\t"
  913. "psrad $8, %%mm3 \n\t"
  914. #endif
  915. "packssdw %%mm1, %%mm4 \n\t"
  916. "packssdw %%mm3, %%mm2 \n\t"
  917. "pmaddwd %%mm5, %%mm4 \n\t"
  918. "pmaddwd %%mm5, %%mm2 \n\t"
  919. "addl $24, %%ebx \n\t"
  920. "packssdw %%mm2, %%mm4 \n\t"
  921. "psraw $7, %%mm4 \n\t"
  922. "packuswb %%mm4, %%mm0 \n\t"
  923. "paddusb "MANGLE(bgr2YOffset)", %%mm0 \n\t"
  924. MOVNTQ" %%mm0, (%1, %%eax) \n\t"
  925. "addl $8, %%eax \n\t"
  926. " js 1b \n\t"
  927. : : "r" (src+width*3), "r" (ydst+width), "g" (-width)
  928. : "%eax", "%ebx"
  929. );
  930. ydst += lumStride;
  931. src += srcStride;
  932. }
  933. src -= srcStride*2;
  934. asm volatile(
  935. "movl %4, %%eax \n\t"
  936. "movq "MANGLE(w1111)", %%mm5 \n\t"
  937. "movq "MANGLE(bgr2UCoeff)", %%mm6 \n\t"
  938. "pxor %%mm7, %%mm7 \n\t"
  939. "leal (%%eax, %%eax, 2), %%ebx \n\t"
  940. "addl %%ebx, %%ebx \n\t"
  941. ".balign 16 \n\t"
  942. "1: \n\t"
  943. PREFETCH" 64(%0, %%ebx) \n\t"
  944. PREFETCH" 64(%1, %%ebx) \n\t"
  945. #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
  946. "movq (%0, %%ebx), %%mm0 \n\t"
  947. "movq (%1, %%ebx), %%mm1 \n\t"
  948. "movq 6(%0, %%ebx), %%mm2 \n\t"
  949. "movq 6(%1, %%ebx), %%mm3 \n\t"
  950. PAVGB" %%mm1, %%mm0 \n\t"
  951. PAVGB" %%mm3, %%mm2 \n\t"
  952. "movq %%mm0, %%mm1 \n\t"
  953. "movq %%mm2, %%mm3 \n\t"
  954. "psrlq $24, %%mm0 \n\t"
  955. "psrlq $24, %%mm2 \n\t"
  956. PAVGB" %%mm1, %%mm0 \n\t"
  957. PAVGB" %%mm3, %%mm2 \n\t"
  958. "punpcklbw %%mm7, %%mm0 \n\t"
  959. "punpcklbw %%mm7, %%mm2 \n\t"
  960. #else
  961. "movd (%0, %%ebx), %%mm0 \n\t"
  962. "movd (%1, %%ebx), %%mm1 \n\t"
  963. "movd 3(%0, %%ebx), %%mm2 \n\t"
  964. "movd 3(%1, %%ebx), %%mm3 \n\t"
  965. "punpcklbw %%mm7, %%mm0 \n\t"
  966. "punpcklbw %%mm7, %%mm1 \n\t"
  967. "punpcklbw %%mm7, %%mm2 \n\t"
  968. "punpcklbw %%mm7, %%mm3 \n\t"
  969. "paddw %%mm1, %%mm0 \n\t"
  970. "paddw %%mm3, %%mm2 \n\t"
  971. "paddw %%mm2, %%mm0 \n\t"
  972. "movd 6(%0, %%ebx), %%mm4 \n\t"
  973. "movd 6(%1, %%ebx), %%mm1 \n\t"
  974. "movd 9(%0, %%ebx), %%mm2 \n\t"
  975. "movd 9(%1, %%ebx), %%mm3 \n\t"
  976. "punpcklbw %%mm7, %%mm4 \n\t"
  977. "punpcklbw %%mm7, %%mm1 \n\t"
  978. "punpcklbw %%mm7, %%mm2 \n\t"
  979. "punpcklbw %%mm7, %%mm3 \n\t"
  980. "paddw %%mm1, %%mm4 \n\t"
  981. "paddw %%mm3, %%mm2 \n\t"
  982. "paddw %%mm4, %%mm2 \n\t"
  983. "psrlw $2, %%mm0 \n\t"
  984. "psrlw $2, %%mm2 \n\t"
  985. #endif
  986. "movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t"
  987. "movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t"
  988. "pmaddwd %%mm0, %%mm1 \n\t"
  989. "pmaddwd %%mm2, %%mm3 \n\t"
  990. "pmaddwd %%mm6, %%mm0 \n\t"
  991. "pmaddwd %%mm6, %%mm2 \n\t"
  992. #ifndef FAST_BGR2YV12
  993. "psrad $8, %%mm0 \n\t"
  994. "psrad $8, %%mm1 \n\t"
  995. "psrad $8, %%mm2 \n\t"
  996. "psrad $8, %%mm3 \n\t"
  997. #endif
  998. "packssdw %%mm2, %%mm0 \n\t"
  999. "packssdw %%mm3, %%mm1 \n\t"
  1000. "pmaddwd %%mm5, %%mm0 \n\t"
  1001. "pmaddwd %%mm5, %%mm1 \n\t"
  1002. "packssdw %%mm1, %%mm0 \n\t" // V1 V0 U1 U0
  1003. "psraw $7, %%mm0 \n\t"
  1004. #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
  1005. "movq 12(%0, %%ebx), %%mm4 \n\t"
  1006. "movq 12(%1, %%ebx), %%mm1 \n\t"
  1007. "movq 18(%0, %%ebx), %%mm2 \n\t"
  1008. "movq 18(%1, %%ebx), %%mm3 \n\t"
  1009. PAVGB" %%mm1, %%mm4 \n\t"
  1010. PAVGB" %%mm3, %%mm2 \n\t"
  1011. "movq %%mm4, %%mm1 \n\t"
  1012. "movq %%mm2, %%mm3 \n\t"
  1013. "psrlq $24, %%mm4 \n\t"
  1014. "psrlq $24, %%mm2 \n\t"
  1015. PAVGB" %%mm1, %%mm4 \n\t"
  1016. PAVGB" %%mm3, %%mm2 \n\t"
  1017. "punpcklbw %%mm7, %%mm4 \n\t"
  1018. "punpcklbw %%mm7, %%mm2 \n\t"
  1019. #else
  1020. "movd 12(%0, %%ebx), %%mm4 \n\t"
  1021. "movd 12(%1, %%ebx), %%mm1 \n\t"
  1022. "movd 15(%0, %%ebx), %%mm2 \n\t"
  1023. "movd 15(%1, %%ebx), %%mm3 \n\t"
  1024. "punpcklbw %%mm7, %%mm4 \n\t"
  1025. "punpcklbw %%mm7, %%mm1 \n\t"
  1026. "punpcklbw %%mm7, %%mm2 \n\t"
  1027. "punpcklbw %%mm7, %%mm3 \n\t"
  1028. "paddw %%mm1, %%mm4 \n\t"
  1029. "paddw %%mm3, %%mm2 \n\t"
  1030. "paddw %%mm2, %%mm4 \n\t"
  1031. "movd 18(%0, %%ebx), %%mm5 \n\t"
  1032. "movd 18(%1, %%ebx), %%mm1 \n\t"
  1033. "movd 21(%0, %%ebx), %%mm2 \n\t"
  1034. "movd 21(%1, %%ebx), %%mm3 \n\t"
  1035. "punpcklbw %%mm7, %%mm5 \n\t"
  1036. "punpcklbw %%mm7, %%mm1 \n\t"
  1037. "punpcklbw %%mm7, %%mm2 \n\t"
  1038. "punpcklbw %%mm7, %%mm3 \n\t"
  1039. "paddw %%mm1, %%mm5 \n\t"
  1040. "paddw %%mm3, %%mm2 \n\t"
  1041. "paddw %%mm5, %%mm2 \n\t"
  1042. "movq "MANGLE(w1111)", %%mm5 \n\t"
  1043. "psrlw $2, %%mm4 \n\t"
  1044. "psrlw $2, %%mm2 \n\t"
  1045. #endif
  1046. "movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t"
  1047. "movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t"
  1048. "pmaddwd %%mm4, %%mm1 \n\t"
  1049. "pmaddwd %%mm2, %%mm3 \n\t"
  1050. "pmaddwd %%mm6, %%mm4 \n\t"
  1051. "pmaddwd %%mm6, %%mm2 \n\t"
  1052. #ifndef FAST_BGR2YV12
  1053. "psrad $8, %%mm4 \n\t"
  1054. "psrad $8, %%mm1 \n\t"
  1055. "psrad $8, %%mm2 \n\t"
  1056. "psrad $8, %%mm3 \n\t"
  1057. #endif
  1058. "packssdw %%mm2, %%mm4 \n\t"
  1059. "packssdw %%mm3, %%mm1 \n\t"
  1060. "pmaddwd %%mm5, %%mm4 \n\t"
  1061. "pmaddwd %%mm5, %%mm1 \n\t"
  1062. "addl $24, %%ebx \n\t"
  1063. "packssdw %%mm1, %%mm4 \n\t" // V3 V2 U3 U2
  1064. "psraw $7, %%mm4 \n\t"
  1065. "movq %%mm0, %%mm1 \n\t"
  1066. "punpckldq %%mm4, %%mm0 \n\t"
  1067. "punpckhdq %%mm4, %%mm1 \n\t"
  1068. "packsswb %%mm1, %%mm0 \n\t"
  1069. "paddb "MANGLE(bgr2UVOffset)", %%mm0 \n\t"
  1070. "movd %%mm0, (%2, %%eax) \n\t"
  1071. "punpckhdq %%mm0, %%mm0 \n\t"
  1072. "movd %%mm0, (%3, %%eax) \n\t"
  1073. "addl $4, %%eax \n\t"
  1074. " js 1b \n\t"
  1075. : : "r" (src+width*6), "r" (src+srcStride+width*6), "r" (udst+width), "r" (vdst+width), "g" (-width)
  1076. : "%eax", "%ebx"
  1077. );
  1078. udst += chromStride;
  1079. vdst += chromStride;
  1080. src += srcStride*2;
  1081. }
  1082. asm volatile( EMMS" \n\t"
  1083. SFENCE" \n\t"
  1084. :::"memory");
  1085. #else
  1086. y=0;
  1087. #endif
  1088. for(; y<height; y+=2)
  1089. {
  1090. int i;
  1091. for(i=0; i<chromWidth; i++)
  1092. {
  1093. unsigned int b= src[6*i+0];
  1094. unsigned int g= src[6*i+1];
  1095. unsigned int r= src[6*i+2];
  1096. unsigned int Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
  1097. unsigned int V = ((RV*r + GV*g + BV*b)>>RGB2YUV_SHIFT) + 128;
  1098. unsigned int U = ((RU*r + GU*g + BU*b)>>RGB2YUV_SHIFT) + 128;
  1099. udst[i] = U;
  1100. vdst[i] = V;
  1101. ydst[2*i] = Y;
  1102. b= src[6*i+3];
  1103. g= src[6*i+4];
  1104. r= src[6*i+5];
  1105. Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
  1106. ydst[2*i+1] = Y;
  1107. }
  1108. ydst += lumStride;
  1109. src += srcStride;
  1110. for(i=0; i<chromWidth; i++)
  1111. {
  1112. unsigned int b= src[6*i+0];
  1113. unsigned int g= src[6*i+1];
  1114. unsigned int r= src[6*i+2];
  1115. unsigned int Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
  1116. ydst[2*i] = Y;
  1117. b= src[6*i+3];
  1118. g= src[6*i+4];
  1119. r= src[6*i+5];
  1120. Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
  1121. ydst[2*i+1] = Y;
  1122. }
  1123. udst += chromStride;
  1124. vdst += chromStride;
  1125. ydst += lumStride;
  1126. src += srcStride;
  1127. }
  1128. }