You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1366 lines
35KB

  1. /*
  2. *
  3. * rgb2rgb.c, Software RGB to RGB convertor
  4. * pluralize by Software PAL8 to RGB convertor
  5. * Software YUV to YUV convertor
  6. * Software YUV to RGB convertor
  7. * Written by Nick Kurshev.
  8. * palette & yuv & runtime cpu stuff by Michael (michaelni@gmx.at) (under GPL)
  9. */
  10. #undef PREFETCH
  11. #undef MOVNTQ
  12. #undef EMMS
  13. #undef SFENCE
  14. #undef MMREG_SIZE
  15. #undef PREFETCHW
  16. #undef PAVGB
  17. #ifdef HAVE_SSE2
  18. #define MMREG_SIZE 16
  19. #else
  20. #define MMREG_SIZE 8
  21. #endif
  22. #ifdef HAVE_3DNOW
  23. #define PREFETCH "prefetch"
  24. #define PREFETCHW "prefetchw"
  25. #define PAVGB "pavgusb"
  26. #elif defined ( HAVE_MMX2 )
  27. #define PREFETCH "prefetchnta"
  28. #define PREFETCHW "prefetcht0"
  29. #define PAVGB "pavgb"
  30. #else
  31. #define PREFETCH "/nop"
  32. #define PREFETCHW "/nop"
  33. #endif
  34. #ifdef HAVE_3DNOW
  35. /* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
  36. #define EMMS "femms"
  37. #else
  38. #define EMMS "emms"
  39. #endif
  40. #ifdef HAVE_MMX2
  41. #define MOVNTQ "movntq"
  42. #define SFENCE "sfence"
  43. #else
  44. #define MOVNTQ "movq"
  45. #define SFENCE "/nop"
  46. #endif
  47. static inline void RENAME(rgb24to32)(const uint8_t *src,uint8_t *dst,unsigned src_size)
  48. {
  49. uint8_t *dest = dst;
  50. const uint8_t *s = src;
  51. const uint8_t *end;
  52. #ifdef HAVE_MMX
  53. uint8_t *mm_end;
  54. #endif
  55. end = s + src_size;
  56. #ifdef HAVE_MMX
  57. __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
  58. mm_end = (uint8_t*)((((unsigned long)end)/(MMREG_SIZE*4))*(MMREG_SIZE*4));
  59. __asm __volatile("movq %0, %%mm7"::"m"(mask32):"memory");
  60. if(mm_end == end) mm_end -= MMREG_SIZE*4;
  61. while(s < mm_end)
  62. {
  63. __asm __volatile(
  64. PREFETCH" 32%1\n\t"
  65. "movd %1, %%mm0\n\t"
  66. "punpckldq 3%1, %%mm0\n\t"
  67. "movd 6%1, %%mm1\n\t"
  68. "punpckldq 9%1, %%mm1\n\t"
  69. "movd 12%1, %%mm2\n\t"
  70. "punpckldq 15%1, %%mm2\n\t"
  71. "movd 18%1, %%mm3\n\t"
  72. "punpckldq 21%1, %%mm3\n\t"
  73. "pand %%mm7, %%mm0\n\t"
  74. "pand %%mm7, %%mm1\n\t"
  75. "pand %%mm7, %%mm2\n\t"
  76. "pand %%mm7, %%mm3\n\t"
  77. MOVNTQ" %%mm0, %0\n\t"
  78. MOVNTQ" %%mm1, 8%0\n\t"
  79. MOVNTQ" %%mm2, 16%0\n\t"
  80. MOVNTQ" %%mm3, 24%0"
  81. :"=m"(*dest)
  82. :"m"(*s)
  83. :"memory");
  84. dest += 32;
  85. s += 24;
  86. }
  87. __asm __volatile(SFENCE:::"memory");
  88. __asm __volatile(EMMS:::"memory");
  89. #endif
  90. while(s < end)
  91. {
  92. *dest++ = *s++;
  93. *dest++ = *s++;
  94. *dest++ = *s++;
  95. *dest++ = 0;
  96. }
  97. }
  98. static inline void RENAME(rgb32to24)(const uint8_t *src,uint8_t *dst,unsigned src_size)
  99. {
  100. uint8_t *dest = dst;
  101. const uint8_t *s = src;
  102. const uint8_t *end;
  103. #ifdef HAVE_MMX
  104. uint8_t *mm_end;
  105. #endif
  106. end = s + src_size;
  107. #ifdef HAVE_MMX
  108. __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
  109. mm_end = (uint8_t*)((((unsigned long)end)/(MMREG_SIZE*4))*(MMREG_SIZE*4));
  110. while(s < mm_end)
  111. {
  112. __asm __volatile(
  113. PREFETCH" 32%1\n\t"
  114. "movq %1, %%mm0\n\t"
  115. "movq 8%1, %%mm1\n\t"
  116. "movq 16%1, %%mm4\n\t"
  117. "movq 24%1, %%mm5\n\t"
  118. "movq %%mm0, %%mm2\n\t"
  119. "movq %%mm1, %%mm3\n\t"
  120. "movq %%mm4, %%mm6\n\t"
  121. "movq %%mm5, %%mm7\n\t"
  122. "psrlq $8, %%mm2\n\t"
  123. "psrlq $8, %%mm3\n\t"
  124. "psrlq $8, %%mm6\n\t"
  125. "psrlq $8, %%mm7\n\t"
  126. "pand %2, %%mm0\n\t"
  127. "pand %2, %%mm1\n\t"
  128. "pand %2, %%mm4\n\t"
  129. "pand %2, %%mm5\n\t"
  130. "pand %3, %%mm2\n\t"
  131. "pand %3, %%mm3\n\t"
  132. "pand %3, %%mm6\n\t"
  133. "pand %3, %%mm7\n\t"
  134. "por %%mm2, %%mm0\n\t"
  135. "por %%mm3, %%mm1\n\t"
  136. "por %%mm6, %%mm4\n\t"
  137. "por %%mm7, %%mm5\n\t"
  138. "movq %%mm1, %%mm2\n\t"
  139. "movq %%mm4, %%mm3\n\t"
  140. "psllq $48, %%mm2\n\t"
  141. "psllq $32, %%mm3\n\t"
  142. "pand %4, %%mm2\n\t"
  143. "pand %5, %%mm3\n\t"
  144. "por %%mm2, %%mm0\n\t"
  145. "psrlq $16, %%mm1\n\t"
  146. "psrlq $32, %%mm4\n\t"
  147. "psllq $16, %%mm5\n\t"
  148. "por %%mm3, %%mm1\n\t"
  149. "pand %6, %%mm5\n\t"
  150. "por %%mm5, %%mm4\n\t"
  151. MOVNTQ" %%mm0, %0\n\t"
  152. MOVNTQ" %%mm1, 8%0\n\t"
  153. MOVNTQ" %%mm4, 16%0"
  154. :"=m"(*dest)
  155. :"m"(*s),"m"(mask24l),
  156. "m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
  157. :"memory");
  158. dest += 24;
  159. s += 32;
  160. }
  161. __asm __volatile(SFENCE:::"memory");
  162. __asm __volatile(EMMS:::"memory");
  163. #endif
  164. while(s < end)
  165. {
  166. *dest++ = *s++;
  167. *dest++ = *s++;
  168. *dest++ = *s++;
  169. s++;
  170. }
  171. }
  172. /*
  173. Original by Strepto/Astral
  174. ported to gcc & bugfixed : A'rpi
  175. MMX2, 3DNOW optimization by Nick Kurshev
  176. 32bit c version, and and&add trick by Michael Niedermayer
  177. */
  178. static inline void RENAME(rgb15to16)(const uint8_t *src,uint8_t *dst,unsigned src_size)
  179. {
  180. #ifdef HAVE_MMX
  181. register const char* s=src+src_size;
  182. register char* d=dst+src_size;
  183. register int offs=-src_size;
  184. __asm __volatile(PREFETCH" %0"::"m"(*(s+offs)));
  185. __asm __volatile(
  186. "movq %0, %%mm4\n\t"
  187. ::"m"(mask15s));
  188. while(offs<0)
  189. {
  190. __asm __volatile(
  191. PREFETCH" 32%1\n\t"
  192. "movq %1, %%mm0\n\t"
  193. "movq 8%1, %%mm2\n\t"
  194. "movq %%mm0, %%mm1\n\t"
  195. "movq %%mm2, %%mm3\n\t"
  196. "pand %%mm4, %%mm0\n\t"
  197. "pand %%mm4, %%mm2\n\t"
  198. "paddw %%mm1, %%mm0\n\t"
  199. "paddw %%mm3, %%mm2\n\t"
  200. MOVNTQ" %%mm0, %0\n\t"
  201. MOVNTQ" %%mm2, 8%0"
  202. :"=m"(*(d+offs))
  203. :"m"(*(s+offs))
  204. );
  205. offs+=16;
  206. }
  207. __asm __volatile(SFENCE:::"memory");
  208. __asm __volatile(EMMS:::"memory");
  209. #else
  210. #if 0
  211. const uint16_t *s1=( uint16_t * )src;
  212. uint16_t *d1=( uint16_t * )dst;
  213. uint16_t *e=((uint8_t *)s1)+src_size;
  214. while( s1<e ){
  215. register int x=*( s1++ );
  216. /* rrrrrggggggbbbbb
  217. 0rrrrrgggggbbbbb
  218. 0111 1111 1110 0000=0x7FE0
  219. 00000000000001 1111=0x001F */
  220. *( d1++ )=( x&0x001F )|( ( x&0x7FE0 )<<1 );
  221. }
  222. #else
  223. const unsigned *s1=( unsigned * )src;
  224. unsigned *d1=( unsigned * )dst;
  225. int i;
  226. int size= src_size>>2;
  227. for(i=0; i<size; i++)
  228. {
  229. register int x= s1[i];
  230. // d1[i] = x + (x&0x7FE07FE0); //faster but need msbit =0 which might not allways be true
  231. d1[i] = (x&0x7FFF7FFF) + (x&0x7FE07FE0);
  232. }
  233. #endif
  234. #endif
  235. }
  236. static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, unsigned src_size)
  237. {
  238. #ifdef HAVE_MMX
  239. const uint8_t *s = src;
  240. const uint8_t *end,*mm_end;
  241. uint16_t *d = (uint16_t *)dst;
  242. end = s + src_size;
  243. mm_end = (uint8_t*)((((unsigned long)end)/(MMREG_SIZE*2))*(MMREG_SIZE*2));
  244. __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
  245. __asm __volatile(
  246. "movq %0, %%mm7\n\t"
  247. "movq %1, %%mm6\n\t"
  248. ::"m"(red_16mask),"m"(green_16mask));
  249. while(s < mm_end)
  250. {
  251. __asm __volatile(
  252. PREFETCH" 32%1\n\t"
  253. "movd %1, %%mm0\n\t"
  254. "movd 4%1, %%mm3\n\t"
  255. "punpckldq 8%1, %%mm0\n\t"
  256. "punpckldq 12%1, %%mm3\n\t"
  257. "movq %%mm0, %%mm1\n\t"
  258. "movq %%mm0, %%mm2\n\t"
  259. "movq %%mm3, %%mm4\n\t"
  260. "movq %%mm3, %%mm5\n\t"
  261. "psrlq $3, %%mm0\n\t"
  262. "psrlq $3, %%mm3\n\t"
  263. "pand %2, %%mm0\n\t"
  264. "pand %2, %%mm3\n\t"
  265. "psrlq $5, %%mm1\n\t"
  266. "psrlq $5, %%mm4\n\t"
  267. "pand %%mm6, %%mm1\n\t"
  268. "pand %%mm6, %%mm4\n\t"
  269. "psrlq $8, %%mm2\n\t"
  270. "psrlq $8, %%mm5\n\t"
  271. "pand %%mm7, %%mm2\n\t"
  272. "pand %%mm7, %%mm5\n\t"
  273. "por %%mm1, %%mm0\n\t"
  274. "por %%mm4, %%mm3\n\t"
  275. "por %%mm2, %%mm0\n\t"
  276. "por %%mm5, %%mm3\n\t"
  277. "psllq $16, %%mm3\n\t"
  278. "por %%mm3, %%mm0\n\t"
  279. MOVNTQ" %%mm0, %0\n\t"
  280. :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
  281. d += 4;
  282. s += 16;
  283. }
  284. while(s < end)
  285. {
  286. const int b= *s++;
  287. const int g= *s++;
  288. const int r= *s++;
  289. *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
  290. }
  291. __asm __volatile(SFENCE:::"memory");
  292. __asm __volatile(EMMS:::"memory");
  293. #else
  294. unsigned j,i,num_pixels=src_size/4;
  295. uint16_t *d = (uint16_t *)dst;
  296. for(i=0,j=0; j<num_pixels; i+=4,j++)
  297. {
  298. const int b= src[i+0];
  299. const int g= src[i+1];
  300. const int r= src[i+2];
  301. d[j]= (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
  302. }
  303. #endif
  304. }
  305. static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, unsigned src_size)
  306. {
  307. #ifdef HAVE_MMX
  308. const uint8_t *s = src;
  309. const uint8_t *end,*mm_end;
  310. uint16_t *d = (uint16_t *)dst;
  311. end = s + src_size;
  312. mm_end = (uint8_t*)((((unsigned long)end)/(MMREG_SIZE*2))*(MMREG_SIZE*2));
  313. __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
  314. __asm __volatile(
  315. "movq %0, %%mm7\n\t"
  316. "movq %1, %%mm6\n\t"
  317. ::"m"(red_15mask),"m"(green_15mask));
  318. while(s < mm_end)
  319. {
  320. __asm __volatile(
  321. PREFETCH" 32%1\n\t"
  322. "movd %1, %%mm0\n\t"
  323. "movd 4%1, %%mm3\n\t"
  324. "punpckldq 8%1, %%mm0\n\t"
  325. "punpckldq 12%1, %%mm3\n\t"
  326. "movq %%mm0, %%mm1\n\t"
  327. "movq %%mm0, %%mm2\n\t"
  328. "movq %%mm3, %%mm4\n\t"
  329. "movq %%mm3, %%mm5\n\t"
  330. "psrlq $3, %%mm0\n\t"
  331. "psrlq $3, %%mm3\n\t"
  332. "pand %2, %%mm0\n\t"
  333. "pand %2, %%mm3\n\t"
  334. "psrlq $6, %%mm1\n\t"
  335. "psrlq $6, %%mm4\n\t"
  336. "pand %%mm6, %%mm1\n\t"
  337. "pand %%mm6, %%mm4\n\t"
  338. "psrlq $9, %%mm2\n\t"
  339. "psrlq $9, %%mm5\n\t"
  340. "pand %%mm7, %%mm2\n\t"
  341. "pand %%mm7, %%mm5\n\t"
  342. "por %%mm1, %%mm0\n\t"
  343. "por %%mm4, %%mm3\n\t"
  344. "por %%mm2, %%mm0\n\t"
  345. "por %%mm5, %%mm3\n\t"
  346. "psllq $16, %%mm3\n\t"
  347. "por %%mm3, %%mm0\n\t"
  348. MOVNTQ" %%mm0, %0\n\t"
  349. :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
  350. d += 4;
  351. s += 16;
  352. }
  353. while(s < end)
  354. {
  355. const int b= *s++;
  356. const int g= *s++;
  357. const int r= *s++;
  358. *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
  359. }
  360. __asm __volatile(SFENCE:::"memory");
  361. __asm __volatile(EMMS:::"memory");
  362. #else
  363. unsigned j,i,num_pixels=src_size/4;
  364. uint16_t *d = (uint16_t *)dst;
  365. for(i=0,j=0; j<num_pixels; i+=4,j++)
  366. {
  367. const int b= src[i+0];
  368. const int g= src[i+1];
  369. const int r= src[i+2];
  370. d[j]= (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
  371. }
  372. #endif
  373. }
  374. static inline void RENAME(rgb24to16)(const uint8_t *src, uint8_t *dst, unsigned src_size)
  375. {
  376. #ifdef HAVE_MMX
  377. const uint8_t *s = src;
  378. const uint8_t *end,*mm_end;
  379. uint16_t *d = (uint16_t *)dst;
  380. end = s + src_size;
  381. mm_end = (uint8_t*)((((unsigned long)end)/(MMREG_SIZE*2))*(MMREG_SIZE*2));
  382. __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
  383. __asm __volatile(
  384. "movq %0, %%mm7\n\t"
  385. "movq %1, %%mm6\n\t"
  386. ::"m"(red_16mask),"m"(green_16mask));
  387. if(mm_end == end) mm_end -= MMREG_SIZE*2;
  388. while(s < mm_end)
  389. {
  390. __asm __volatile(
  391. PREFETCH" 32%1\n\t"
  392. "movd %1, %%mm0\n\t"
  393. "movd 3%1, %%mm3\n\t"
  394. "punpckldq 6%1, %%mm0\n\t"
  395. "punpckldq 9%1, %%mm3\n\t"
  396. "movq %%mm0, %%mm1\n\t"
  397. "movq %%mm0, %%mm2\n\t"
  398. "movq %%mm3, %%mm4\n\t"
  399. "movq %%mm3, %%mm5\n\t"
  400. "psrlq $3, %%mm0\n\t"
  401. "psrlq $3, %%mm3\n\t"
  402. "pand %2, %%mm0\n\t"
  403. "pand %2, %%mm3\n\t"
  404. "psrlq $5, %%mm1\n\t"
  405. "psrlq $5, %%mm4\n\t"
  406. "pand %%mm6, %%mm1\n\t"
  407. "pand %%mm6, %%mm4\n\t"
  408. "psrlq $8, %%mm2\n\t"
  409. "psrlq $8, %%mm5\n\t"
  410. "pand %%mm7, %%mm2\n\t"
  411. "pand %%mm7, %%mm5\n\t"
  412. "por %%mm1, %%mm0\n\t"
  413. "por %%mm4, %%mm3\n\t"
  414. "por %%mm2, %%mm0\n\t"
  415. "por %%mm5, %%mm3\n\t"
  416. "psllq $16, %%mm3\n\t"
  417. "por %%mm3, %%mm0\n\t"
  418. MOVNTQ" %%mm0, %0\n\t"
  419. :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
  420. d += 4;
  421. s += 12;
  422. }
  423. while(s < end)
  424. {
  425. const int b= *s++;
  426. const int g= *s++;
  427. const int r= *s++;
  428. *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
  429. }
  430. __asm __volatile(SFENCE:::"memory");
  431. __asm __volatile(EMMS:::"memory");
  432. #else
  433. unsigned j,i,num_pixels=src_size/3;
  434. uint16_t *d = (uint16_t *)dst;
  435. for(i=0,j=0; j<num_pixels; i+=3,j++)
  436. {
  437. const int b= src[i+0];
  438. const int g= src[i+1];
  439. const int r= src[i+2];
  440. d[j]= (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
  441. }
  442. #endif
  443. }
  444. static inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, unsigned src_size)
  445. {
  446. #ifdef HAVE_MMX
  447. const uint8_t *s = src;
  448. const uint8_t *end,*mm_end;
  449. uint16_t *d = (uint16_t *)dst;
  450. end = s + src_size;
  451. mm_end = (uint8_t*)((((unsigned long)end)/(MMREG_SIZE*2))*(MMREG_SIZE*2));
  452. __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
  453. __asm __volatile(
  454. "movq %0, %%mm7\n\t"
  455. "movq %1, %%mm6\n\t"
  456. ::"m"(red_15mask),"m"(green_15mask));
  457. if(mm_end == end) mm_end -= MMREG_SIZE*2;
  458. while(s < mm_end)
  459. {
  460. __asm __volatile(
  461. PREFETCH" 32%1\n\t"
  462. "movd %1, %%mm0\n\t"
  463. "movd 3%1, %%mm3\n\t"
  464. "punpckldq 6%1, %%mm0\n\t"
  465. "punpckldq 9%1, %%mm3\n\t"
  466. "movq %%mm0, %%mm1\n\t"
  467. "movq %%mm0, %%mm2\n\t"
  468. "movq %%mm3, %%mm4\n\t"
  469. "movq %%mm3, %%mm5\n\t"
  470. "psrlq $3, %%mm0\n\t"
  471. "psrlq $3, %%mm3\n\t"
  472. "pand %2, %%mm0\n\t"
  473. "pand %2, %%mm3\n\t"
  474. "psrlq $6, %%mm1\n\t"
  475. "psrlq $6, %%mm4\n\t"
  476. "pand %%mm6, %%mm1\n\t"
  477. "pand %%mm6, %%mm4\n\t"
  478. "psrlq $9, %%mm2\n\t"
  479. "psrlq $9, %%mm5\n\t"
  480. "pand %%mm7, %%mm2\n\t"
  481. "pand %%mm7, %%mm5\n\t"
  482. "por %%mm1, %%mm0\n\t"
  483. "por %%mm4, %%mm3\n\t"
  484. "por %%mm2, %%mm0\n\t"
  485. "por %%mm5, %%mm3\n\t"
  486. "psllq $16, %%mm3\n\t"
  487. "por %%mm3, %%mm0\n\t"
  488. MOVNTQ" %%mm0, %0\n\t"
  489. :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
  490. d += 4;
  491. s += 12;
  492. }
  493. while(s < end)
  494. {
  495. const int b= *s++;
  496. const int g= *s++;
  497. const int r= *s++;
  498. *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
  499. }
  500. __asm __volatile(SFENCE:::"memory");
  501. __asm __volatile(EMMS:::"memory");
  502. #else
  503. unsigned j,i,num_pixels=src_size/3;
  504. uint16_t *d = (uint16_t *)dst;
  505. for(i=0,j=0; j<num_pixels; i+=3,j++)
  506. {
  507. const int b= src[i+0];
  508. const int g= src[i+1];
  509. const int r= src[i+2];
  510. d[j]= (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
  511. }
  512. #endif
  513. }
  514. static inline void RENAME(rgb32tobgr32)(const uint8_t *src, uint8_t *dst, unsigned int src_size)
  515. {
  516. int num_pixels= src_size >> 2;
  517. #ifdef HAVE_MMX
  518. asm volatile (
  519. "xorl %%eax, %%eax \n\t"
  520. ".balign 16 \n\t"
  521. "1: \n\t"
  522. PREFETCH" 32(%0, %%eax) \n\t"
  523. "movq (%0, %%eax), %%mm0 \n\t"
  524. "movq %%mm0, %%mm1 \n\t"
  525. "movq %%mm0, %%mm2 \n\t"
  526. "pslld $16, %%mm0 \n\t"
  527. "psrld $16, %%mm1 \n\t"
  528. "pand "MANGLE(mask32r)", %%mm0 \n\t"
  529. "pand "MANGLE(mask32g)", %%mm2 \n\t"
  530. "pand "MANGLE(mask32b)", %%mm1 \n\t"
  531. "por %%mm0, %%mm2 \n\t"
  532. "por %%mm1, %%mm2 \n\t"
  533. MOVNTQ" %%mm2, (%1, %%eax) \n\t"
  534. "addl $2, %%eax \n\t"
  535. "cmpl %2, %%eax \n\t"
  536. " jb 1b \n\t"
  537. :: "r" (src), "r"(dst), "r" (num_pixels)
  538. : "%eax"
  539. );
  540. __asm __volatile(SFENCE:::"memory");
  541. __asm __volatile(EMMS:::"memory");
  542. #else
  543. int i;
  544. for(i=0; i<num_pixels; i++)
  545. {
  546. dst[4*i + 0] = src[4*i + 2];
  547. dst[4*i + 1] = src[4*i + 1];
  548. dst[4*i + 2] = src[4*i + 0];
  549. }
  550. #endif
  551. }
  552. static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, unsigned int src_size)
  553. {
  554. int i;
  555. #ifdef HAVE_MMX
  556. int mmx_size= 23 - src_size;
  557. asm volatile (
  558. "movq "MANGLE(mask24r)", %%mm5 \n\t"
  559. "movq "MANGLE(mask24g)", %%mm6 \n\t"
  560. "movq "MANGLE(mask24b)", %%mm7 \n\t"
  561. ".balign 16 \n\t"
  562. "1: \n\t"
  563. PREFETCH" 32(%1, %%eax) \n\t"
  564. "movq (%1, %%eax), %%mm0 \n\t" // BGR BGR BG
  565. "movq (%1, %%eax), %%mm1 \n\t" // BGR BGR BG
  566. "movq 2(%1, %%eax), %%mm2 \n\t" // R BGR BGR B
  567. "psllq $16, %%mm0 \n\t" // 00 BGR BGR
  568. "pand %%mm5, %%mm0 \n\t"
  569. "pand %%mm6, %%mm1 \n\t"
  570. "pand %%mm7, %%mm2 \n\t"
  571. "por %%mm0, %%mm1 \n\t"
  572. "por %%mm2, %%mm1 \n\t"
  573. "movq 6(%1, %%eax), %%mm0 \n\t" // BGR BGR BG
  574. MOVNTQ" %%mm1, (%2, %%eax) \n\t" // RGB RGB RG
  575. "movq 8(%1, %%eax), %%mm1 \n\t" // R BGR BGR B
  576. "movq 10(%1, %%eax), %%mm2 \n\t" // GR BGR BGR
  577. "pand %%mm7, %%mm0 \n\t"
  578. "pand %%mm5, %%mm1 \n\t"
  579. "pand %%mm6, %%mm2 \n\t"
  580. "por %%mm0, %%mm1 \n\t"
  581. "por %%mm2, %%mm1 \n\t"
  582. "movq 14(%1, %%eax), %%mm0 \n\t" // R BGR BGR B
  583. MOVNTQ" %%mm1, 8(%2, %%eax) \n\t" // B RGB RGB R
  584. "movq 16(%1, %%eax), %%mm1 \n\t" // GR BGR BGR
  585. "movq 18(%1, %%eax), %%mm2 \n\t" // BGR BGR BG
  586. "pand %%mm6, %%mm0 \n\t"
  587. "pand %%mm7, %%mm1 \n\t"
  588. "pand %%mm5, %%mm2 \n\t"
  589. "por %%mm0, %%mm1 \n\t"
  590. "por %%mm2, %%mm1 \n\t"
  591. MOVNTQ" %%mm1, 16(%2, %%eax) \n\t"
  592. "addl $24, %%eax \n\t"
  593. " js 1b \n\t"
  594. : "+a" (mmx_size)
  595. : "r" (src-mmx_size), "r"(dst-mmx_size)
  596. );
  597. __asm __volatile(SFENCE:::"memory");
  598. __asm __volatile(EMMS:::"memory");
  599. if(!mmx_size) return; //finihsed, was multiple of 8
  600. src+= src_size;
  601. dst+= src_size;
  602. src_size= 24-mmx_size;
  603. src-= src_size;
  604. dst-= src_size;
  605. #endif
  606. for(i=0; i<src_size; i+=3)
  607. {
  608. register int x;
  609. x = src[i + 2];
  610. dst[i + 1] = src[i + 1];
  611. dst[i + 2] = src[i + 0];
  612. dst[i + 0] = x;
  613. }
  614. }
  615. static inline void RENAME(yuvPlanartoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
  616. unsigned int width, unsigned int height,
  617. unsigned int lumStride, unsigned int chromStride, unsigned int dstStride, int vertLumPerChroma)
  618. {
  619. int y;
  620. const int chromWidth= width>>1;
  621. for(y=0; y<height; y++)
  622. {
  623. #ifdef HAVE_MMX
  624. //FIXME handle 2 lines a once (fewer prefetch, reuse some chrom, but very likely limited by mem anyway)
  625. asm volatile(
  626. "xorl %%eax, %%eax \n\t"
  627. ".balign 16 \n\t"
  628. "1: \n\t"
  629. PREFETCH" 32(%1, %%eax, 2) \n\t"
  630. PREFETCH" 32(%2, %%eax) \n\t"
  631. PREFETCH" 32(%3, %%eax) \n\t"
  632. "movq (%2, %%eax), %%mm0 \n\t" // U(0)
  633. "movq %%mm0, %%mm2 \n\t" // U(0)
  634. "movq (%3, %%eax), %%mm1 \n\t" // V(0)
  635. "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
  636. "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8)
  637. "movq (%1, %%eax,2), %%mm3 \n\t" // Y(0)
  638. "movq 8(%1, %%eax,2), %%mm5 \n\t" // Y(8)
  639. "movq %%mm3, %%mm4 \n\t" // Y(0)
  640. "movq %%mm5, %%mm6 \n\t" // Y(8)
  641. "punpcklbw %%mm0, %%mm3 \n\t" // YUYV YUYV(0)
  642. "punpckhbw %%mm0, %%mm4 \n\t" // YUYV YUYV(4)
  643. "punpcklbw %%mm2, %%mm5 \n\t" // YUYV YUYV(8)
  644. "punpckhbw %%mm2, %%mm6 \n\t" // YUYV YUYV(12)
  645. MOVNTQ" %%mm3, (%0, %%eax, 4) \n\t"
  646. MOVNTQ" %%mm4, 8(%0, %%eax, 4) \n\t"
  647. MOVNTQ" %%mm5, 16(%0, %%eax, 4) \n\t"
  648. MOVNTQ" %%mm6, 24(%0, %%eax, 4) \n\t"
  649. "addl $8, %%eax \n\t"
  650. "cmpl %4, %%eax \n\t"
  651. " jb 1b \n\t"
  652. ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "r" (chromWidth)
  653. : "%eax"
  654. );
  655. #else
  656. int i;
  657. for(i=0; i<chromWidth; i++)
  658. {
  659. dst[4*i+0] = ysrc[2*i+0];
  660. dst[4*i+1] = usrc[i];
  661. dst[4*i+2] = ysrc[2*i+1];
  662. dst[4*i+3] = vsrc[i];
  663. }
  664. #endif
  665. if((y&(vertLumPerChroma-1))==(vertLumPerChroma-1) )
  666. {
  667. usrc += chromStride;
  668. vsrc += chromStride;
  669. }
  670. ysrc += lumStride;
  671. dst += dstStride;
  672. }
  673. #ifdef HAVE_MMX
  674. asm( EMMS" \n\t"
  675. SFENCE" \n\t"
  676. :::"memory");
  677. #endif
  678. }
  679. /**
  680. *
  681. * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
  682. * problem for anyone then tell me, and ill fix it)
  683. */
  684. static inline void RENAME(yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
  685. unsigned int width, unsigned int height,
  686. unsigned int lumStride, unsigned int chromStride, unsigned int dstStride)
  687. {
  688. //FIXME interpolate chroma
  689. RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
  690. }
  691. /**
  692. *
  693. * width should be a multiple of 16
  694. */
  695. static inline void RENAME(yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
  696. unsigned int width, unsigned int height,
  697. unsigned int lumStride, unsigned int chromStride, unsigned int dstStride)
  698. {
  699. RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
  700. }
  701. /**
  702. *
  703. * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
  704. * problem for anyone then tell me, and ill fix it)
  705. */
  706. static inline void RENAME(yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
  707. unsigned int width, unsigned int height,
  708. unsigned int lumStride, unsigned int chromStride, unsigned int srcStride)
  709. {
  710. int y;
  711. const int chromWidth= width>>1;
  712. for(y=0; y<height; y+=2)
  713. {
  714. #ifdef HAVE_MMX
  715. asm volatile(
  716. "xorl %%eax, %%eax \n\t"
  717. "pcmpeqw %%mm7, %%mm7 \n\t"
  718. "psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
  719. ".balign 16 \n\t"
  720. "1: \n\t"
  721. PREFETCH" 64(%0, %%eax, 4) \n\t"
  722. "movq (%0, %%eax, 4), %%mm0 \n\t" // YUYV YUYV(0)
  723. "movq 8(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(4)
  724. "movq %%mm0, %%mm2 \n\t" // YUYV YUYV(0)
  725. "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(4)
  726. "psrlw $8, %%mm0 \n\t" // U0V0 U0V0(0)
  727. "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(4)
  728. "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
  729. "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
  730. "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
  731. "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
  732. MOVNTQ" %%mm2, (%1, %%eax, 2) \n\t"
  733. "movq 16(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(8)
  734. "movq 24(%0, %%eax, 4), %%mm2 \n\t" // YUYV YUYV(12)
  735. "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(8)
  736. "movq %%mm2, %%mm4 \n\t" // YUYV YUYV(12)
  737. "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(8)
  738. "psrlw $8, %%mm2 \n\t" // U0V0 U0V0(12)
  739. "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
  740. "pand %%mm7, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
  741. "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
  742. "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
  743. MOVNTQ" %%mm3, 8(%1, %%eax, 2) \n\t"
  744. "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
  745. "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
  746. "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
  747. "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
  748. "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
  749. "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
  750. "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
  751. "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
  752. MOVNTQ" %%mm0, (%3, %%eax) \n\t"
  753. MOVNTQ" %%mm2, (%2, %%eax) \n\t"
  754. "addl $8, %%eax \n\t"
  755. "cmpl %4, %%eax \n\t"
  756. " jb 1b \n\t"
  757. ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "r" (chromWidth)
  758. : "memory", "%eax"
  759. );
  760. ydst += lumStride;
  761. src += srcStride;
  762. asm volatile(
  763. "xorl %%eax, %%eax \n\t"
  764. ".balign 16 \n\t"
  765. "1: \n\t"
  766. PREFETCH" 64(%0, %%eax, 4) \n\t"
  767. "movq (%0, %%eax, 4), %%mm0 \n\t" // YUYV YUYV(0)
  768. "movq 8(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(4)
  769. "movq 16(%0, %%eax, 4), %%mm2 \n\t" // YUYV YUYV(8)
  770. "movq 24(%0, %%eax, 4), %%mm3 \n\t" // YUYV YUYV(12)
  771. "pand %%mm7, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
  772. "pand %%mm7, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
  773. "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
  774. "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
  775. "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
  776. "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
  777. MOVNTQ" %%mm0, (%1, %%eax, 2) \n\t"
  778. MOVNTQ" %%mm2, 8(%1, %%eax, 2) \n\t"
  779. "addl $8, %%eax \n\t"
  780. "cmpl %4, %%eax \n\t"
  781. " jb 1b \n\t"
  782. ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "r" (chromWidth)
  783. : "memory", "%eax"
  784. );
  785. #else
  786. int i;
  787. for(i=0; i<chromWidth; i++)
  788. {
  789. ydst[2*i+0] = src[4*i+0];
  790. udst[i] = src[4*i+1];
  791. ydst[2*i+1] = src[4*i+2];
  792. vdst[i] = src[4*i+3];
  793. }
  794. ydst += lumStride;
  795. src += srcStride;
  796. for(i=0; i<chromWidth; i++)
  797. {
  798. ydst[2*i+0] = src[4*i+0];
  799. ydst[2*i+1] = src[4*i+2];
  800. }
  801. #endif
  802. udst += chromStride;
  803. vdst += chromStride;
  804. ydst += lumStride;
  805. src += srcStride;
  806. }
  807. #ifdef HAVE_MMX
  808. asm volatile( EMMS" \n\t"
  809. SFENCE" \n\t"
  810. :::"memory");
  811. #endif
  812. }
  813. /**
  814. *
  815. * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
  816. * problem for anyone then tell me, and ill fix it)
  817. * chrominance data is only taken from every secound line others are ignored FIXME write HQ version
  818. */
  819. static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
  820. unsigned int width, unsigned int height,
  821. unsigned int lumStride, unsigned int chromStride, unsigned int srcStride)
  822. {
  823. int y;
  824. const int chromWidth= width>>1;
  825. for(y=0; y<height; y+=2)
  826. {
  827. #ifdef HAVE_MMX
  828. asm volatile(
  829. "xorl %%eax, %%eax \n\t"
  830. "pcmpeqw %%mm7, %%mm7 \n\t"
  831. "psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
  832. ".balign 16 \n\t"
  833. "1: \n\t"
  834. PREFETCH" 64(%0, %%eax, 4) \n\t"
  835. "movq (%0, %%eax, 4), %%mm0 \n\t" // UYVY UYVY(0)
  836. "movq 8(%0, %%eax, 4), %%mm1 \n\t" // UYVY UYVY(4)
  837. "movq %%mm0, %%mm2 \n\t" // UYVY UYVY(0)
  838. "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(4)
  839. "pand %%mm7, %%mm0 \n\t" // U0V0 U0V0(0)
  840. "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(4)
  841. "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
  842. "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
  843. "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
  844. "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
  845. MOVNTQ" %%mm2, (%1, %%eax, 2) \n\t"
  846. "movq 16(%0, %%eax, 4), %%mm1 \n\t" // UYVY UYVY(8)
  847. "movq 24(%0, %%eax, 4), %%mm2 \n\t" // UYVY UYVY(12)
  848. "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(8)
  849. "movq %%mm2, %%mm4 \n\t" // UYVY UYVY(12)
  850. "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(8)
  851. "pand %%mm7, %%mm2 \n\t" // U0V0 U0V0(12)
  852. "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
  853. "psrlw $8, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
  854. "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
  855. "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
  856. MOVNTQ" %%mm3, 8(%1, %%eax, 2) \n\t"
  857. "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
  858. "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
  859. "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
  860. "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
  861. "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
  862. "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
  863. "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
  864. "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
  865. MOVNTQ" %%mm0, (%3, %%eax) \n\t"
  866. MOVNTQ" %%mm2, (%2, %%eax) \n\t"
  867. "addl $8, %%eax \n\t"
  868. "cmpl %4, %%eax \n\t"
  869. " jb 1b \n\t"
  870. ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "r" (chromWidth)
  871. : "memory", "%eax"
  872. );
  873. ydst += lumStride;
  874. src += srcStride;
  875. asm volatile(
  876. "xorl %%eax, %%eax \n\t"
  877. ".balign 16 \n\t"
  878. "1: \n\t"
  879. PREFETCH" 64(%0, %%eax, 4) \n\t"
  880. "movq (%0, %%eax, 4), %%mm0 \n\t" // YUYV YUYV(0)
  881. "movq 8(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(4)
  882. "movq 16(%0, %%eax, 4), %%mm2 \n\t" // YUYV YUYV(8)
  883. "movq 24(%0, %%eax, 4), %%mm3 \n\t" // YUYV YUYV(12)
  884. "psrlw $8, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
  885. "psrlw $8, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
  886. "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
  887. "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
  888. "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
  889. "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
  890. MOVNTQ" %%mm0, (%1, %%eax, 2) \n\t"
  891. MOVNTQ" %%mm2, 8(%1, %%eax, 2) \n\t"
  892. "addl $8, %%eax \n\t"
  893. "cmpl %4, %%eax \n\t"
  894. " jb 1b \n\t"
  895. ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "r" (chromWidth)
  896. : "memory", "%eax"
  897. );
  898. #else
  899. int i;
  900. for(i=0; i<chromWidth; i++)
  901. {
  902. udst[i] = src[4*i+0];
  903. ydst[2*i+0] = src[4*i+1];
  904. vdst[i] = src[4*i+2];
  905. ydst[2*i+1] = src[4*i+3];
  906. }
  907. ydst += lumStride;
  908. src += srcStride;
  909. for(i=0; i<chromWidth; i++)
  910. {
  911. ydst[2*i+0] = src[4*i+1];
  912. ydst[2*i+1] = src[4*i+3];
  913. }
  914. #endif
  915. udst += chromStride;
  916. vdst += chromStride;
  917. ydst += lumStride;
  918. src += srcStride;
  919. }
  920. #ifdef HAVE_MMX
  921. asm volatile( EMMS" \n\t"
  922. SFENCE" \n\t"
  923. :::"memory");
  924. #endif
  925. }
  926. /**
  927. *
  928. * height should be a multiple of 2 and width should be a multiple of 2 (if this is a
  929. * problem for anyone then tell me, and ill fix it)
  930. * chrominance data is only taken from every secound line others are ignored in the C version FIXME write HQ version
  931. */
  932. static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
  933. unsigned int width, unsigned int height,
  934. unsigned int lumStride, unsigned int chromStride, unsigned int srcStride)
  935. {
  936. int y;
  937. const int chromWidth= width>>1;
  938. #ifdef HAVE_MMX
  939. for(y=0; y<height-2; y+=2)
  940. {
  941. int i;
  942. for(i=0; i<2; i++)
  943. {
  944. asm volatile(
  945. "movl %2, %%eax \n\t"
  946. "movq "MANGLE(bgr2YCoeff)", %%mm6 \n\t"
  947. "movq "MANGLE(w1111)", %%mm5 \n\t"
  948. "pxor %%mm7, %%mm7 \n\t"
  949. "leal (%%eax, %%eax, 2), %%ebx \n\t"
  950. ".balign 16 \n\t"
  951. "1: \n\t"
  952. PREFETCH" 64(%0, %%ebx) \n\t"
  953. "movd (%0, %%ebx), %%mm0 \n\t"
  954. "movd 3(%0, %%ebx), %%mm1 \n\t"
  955. "punpcklbw %%mm7, %%mm0 \n\t"
  956. "punpcklbw %%mm7, %%mm1 \n\t"
  957. "movd 6(%0, %%ebx), %%mm2 \n\t"
  958. "movd 9(%0, %%ebx), %%mm3 \n\t"
  959. "punpcklbw %%mm7, %%mm2 \n\t"
  960. "punpcklbw %%mm7, %%mm3 \n\t"
  961. "pmaddwd %%mm6, %%mm0 \n\t"
  962. "pmaddwd %%mm6, %%mm1 \n\t"
  963. "pmaddwd %%mm6, %%mm2 \n\t"
  964. "pmaddwd %%mm6, %%mm3 \n\t"
  965. #ifndef FAST_BGR2YV12
  966. "psrad $8, %%mm0 \n\t"
  967. "psrad $8, %%mm1 \n\t"
  968. "psrad $8, %%mm2 \n\t"
  969. "psrad $8, %%mm3 \n\t"
  970. #endif
  971. "packssdw %%mm1, %%mm0 \n\t"
  972. "packssdw %%mm3, %%mm2 \n\t"
  973. "pmaddwd %%mm5, %%mm0 \n\t"
  974. "pmaddwd %%mm5, %%mm2 \n\t"
  975. "packssdw %%mm2, %%mm0 \n\t"
  976. "psraw $7, %%mm0 \n\t"
  977. "movd 12(%0, %%ebx), %%mm4 \n\t"
  978. "movd 15(%0, %%ebx), %%mm1 \n\t"
  979. "punpcklbw %%mm7, %%mm4 \n\t"
  980. "punpcklbw %%mm7, %%mm1 \n\t"
  981. "movd 18(%0, %%ebx), %%mm2 \n\t"
  982. "movd 21(%0, %%ebx), %%mm3 \n\t"
  983. "punpcklbw %%mm7, %%mm2 \n\t"
  984. "punpcklbw %%mm7, %%mm3 \n\t"
  985. "pmaddwd %%mm6, %%mm4 \n\t"
  986. "pmaddwd %%mm6, %%mm1 \n\t"
  987. "pmaddwd %%mm6, %%mm2 \n\t"
  988. "pmaddwd %%mm6, %%mm3 \n\t"
  989. #ifndef FAST_BGR2YV12
  990. "psrad $8, %%mm4 \n\t"
  991. "psrad $8, %%mm1 \n\t"
  992. "psrad $8, %%mm2 \n\t"
  993. "psrad $8, %%mm3 \n\t"
  994. #endif
  995. "packssdw %%mm1, %%mm4 \n\t"
  996. "packssdw %%mm3, %%mm2 \n\t"
  997. "pmaddwd %%mm5, %%mm4 \n\t"
  998. "pmaddwd %%mm5, %%mm2 \n\t"
  999. "addl $24, %%ebx \n\t"
  1000. "packssdw %%mm2, %%mm4 \n\t"
  1001. "psraw $7, %%mm4 \n\t"
  1002. "packuswb %%mm4, %%mm0 \n\t"
  1003. "paddusb "MANGLE(bgr2YOffset)", %%mm0 \n\t"
  1004. MOVNTQ" %%mm0, (%1, %%eax) \n\t"
  1005. "addl $8, %%eax \n\t"
  1006. " js 1b \n\t"
  1007. : : "r" (src+width*3), "r" (ydst+width), "g" (-width)
  1008. : "%eax", "%ebx"
  1009. );
  1010. ydst += lumStride;
  1011. src += srcStride;
  1012. }
  1013. src -= srcStride*2;
  1014. asm volatile(
  1015. "movl %4, %%eax \n\t"
  1016. "movq "MANGLE(w1111)", %%mm5 \n\t"
  1017. "movq "MANGLE(bgr2UCoeff)", %%mm6 \n\t"
  1018. "pxor %%mm7, %%mm7 \n\t"
  1019. "leal (%%eax, %%eax, 2), %%ebx \n\t"
  1020. "addl %%ebx, %%ebx \n\t"
  1021. ".balign 16 \n\t"
  1022. "1: \n\t"
  1023. PREFETCH" 64(%0, %%ebx) \n\t"
  1024. PREFETCH" 64(%1, %%ebx) \n\t"
  1025. #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
  1026. "movq (%0, %%ebx), %%mm0 \n\t"
  1027. "movq (%1, %%ebx), %%mm1 \n\t"
  1028. "movq 6(%0, %%ebx), %%mm2 \n\t"
  1029. "movq 6(%1, %%ebx), %%mm3 \n\t"
  1030. PAVGB" %%mm1, %%mm0 \n\t"
  1031. PAVGB" %%mm3, %%mm2 \n\t"
  1032. "movq %%mm0, %%mm1 \n\t"
  1033. "movq %%mm2, %%mm3 \n\t"
  1034. "psrlq $24, %%mm0 \n\t"
  1035. "psrlq $24, %%mm2 \n\t"
  1036. PAVGB" %%mm1, %%mm0 \n\t"
  1037. PAVGB" %%mm3, %%mm2 \n\t"
  1038. "punpcklbw %%mm7, %%mm0 \n\t"
  1039. "punpcklbw %%mm7, %%mm2 \n\t"
  1040. #else
  1041. "movd (%0, %%ebx), %%mm0 \n\t"
  1042. "movd (%1, %%ebx), %%mm1 \n\t"
  1043. "movd 3(%0, %%ebx), %%mm2 \n\t"
  1044. "movd 3(%1, %%ebx), %%mm3 \n\t"
  1045. "punpcklbw %%mm7, %%mm0 \n\t"
  1046. "punpcklbw %%mm7, %%mm1 \n\t"
  1047. "punpcklbw %%mm7, %%mm2 \n\t"
  1048. "punpcklbw %%mm7, %%mm3 \n\t"
  1049. "paddw %%mm1, %%mm0 \n\t"
  1050. "paddw %%mm3, %%mm2 \n\t"
  1051. "paddw %%mm2, %%mm0 \n\t"
  1052. "movd 6(%0, %%ebx), %%mm4 \n\t"
  1053. "movd 6(%1, %%ebx), %%mm1 \n\t"
  1054. "movd 9(%0, %%ebx), %%mm2 \n\t"
  1055. "movd 9(%1, %%ebx), %%mm3 \n\t"
  1056. "punpcklbw %%mm7, %%mm4 \n\t"
  1057. "punpcklbw %%mm7, %%mm1 \n\t"
  1058. "punpcklbw %%mm7, %%mm2 \n\t"
  1059. "punpcklbw %%mm7, %%mm3 \n\t"
  1060. "paddw %%mm1, %%mm4 \n\t"
  1061. "paddw %%mm3, %%mm2 \n\t"
  1062. "paddw %%mm4, %%mm2 \n\t"
  1063. "psrlw $2, %%mm0 \n\t"
  1064. "psrlw $2, %%mm2 \n\t"
  1065. #endif
  1066. "movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t"
  1067. "movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t"
  1068. "pmaddwd %%mm0, %%mm1 \n\t"
  1069. "pmaddwd %%mm2, %%mm3 \n\t"
  1070. "pmaddwd %%mm6, %%mm0 \n\t"
  1071. "pmaddwd %%mm6, %%mm2 \n\t"
  1072. #ifndef FAST_BGR2YV12
  1073. "psrad $8, %%mm0 \n\t"
  1074. "psrad $8, %%mm1 \n\t"
  1075. "psrad $8, %%mm2 \n\t"
  1076. "psrad $8, %%mm3 \n\t"
  1077. #endif
  1078. "packssdw %%mm2, %%mm0 \n\t"
  1079. "packssdw %%mm3, %%mm1 \n\t"
  1080. "pmaddwd %%mm5, %%mm0 \n\t"
  1081. "pmaddwd %%mm5, %%mm1 \n\t"
  1082. "packssdw %%mm1, %%mm0 \n\t" // V1 V0 U1 U0
  1083. "psraw $7, %%mm0 \n\t"
  1084. #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
  1085. "movq 12(%0, %%ebx), %%mm4 \n\t"
  1086. "movq 12(%1, %%ebx), %%mm1 \n\t"
  1087. "movq 18(%0, %%ebx), %%mm2 \n\t"
  1088. "movq 18(%1, %%ebx), %%mm3 \n\t"
  1089. PAVGB" %%mm1, %%mm4 \n\t"
  1090. PAVGB" %%mm3, %%mm2 \n\t"
  1091. "movq %%mm4, %%mm1 \n\t"
  1092. "movq %%mm2, %%mm3 \n\t"
  1093. "psrlq $24, %%mm4 \n\t"
  1094. "psrlq $24, %%mm2 \n\t"
  1095. PAVGB" %%mm1, %%mm4 \n\t"
  1096. PAVGB" %%mm3, %%mm2 \n\t"
  1097. "punpcklbw %%mm7, %%mm4 \n\t"
  1098. "punpcklbw %%mm7, %%mm2 \n\t"
  1099. #else
  1100. "movd 12(%0, %%ebx), %%mm4 \n\t"
  1101. "movd 12(%1, %%ebx), %%mm1 \n\t"
  1102. "movd 15(%0, %%ebx), %%mm2 \n\t"
  1103. "movd 15(%1, %%ebx), %%mm3 \n\t"
  1104. "punpcklbw %%mm7, %%mm4 \n\t"
  1105. "punpcklbw %%mm7, %%mm1 \n\t"
  1106. "punpcklbw %%mm7, %%mm2 \n\t"
  1107. "punpcklbw %%mm7, %%mm3 \n\t"
  1108. "paddw %%mm1, %%mm4 \n\t"
  1109. "paddw %%mm3, %%mm2 \n\t"
  1110. "paddw %%mm2, %%mm4 \n\t"
  1111. "movd 18(%0, %%ebx), %%mm5 \n\t"
  1112. "movd 18(%1, %%ebx), %%mm1 \n\t"
  1113. "movd 21(%0, %%ebx), %%mm2 \n\t"
  1114. "movd 21(%1, %%ebx), %%mm3 \n\t"
  1115. "punpcklbw %%mm7, %%mm5 \n\t"
  1116. "punpcklbw %%mm7, %%mm1 \n\t"
  1117. "punpcklbw %%mm7, %%mm2 \n\t"
  1118. "punpcklbw %%mm7, %%mm3 \n\t"
  1119. "paddw %%mm1, %%mm5 \n\t"
  1120. "paddw %%mm3, %%mm2 \n\t"
  1121. "paddw %%mm5, %%mm2 \n\t"
  1122. "movq "MANGLE(w1111)", %%mm5 \n\t"
  1123. "psrlw $2, %%mm4 \n\t"
  1124. "psrlw $2, %%mm2 \n\t"
  1125. #endif
  1126. "movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t"
  1127. "movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t"
  1128. "pmaddwd %%mm4, %%mm1 \n\t"
  1129. "pmaddwd %%mm2, %%mm3 \n\t"
  1130. "pmaddwd %%mm6, %%mm4 \n\t"
  1131. "pmaddwd %%mm6, %%mm2 \n\t"
  1132. #ifndef FAST_BGR2YV12
  1133. "psrad $8, %%mm4 \n\t"
  1134. "psrad $8, %%mm1 \n\t"
  1135. "psrad $8, %%mm2 \n\t"
  1136. "psrad $8, %%mm3 \n\t"
  1137. #endif
  1138. "packssdw %%mm2, %%mm4 \n\t"
  1139. "packssdw %%mm3, %%mm1 \n\t"
  1140. "pmaddwd %%mm5, %%mm4 \n\t"
  1141. "pmaddwd %%mm5, %%mm1 \n\t"
  1142. "addl $24, %%ebx \n\t"
  1143. "packssdw %%mm1, %%mm4 \n\t" // V3 V2 U3 U2
  1144. "psraw $7, %%mm4 \n\t"
  1145. "movq %%mm0, %%mm1 \n\t"
  1146. "punpckldq %%mm4, %%mm0 \n\t"
  1147. "punpckhdq %%mm4, %%mm1 \n\t"
  1148. "packsswb %%mm1, %%mm0 \n\t"
  1149. "paddb "MANGLE(bgr2UVOffset)", %%mm0 \n\t"
  1150. "movd %%mm0, (%2, %%eax) \n\t"
  1151. "punpckhdq %%mm0, %%mm0 \n\t"
  1152. "movd %%mm0, (%3, %%eax) \n\t"
  1153. "addl $4, %%eax \n\t"
  1154. " js 1b \n\t"
  1155. : : "r" (src+width*6), "r" (src+srcStride+width*6), "r" (udst+width), "r" (vdst+width), "g" (-width)
  1156. : "%eax", "%ebx"
  1157. );
  1158. udst += chromStride;
  1159. vdst += chromStride;
  1160. src += srcStride*2;
  1161. }
  1162. asm volatile( EMMS" \n\t"
  1163. SFENCE" \n\t"
  1164. :::"memory");
  1165. #else
  1166. y=0;
  1167. #endif
  1168. for(; y<height; y+=2)
  1169. {
  1170. int i;
  1171. for(i=0; i<chromWidth; i++)
  1172. {
  1173. unsigned int b= src[6*i+0];
  1174. unsigned int g= src[6*i+1];
  1175. unsigned int r= src[6*i+2];
  1176. unsigned int Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
  1177. unsigned int V = ((RV*r + GV*g + BV*b)>>RGB2YUV_SHIFT) + 128;
  1178. unsigned int U = ((RU*r + GU*g + BU*b)>>RGB2YUV_SHIFT) + 128;
  1179. udst[i] = U;
  1180. vdst[i] = V;
  1181. ydst[2*i] = Y;
  1182. b= src[6*i+3];
  1183. g= src[6*i+4];
  1184. r= src[6*i+5];
  1185. Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
  1186. ydst[2*i+1] = Y;
  1187. }
  1188. ydst += lumStride;
  1189. src += srcStride;
  1190. for(i=0; i<chromWidth; i++)
  1191. {
  1192. unsigned int b= src[6*i+0];
  1193. unsigned int g= src[6*i+1];
  1194. unsigned int r= src[6*i+2];
  1195. unsigned int Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
  1196. ydst[2*i] = Y;
  1197. b= src[6*i+3];
  1198. g= src[6*i+4];
  1199. r= src[6*i+5];
  1200. Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
  1201. ydst[2*i+1] = Y;
  1202. }
  1203. udst += chromStride;
  1204. vdst += chromStride;
  1205. ydst += lumStride;
  1206. src += srcStride;
  1207. }
  1208. }
  1209. void RENAME(interleaveBytes)(uint8_t *src1, uint8_t *src2, uint8_t *dest,
  1210. int width, int height, int src1Stride, int src2Stride, int dstStride){
  1211. int h;
  1212. for(h=0; h < height; h++)
  1213. {
  1214. int w;
  1215. #ifdef HAVE_MMX
  1216. #ifdef HAVE_SSE2
  1217. asm(
  1218. "xorl %%eax, %%eax \n\t"
  1219. "1: \n\t"
  1220. PREFETCH" 64(%1, %%eax) \n\t"
  1221. PREFETCH" 64(%2, %%eax) \n\t"
  1222. "movdqa (%1, %%eax), %%xmm0 \n\t"
  1223. "movdqa (%1, %%eax), %%xmm1 \n\t"
  1224. "movdqa (%2, %%eax), %%xmm2 \n\t"
  1225. "punpcklbw %%xmm2, %%xmm0 \n\t"
  1226. "punpckhbw %%xmm2, %%xmm1 \n\t"
  1227. "movntdq %%xmm0, (%0, %%eax, 2) \n\t"
  1228. "movntdq %%xmm1, 16(%0, %%eax, 2)\n\t"
  1229. "addl $16, %%eax \n\t"
  1230. "cmpl %3, %%eax \n\t"
  1231. " jb 1b \n\t"
  1232. ::"r"(dest), "r"(src1), "r"(src2), "r" (width-15)
  1233. : "memory", "%eax"
  1234. );
  1235. #else
  1236. asm(
  1237. "xorl %%eax, %%eax \n\t"
  1238. "1: \n\t"
  1239. PREFETCH" 64(%1, %%eax) \n\t"
  1240. PREFETCH" 64(%2, %%eax) \n\t"
  1241. "movq (%1, %%eax), %%mm0 \n\t"
  1242. "movq 8(%1, %%eax), %%mm2 \n\t"
  1243. "movq %%mm0, %%mm1 \n\t"
  1244. "movq %%mm2, %%mm3 \n\t"
  1245. "movq (%2, %%eax), %%mm4 \n\t"
  1246. "movq 8(%2, %%eax), %%mm5 \n\t"
  1247. "punpcklbw %%mm4, %%mm0 \n\t"
  1248. "punpckhbw %%mm4, %%mm1 \n\t"
  1249. "punpcklbw %%mm5, %%mm2 \n\t"
  1250. "punpckhbw %%mm5, %%mm3 \n\t"
  1251. MOVNTQ" %%mm0, (%0, %%eax, 2) \n\t"
  1252. MOVNTQ" %%mm1, 8(%0, %%eax, 2) \n\t"
  1253. MOVNTQ" %%mm2, 16(%0, %%eax, 2) \n\t"
  1254. MOVNTQ" %%mm3, 24(%0, %%eax, 2) \n\t"
  1255. "addl $16, %%eax \n\t"
  1256. "cmpl %3, %%eax \n\t"
  1257. " jb 1b \n\t"
  1258. ::"r"(dest), "r"(src1), "r"(src2), "r" (width-15)
  1259. : "memory", "%eax"
  1260. );
  1261. #endif
  1262. for(w= (width&(~15)); w < width; w++)
  1263. {
  1264. dest[2*w+0] = src1[w];
  1265. dest[2*w+1] = src2[w];
  1266. }
  1267. #else
  1268. for(w=0; w < width; w++)
  1269. {
  1270. dest[2*w+0] = src1[w];
  1271. dest[2*w+1] = src2[w];
  1272. }
  1273. #endif
  1274. dest += dstStride;
  1275. src1 += src1Stride;
  1276. src2 += src2Stride;
  1277. }
  1278. #ifdef HAVE_MMX
  1279. asm(
  1280. EMMS" \n\t"
  1281. SFENCE" \n\t"
  1282. ::: "memory"
  1283. );
  1284. #endif
  1285. }