You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

477 lines
16KB

  1. /*
  2. *
  3. * rgb2rgb.c, Software RGB to RGB convertor
  4. * pluralize by Software PAL8 to RGB convertor
  5. * Software YUV to YUV convertor
  6. * Software YUV to RGB convertor
  7. * Written by Nick Kurshev.
  8. * palette & yuv & runtime cpu stuff by Michael (michaelni@gmx.at) (under GPL)
  9. */
  10. #include <inttypes.h>
  11. #include "../config.h"
  12. #include "rgb2rgb.h"
  13. #include "../cpudetect.h"
  14. #include "../mangle.h"
  15. #ifdef ARCH_X86
  16. #define CAN_COMPILE_X86_ASM
  17. #endif
  18. #define FAST_BGR2YV12 // use 7 bit coeffs instead of 15bit
  19. #ifdef CAN_COMPILE_X86_ASM
  20. static const uint64_t mask32b __attribute__((aligned(8))) = 0x000000FF000000FFULL;
  21. static const uint64_t mask32g __attribute__((aligned(8))) = 0x0000FF000000FF00ULL;
  22. static const uint64_t mask32r __attribute__((aligned(8))) = 0x00FF000000FF0000ULL;
  23. static const uint64_t mask32 __attribute__((aligned(8))) = 0x00FFFFFF00FFFFFFULL;
  24. static const uint64_t mask24b __attribute__((aligned(8))) = 0x00FF0000FF0000FFULL;
  25. static const uint64_t mask24g __attribute__((aligned(8))) = 0xFF0000FF0000FF00ULL;
  26. static const uint64_t mask24r __attribute__((aligned(8))) = 0x0000FF0000FF0000ULL;
  27. static const uint64_t mask24l __attribute__((aligned(8))) = 0x0000000000FFFFFFULL;
  28. static const uint64_t mask24h __attribute__((aligned(8))) = 0x0000FFFFFF000000ULL;
  29. static const uint64_t mask24hh __attribute__((aligned(8))) = 0xffff000000000000ULL;
  30. static const uint64_t mask24hhh __attribute__((aligned(8))) = 0xffffffff00000000ULL;
  31. static const uint64_t mask24hhhh __attribute__((aligned(8))) = 0xffffffffffff0000ULL;
  32. static const uint64_t mask15b __attribute__((aligned(8))) = 0x001F001F001F001FULL; /* 00000000 00011111 xxB */
  33. static const uint64_t mask15rg __attribute__((aligned(8))) = 0x7FE07FE07FE07FE0ULL; /* 01111111 11100000 RGx */
  34. static const uint64_t mask15s __attribute__((aligned(8))) = 0xFFE0FFE0FFE0FFE0ULL;
  35. static const uint64_t red_16mask __attribute__((aligned(8))) = 0x0000f8000000f800ULL;
  36. static const uint64_t green_16mask __attribute__((aligned(8)))= 0x000007e0000007e0ULL;
  37. static const uint64_t blue_16mask __attribute__((aligned(8))) = 0x0000001f0000001fULL;
  38. static const uint64_t red_15mask __attribute__((aligned(8))) = 0x00007c000000f800ULL;
  39. static const uint64_t green_15mask __attribute__((aligned(8)))= 0x000003e0000007e0ULL;
  40. static const uint64_t blue_15mask __attribute__((aligned(8))) = 0x0000001f0000001fULL;
  41. #ifdef FAST_BGR2YV12
  42. static const uint64_t bgr2YCoeff __attribute__((aligned(8))) = 0x000000210041000DULL;
  43. static const uint64_t bgr2UCoeff __attribute__((aligned(8))) = 0x0000FFEEFFDC0038ULL;
  44. static const uint64_t bgr2VCoeff __attribute__((aligned(8))) = 0x00000038FFD2FFF8ULL;
  45. #else
  46. static const uint64_t bgr2YCoeff __attribute__((aligned(8))) = 0x000020E540830C8BULL;
  47. static const uint64_t bgr2UCoeff __attribute__((aligned(8))) = 0x0000ED0FDAC23831ULL;
  48. static const uint64_t bgr2VCoeff __attribute__((aligned(8))) = 0x00003831D0E6F6EAULL;
  49. #endif
  50. static const uint64_t bgr2YOffset __attribute__((aligned(8))) = 0x1010101010101010ULL;
  51. static const uint64_t bgr2UVOffset __attribute__((aligned(8)))= 0x8080808080808080ULL;
  52. static const uint64_t w1111 __attribute__((aligned(8))) = 0x0001000100010001ULL;
  53. #if 0
  54. static volatile uint64_t __attribute__((aligned(8))) b5Dither;
  55. static volatile uint64_t __attribute__((aligned(8))) g5Dither;
  56. static volatile uint64_t __attribute__((aligned(8))) g6Dither;
  57. static volatile uint64_t __attribute__((aligned(8))) r5Dither;
  58. static uint64_t __attribute__((aligned(8))) dither4[2]={
  59. 0x0103010301030103LL,
  60. 0x0200020002000200LL,};
  61. static uint64_t __attribute__((aligned(8))) dither8[2]={
  62. 0x0602060206020602LL,
  63. 0x0004000400040004LL,};
  64. #endif
  65. #endif
  66. #define RGB2YUV_SHIFT 8
  67. #define BY ((int)( 0.098*(1<<RGB2YUV_SHIFT)+0.5))
  68. #define BV ((int)(-0.071*(1<<RGB2YUV_SHIFT)+0.5))
  69. #define BU ((int)( 0.439*(1<<RGB2YUV_SHIFT)+0.5))
  70. #define GY ((int)( 0.504*(1<<RGB2YUV_SHIFT)+0.5))
  71. #define GV ((int)(-0.368*(1<<RGB2YUV_SHIFT)+0.5))
  72. #define GU ((int)(-0.291*(1<<RGB2YUV_SHIFT)+0.5))
  73. #define RY ((int)( 0.257*(1<<RGB2YUV_SHIFT)+0.5))
  74. #define RV ((int)( 0.439*(1<<RGB2YUV_SHIFT)+0.5))
  75. #define RU ((int)(-0.148*(1<<RGB2YUV_SHIFT)+0.5))
  76. //Note: we have C, MMX, MMX2, 3DNOW version therse no 3DNOW+MMX2 one
  77. //Plain C versions
  78. #undef HAVE_MMX
  79. #undef HAVE_MMX2
  80. #undef HAVE_3DNOW
  81. #undef ARCH_X86
  82. #undef HAVE_SSE2
  83. #define RENAME(a) a ## _C
  84. #include "rgb2rgb_template.c"
  85. #ifdef CAN_COMPILE_X86_ASM
  86. //MMX versions
  87. #undef RENAME
  88. #define HAVE_MMX
  89. #undef HAVE_MMX2
  90. #undef HAVE_3DNOW
  91. #undef HAVE_SSE2
  92. #define ARCH_X86
  93. #define RENAME(a) a ## _MMX
  94. #include "rgb2rgb_template.c"
  95. //MMX2 versions
  96. #undef RENAME
  97. #define HAVE_MMX
  98. #define HAVE_MMX2
  99. #undef HAVE_3DNOW
  100. #undef HAVE_SSE2
  101. #define ARCH_X86
  102. #define RENAME(a) a ## _MMX2
  103. #include "rgb2rgb_template.c"
  104. //3DNOW versions
  105. #undef RENAME
  106. #define HAVE_MMX
  107. #undef HAVE_MMX2
  108. #define HAVE_3DNOW
  109. #undef HAVE_SSE2
  110. #define ARCH_X86
  111. #define RENAME(a) a ## _3DNow
  112. #include "rgb2rgb_template.c"
  113. #endif //CAN_COMPILE_X86_ASM
  114. void rgb24to32(const uint8_t *src,uint8_t *dst,unsigned src_size)
  115. {
  116. #ifdef CAN_COMPILE_X86_ASM
  117. // ordered per speed fasterst first
  118. if(gCpuCaps.hasMMX2)
  119. rgb24to32_MMX2(src, dst, src_size);
  120. else if(gCpuCaps.has3DNow)
  121. rgb24to32_3DNow(src, dst, src_size);
  122. else if(gCpuCaps.hasMMX)
  123. rgb24to32_MMX(src, dst, src_size);
  124. else
  125. rgb24to32_C(src, dst, src_size);
  126. #else
  127. rgb24to32_C(src, dst, src_size);
  128. #endif
  129. }
  130. void rgb32to24(const uint8_t *src,uint8_t *dst,unsigned src_size)
  131. {
  132. #ifdef CAN_COMPILE_X86_ASM
  133. // ordered per speed fasterst first
  134. if(gCpuCaps.hasMMX2)
  135. rgb32to24_MMX2(src, dst, src_size);
  136. else if(gCpuCaps.has3DNow)
  137. rgb32to24_3DNow(src, dst, src_size);
  138. else if(gCpuCaps.hasMMX)
  139. rgb32to24_MMX(src, dst, src_size);
  140. else
  141. rgb32to24_C(src, dst, src_size);
  142. #else
  143. rgb32to24_C(src, dst, src_size);
  144. #endif
  145. }
  146. /*
  147. Original by Strepto/Astral
  148. ported to gcc & bugfixed : A'rpi
  149. MMX2, 3DNOW optimization by Nick Kurshev
  150. 32bit c version, and and&add trick by Michael Niedermayer
  151. */
  152. void rgb15to16(const uint8_t *src,uint8_t *dst,unsigned src_size)
  153. {
  154. #ifdef CAN_COMPILE_X86_ASM
  155. // ordered per speed fasterst first
  156. if(gCpuCaps.hasMMX2)
  157. rgb15to16_MMX2(src, dst, src_size);
  158. else if(gCpuCaps.has3DNow)
  159. rgb15to16_3DNow(src, dst, src_size);
  160. else if(gCpuCaps.hasMMX)
  161. rgb15to16_MMX(src, dst, src_size);
  162. else
  163. rgb15to16_C(src, dst, src_size);
  164. #else
  165. rgb15to16_C(src, dst, src_size);
  166. #endif
  167. }
  168. /**
  169. * Pallete is assumed to contain bgr32
  170. */
  171. void palette8torgb32(const uint8_t *src, uint8_t *dst, unsigned num_pixels, const uint8_t *palette)
  172. {
  173. unsigned i;
  174. for(i=0; i<num_pixels; i++)
  175. ((unsigned *)dst)[i] = ((unsigned *)palette)[ src[i] ];
  176. }
  177. /**
  178. * Pallete is assumed to contain bgr32
  179. */
  180. void palette8torgb24(const uint8_t *src, uint8_t *dst, unsigned num_pixels, const uint8_t *palette)
  181. {
  182. unsigned i;
  183. /*
  184. writes 1 byte o much and might cause alignment issues on some architectures?
  185. for(i=0; i<num_pixels; i++)
  186. ((unsigned *)(&dst[i*3])) = ((unsigned *)palette)[ src[i] ];
  187. */
  188. for(i=0; i<num_pixels; i++)
  189. {
  190. //FIXME slow?
  191. dst[0]= palette[ src[i]*4+0 ];
  192. dst[1]= palette[ src[i]*4+1 ];
  193. dst[2]= palette[ src[i]*4+2 ];
  194. dst+= 3;
  195. }
  196. }
  197. void rgb32to16(const uint8_t *src, uint8_t *dst, unsigned src_size)
  198. {
  199. #ifdef CAN_COMPILE_X86_ASM
  200. // ordered per speed fasterst first
  201. if(gCpuCaps.hasMMX2)
  202. rgb32to16_MMX2(src, dst, src_size);
  203. else if(gCpuCaps.has3DNow)
  204. rgb32to16_3DNow(src, dst, src_size);
  205. else if(gCpuCaps.hasMMX)
  206. rgb32to16_MMX(src, dst, src_size);
  207. else
  208. rgb32to16_C(src, dst, src_size);
  209. #else
  210. rgb32to16_C(src, dst, src_size);
  211. #endif
  212. }
  213. void rgb32to15(const uint8_t *src, uint8_t *dst, unsigned src_size)
  214. {
  215. #ifdef CAN_COMPILE_X86_ASM
  216. // ordered per speed fasterst first
  217. if(gCpuCaps.hasMMX2)
  218. rgb32to15_MMX2(src, dst, src_size);
  219. else if(gCpuCaps.has3DNow)
  220. rgb32to15_3DNow(src, dst, src_size);
  221. else if(gCpuCaps.hasMMX)
  222. rgb32to15_MMX(src, dst, src_size);
  223. else
  224. rgb32to15_C(src, dst, src_size);
  225. #else
  226. rgb32to15_C(src, dst, src_size);
  227. #endif
  228. }
  229. void rgb24to16(const uint8_t *src, uint8_t *dst, unsigned src_size)
  230. {
  231. #ifdef CAN_COMPILE_X86_ASM
  232. // ordered per speed fasterst first
  233. if(gCpuCaps.hasMMX2)
  234. rgb24to16_MMX2(src, dst, src_size);
  235. else if(gCpuCaps.has3DNow)
  236. rgb24to16_3DNow(src, dst, src_size);
  237. else if(gCpuCaps.hasMMX)
  238. rgb24to16_MMX(src, dst, src_size);
  239. else
  240. rgb24to16_C(src, dst, src_size);
  241. #else
  242. rgb24to16_C(src, dst, src_size);
  243. #endif
  244. }
  245. void rgb24to15(const uint8_t *src, uint8_t *dst, unsigned src_size)
  246. {
  247. #ifdef CAN_COMPILE_X86_ASM
  248. // ordered per speed fasterst first
  249. if(gCpuCaps.hasMMX2)
  250. rgb24to15_MMX2(src, dst, src_size);
  251. else if(gCpuCaps.has3DNow)
  252. rgb24to15_3DNow(src, dst, src_size);
  253. else if(gCpuCaps.hasMMX)
  254. rgb24to15_MMX(src, dst, src_size);
  255. else
  256. rgb24to15_C(src, dst, src_size);
  257. #else
  258. rgb24to15_C(src, dst, src_size);
  259. #endif
  260. }
  261. /**
  262. * Palette is assumed to contain bgr16, see rgb32to16 to convert the palette
  263. */
  264. void palette8torgb16(const uint8_t *src, uint8_t *dst, unsigned num_pixels, const uint8_t *palette)
  265. {
  266. unsigned i;
  267. for(i=0; i<num_pixels; i++)
  268. ((uint16_t *)dst)[i] = ((uint16_t *)palette)[ src[i] ];
  269. }
  270. /**
  271. * Pallete is assumed to contain bgr15, see rgb32to15 to convert the palette
  272. */
  273. void palette8torgb15(const uint8_t *src, uint8_t *dst, unsigned num_pixels, const uint8_t *palette)
  274. {
  275. unsigned i;
  276. for(i=0; i<num_pixels; i++)
  277. ((uint16_t *)dst)[i] = ((uint16_t *)palette)[ src[i] ];
  278. }
  279. void rgb32tobgr32(const uint8_t *src, uint8_t *dst, unsigned int src_size)
  280. {
  281. #ifdef CAN_COMPILE_X86_ASM
  282. // ordered per speed fasterst first
  283. if(gCpuCaps.hasMMX2)
  284. rgb32tobgr32_MMX2(src, dst, src_size);
  285. else if(gCpuCaps.has3DNow)
  286. rgb32tobgr32_3DNow(src, dst, src_size);
  287. else if(gCpuCaps.hasMMX)
  288. rgb32tobgr32_MMX(src, dst, src_size);
  289. else
  290. rgb32tobgr32_C(src, dst, src_size);
  291. #else
  292. rgb32tobgr32_C(src, dst, src_size);
  293. #endif
  294. }
  295. void rgb24tobgr24(const uint8_t *src, uint8_t *dst, unsigned int src_size)
  296. {
  297. #ifdef CAN_COMPILE_X86_ASM
  298. // ordered per speed fasterst first
  299. if(gCpuCaps.hasMMX2)
  300. rgb24tobgr24_MMX2(src, dst, src_size);
  301. else if(gCpuCaps.has3DNow)
  302. rgb24tobgr24_3DNow(src, dst, src_size);
  303. else if(gCpuCaps.hasMMX)
  304. rgb24tobgr24_MMX(src, dst, src_size);
  305. else
  306. rgb24tobgr24_C(src, dst, src_size);
  307. #else
  308. rgb24tobgr24_C(src, dst, src_size);
  309. #endif
  310. }
  311. /**
  312. *
  313. * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
  314. * problem for anyone then tell me, and ill fix it)
  315. */
  316. void yv12toyuy2(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
  317. unsigned int width, unsigned int height,
  318. unsigned int lumStride, unsigned int chromStride, unsigned int dstStride)
  319. {
  320. #ifdef CAN_COMPILE_X86_ASM
  321. // ordered per speed fasterst first
  322. if(gCpuCaps.hasMMX2)
  323. yv12toyuy2_MMX2(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride);
  324. else if(gCpuCaps.has3DNow)
  325. yv12toyuy2_3DNow(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride);
  326. else if(gCpuCaps.hasMMX)
  327. yv12toyuy2_MMX(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride);
  328. else
  329. yv12toyuy2_C(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride);
  330. #else
  331. yv12toyuy2_C(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride);
  332. #endif
  333. }
  334. /**
  335. *
  336. * width should be a multiple of 16
  337. */
  338. void yuv422ptoyuy2(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
  339. unsigned int width, unsigned int height,
  340. unsigned int lumStride, unsigned int chromStride, unsigned int dstStride)
  341. {
  342. #ifdef CAN_COMPILE_X86_ASM
  343. // ordered per speed fasterst first
  344. if(gCpuCaps.hasMMX2)
  345. yuv422ptoyuy2_MMX2(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride);
  346. else if(gCpuCaps.has3DNow)
  347. yuv422ptoyuy2_3DNow(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride);
  348. else if(gCpuCaps.hasMMX)
  349. yuv422ptoyuy2_MMX(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride);
  350. else
  351. yuv422ptoyuy2_C(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride);
  352. #else
  353. yuv422ptoyuy2_C(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride);
  354. #endif
  355. }
  356. /**
  357. *
  358. * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
  359. * problem for anyone then tell me, and ill fix it)
  360. */
  361. void yuy2toyv12(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
  362. unsigned int width, unsigned int height,
  363. unsigned int lumStride, unsigned int chromStride, unsigned int srcStride)
  364. {
  365. #ifdef CAN_COMPILE_X86_ASM
  366. // ordered per speed fasterst first
  367. if(gCpuCaps.hasMMX2)
  368. yuy2toyv12_MMX2(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride);
  369. else if(gCpuCaps.has3DNow)
  370. yuy2toyv12_3DNow(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride);
  371. else if(gCpuCaps.hasMMX)
  372. yuy2toyv12_MMX(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride);
  373. else
  374. yuy2toyv12_C(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride);
  375. #else
  376. yuy2toyv12_C(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride);
  377. #endif
  378. }
  379. /**
  380. *
  381. * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
  382. * problem for anyone then tell me, and ill fix it)
  383. * chrominance data is only taken from every secound line others are ignored FIXME write HQ version
  384. */
  385. void uyvytoyv12(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
  386. unsigned int width, unsigned int height,
  387. unsigned int lumStride, unsigned int chromStride, unsigned int srcStride)
  388. {
  389. #ifdef CAN_COMPILE_X86_ASM
  390. // ordered per speed fasterst first
  391. if(gCpuCaps.hasMMX2)
  392. uyvytoyv12_MMX2(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride);
  393. else if(gCpuCaps.has3DNow)
  394. uyvytoyv12_3DNow(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride);
  395. else if(gCpuCaps.hasMMX)
  396. uyvytoyv12_MMX(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride);
  397. else
  398. uyvytoyv12_C(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride);
  399. #else
  400. uyvytoyv12_C(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride);
  401. #endif
  402. }
  403. /**
  404. *
  405. * height should be a multiple of 2 and width should be a multiple of 2 (if this is a
  406. * problem for anyone then tell me, and ill fix it)
  407. * chrominance data is only taken from every secound line others are ignored FIXME write HQ version
  408. */
  409. void rgb24toyv12(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
  410. unsigned int width, unsigned int height,
  411. unsigned int lumStride, unsigned int chromStride, unsigned int srcStride)
  412. {
  413. #ifdef CAN_COMPILE_X86_ASM
  414. // ordered per speed fasterst first
  415. if(gCpuCaps.hasMMX2)
  416. rgb24toyv12_MMX2(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride);
  417. else if(gCpuCaps.has3DNow)
  418. rgb24toyv12_3DNow(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride);
  419. else if(gCpuCaps.hasMMX)
  420. rgb24toyv12_MMX(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride);
  421. else
  422. rgb24toyv12_C(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride);
  423. #else
  424. rgb24toyv12_C(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride);
  425. #endif
  426. }
  427. void interleaveBytes(uint8_t *src1, uint8_t *src2, uint8_t *dst,
  428. int width, int height, int src1Stride, int src2Stride, int dstStride)
  429. {
  430. #ifdef CAN_COMPILE_X86_ASM
  431. // ordered per speed fasterst first
  432. if(gCpuCaps.hasMMX2)
  433. interleaveBytes_MMX2(src1, src2, dst, width, height, src1Stride, src2Stride, dstStride);
  434. else if(gCpuCaps.has3DNow)
  435. interleaveBytes_3DNow(src1, src2, dst, width, height, src1Stride, src2Stride, dstStride);
  436. else if(gCpuCaps.hasMMX)
  437. interleaveBytes_MMX(src1, src2, dst, width, height, src1Stride, src2Stride, dstStride);
  438. else
  439. interleaveBytes_C(src1, src2, dst, width, height, src1Stride, src2Stride, dstStride);
  440. #else
  441. interleaveBytes_C(src1, src2, dst, width, height, src1Stride, src2Stride, dstStride);
  442. #endif
  443. }