You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

434 lines
14KB

  1. /*
  2. *
  3. * rgb2rgb.c, Software RGB to RGB convertor
  4. * pluralize by Software PAL8 to RGB convertor
  5. * Software YUV to YUV convertor
  6. * Software YUV to RGB convertor
  7. * Written by Nick Kurshev.
  8. * palette & yuv & runtime cpu stuff by Michael (michaelni@gmx.at) (under GPL)
  9. */
  10. #include <inttypes.h>
  11. #include "../config.h"
  12. #include "rgb2rgb.h"
  13. #include "../cpudetect.h"
  14. #include "../mangle.h"
  15. #ifdef ARCH_X86
  16. #define CAN_COMPILE_X86_ASM
  17. #endif
  18. #define FAST_BGR2YV12 // use 7 bit coeffs instead of 15bit
  19. #ifdef CAN_COMPILE_X86_ASM
  20. static const uint64_t mask32b __attribute__((aligned(8))) = 0x000000FF000000FFULL;
  21. static const uint64_t mask32g __attribute__((aligned(8))) = 0x0000FF000000FF00ULL;
  22. static const uint64_t mask32r __attribute__((aligned(8))) = 0x00FF000000FF0000ULL;
  23. static const uint64_t mask32 __attribute__((aligned(8))) = 0x00FFFFFF00FFFFFFULL;
  24. static const uint64_t mask24l __attribute__((aligned(8))) = 0x0000000000FFFFFFULL;
  25. static const uint64_t mask24h __attribute__((aligned(8))) = 0x0000FFFFFF000000ULL;
  26. static const uint64_t mask24hh __attribute__((aligned(8))) = 0xffff000000000000ULL;
  27. static const uint64_t mask24hhh __attribute__((aligned(8))) = 0xffffffff00000000ULL;
  28. static const uint64_t mask24hhhh __attribute__((aligned(8))) = 0xffffffffffff0000ULL;
  29. static const uint64_t mask15b __attribute__((aligned(8))) = 0x001F001F001F001FULL; /* 00000000 00011111 xxB */
  30. static const uint64_t mask15rg __attribute__((aligned(8))) = 0x7FE07FE07FE07FE0ULL; /* 01111111 11100000 RGx */
  31. static const uint64_t mask15s __attribute__((aligned(8))) = 0xFFE0FFE0FFE0FFE0ULL;
  32. static const uint64_t red_16mask __attribute__((aligned(8))) = 0x0000f8000000f800ULL;
  33. static const uint64_t green_16mask __attribute__((aligned(8)))= 0x000007e0000007e0ULL;
  34. static const uint64_t blue_16mask __attribute__((aligned(8))) = 0x0000001f0000001fULL;
  35. static const uint64_t red_15mask __attribute__((aligned(8))) = 0x00007c000000f800ULL;
  36. static const uint64_t green_15mask __attribute__((aligned(8)))= 0x000003e0000007e0ULL;
  37. static const uint64_t blue_15mask __attribute__((aligned(8))) = 0x0000001f0000001fULL;
  38. #ifdef FAST_BGR2YV12
  39. static const uint64_t bgr2YCoeff __attribute__((aligned(8))) = 0x000000210041000DULL;
  40. static const uint64_t bgr2UCoeff __attribute__((aligned(8))) = 0x0000FFEEFFDC0038ULL;
  41. static const uint64_t bgr2VCoeff __attribute__((aligned(8))) = 0x00000038FFD2FFF8ULL;
  42. #else
  43. static const uint64_t bgr2YCoeff __attribute__((aligned(8))) = 0x000020E540830C8BULL;
  44. static const uint64_t bgr2UCoeff __attribute__((aligned(8))) = 0x0000ED0FDAC23831ULL;
  45. static const uint64_t bgr2VCoeff __attribute__((aligned(8))) = 0x00003831D0E6F6EAULL;
  46. #endif
  47. static const uint64_t bgr2YOffset __attribute__((aligned(8))) = 0x1010101010101010ULL;
  48. static const uint64_t bgr2UVOffset __attribute__((aligned(8)))= 0x8080808080808080ULL;
  49. static const uint64_t w1111 __attribute__((aligned(8))) = 0x0001000100010001ULL;
  50. #if 0
  51. static volatile uint64_t __attribute__((aligned(8))) b5Dither;
  52. static volatile uint64_t __attribute__((aligned(8))) g5Dither;
  53. static volatile uint64_t __attribute__((aligned(8))) g6Dither;
  54. static volatile uint64_t __attribute__((aligned(8))) r5Dither;
  55. static uint64_t __attribute__((aligned(8))) dither4[2]={
  56. 0x0103010301030103LL,
  57. 0x0200020002000200LL,};
  58. static uint64_t __attribute__((aligned(8))) dither8[2]={
  59. 0x0602060206020602LL,
  60. 0x0004000400040004LL,};
  61. #endif
  62. #endif
  63. #define RGB2YUV_SHIFT 8
  64. #define BY ((int)( 0.098*(1<<RGB2YUV_SHIFT)+0.5))
  65. #define BV ((int)(-0.071*(1<<RGB2YUV_SHIFT)+0.5))
  66. #define BU ((int)( 0.439*(1<<RGB2YUV_SHIFT)+0.5))
  67. #define GY ((int)( 0.504*(1<<RGB2YUV_SHIFT)+0.5))
  68. #define GV ((int)(-0.368*(1<<RGB2YUV_SHIFT)+0.5))
  69. #define GU ((int)(-0.291*(1<<RGB2YUV_SHIFT)+0.5))
  70. #define RY ((int)( 0.257*(1<<RGB2YUV_SHIFT)+0.5))
  71. #define RV ((int)( 0.439*(1<<RGB2YUV_SHIFT)+0.5))
  72. #define RU ((int)(-0.148*(1<<RGB2YUV_SHIFT)+0.5))
  73. //Note: we have C, MMX, MMX2, 3DNOW version therse no 3DNOW+MMX2 one
  74. //Plain C versions
  75. #undef HAVE_MMX
  76. #undef HAVE_MMX2
  77. #undef HAVE_3DNOW
  78. #undef ARCH_X86
  79. #undef HAVE_SSE2
  80. #define RENAME(a) a ## _C
  81. #include "rgb2rgb_template.c"
  82. #ifdef CAN_COMPILE_X86_ASM
  83. //MMX versions
  84. #undef RENAME
  85. #define HAVE_MMX
  86. #undef HAVE_MMX2
  87. #undef HAVE_3DNOW
  88. #undef HAVE_SSE2
  89. #define ARCH_X86
  90. #define RENAME(a) a ## _MMX
  91. #include "rgb2rgb_template.c"
  92. //MMX2 versions
  93. #undef RENAME
  94. #define HAVE_MMX
  95. #define HAVE_MMX2
  96. #undef HAVE_3DNOW
  97. #undef HAVE_SSE2
  98. #define ARCH_X86
  99. #define RENAME(a) a ## _MMX2
  100. #include "rgb2rgb_template.c"
  101. //3DNOW versions
  102. #undef RENAME
  103. #define HAVE_MMX
  104. #undef HAVE_MMX2
  105. #define HAVE_3DNOW
  106. #undef HAVE_SSE2
  107. #define ARCH_X86
  108. #define RENAME(a) a ## _3DNow
  109. #include "rgb2rgb_template.c"
  110. #endif //CAN_COMPILE_X86_ASM
  111. void rgb24to32(const uint8_t *src,uint8_t *dst,unsigned src_size)
  112. {
  113. #ifdef CAN_COMPILE_X86_ASM
  114. // ordered per speed fasterst first
  115. if(gCpuCaps.hasMMX2)
  116. rgb24to32_MMX2(src, dst, src_size);
  117. else if(gCpuCaps.has3DNow)
  118. rgb24to32_3DNow(src, dst, src_size);
  119. else if(gCpuCaps.hasMMX)
  120. rgb24to32_MMX(src, dst, src_size);
  121. else
  122. rgb24to32_C(src, dst, src_size);
  123. #else
  124. rgb24to32_C(src, dst, src_size);
  125. #endif
  126. }
  127. void rgb32to24(const uint8_t *src,uint8_t *dst,unsigned src_size)
  128. {
  129. #ifdef CAN_COMPILE_X86_ASM
  130. // ordered per speed fasterst first
  131. if(gCpuCaps.hasMMX2)
  132. rgb32to24_MMX2(src, dst, src_size);
  133. else if(gCpuCaps.has3DNow)
  134. rgb32to24_3DNow(src, dst, src_size);
  135. else if(gCpuCaps.hasMMX)
  136. rgb32to24_MMX(src, dst, src_size);
  137. else
  138. rgb32to24_C(src, dst, src_size);
  139. #else
  140. rgb32to24_C(src, dst, src_size);
  141. #endif
  142. }
  143. /*
  144. Original by Strepto/Astral
  145. ported to gcc & bugfixed : A'rpi
  146. MMX2, 3DNOW optimization by Nick Kurshev
  147. 32bit c version, and and&add trick by Michael Niedermayer
  148. */
  149. void rgb15to16(const uint8_t *src,uint8_t *dst,unsigned src_size)
  150. {
  151. #ifdef CAN_COMPILE_X86_ASM
  152. // ordered per speed fasterst first
  153. if(gCpuCaps.hasMMX2)
  154. rgb15to16_MMX2(src, dst, src_size);
  155. else if(gCpuCaps.has3DNow)
  156. rgb15to16_3DNow(src, dst, src_size);
  157. else if(gCpuCaps.hasMMX)
  158. rgb15to16_MMX(src, dst, src_size);
  159. else
  160. rgb15to16_C(src, dst, src_size);
  161. #else
  162. rgb15to16_C(src, dst, src_size);
  163. #endif
  164. }
  165. /**
  166. * Pallete is assumed to contain bgr32
  167. */
  168. void palette8torgb32(const uint8_t *src, uint8_t *dst, unsigned num_pixels, const uint8_t *palette)
  169. {
  170. unsigned i;
  171. for(i=0; i<num_pixels; i++)
  172. ((unsigned *)dst)[i] = ((unsigned *)palette)[ src[i] ];
  173. }
  174. /**
  175. * Pallete is assumed to contain bgr32
  176. */
  177. void palette8torgb24(const uint8_t *src, uint8_t *dst, unsigned num_pixels, const uint8_t *palette)
  178. {
  179. unsigned i;
  180. /*
  181. writes 1 byte o much and might cause alignment issues on some architectures?
  182. for(i=0; i<num_pixels; i++)
  183. ((unsigned *)(&dst[i*3])) = ((unsigned *)palette)[ src[i] ];
  184. */
  185. for(i=0; i<num_pixels; i++)
  186. {
  187. //FIXME slow?
  188. dst[0]= palette[ src[i]*4+0 ];
  189. dst[1]= palette[ src[i]*4+1 ];
  190. dst[2]= palette[ src[i]*4+2 ];
  191. dst+= 3;
  192. }
  193. }
  194. void rgb32to16(const uint8_t *src, uint8_t *dst, unsigned src_size)
  195. {
  196. #ifdef CAN_COMPILE_X86_ASM
  197. // ordered per speed fasterst first
  198. if(gCpuCaps.hasMMX2)
  199. rgb32to16_MMX2(src, dst, src_size);
  200. else if(gCpuCaps.has3DNow)
  201. rgb32to16_3DNow(src, dst, src_size);
  202. else if(gCpuCaps.hasMMX)
  203. rgb32to16_MMX(src, dst, src_size);
  204. else
  205. rgb32to16_C(src, dst, src_size);
  206. #else
  207. rgb32to16_C(src, dst, src_size);
  208. #endif
  209. }
  210. void rgb32to15(const uint8_t *src, uint8_t *dst, unsigned src_size)
  211. {
  212. #ifdef CAN_COMPILE_X86_ASM
  213. // ordered per speed fasterst first
  214. if(gCpuCaps.hasMMX2)
  215. rgb32to15_MMX2(src, dst, src_size);
  216. else if(gCpuCaps.has3DNow)
  217. rgb32to15_3DNow(src, dst, src_size);
  218. else if(gCpuCaps.hasMMX)
  219. rgb32to15_MMX(src, dst, src_size);
  220. else
  221. rgb32to15_C(src, dst, src_size);
  222. #else
  223. rgb32to15_C(src, dst, src_size);
  224. #endif
  225. }
  226. void rgb24to16(const uint8_t *src, uint8_t *dst, unsigned src_size)
  227. {
  228. #ifdef CAN_COMPILE_X86_ASM
  229. // ordered per speed fasterst first
  230. if(gCpuCaps.hasMMX2)
  231. rgb24to16_MMX2(src, dst, src_size);
  232. else if(gCpuCaps.has3DNow)
  233. rgb24to16_3DNow(src, dst, src_size);
  234. else if(gCpuCaps.hasMMX)
  235. rgb24to16_MMX(src, dst, src_size);
  236. else
  237. rgb24to16_C(src, dst, src_size);
  238. #else
  239. rgb24to16_C(src, dst, src_size);
  240. #endif
  241. }
  242. void rgb24to15(const uint8_t *src, uint8_t *dst, unsigned src_size)
  243. {
  244. #ifdef CAN_COMPILE_X86_ASM
  245. // ordered per speed fasterst first
  246. if(gCpuCaps.hasMMX2)
  247. rgb24to15_MMX2(src, dst, src_size);
  248. else if(gCpuCaps.has3DNow)
  249. rgb24to15_3DNow(src, dst, src_size);
  250. else if(gCpuCaps.hasMMX)
  251. rgb24to15_MMX(src, dst, src_size);
  252. else
  253. rgb24to15_C(src, dst, src_size);
  254. #else
  255. rgb24to15_C(src, dst, src_size);
  256. #endif
  257. }
  258. /**
  259. * Palette is assumed to contain bgr16, see rgb32to16 to convert the palette
  260. */
  261. void palette8torgb16(const uint8_t *src, uint8_t *dst, unsigned num_pixels, const uint8_t *palette)
  262. {
  263. unsigned i;
  264. for(i=0; i<num_pixels; i++)
  265. ((uint16_t *)dst)[i] = ((uint16_t *)palette)[ src[i] ];
  266. }
  267. /**
  268. * Pallete is assumed to contain bgr15, see rgb32to15 to convert the palette
  269. */
  270. void palette8torgb15(const uint8_t *src, uint8_t *dst, unsigned num_pixels, const uint8_t *palette)
  271. {
  272. unsigned i;
  273. for(i=0; i<num_pixels; i++)
  274. ((uint16_t *)dst)[i] = ((uint16_t *)palette)[ src[i] ];
  275. }
  276. void rgb32tobgr32(const uint8_t *src, uint8_t *dst, unsigned int src_size)
  277. {
  278. #ifdef CAN_COMPILE_X86_ASM
  279. // ordered per speed fasterst first
  280. if(gCpuCaps.hasMMX2)
  281. rgb32tobgr32_MMX2(src, dst, src_size);
  282. else if(gCpuCaps.has3DNow)
  283. rgb32tobgr32_3DNow(src, dst, src_size);
  284. else if(gCpuCaps.hasMMX)
  285. rgb32tobgr32_MMX(src, dst, src_size);
  286. else
  287. rgb32tobgr32_C(src, dst, src_size);
  288. #else
  289. rgb32tobgr32_C(src, dst, src_size);
  290. #endif
  291. }
  292. /**
  293. *
  294. * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
  295. * problem for anyone then tell me, and ill fix it)
  296. */
  297. void yv12toyuy2(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
  298. unsigned int width, unsigned int height,
  299. unsigned int lumStride, unsigned int chromStride, unsigned int dstStride)
  300. {
  301. #ifdef CAN_COMPILE_X86_ASM
  302. // ordered per speed fasterst first
  303. if(gCpuCaps.hasMMX2)
  304. yv12toyuy2_MMX2(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride);
  305. else if(gCpuCaps.has3DNow)
  306. yv12toyuy2_3DNow(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride);
  307. else if(gCpuCaps.hasMMX)
  308. yv12toyuy2_MMX(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride);
  309. else
  310. yv12toyuy2_C(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride);
  311. #else
  312. yv12toyuy2_C(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride);
  313. #endif
  314. }
  315. /**
  316. *
  317. * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
  318. * problem for anyone then tell me, and ill fix it)
  319. */
  320. void yuy2toyv12(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
  321. unsigned int width, unsigned int height,
  322. unsigned int lumStride, unsigned int chromStride, unsigned int srcStride)
  323. {
  324. #ifdef CAN_COMPILE_X86_ASM
  325. // ordered per speed fasterst first
  326. if(gCpuCaps.hasMMX2)
  327. yuy2toyv12_MMX2(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride);
  328. else if(gCpuCaps.has3DNow)
  329. yuy2toyv12_3DNow(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride);
  330. else if(gCpuCaps.hasMMX)
  331. yuy2toyv12_MMX(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride);
  332. else
  333. yuy2toyv12_C(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride);
  334. #else
  335. yuy2toyv12_C(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride);
  336. #endif
  337. }
  338. /**
  339. *
  340. * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
  341. * problem for anyone then tell me, and ill fix it)
  342. * chrominance data is only taken from every secound line others are ignored FIXME write HQ version
  343. */
  344. void uyvytoyv12(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
  345. unsigned int width, unsigned int height,
  346. unsigned int lumStride, unsigned int chromStride, unsigned int srcStride)
  347. {
  348. #ifdef CAN_COMPILE_X86_ASM
  349. // ordered per speed fasterst first
  350. if(gCpuCaps.hasMMX2)
  351. uyvytoyv12_MMX2(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride);
  352. else if(gCpuCaps.has3DNow)
  353. uyvytoyv12_3DNow(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride);
  354. else if(gCpuCaps.hasMMX)
  355. uyvytoyv12_MMX(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride);
  356. else
  357. uyvytoyv12_C(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride);
  358. #else
  359. uyvytoyv12_C(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride);
  360. #endif
  361. }
  362. /**
  363. *
  364. * height should be a multiple of 2 and width should be a multiple of 2 (if this is a
  365. * problem for anyone then tell me, and ill fix it)
  366. * chrominance data is only taken from every secound line others are ignored FIXME write HQ version
  367. */
  368. void rgb24toyv12(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
  369. unsigned int width, unsigned int height,
  370. unsigned int lumStride, unsigned int chromStride, unsigned int srcStride)
  371. {
  372. #ifdef CAN_COMPILE_X86_ASM
  373. // ordered per speed fasterst first
  374. if(gCpuCaps.hasMMX2)
  375. rgb24toyv12_MMX2(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride);
  376. else if(gCpuCaps.has3DNow)
  377. rgb24toyv12_3DNow(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride);
  378. else if(gCpuCaps.hasMMX)
  379. rgb24toyv12_MMX(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride);
  380. else
  381. rgb24toyv12_C(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride);
  382. #else
  383. rgb24toyv12_C(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride);
  384. #endif
  385. }
  386. void interleaveBytes(uint8_t *src1, uint8_t *src2, uint8_t *dst,
  387. int width, int height, int src1Stride, int src2Stride, int dstStride)
  388. {
  389. #ifdef CAN_COMPILE_X86_ASM
  390. // ordered per speed fasterst first
  391. if(gCpuCaps.hasMMX2)
  392. interleaveBytes_MMX2(src1, src2, dst, width, height, src1Stride, src2Stride, dstStride);
  393. else if(gCpuCaps.has3DNow)
  394. interleaveBytes_3DNow(src1, src2, dst, width, height, src1Stride, src2Stride, dstStride);
  395. else if(gCpuCaps.hasMMX)
  396. interleaveBytes_MMX(src1, src2, dst, width, height, src1Stride, src2Stride, dstStride);
  397. else
  398. interleaveBytes_C(src1, src2, dst, width, height, src1Stride, src2Stride, dstStride);
  399. #else
  400. interleaveBytes_C(src1, src2, dst, width, height, src1Stride, src2Stride, dstStride);
  401. #endif
  402. }