You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

411 lines
13KB

  1. /*
  2. *
  3. * rgb2rgb.c, Software RGB to RGB convertor
  4. * pluralize by Software PAL8 to RGB convertor
  5. * Software YUV to YUV convertor
  6. * Software YUV to RGB convertor
  7. * Written by Nick Kurshev.
  8. * palette & yuv & runtime cpu stuff by Michael (michaelni@gmx.at) (under GPL)
  9. */
  10. #include <inttypes.h>
  11. #include "../config.h"
  12. #include "rgb2rgb.h"
  13. #include "../cpudetect.h"
  14. #ifdef ARCH_X86
  15. #define CAN_COMPILE_X86_ASM
  16. #endif
  17. #define FAST_BGR2YV12 // use 7 bit coeffs instead of 15bit
  18. #ifdef CAN_COMPILE_X86_ASM
  19. static const uint64_t mask32b __attribute__((aligned(8))) = 0x000000FF000000FFULL;
  20. static const uint64_t mask32g __attribute__((aligned(8))) = 0x0000FF000000FF00ULL;
  21. static const uint64_t mask32r __attribute__((aligned(8))) = 0x00FF000000FF0000ULL;
  22. static const uint64_t mask32 __attribute__((aligned(8))) = 0x00FFFFFF00FFFFFFULL;
  23. static const uint64_t mask24l __attribute__((aligned(8))) = 0x0000000000FFFFFFULL;
  24. static const uint64_t mask24h __attribute__((aligned(8))) = 0x0000FFFFFF000000ULL;
  25. static const uint64_t mask24hh __attribute__((aligned(8))) = 0xffff000000000000ULL;
  26. static const uint64_t mask24hhh __attribute__((aligned(8))) = 0xffffffff00000000ULL;
  27. static const uint64_t mask24hhhh __attribute__((aligned(8))) = 0xffffffffffff0000ULL;
  28. static const uint64_t mask15b __attribute__((aligned(8))) = 0x001F001F001F001FULL; /* 00000000 00011111 xxB */
  29. static const uint64_t mask15rg __attribute__((aligned(8))) = 0x7FE07FE07FE07FE0ULL; /* 01111111 11100000 RGx */
  30. static const uint64_t mask15s __attribute__((aligned(8))) = 0xFFE0FFE0FFE0FFE0ULL;
  31. static const uint64_t red_16mask __attribute__((aligned(8))) = 0x0000f8000000f800ULL;
  32. static const uint64_t green_16mask __attribute__((aligned(8)))= 0x000007e0000007e0ULL;
  33. static const uint64_t blue_16mask __attribute__((aligned(8))) = 0x0000001f0000001fULL;
  34. static const uint64_t red_15mask __attribute__((aligned(8))) = 0x00007c000000f800ULL;
  35. static const uint64_t green_15mask __attribute__((aligned(8)))= 0x000003e0000007e0ULL;
  36. static const uint64_t blue_15mask __attribute__((aligned(8))) = 0x0000001f0000001fULL;
  37. #ifdef FAST_BGR2YV12
  38. static const uint64_t bgr2YCoeff __attribute__((aligned(8))) = 0x000000210041000DULL;
  39. static const uint64_t bgr2UCoeff __attribute__((aligned(8))) = 0x0000FFEEFFDC0038ULL;
  40. static const uint64_t bgr2VCoeff __attribute__((aligned(8))) = 0x00000038FFD2FFF8ULL;
  41. #else
  42. static const uint64_t bgr2YCoeff __attribute__((aligned(8))) = 0x000020E540830C8BULL;
  43. static const uint64_t bgr2UCoeff __attribute__((aligned(8))) = 0x0000ED0FDAC23831ULL;
  44. static const uint64_t bgr2VCoeff __attribute__((aligned(8))) = 0x00003831D0E6F6EAULL;
  45. #endif
  46. static const uint64_t bgr2YOffset __attribute__((aligned(8))) = 0x1010101010101010ULL;
  47. static const uint64_t bgr2UVOffset __attribute__((aligned(8)))= 0x8080808080808080ULL;
  48. static const uint64_t w1111 __attribute__((aligned(8))) = 0x0001000100010001ULL;
  49. #if 0
  50. static volatile uint64_t __attribute__((aligned(8))) b5Dither;
  51. static volatile uint64_t __attribute__((aligned(8))) g5Dither;
  52. static volatile uint64_t __attribute__((aligned(8))) g6Dither;
  53. static volatile uint64_t __attribute__((aligned(8))) r5Dither;
  54. static uint64_t __attribute__((aligned(8))) dither4[2]={
  55. 0x0103010301030103LL,
  56. 0x0200020002000200LL,};
  57. static uint64_t __attribute__((aligned(8))) dither8[2]={
  58. 0x0602060206020602LL,
  59. 0x0004000400040004LL,};
  60. #endif
  61. #endif
  62. #define RGB2YUV_SHIFT 8
  63. #define BY ((int)( 0.098*(1<<RGB2YUV_SHIFT)+0.5))
  64. #define BV ((int)(-0.071*(1<<RGB2YUV_SHIFT)+0.5))
  65. #define BU ((int)( 0.439*(1<<RGB2YUV_SHIFT)+0.5))
  66. #define GY ((int)( 0.504*(1<<RGB2YUV_SHIFT)+0.5))
  67. #define GV ((int)(-0.368*(1<<RGB2YUV_SHIFT)+0.5))
  68. #define GU ((int)(-0.291*(1<<RGB2YUV_SHIFT)+0.5))
  69. #define RY ((int)( 0.257*(1<<RGB2YUV_SHIFT)+0.5))
  70. #define RV ((int)( 0.439*(1<<RGB2YUV_SHIFT)+0.5))
  71. #define RU ((int)(-0.148*(1<<RGB2YUV_SHIFT)+0.5))
  72. //Note: we have C, MMX, MMX2, 3DNOW version therse no 3DNOW+MMX2 one
  73. //Plain C versions
  74. #undef HAVE_MMX
  75. #undef HAVE_MMX2
  76. #undef HAVE_3DNOW
  77. #undef ARCH_X86
  78. #define RENAME(a) a ## _C
  79. #include "rgb2rgb_template.c"
  80. #ifdef CAN_COMPILE_X86_ASM
  81. //MMX versions
  82. #undef RENAME
  83. #define HAVE_MMX
  84. #undef HAVE_MMX2
  85. #undef HAVE_3DNOW
  86. #define ARCH_X86
  87. #define RENAME(a) a ## _MMX
  88. #include "rgb2rgb_template.c"
  89. //MMX2 versions
  90. #undef RENAME
  91. #define HAVE_MMX
  92. #define HAVE_MMX2
  93. #undef HAVE_3DNOW
  94. #define ARCH_X86
  95. #define RENAME(a) a ## _MMX2
  96. #include "rgb2rgb_template.c"
  97. //3DNOW versions
  98. #undef RENAME
  99. #define HAVE_MMX
  100. #undef HAVE_MMX2
  101. #define HAVE_3DNOW
  102. #define ARCH_X86
  103. #define RENAME(a) a ## _3DNow
  104. #include "rgb2rgb_template.c"
  105. #endif //CAN_COMPILE_X86_ASM
  106. void rgb24to32(const uint8_t *src,uint8_t *dst,unsigned src_size)
  107. {
  108. #ifdef CAN_COMPILE_X86_ASM
  109. // ordered per speed fasterst first
  110. if(gCpuCaps.hasMMX2)
  111. rgb24to32_MMX2(src, dst, src_size);
  112. else if(gCpuCaps.has3DNow)
  113. rgb24to32_3DNow(src, dst, src_size);
  114. else if(gCpuCaps.hasMMX)
  115. rgb24to32_MMX(src, dst, src_size);
  116. else
  117. rgb24to32_C(src, dst, src_size);
  118. #else
  119. rgb24to32_C(src, dst, src_size);
  120. #endif
  121. }
  122. void rgb32to24(const uint8_t *src,uint8_t *dst,unsigned src_size)
  123. {
  124. #ifdef CAN_COMPILE_X86_ASM
  125. // ordered per speed fasterst first
  126. if(gCpuCaps.hasMMX2)
  127. rgb32to24_MMX2(src, dst, src_size);
  128. else if(gCpuCaps.has3DNow)
  129. rgb32to24_3DNow(src, dst, src_size);
  130. else if(gCpuCaps.hasMMX)
  131. rgb32to24_MMX(src, dst, src_size);
  132. else
  133. rgb32to24_C(src, dst, src_size);
  134. #else
  135. rgb32to24_C(src, dst, src_size);
  136. #endif
  137. }
  138. /*
  139. Original by Strepto/Astral
  140. ported to gcc & bugfixed : A'rpi
  141. MMX2, 3DNOW optimization by Nick Kurshev
  142. 32bit c version, and and&add trick by Michael Niedermayer
  143. */
  144. void rgb15to16(const uint8_t *src,uint8_t *dst,unsigned src_size)
  145. {
  146. #ifdef CAN_COMPILE_X86_ASM
  147. // ordered per speed fasterst first
  148. if(gCpuCaps.hasMMX2)
  149. rgb15to16_MMX2(src, dst, src_size);
  150. else if(gCpuCaps.has3DNow)
  151. rgb15to16_3DNow(src, dst, src_size);
  152. else if(gCpuCaps.hasMMX)
  153. rgb15to16_MMX(src, dst, src_size);
  154. else
  155. rgb15to16_C(src, dst, src_size);
  156. #else
  157. rgb15to16_C(src, dst, src_size);
  158. #endif
  159. }
  160. /**
  161. * Pallete is assumed to contain bgr32
  162. */
  163. void palette8torgb32(const uint8_t *src, uint8_t *dst, unsigned num_pixels, const uint8_t *palette)
  164. {
  165. unsigned i;
  166. for(i=0; i<num_pixels; i++)
  167. ((unsigned *)dst)[i] = ((unsigned *)palette)[ src[i] ];
  168. }
  169. /**
  170. * Pallete is assumed to contain bgr32
  171. */
  172. void palette8torgb24(const uint8_t *src, uint8_t *dst, unsigned num_pixels, const uint8_t *palette)
  173. {
  174. unsigned i;
  175. /*
  176. writes 1 byte o much and might cause alignment issues on some architectures?
  177. for(i=0; i<num_pixels; i++)
  178. ((unsigned *)(&dst[i*3])) = ((unsigned *)palette)[ src[i] ];
  179. */
  180. for(i=0; i<num_pixels; i++)
  181. {
  182. //FIXME slow?
  183. dst[0]= palette[ src[i]*4+0 ];
  184. dst[1]= palette[ src[i]*4+1 ];
  185. dst[2]= palette[ src[i]*4+2 ];
  186. dst+= 3;
  187. }
  188. }
  189. void rgb32to16(const uint8_t *src, uint8_t *dst, unsigned src_size)
  190. {
  191. #ifdef CAN_COMPILE_X86_ASM
  192. // ordered per speed fasterst first
  193. if(gCpuCaps.hasMMX2)
  194. rgb32to16_MMX2(src, dst, src_size);
  195. else if(gCpuCaps.has3DNow)
  196. rgb32to16_3DNow(src, dst, src_size);
  197. else if(gCpuCaps.hasMMX)
  198. rgb32to16_MMX(src, dst, src_size);
  199. else
  200. rgb32to16_C(src, dst, src_size);
  201. #else
  202. rgb32to16_C(src, dst, src_size);
  203. #endif
  204. }
  205. void rgb32to15(const uint8_t *src, uint8_t *dst, unsigned src_size)
  206. {
  207. #ifdef CAN_COMPILE_X86_ASM
  208. // ordered per speed fasterst first
  209. if(gCpuCaps.hasMMX2)
  210. rgb32to15_MMX2(src, dst, src_size);
  211. else if(gCpuCaps.has3DNow)
  212. rgb32to15_3DNow(src, dst, src_size);
  213. else if(gCpuCaps.hasMMX)
  214. rgb32to15_MMX(src, dst, src_size);
  215. else
  216. rgb32to15_C(src, dst, src_size);
  217. #else
  218. rgb32to15_C(src, dst, src_size);
  219. #endif
  220. }
  221. void rgb24to16(const uint8_t *src, uint8_t *dst, unsigned src_size)
  222. {
  223. #ifdef CAN_COMPILE_X86_ASM
  224. // ordered per speed fasterst first
  225. if(gCpuCaps.hasMMX2)
  226. rgb24to16_MMX2(src, dst, src_size);
  227. else if(gCpuCaps.has3DNow)
  228. rgb24to16_3DNow(src, dst, src_size);
  229. else if(gCpuCaps.hasMMX)
  230. rgb24to16_MMX(src, dst, src_size);
  231. else
  232. rgb24to16_C(src, dst, src_size);
  233. #else
  234. rgb24to16_C(src, dst, src_size);
  235. #endif
  236. }
  237. void rgb24to15(const uint8_t *src, uint8_t *dst, unsigned src_size)
  238. {
  239. #ifdef CAN_COMPILE_X86_ASM
  240. // ordered per speed fasterst first
  241. if(gCpuCaps.hasMMX2)
  242. rgb24to15_MMX2(src, dst, src_size);
  243. else if(gCpuCaps.has3DNow)
  244. rgb24to15_3DNow(src, dst, src_size);
  245. else if(gCpuCaps.hasMMX)
  246. rgb24to15_MMX(src, dst, src_size);
  247. else
  248. rgb24to15_C(src, dst, src_size);
  249. #else
  250. rgb24to15_C(src, dst, src_size);
  251. #endif
  252. }
  253. /**
  254. * Palette is assumed to contain bgr16, see rgb32to16 to convert the palette
  255. */
  256. void palette8torgb16(const uint8_t *src, uint8_t *dst, unsigned num_pixels, const uint8_t *palette)
  257. {
  258. unsigned i;
  259. for(i=0; i<num_pixels; i++)
  260. ((uint16_t *)dst)[i] = ((uint16_t *)palette)[ src[i] ];
  261. }
  262. /**
  263. * Pallete is assumed to contain bgr15, see rgb32to15 to convert the palette
  264. */
  265. void palette8torgb15(const uint8_t *src, uint8_t *dst, unsigned num_pixels, const uint8_t *palette)
  266. {
  267. unsigned i;
  268. for(i=0; i<num_pixels; i++)
  269. ((uint16_t *)dst)[i] = ((uint16_t *)palette)[ src[i] ];
  270. }
  271. void rgb32tobgr32(const uint8_t *src, uint8_t *dst, unsigned int src_size)
  272. {
  273. #ifdef CAN_COMPILE_X86_ASM
  274. // ordered per speed fasterst first
  275. if(gCpuCaps.hasMMX2)
  276. rgb32tobgr32_MMX2(src, dst, src_size);
  277. else if(gCpuCaps.has3DNow)
  278. rgb32tobgr32_3DNow(src, dst, src_size);
  279. else if(gCpuCaps.hasMMX)
  280. rgb32tobgr32_MMX(src, dst, src_size);
  281. else
  282. rgb32tobgr32_C(src, dst, src_size);
  283. #else
  284. rgb32tobgr32_C(src, dst, src_size);
  285. #endif
  286. }
  287. /**
  288. *
  289. * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
  290. * problem for anyone then tell me, and ill fix it)
  291. */
  292. void yv12toyuy2(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
  293. unsigned int width, unsigned int height,
  294. unsigned int lumStride, unsigned int chromStride, unsigned int dstStride)
  295. {
  296. #ifdef CAN_COMPILE_X86_ASM
  297. // ordered per speed fasterst first
  298. if(gCpuCaps.hasMMX2)
  299. yv12toyuy2_MMX2(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride);
  300. else if(gCpuCaps.has3DNow)
  301. yv12toyuy2_3DNow(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride);
  302. else if(gCpuCaps.hasMMX)
  303. yv12toyuy2_MMX(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride);
  304. else
  305. yv12toyuy2_C(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride);
  306. #else
  307. yv12toyuy2_C(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride);
  308. #endif
  309. }
  310. /**
  311. *
  312. * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
  313. * problem for anyone then tell me, and ill fix it)
  314. */
  315. void yuy2toyv12(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
  316. unsigned int width, unsigned int height,
  317. unsigned int lumStride, unsigned int chromStride, unsigned int srcStride)
  318. {
  319. #ifdef CAN_COMPILE_X86_ASM
  320. // ordered per speed fasterst first
  321. if(gCpuCaps.hasMMX2)
  322. yuy2toyv12_MMX2(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride);
  323. else if(gCpuCaps.has3DNow)
  324. yuy2toyv12_3DNow(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride);
  325. else if(gCpuCaps.hasMMX)
  326. yuy2toyv12_MMX(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride);
  327. else
  328. yuy2toyv12_C(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride);
  329. #else
  330. yuy2toyv12_C(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride);
  331. #endif
  332. }
  333. /**
  334. *
  335. * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
  336. * problem for anyone then tell me, and ill fix it)
  337. * chrominance data is only taken from every secound line others are ignored FIXME write HQ version
  338. */
  339. void uyvytoyv12(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
  340. unsigned int width, unsigned int height,
  341. unsigned int lumStride, unsigned int chromStride, unsigned int srcStride)
  342. {
  343. #ifdef CAN_COMPILE_X86_ASM
  344. // ordered per speed fasterst first
  345. if(gCpuCaps.hasMMX2)
  346. uyvytoyv12_MMX2(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride);
  347. else if(gCpuCaps.has3DNow)
  348. uyvytoyv12_3DNow(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride);
  349. else if(gCpuCaps.hasMMX)
  350. uyvytoyv12_MMX(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride);
  351. else
  352. uyvytoyv12_C(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride);
  353. #else
  354. uyvytoyv12_C(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride);
  355. #endif
  356. }
  357. /**
  358. *
  359. * height should be a multiple of 2 and width should be a multiple of 2 (if this is a
  360. * problem for anyone then tell me, and ill fix it)
  361. * chrominance data is only taken from every secound line others are ignored FIXME write HQ version
  362. */
  363. void rgb24toyv12(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
  364. unsigned int width, unsigned int height,
  365. unsigned int lumStride, unsigned int chromStride, unsigned int srcStride)
  366. {
  367. #ifdef CAN_COMPILE_X86_ASM
  368. // ordered per speed fasterst first
  369. if(gCpuCaps.hasMMX2)
  370. rgb24toyv12_MMX2(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride);
  371. else if(gCpuCaps.has3DNow)
  372. rgb24toyv12_3DNow(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride);
  373. else if(gCpuCaps.hasMMX)
  374. rgb24toyv12_MMX(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride);
  375. else
  376. rgb24toyv12_C(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride);
  377. #else
  378. rgb24toyv12_C(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride);
  379. #endif
  380. }