You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

412 lines
13KB

  1. /*
  2. *
  3. * rgb2rgb.c, Software RGB to RGB convertor
  4. * pluralize by Software PAL8 to RGB convertor
  5. * Software YUV to YUV convertor
  6. * Software YUV to RGB convertor
  7. * Written by Nick Kurshev.
  8. * palette & yuv & runtime cpu stuff by Michael (michaelni@gmx.at) (under GPL)
  9. */
  10. #include <inttypes.h>
  11. #include "../config.h"
  12. #include "rgb2rgb.h"
  13. #include "../cpudetect.h"
  14. #include "../mangle.h"
  15. #ifdef ARCH_X86
  16. #define CAN_COMPILE_X86_ASM
  17. #endif
  18. #define FAST_BGR2YV12 // use 7 bit coeffs instead of 15bit
  19. #ifdef CAN_COMPILE_X86_ASM
  20. static const uint64_t mask32b __attribute__((aligned(8))) = 0x000000FF000000FFULL;
  21. static const uint64_t mask32g __attribute__((aligned(8))) = 0x0000FF000000FF00ULL;
  22. static const uint64_t mask32r __attribute__((aligned(8))) = 0x00FF000000FF0000ULL;
  23. static const uint64_t mask32 __attribute__((aligned(8))) = 0x00FFFFFF00FFFFFFULL;
  24. static const uint64_t mask24l __attribute__((aligned(8))) = 0x0000000000FFFFFFULL;
  25. static const uint64_t mask24h __attribute__((aligned(8))) = 0x0000FFFFFF000000ULL;
  26. static const uint64_t mask24hh __attribute__((aligned(8))) = 0xffff000000000000ULL;
  27. static const uint64_t mask24hhh __attribute__((aligned(8))) = 0xffffffff00000000ULL;
  28. static const uint64_t mask24hhhh __attribute__((aligned(8))) = 0xffffffffffff0000ULL;
  29. static const uint64_t mask15b __attribute__((aligned(8))) = 0x001F001F001F001FULL; /* 00000000 00011111 xxB */
  30. static const uint64_t mask15rg __attribute__((aligned(8))) = 0x7FE07FE07FE07FE0ULL; /* 01111111 11100000 RGx */
  31. static const uint64_t mask15s __attribute__((aligned(8))) = 0xFFE0FFE0FFE0FFE0ULL;
  32. static const uint64_t red_16mask __attribute__((aligned(8))) = 0x0000f8000000f800ULL;
  33. static const uint64_t green_16mask __attribute__((aligned(8)))= 0x000007e0000007e0ULL;
  34. static const uint64_t blue_16mask __attribute__((aligned(8))) = 0x0000001f0000001fULL;
  35. static const uint64_t red_15mask __attribute__((aligned(8))) = 0x00007c000000f800ULL;
  36. static const uint64_t green_15mask __attribute__((aligned(8)))= 0x000003e0000007e0ULL;
  37. static const uint64_t blue_15mask __attribute__((aligned(8))) = 0x0000001f0000001fULL;
  38. #ifdef FAST_BGR2YV12
  39. static const uint64_t bgr2YCoeff __attribute__((aligned(8))) = 0x000000210041000DULL;
  40. static const uint64_t bgr2UCoeff __attribute__((aligned(8))) = 0x0000FFEEFFDC0038ULL;
  41. static const uint64_t bgr2VCoeff __attribute__((aligned(8))) = 0x00000038FFD2FFF8ULL;
  42. #else
  43. static const uint64_t bgr2YCoeff __attribute__((aligned(8))) = 0x000020E540830C8BULL;
  44. static const uint64_t bgr2UCoeff __attribute__((aligned(8))) = 0x0000ED0FDAC23831ULL;
  45. static const uint64_t bgr2VCoeff __attribute__((aligned(8))) = 0x00003831D0E6F6EAULL;
  46. #endif
  47. static const uint64_t bgr2YOffset __attribute__((aligned(8))) = 0x1010101010101010ULL;
  48. static const uint64_t bgr2UVOffset __attribute__((aligned(8)))= 0x8080808080808080ULL;
  49. static const uint64_t w1111 __attribute__((aligned(8))) = 0x0001000100010001ULL;
  50. #if 0
  51. static volatile uint64_t __attribute__((aligned(8))) b5Dither;
  52. static volatile uint64_t __attribute__((aligned(8))) g5Dither;
  53. static volatile uint64_t __attribute__((aligned(8))) g6Dither;
  54. static volatile uint64_t __attribute__((aligned(8))) r5Dither;
  55. static uint64_t __attribute__((aligned(8))) dither4[2]={
  56. 0x0103010301030103LL,
  57. 0x0200020002000200LL,};
  58. static uint64_t __attribute__((aligned(8))) dither8[2]={
  59. 0x0602060206020602LL,
  60. 0x0004000400040004LL,};
  61. #endif
  62. #endif
  63. #define RGB2YUV_SHIFT 8
  64. #define BY ((int)( 0.098*(1<<RGB2YUV_SHIFT)+0.5))
  65. #define BV ((int)(-0.071*(1<<RGB2YUV_SHIFT)+0.5))
  66. #define BU ((int)( 0.439*(1<<RGB2YUV_SHIFT)+0.5))
  67. #define GY ((int)( 0.504*(1<<RGB2YUV_SHIFT)+0.5))
  68. #define GV ((int)(-0.368*(1<<RGB2YUV_SHIFT)+0.5))
  69. #define GU ((int)(-0.291*(1<<RGB2YUV_SHIFT)+0.5))
  70. #define RY ((int)( 0.257*(1<<RGB2YUV_SHIFT)+0.5))
  71. #define RV ((int)( 0.439*(1<<RGB2YUV_SHIFT)+0.5))
  72. #define RU ((int)(-0.148*(1<<RGB2YUV_SHIFT)+0.5))
  73. //Note: we have C, MMX, MMX2, 3DNOW version therse no 3DNOW+MMX2 one
  74. //Plain C versions
  75. #undef HAVE_MMX
  76. #undef HAVE_MMX2
  77. #undef HAVE_3DNOW
  78. #undef ARCH_X86
  79. #define RENAME(a) a ## _C
  80. #include "rgb2rgb_template.c"
  81. #ifdef CAN_COMPILE_X86_ASM
  82. //MMX versions
  83. #undef RENAME
  84. #define HAVE_MMX
  85. #undef HAVE_MMX2
  86. #undef HAVE_3DNOW
  87. #define ARCH_X86
  88. #define RENAME(a) a ## _MMX
  89. #include "rgb2rgb_template.c"
  90. //MMX2 versions
  91. #undef RENAME
  92. #define HAVE_MMX
  93. #define HAVE_MMX2
  94. #undef HAVE_3DNOW
  95. #define ARCH_X86
  96. #define RENAME(a) a ## _MMX2
  97. #include "rgb2rgb_template.c"
  98. //3DNOW versions
  99. #undef RENAME
  100. #define HAVE_MMX
  101. #undef HAVE_MMX2
  102. #define HAVE_3DNOW
  103. #define ARCH_X86
  104. #define RENAME(a) a ## _3DNow
  105. #include "rgb2rgb_template.c"
  106. #endif //CAN_COMPILE_X86_ASM
  107. void rgb24to32(const uint8_t *src,uint8_t *dst,unsigned src_size)
  108. {
  109. #ifdef CAN_COMPILE_X86_ASM
  110. // ordered per speed fasterst first
  111. if(gCpuCaps.hasMMX2)
  112. rgb24to32_MMX2(src, dst, src_size);
  113. else if(gCpuCaps.has3DNow)
  114. rgb24to32_3DNow(src, dst, src_size);
  115. else if(gCpuCaps.hasMMX)
  116. rgb24to32_MMX(src, dst, src_size);
  117. else
  118. rgb24to32_C(src, dst, src_size);
  119. #else
  120. rgb24to32_C(src, dst, src_size);
  121. #endif
  122. }
  123. void rgb32to24(const uint8_t *src,uint8_t *dst,unsigned src_size)
  124. {
  125. #ifdef CAN_COMPILE_X86_ASM
  126. // ordered per speed fasterst first
  127. if(gCpuCaps.hasMMX2)
  128. rgb32to24_MMX2(src, dst, src_size);
  129. else if(gCpuCaps.has3DNow)
  130. rgb32to24_3DNow(src, dst, src_size);
  131. else if(gCpuCaps.hasMMX)
  132. rgb32to24_MMX(src, dst, src_size);
  133. else
  134. rgb32to24_C(src, dst, src_size);
  135. #else
  136. rgb32to24_C(src, dst, src_size);
  137. #endif
  138. }
  139. /*
  140. Original by Strepto/Astral
  141. ported to gcc & bugfixed : A'rpi
  142. MMX2, 3DNOW optimization by Nick Kurshev
  143. 32bit c version, and and&add trick by Michael Niedermayer
  144. */
  145. void rgb15to16(const uint8_t *src,uint8_t *dst,unsigned src_size)
  146. {
  147. #ifdef CAN_COMPILE_X86_ASM
  148. // ordered per speed fasterst first
  149. if(gCpuCaps.hasMMX2)
  150. rgb15to16_MMX2(src, dst, src_size);
  151. else if(gCpuCaps.has3DNow)
  152. rgb15to16_3DNow(src, dst, src_size);
  153. else if(gCpuCaps.hasMMX)
  154. rgb15to16_MMX(src, dst, src_size);
  155. else
  156. rgb15to16_C(src, dst, src_size);
  157. #else
  158. rgb15to16_C(src, dst, src_size);
  159. #endif
  160. }
  161. /**
  162. * Pallete is assumed to contain bgr32
  163. */
  164. void palette8torgb32(const uint8_t *src, uint8_t *dst, unsigned num_pixels, const uint8_t *palette)
  165. {
  166. unsigned i;
  167. for(i=0; i<num_pixels; i++)
  168. ((unsigned *)dst)[i] = ((unsigned *)palette)[ src[i] ];
  169. }
  170. /**
  171. * Pallete is assumed to contain bgr32
  172. */
  173. void palette8torgb24(const uint8_t *src, uint8_t *dst, unsigned num_pixels, const uint8_t *palette)
  174. {
  175. unsigned i;
  176. /*
  177. writes 1 byte o much and might cause alignment issues on some architectures?
  178. for(i=0; i<num_pixels; i++)
  179. ((unsigned *)(&dst[i*3])) = ((unsigned *)palette)[ src[i] ];
  180. */
  181. for(i=0; i<num_pixels; i++)
  182. {
  183. //FIXME slow?
  184. dst[0]= palette[ src[i]*4+0 ];
  185. dst[1]= palette[ src[i]*4+1 ];
  186. dst[2]= palette[ src[i]*4+2 ];
  187. dst+= 3;
  188. }
  189. }
  190. void rgb32to16(const uint8_t *src, uint8_t *dst, unsigned src_size)
  191. {
  192. #ifdef CAN_COMPILE_X86_ASM
  193. // ordered per speed fasterst first
  194. if(gCpuCaps.hasMMX2)
  195. rgb32to16_MMX2(src, dst, src_size);
  196. else if(gCpuCaps.has3DNow)
  197. rgb32to16_3DNow(src, dst, src_size);
  198. else if(gCpuCaps.hasMMX)
  199. rgb32to16_MMX(src, dst, src_size);
  200. else
  201. rgb32to16_C(src, dst, src_size);
  202. #else
  203. rgb32to16_C(src, dst, src_size);
  204. #endif
  205. }
  206. void rgb32to15(const uint8_t *src, uint8_t *dst, unsigned src_size)
  207. {
  208. #ifdef CAN_COMPILE_X86_ASM
  209. // ordered per speed fasterst first
  210. if(gCpuCaps.hasMMX2)
  211. rgb32to15_MMX2(src, dst, src_size);
  212. else if(gCpuCaps.has3DNow)
  213. rgb32to15_3DNow(src, dst, src_size);
  214. else if(gCpuCaps.hasMMX)
  215. rgb32to15_MMX(src, dst, src_size);
  216. else
  217. rgb32to15_C(src, dst, src_size);
  218. #else
  219. rgb32to15_C(src, dst, src_size);
  220. #endif
  221. }
  222. void rgb24to16(const uint8_t *src, uint8_t *dst, unsigned src_size)
  223. {
  224. #ifdef CAN_COMPILE_X86_ASM
  225. // ordered per speed fasterst first
  226. if(gCpuCaps.hasMMX2)
  227. rgb24to16_MMX2(src, dst, src_size);
  228. else if(gCpuCaps.has3DNow)
  229. rgb24to16_3DNow(src, dst, src_size);
  230. else if(gCpuCaps.hasMMX)
  231. rgb24to16_MMX(src, dst, src_size);
  232. else
  233. rgb24to16_C(src, dst, src_size);
  234. #else
  235. rgb24to16_C(src, dst, src_size);
  236. #endif
  237. }
  238. void rgb24to15(const uint8_t *src, uint8_t *dst, unsigned src_size)
  239. {
  240. #ifdef CAN_COMPILE_X86_ASM
  241. // ordered per speed fasterst first
  242. if(gCpuCaps.hasMMX2)
  243. rgb24to15_MMX2(src, dst, src_size);
  244. else if(gCpuCaps.has3DNow)
  245. rgb24to15_3DNow(src, dst, src_size);
  246. else if(gCpuCaps.hasMMX)
  247. rgb24to15_MMX(src, dst, src_size);
  248. else
  249. rgb24to15_C(src, dst, src_size);
  250. #else
  251. rgb24to15_C(src, dst, src_size);
  252. #endif
  253. }
  254. /**
  255. * Palette is assumed to contain bgr16, see rgb32to16 to convert the palette
  256. */
  257. void palette8torgb16(const uint8_t *src, uint8_t *dst, unsigned num_pixels, const uint8_t *palette)
  258. {
  259. unsigned i;
  260. for(i=0; i<num_pixels; i++)
  261. ((uint16_t *)dst)[i] = ((uint16_t *)palette)[ src[i] ];
  262. }
  263. /**
  264. * Pallete is assumed to contain bgr15, see rgb32to15 to convert the palette
  265. */
  266. void palette8torgb15(const uint8_t *src, uint8_t *dst, unsigned num_pixels, const uint8_t *palette)
  267. {
  268. unsigned i;
  269. for(i=0; i<num_pixels; i++)
  270. ((uint16_t *)dst)[i] = ((uint16_t *)palette)[ src[i] ];
  271. }
  272. void rgb32tobgr32(const uint8_t *src, uint8_t *dst, unsigned int src_size)
  273. {
  274. #ifdef CAN_COMPILE_X86_ASM
  275. // ordered per speed fasterst first
  276. if(gCpuCaps.hasMMX2)
  277. rgb32tobgr32_MMX2(src, dst, src_size);
  278. else if(gCpuCaps.has3DNow)
  279. rgb32tobgr32_3DNow(src, dst, src_size);
  280. else if(gCpuCaps.hasMMX)
  281. rgb32tobgr32_MMX(src, dst, src_size);
  282. else
  283. rgb32tobgr32_C(src, dst, src_size);
  284. #else
  285. rgb32tobgr32_C(src, dst, src_size);
  286. #endif
  287. }
  288. /**
  289. *
  290. * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
  291. * problem for anyone then tell me, and ill fix it)
  292. */
  293. void yv12toyuy2(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
  294. unsigned int width, unsigned int height,
  295. unsigned int lumStride, unsigned int chromStride, unsigned int dstStride)
  296. {
  297. #ifdef CAN_COMPILE_X86_ASM
  298. // ordered per speed fasterst first
  299. if(gCpuCaps.hasMMX2)
  300. yv12toyuy2_MMX2(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride);
  301. else if(gCpuCaps.has3DNow)
  302. yv12toyuy2_3DNow(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride);
  303. else if(gCpuCaps.hasMMX)
  304. yv12toyuy2_MMX(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride);
  305. else
  306. yv12toyuy2_C(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride);
  307. #else
  308. yv12toyuy2_C(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride);
  309. #endif
  310. }
  311. /**
  312. *
  313. * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
  314. * problem for anyone then tell me, and ill fix it)
  315. */
  316. void yuy2toyv12(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
  317. unsigned int width, unsigned int height,
  318. unsigned int lumStride, unsigned int chromStride, unsigned int srcStride)
  319. {
  320. #ifdef CAN_COMPILE_X86_ASM
  321. // ordered per speed fasterst first
  322. if(gCpuCaps.hasMMX2)
  323. yuy2toyv12_MMX2(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride);
  324. else if(gCpuCaps.has3DNow)
  325. yuy2toyv12_3DNow(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride);
  326. else if(gCpuCaps.hasMMX)
  327. yuy2toyv12_MMX(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride);
  328. else
  329. yuy2toyv12_C(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride);
  330. #else
  331. yuy2toyv12_C(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride);
  332. #endif
  333. }
  334. /**
  335. *
  336. * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
  337. * problem for anyone then tell me, and ill fix it)
  338. * chrominance data is only taken from every secound line others are ignored FIXME write HQ version
  339. */
  340. void uyvytoyv12(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
  341. unsigned int width, unsigned int height,
  342. unsigned int lumStride, unsigned int chromStride, unsigned int srcStride)
  343. {
  344. #ifdef CAN_COMPILE_X86_ASM
  345. // ordered per speed fasterst first
  346. if(gCpuCaps.hasMMX2)
  347. uyvytoyv12_MMX2(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride);
  348. else if(gCpuCaps.has3DNow)
  349. uyvytoyv12_3DNow(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride);
  350. else if(gCpuCaps.hasMMX)
  351. uyvytoyv12_MMX(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride);
  352. else
  353. uyvytoyv12_C(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride);
  354. #else
  355. uyvytoyv12_C(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride);
  356. #endif
  357. }
  358. /**
  359. *
  360. * height should be a multiple of 2 and width should be a multiple of 2 (if this is a
  361. * problem for anyone then tell me, and ill fix it)
  362. * chrominance data is only taken from every secound line others are ignored FIXME write HQ version
  363. */
  364. void rgb24toyv12(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
  365. unsigned int width, unsigned int height,
  366. unsigned int lumStride, unsigned int chromStride, unsigned int srcStride)
  367. {
  368. #ifdef CAN_COMPILE_X86_ASM
  369. // ordered per speed fasterst first
  370. if(gCpuCaps.hasMMX2)
  371. rgb24toyv12_MMX2(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride);
  372. else if(gCpuCaps.has3DNow)
  373. rgb24toyv12_3DNow(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride);
  374. else if(gCpuCaps.hasMMX)
  375. rgb24toyv12_MMX(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride);
  376. else
  377. rgb24toyv12_C(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride);
  378. #else
  379. rgb24toyv12_C(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride);
  380. #endif
  381. }