You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

530 lines
15KB

  1. // Software scaling and colorspace conversion routines for MPlayer
  2. // Orginal C implementation by A'rpi/ESP-team <arpi@thot.banki.hu>
  3. // current version mostly by Michael Niedermayer (michaelni@gmx.at)
  4. // the parts written by michael are under GNU GPL
  5. #include <inttypes.h>
  6. #include <string.h>
  7. #include <math.h>
  8. #include <stdio.h>
  9. #include "../config.h"
  10. #ifdef HAVE_MALLOC_H
  11. #include <malloc.h>
  12. #endif
  13. #include "swscale.h"
  14. #include "../cpudetect.h"
  15. #undef MOVNTQ
  16. #undef PAVGB
  17. //#undef HAVE_MMX2
  18. //#undef HAVE_MMX
  19. //#undef ARCH_X86
  20. #define DITHER1XBPP
  21. int fullUVIpol=0;
  22. //disables the unscaled height version
  23. int allwaysIpol=0;
  24. #define RET 0xC3 //near return opcode
  25. //#define ASSERT(x) if(!(x)) { printf("ASSERT " #x " failed\n"); *((int*)0)=0; }
  26. #define ASSERT(x) ;
  27. extern int verbose; // defined in mplayer.c
  28. /*
  29. NOTES
  30. known BUGS with known cause (no bugreports please!, but patches are welcome :) )
  31. horizontal fast_bilinear MMX2 scaler reads 1-7 samples too much (might cause a sig11)
  32. Supported output formats BGR15 BGR16 BGR24 BGR32 YV12
  33. BGR15 & BGR16 MMX verions support dithering
  34. Special versions: fast Y 1:1 scaling (no interpolation in y direction)
  35. TODO
  36. more intelligent missalignment avoidance for the horizontal scaler
  37. dither in C
  38. change the distance of the u & v buffer
  39. Move static / global vars into a struct so multiple scalers can be used
  40. write special vertical cubic upscale version
  41. Optimize C code (yv12 / minmax)
  42. dstStride[3]
  43. */
  44. #define ABS(a) ((a) > 0 ? (a) : (-(a)))
  45. #define MIN(a,b) ((a) > (b) ? (b) : (a))
  46. #define MAX(a,b) ((a) < (b) ? (b) : (a))
  47. #ifdef ARCH_X86
  48. #define CAN_COMPILE_X86_ASM
  49. #endif
  50. #ifdef CAN_COMPILE_X86_ASM
  51. static uint64_t __attribute__((aligned(8))) yCoeff= 0x2568256825682568LL;
  52. static uint64_t __attribute__((aligned(8))) vrCoeff= 0x3343334333433343LL;
  53. static uint64_t __attribute__((aligned(8))) ubCoeff= 0x40cf40cf40cf40cfLL;
  54. static uint64_t __attribute__((aligned(8))) vgCoeff= 0xE5E2E5E2E5E2E5E2LL;
  55. static uint64_t __attribute__((aligned(8))) ugCoeff= 0xF36EF36EF36EF36ELL;
  56. static uint64_t __attribute__((aligned(8))) bF8= 0xF8F8F8F8F8F8F8F8LL;
  57. static uint64_t __attribute__((aligned(8))) bFC= 0xFCFCFCFCFCFCFCFCLL;
  58. static uint64_t __attribute__((aligned(8))) w400= 0x0400040004000400LL;
  59. static uint64_t __attribute__((aligned(8))) w80= 0x0080008000800080LL;
  60. static uint64_t __attribute__((aligned(8))) w10= 0x0010001000100010LL;
  61. static uint64_t __attribute__((aligned(8))) w02= 0x0002000200020002LL;
  62. static uint64_t __attribute__((aligned(8))) bm00001111=0x00000000FFFFFFFFLL;
  63. static uint64_t __attribute__((aligned(8))) bm00000111=0x0000000000FFFFFFLL;
  64. static uint64_t __attribute__((aligned(8))) bm11111000=0xFFFFFFFFFF000000LL;
  65. static volatile uint64_t __attribute__((aligned(8))) b5Dither;
  66. static volatile uint64_t __attribute__((aligned(8))) g5Dither;
  67. static volatile uint64_t __attribute__((aligned(8))) g6Dither;
  68. static volatile uint64_t __attribute__((aligned(8))) r5Dither;
  69. static uint64_t __attribute__((aligned(8))) dither4[2]={
  70. 0x0103010301030103LL,
  71. 0x0200020002000200LL,};
  72. static uint64_t __attribute__((aligned(8))) dither8[2]={
  73. 0x0602060206020602LL,
  74. 0x0004000400040004LL,};
  75. static uint64_t __attribute__((aligned(8))) b16Mask= 0x001F001F001F001FLL;
  76. static uint64_t __attribute__((aligned(8))) g16Mask= 0x07E007E007E007E0LL;
  77. static uint64_t __attribute__((aligned(8))) r16Mask= 0xF800F800F800F800LL;
  78. static uint64_t __attribute__((aligned(8))) b15Mask= 0x001F001F001F001FLL;
  79. static uint64_t __attribute__((aligned(8))) g15Mask= 0x03E003E003E003E0LL;
  80. static uint64_t __attribute__((aligned(8))) r15Mask= 0x7C007C007C007C00LL;
  81. static uint64_t __attribute__((aligned(8))) M24A= 0x00FF0000FF0000FFLL;
  82. static uint64_t __attribute__((aligned(8))) M24B= 0xFF0000FF0000FF00LL;
  83. static uint64_t __attribute__((aligned(8))) M24C= 0x0000FF0000FF0000LL;
  84. static uint64_t __attribute__((aligned(8))) temp0;
  85. static uint64_t __attribute__((aligned(8))) asm_yalpha1;
  86. static uint64_t __attribute__((aligned(8))) asm_uvalpha1;
  87. static int16_t __attribute__((aligned(8))) *lumPixBuf[2000];
  88. static int16_t __attribute__((aligned(8))) *chrPixBuf[2000];
  89. static int16_t __attribute__((aligned(8))) hLumFilter[8000];
  90. static int16_t __attribute__((aligned(8))) hLumFilterPos[2000];
  91. static int16_t __attribute__((aligned(8))) hChrFilter[8000];
  92. static int16_t __attribute__((aligned(8))) hChrFilterPos[2000];
  93. static int16_t __attribute__((aligned(8))) vLumFilter[8000];
  94. static int16_t __attribute__((aligned(8))) vLumFilterPos[2000];
  95. static int16_t __attribute__((aligned(8))) vChrFilter[8000];
  96. static int16_t __attribute__((aligned(8))) vChrFilterPos[2000];
  97. // Contain simply the values from v(Lum|Chr)Filter just nicely packed for mmx
  98. //FIXME these are very likely too small / 8000 caused problems with 480x480
  99. static int16_t __attribute__((aligned(8))) lumMmxFilter[16000];
  100. static int16_t __attribute__((aligned(8))) chrMmxFilter[16000];
  101. #else
  102. static int16_t *lumPixBuf[2000];
  103. static int16_t *chrPixBuf[2000];
  104. static int16_t hLumFilter[8000];
  105. static int16_t hLumFilterPos[2000];
  106. static int16_t hChrFilter[8000];
  107. static int16_t hChrFilterPos[2000];
  108. static int16_t vLumFilter[8000];
  109. static int16_t vLumFilterPos[2000];
  110. static int16_t vChrFilter[8000];
  111. static int16_t vChrFilterPos[2000];
  112. //FIXME just dummy vars
  113. static int16_t lumMmxFilter[1];
  114. static int16_t chrMmxFilter[1];
  115. #endif
  116. // clipping helper table for C implementations:
  117. static unsigned char clip_table[768];
  118. static unsigned short clip_table16b[768];
  119. static unsigned short clip_table16g[768];
  120. static unsigned short clip_table16r[768];
  121. static unsigned short clip_table15b[768];
  122. static unsigned short clip_table15g[768];
  123. static unsigned short clip_table15r[768];
  124. // yuv->rgb conversion tables:
  125. static int yuvtab_2568[256];
  126. static int yuvtab_3343[256];
  127. static int yuvtab_0c92[256];
  128. static int yuvtab_1a1e[256];
  129. static int yuvtab_40cf[256];
  130. // Needed for cubic scaler to catch overflows
  131. static int clip_yuvtab_2568[768];
  132. static int clip_yuvtab_3343[768];
  133. static int clip_yuvtab_0c92[768];
  134. static int clip_yuvtab_1a1e[768];
  135. static int clip_yuvtab_40cf[768];
  136. static int hLumFilterSize=0;
  137. static int hChrFilterSize=0;
  138. static int vLumFilterSize=0;
  139. static int vChrFilterSize=0;
  140. static int vLumBufSize=0;
  141. static int vChrBufSize=0;
  142. int sws_flags=0;
  143. #ifdef CAN_COMPILE_X86_ASM
  144. static uint8_t funnyYCode[10000];
  145. static uint8_t funnyUVCode[10000];
  146. #endif
  147. static int canMMX2BeUsed=0;
  148. #ifdef CAN_COMPILE_X86_ASM
  149. void in_asm_used_var_warning_killer()
  150. {
  151. volatile int i= yCoeff+vrCoeff+ubCoeff+vgCoeff+ugCoeff+bF8+bFC+w400+w80+w10+
  152. bm00001111+bm00000111+bm11111000+b16Mask+g16Mask+r16Mask+b15Mask+g15Mask+r15Mask+temp0+asm_yalpha1+ asm_uvalpha1+
  153. M24A+M24B+M24C+w02 + funnyYCode[0]+ funnyUVCode[0]+b5Dither+g5Dither+r5Dither+g6Dither+dither4[0]+dither8[0];
  154. if(i) i=0;
  155. }
  156. #endif
  157. static inline void yuv2yuvXinC(int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
  158. int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
  159. uint8_t *dest, uint8_t *uDest, uint8_t *vDest, int dstW)
  160. {
  161. //FIXME Optimize (just quickly writen not opti..)
  162. int i;
  163. for(i=0; i<dstW; i++)
  164. {
  165. int val=0;
  166. int j;
  167. for(j=0; j<lumFilterSize; j++)
  168. val += lumSrc[j][i] * lumFilter[j];
  169. dest[i]= MIN(MAX(val>>19, 0), 255);
  170. }
  171. if(uDest != NULL)
  172. for(i=0; i<(dstW>>1); i++)
  173. {
  174. int u=0;
  175. int v=0;
  176. int j;
  177. for(j=0; j<chrFilterSize; j++)
  178. {
  179. u += chrSrc[j][i] * chrFilter[j];
  180. v += chrSrc[j][i + 2048] * chrFilter[j];
  181. }
  182. uDest[i]= MIN(MAX(u>>19, 0), 255);
  183. vDest[i]= MIN(MAX(v>>19, 0), 255);
  184. }
  185. }
  186. static inline void yuv2rgbXinC(int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
  187. int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
  188. uint8_t *dest, int dstW, int dstbpp)
  189. {
  190. if(dstbpp==32)
  191. {
  192. int i;
  193. for(i=0; i<(dstW>>1); i++){
  194. int j;
  195. int Y1=0;
  196. int Y2=0;
  197. int U=0;
  198. int V=0;
  199. int Cb, Cr, Cg;
  200. for(j=0; j<lumFilterSize; j++)
  201. {
  202. Y1 += lumSrc[j][2*i] * lumFilter[j];
  203. Y2 += lumSrc[j][2*i+1] * lumFilter[j];
  204. }
  205. for(j=0; j<chrFilterSize; j++)
  206. {
  207. U += chrSrc[j][i] * chrFilter[j];
  208. V += chrSrc[j][i+2048] * chrFilter[j];
  209. }
  210. Y1= clip_yuvtab_2568[ (Y1>>19) + 256 ];
  211. Y2= clip_yuvtab_2568[ (Y2>>19) + 256 ];
  212. U >>= 19;
  213. V >>= 19;
  214. Cb= clip_yuvtab_40cf[U+ 256];
  215. Cg= clip_yuvtab_1a1e[V+ 256] + yuvtab_0c92[U+ 256];
  216. Cr= clip_yuvtab_3343[V+ 256];
  217. dest[8*i+0]=clip_table[((Y1 + Cb) >>13)];
  218. dest[8*i+1]=clip_table[((Y1 + Cg) >>13)];
  219. dest[8*i+2]=clip_table[((Y1 + Cr) >>13)];
  220. dest[8*i+4]=clip_table[((Y2 + Cb) >>13)];
  221. dest[8*i+5]=clip_table[((Y2 + Cg) >>13)];
  222. dest[8*i+6]=clip_table[((Y2 + Cr) >>13)];
  223. }
  224. }
  225. else if(dstbpp==24)
  226. {
  227. int i;
  228. for(i=0; i<(dstW>>1); i++){
  229. int j;
  230. int Y1=0;
  231. int Y2=0;
  232. int U=0;
  233. int V=0;
  234. int Cb, Cr, Cg;
  235. for(j=0; j<lumFilterSize; j++)
  236. {
  237. Y1 += lumSrc[j][2*i] * lumFilter[j];
  238. Y2 += lumSrc[j][2*i+1] * lumFilter[j];
  239. }
  240. for(j=0; j<chrFilterSize; j++)
  241. {
  242. U += chrSrc[j][i] * chrFilter[j];
  243. V += chrSrc[j][i+2048] * chrFilter[j];
  244. }
  245. Y1= clip_yuvtab_2568[ (Y1>>19) + 256 ];
  246. Y2= clip_yuvtab_2568[ (Y2>>19) + 256 ];
  247. U >>= 19;
  248. V >>= 19;
  249. Cb= clip_yuvtab_40cf[U+ 256];
  250. Cg= clip_yuvtab_1a1e[V+ 256] + yuvtab_0c92[U+ 256];
  251. Cr= clip_yuvtab_3343[V+ 256];
  252. dest[0]=clip_table[((Y1 + Cb) >>13)];
  253. dest[1]=clip_table[((Y1 + Cg) >>13)];
  254. dest[2]=clip_table[((Y1 + Cr) >>13)];
  255. dest[3]=clip_table[((Y2 + Cb) >>13)];
  256. dest[4]=clip_table[((Y2 + Cg) >>13)];
  257. dest[5]=clip_table[((Y2 + Cr) >>13)];
  258. dest+=6;
  259. }
  260. }
  261. else if(dstbpp==16)
  262. {
  263. int i;
  264. for(i=0; i<(dstW>>1); i++){
  265. int j;
  266. int Y1=0;
  267. int Y2=0;
  268. int U=0;
  269. int V=0;
  270. int Cb, Cr, Cg;
  271. for(j=0; j<lumFilterSize; j++)
  272. {
  273. Y1 += lumSrc[j][2*i] * lumFilter[j];
  274. Y2 += lumSrc[j][2*i+1] * lumFilter[j];
  275. }
  276. for(j=0; j<chrFilterSize; j++)
  277. {
  278. U += chrSrc[j][i] * chrFilter[j];
  279. V += chrSrc[j][i+2048] * chrFilter[j];
  280. }
  281. Y1= clip_yuvtab_2568[ (Y1>>19) + 256 ];
  282. Y2= clip_yuvtab_2568[ (Y2>>19) + 256 ];
  283. U >>= 19;
  284. V >>= 19;
  285. Cb= clip_yuvtab_40cf[U+ 256];
  286. Cg= clip_yuvtab_1a1e[V+ 256] + yuvtab_0c92[U+ 256];
  287. Cr= clip_yuvtab_3343[V+ 256];
  288. ((uint16_t*)dest)[2*i] =
  289. clip_table16b[(Y1 + Cb) >>13] |
  290. clip_table16g[(Y1 + Cg) >>13] |
  291. clip_table16r[(Y1 + Cr) >>13];
  292. ((uint16_t*)dest)[2*i+1] =
  293. clip_table16b[(Y2 + Cb) >>13] |
  294. clip_table16g[(Y2 + Cg) >>13] |
  295. clip_table16r[(Y2 + Cr) >>13];
  296. }
  297. }
  298. else if(dstbpp==15)
  299. {
  300. int i;
  301. for(i=0; i<(dstW>>1); i++){
  302. int j;
  303. int Y1=0;
  304. int Y2=0;
  305. int U=0;
  306. int V=0;
  307. int Cb, Cr, Cg;
  308. for(j=0; j<lumFilterSize; j++)
  309. {
  310. Y1 += lumSrc[j][2*i] * lumFilter[j];
  311. Y2 += lumSrc[j][2*i+1] * lumFilter[j];
  312. }
  313. for(j=0; j<chrFilterSize; j++)
  314. {
  315. U += chrSrc[j][i] * chrFilter[j];
  316. V += chrSrc[j][i+2048] * chrFilter[j];
  317. }
  318. Y1= clip_yuvtab_2568[ (Y1>>19) + 256 ];
  319. Y2= clip_yuvtab_2568[ (Y2>>19) + 256 ];
  320. U >>= 19;
  321. V >>= 19;
  322. Cb= clip_yuvtab_40cf[U+ 256];
  323. Cg= clip_yuvtab_1a1e[V+ 256] + yuvtab_0c92[U+ 256];
  324. Cr= clip_yuvtab_3343[V+ 256];
  325. ((uint16_t*)dest)[2*i] =
  326. clip_table15b[(Y1 + Cb) >>13] |
  327. clip_table15g[(Y1 + Cg) >>13] |
  328. clip_table15r[(Y1 + Cr) >>13];
  329. ((uint16_t*)dest)[2*i+1] =
  330. clip_table15b[(Y2 + Cb) >>13] |
  331. clip_table15g[(Y2 + Cg) >>13] |
  332. clip_table15r[(Y2 + Cr) >>13];
  333. }
  334. }
  335. }
  336. //Note: we have C, X86, MMX, MMX2, 3DNOW version therse no 3DNOW+MMX2 one
  337. //Plain C versions
  338. #if !defined (HAVE_MMX) || defined (RUNTIME_CPUDETECT)
  339. #define COMPILE_C
  340. #endif
  341. #ifdef CAN_COMPILE_X86_ASM
  342. #if (defined (HAVE_MMX) && !defined (HAVE_3DNOW) && !defined (HAVE_MMX2)) || defined (RUNTIME_CPUDETECT)
  343. #define COMPILE_MMX
  344. #endif
  345. #if defined (HAVE_MMX2) || defined (RUNTIME_CPUDETECT)
  346. #define COMPILE_MMX2
  347. #endif
  348. #if (defined (HAVE_3DNOW) && !defined (HAVE_MMX2)) || defined (RUNTIME_CPUDETECT)
  349. #define COMPILE_3DNOW
  350. #endif
  351. #endif //CAN_COMPILE_X86_ASM
  352. #undef HAVE_MMX
  353. #undef HAVE_MMX2
  354. #undef HAVE_3DNOW
  355. #undef ARCH_X86
  356. #ifdef COMPILE_C
  357. #undef HAVE_MMX
  358. #undef HAVE_MMX2
  359. #undef HAVE_3DNOW
  360. #undef ARCH_X86
  361. #define RENAME(a) a ## _C
  362. #include "swscale_template.c"
  363. #endif
  364. #ifdef CAN_COMPILE_X86_ASM
  365. //X86 versions
  366. /*
  367. #undef RENAME
  368. #undef HAVE_MMX
  369. #undef HAVE_MMX2
  370. #undef HAVE_3DNOW
  371. #define ARCH_X86
  372. #define RENAME(a) a ## _X86
  373. #include "swscale_template.c"
  374. */
  375. //MMX versions
  376. #ifdef COMPILE_MMX
  377. #undef RENAME
  378. #define HAVE_MMX
  379. #undef HAVE_MMX2
  380. #undef HAVE_3DNOW
  381. #define ARCH_X86
  382. #define RENAME(a) a ## _MMX
  383. #include "swscale_template.c"
  384. #endif
  385. //MMX2 versions
  386. #ifdef COMPILE_MMX2
  387. #undef RENAME
  388. #define HAVE_MMX
  389. #define HAVE_MMX2
  390. #undef HAVE_3DNOW
  391. #define ARCH_X86
  392. #define RENAME(a) a ## _MMX2
  393. #include "swscale_template.c"
  394. #endif
  395. //3DNOW versions
  396. #ifdef COMPILE_3DNOW
  397. #undef RENAME
  398. #define HAVE_MMX
  399. #undef HAVE_MMX2
  400. #define HAVE_3DNOW
  401. #define ARCH_X86
  402. #define RENAME(a) a ## _3DNow
  403. #include "swscale_template.c"
  404. #endif
  405. #endif //CAN_COMPILE_X86_ASM
  406. // minor note: the HAVE_xyz is messed up after that line so dont use it
  407. // *** bilinear scaling and yuv->rgb or yuv->yuv conversion of yv12 slices:
  408. // *** Note: it's called multiple times while decoding a frame, first time y==0
  409. // switching the cpu type during a sliced drawing can have bad effects, like sig11
  410. void SwScale_YV12slice(unsigned char* srcptr[],int stride[], int srcSliceY ,
  411. int srcSliceH, uint8_t* dstptr[], int dststride, int dstbpp,
  412. int srcW, int srcH, int dstW, int dstH){
  413. #ifdef RUNTIME_CPUDETECT
  414. #ifdef CAN_COMPILE_X86_ASM
  415. // ordered per speed fasterst first
  416. if(gCpuCaps.hasMMX2)
  417. SwScale_YV12slice_MMX2(srcptr, stride, srcSliceY, srcSliceH, dstptr, dststride, dstbpp, srcW, srcH, dstW, dstH);
  418. else if(gCpuCaps.has3DNow)
  419. SwScale_YV12slice_3DNow(srcptr, stride, srcSliceY, srcSliceH, dstptr, dststride, dstbpp, srcW, srcH, dstW, dstH);
  420. else if(gCpuCaps.hasMMX)
  421. SwScale_YV12slice_MMX(srcptr, stride, srcSliceY, srcSliceH, dstptr, dststride, dstbpp, srcW, srcH, dstW, dstH);
  422. else
  423. SwScale_YV12slice_C(srcptr, stride, srcSliceY, srcSliceH, dstptr, dststride, dstbpp, srcW, srcH, dstW, dstH);
  424. #else
  425. SwScale_YV12slice_C(srcptr, stride, srcSliceY, srcSliceH, dstptr, dststride, dstbpp, srcW, srcH, dstW, dstH);
  426. #endif
  427. #else //RUNTIME_CPUDETECT
  428. #ifdef HAVE_MMX2
  429. SwScale_YV12slice_MMX2(srcptr, stride, srcSliceY, srcSliceH, dstptr, dststride, dstbpp, srcW, srcH, dstW, dstH);
  430. #elif defined (HAVE_3DNOW)
  431. SwScale_YV12slice_3DNow(srcptr, stride, srcSliceY, srcSliceH, dstptr, dststride, dstbpp, srcW, srcH, dstW, dstH);
  432. #elif defined (HAVE_MMX)
  433. SwScale_YV12slice_MMX(srcptr, stride, srcSliceY, srcSliceH, dstptr, dststride, dstbpp, srcW, srcH, dstW, dstH);
  434. #else
  435. SwScale_YV12slice_C(srcptr, stride, srcSliceY, srcSliceH, dstptr, dststride, dstbpp, srcW, srcH, dstW, dstH);
  436. #endif
  437. #endif //!RUNTIME_CPUDETECT
  438. }
  439. void SwScale_Init(){
  440. // generating tables:
  441. int i;
  442. for(i=0; i<768; i++){
  443. int c= MIN(MAX(i-256, 0), 255);
  444. clip_table[i]=c;
  445. yuvtab_2568[c]= clip_yuvtab_2568[i]=(0x2568*(c-16))+(256<<13);
  446. yuvtab_3343[c]= clip_yuvtab_3343[i]=0x3343*(c-128);
  447. yuvtab_0c92[c]= clip_yuvtab_0c92[i]=-0x0c92*(c-128);
  448. yuvtab_1a1e[c]= clip_yuvtab_1a1e[i]=-0x1a1e*(c-128);
  449. yuvtab_40cf[c]= clip_yuvtab_40cf[i]=0x40cf*(c-128);
  450. }
  451. for(i=0; i<768; i++)
  452. {
  453. int v= clip_table[i];
  454. clip_table16b[i]= v>>3;
  455. clip_table16g[i]= (v<<3)&0x07E0;
  456. clip_table16r[i]= (v<<8)&0xF800;
  457. clip_table15b[i]= v>>3;
  458. clip_table15g[i]= (v<<2)&0x03E0;
  459. clip_table15r[i]= (v<<7)&0x7C00;
  460. }
  461. }