You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

2612 lines
71KB

  1. /*
  2. Copyright (C) 2001-2002 Michael Niedermayer <michaelni@gmx.at>
  3. This program is free software; you can redistribute it and/or modify
  4. it under the terms of the GNU General Public License as published by
  5. the Free Software Foundation; either version 2 of the License, or
  6. (at your option) any later version.
  7. This program is distributed in the hope that it will be useful,
  8. but WITHOUT ANY WARRANTY; without even the implied warranty of
  9. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  10. GNU General Public License for more details.
  11. You should have received a copy of the GNU General Public License
  12. along with this program; if not, write to the Free Software
  13. Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  14. */
  15. /*
  16. supported Input formats: YV12, I420, IYUV, YUY2, BGR32, BGR24, BGR16, BGR15, RGB32, RGB24, Y8, Y800, YVU9
  17. supported output formats: YV12, I420, IYUV, BGR15, BGR16, BGR24, BGR32, Y8, Y800, YVU9
  18. BGR15/16 support dithering
  19. unscaled special converters
  20. YV12/I420/IYUV -> BGR15/BGR16/BGR24/BGR32
  21. YV12/I420/IYUV -> YV12/I420/IYUV
  22. YUY2/BGR15/BGR16/BGR24/BGR32/RGB24/RGB32 -> same format
  23. BGR24 -> BGR32 & RGB24 -> RGB32
  24. BGR32 -> BGR24 & RGB32 -> RGB24
  25. BGR15 -> BGR16
  26. */
  27. /*
  28. tested special converters
  29. YV12/I420 -> BGR16
  30. YV12 -> YV12
  31. BGR15 -> BGR16
  32. BGR16 -> BGR16
  33. untested special converters
  34. YV12/I420 -> BGR15/BGR24/BGR32 (its the yuv2rgb stuff, so it should be ok)
  35. YV12/I420 -> YV12/I420
  36. YUY2/BGR15/BGR24/BGR32/RGB24/RGB32 -> same format
  37. BGR24 -> BGR32 & RGB24 -> RGB32
  38. BGR32 -> BGR24 & RGB32 -> RGB24
  39. BGR24 -> YV12
  40. */
  41. #include <inttypes.h>
  42. #include <string.h>
  43. #include <math.h>
  44. #include <stdio.h>
  45. #include "../config.h"
  46. #include "../mangle.h"
  47. #include <assert.h>
  48. #ifdef HAVE_MALLOC_H
  49. #include <malloc.h>
  50. #else
  51. #include <stdlib.h>
  52. #endif
  53. #include "swscale.h"
  54. #include "../cpudetect.h"
  55. #include "../bswap.h"
  56. #include "../libvo/img_format.h"
  57. #include "rgb2rgb.h"
  58. #include "../libvo/fastmemcpy.h"
  59. #include "../mp_msg.h"
  60. #define MSG_WARN(args...) mp_msg(MSGT_SWS,MSGL_WARN, ##args )
  61. #define MSG_FATAL(args...) mp_msg(MSGT_SWS,MSGL_FATAL, ##args )
  62. #define MSG_ERR(args...) mp_msg(MSGT_SWS,MSGL_ERR, ##args )
  63. #define MSG_V(args...) mp_msg(MSGT_SWS,MSGL_V, ##args )
  64. #define MSG_DBG2(args...) mp_msg(MSGT_SWS,MSGL_DBG2, ##args )
  65. #define MSG_INFO(args...) mp_msg(MSGT_SWS,MSGL_INFO, ##args )
  66. #undef MOVNTQ
  67. #undef PAVGB
  68. //#undef HAVE_MMX2
  69. //#define HAVE_3DNOW
  70. //#undef HAVE_MMX
  71. //#undef ARCH_X86
  72. //#define WORDS_BIGENDIAN
  73. #define DITHER1XBPP
  74. #define FAST_BGR2YV12 // use 7 bit coeffs instead of 15bit
  75. #define RET 0xC3 //near return opcode for X86
  76. #ifdef MP_DEBUG
  77. #define ASSERT(x) assert(x);
  78. #else
  79. #define ASSERT(x) ;
  80. #endif
  81. #ifdef M_PI
  82. #define PI M_PI
  83. #else
  84. #define PI 3.14159265358979323846
  85. #endif
  86. //FIXME replace this with something faster
  87. #define isPlanarYUV(x) ((x)==IMGFMT_YV12 || (x)==IMGFMT_I420 || (x)==IMGFMT_YVU9)
  88. #define isYUV(x) ((x)==IMGFMT_YUY2 || isPlanarYUV(x))
  89. #define isGray(x) ((x)==IMGFMT_Y800)
  90. #define isSupportedIn(x) ((x)==IMGFMT_YV12 || (x)==IMGFMT_I420 || (x)==IMGFMT_YUY2 \
  91. || (x)==IMGFMT_BGR32|| (x)==IMGFMT_BGR24|| (x)==IMGFMT_BGR16|| (x)==IMGFMT_BGR15\
  92. || (x)==IMGFMT_RGB32|| (x)==IMGFMT_RGB24\
  93. || (x)==IMGFMT_Y800 || (x)==IMGFMT_YVU9)
  94. #define isSupportedOut(x) ((x)==IMGFMT_YV12 || (x)==IMGFMT_I420 \
  95. || (x)==IMGFMT_BGR32|| (x)==IMGFMT_BGR24|| (x)==IMGFMT_BGR16|| (x)==IMGFMT_BGR15\
  96. || (x)==IMGFMT_Y800 || (x)==IMGFMT_YVU9)
  97. #define isRGB(x) (((x)&IMGFMT_RGB_MASK)==IMGFMT_RGB)
  98. #define isBGR(x) (((x)&IMGFMT_BGR_MASK)==IMGFMT_BGR)
  99. #define isPacked(x) ((x)==IMGFMT_YUY2 || isRGB(x) || isBGR(x))
  100. #define RGB2YUV_SHIFT 16
  101. #define BY ((int)( 0.098*(1<<RGB2YUV_SHIFT)+0.5))
  102. #define BV ((int)(-0.071*(1<<RGB2YUV_SHIFT)+0.5))
  103. #define BU ((int)( 0.439*(1<<RGB2YUV_SHIFT)+0.5))
  104. #define GY ((int)( 0.504*(1<<RGB2YUV_SHIFT)+0.5))
  105. #define GV ((int)(-0.368*(1<<RGB2YUV_SHIFT)+0.5))
  106. #define GU ((int)(-0.291*(1<<RGB2YUV_SHIFT)+0.5))
  107. #define RY ((int)( 0.257*(1<<RGB2YUV_SHIFT)+0.5))
  108. #define RV ((int)( 0.439*(1<<RGB2YUV_SHIFT)+0.5))
  109. #define RU ((int)(-0.148*(1<<RGB2YUV_SHIFT)+0.5))
  110. extern int verbose; // defined in mplayer.c
  111. /*
  112. NOTES
  113. Special versions: fast Y 1:1 scaling (no interpolation in y direction)
  114. TODO
  115. more intelligent missalignment avoidance for the horizontal scaler
  116. write special vertical cubic upscale version
  117. Optimize C code (yv12 / minmax)
  118. add support for packed pixel yuv input & output
  119. add support for Y8 output
  120. optimize bgr24 & bgr32
  121. add BGR4 output support
  122. write special BGR->BGR scaler
  123. deglobalize yuv2rgb*.c
  124. */
  125. #define ABS(a) ((a) > 0 ? (a) : (-(a)))
  126. #define MIN(a,b) ((a) > (b) ? (b) : (a))
  127. #define MAX(a,b) ((a) < (b) ? (b) : (a))
  128. #ifdef ARCH_X86
  129. #define CAN_COMPILE_X86_ASM
  130. #endif
  131. #ifdef CAN_COMPILE_X86_ASM
  132. static uint64_t __attribute__((aligned(8))) yCoeff= 0x2568256825682568LL;
  133. static uint64_t __attribute__((aligned(8))) vrCoeff= 0x3343334333433343LL;
  134. static uint64_t __attribute__((aligned(8))) ubCoeff= 0x40cf40cf40cf40cfLL;
  135. static uint64_t __attribute__((aligned(8))) vgCoeff= 0xE5E2E5E2E5E2E5E2LL;
  136. static uint64_t __attribute__((aligned(8))) ugCoeff= 0xF36EF36EF36EF36ELL;
  137. static uint64_t __attribute__((aligned(8))) bF8= 0xF8F8F8F8F8F8F8F8LL;
  138. static uint64_t __attribute__((aligned(8))) bFC= 0xFCFCFCFCFCFCFCFCLL;
  139. static uint64_t __attribute__((aligned(8))) w400= 0x0400040004000400LL;
  140. static uint64_t __attribute__((aligned(8))) w80= 0x0080008000800080LL;
  141. static uint64_t __attribute__((aligned(8))) w10= 0x0010001000100010LL;
  142. static uint64_t __attribute__((aligned(8))) w02= 0x0002000200020002LL;
  143. static uint64_t __attribute__((aligned(8))) bm00001111=0x00000000FFFFFFFFLL;
  144. static uint64_t __attribute__((aligned(8))) bm00000111=0x0000000000FFFFFFLL;
  145. static uint64_t __attribute__((aligned(8))) bm11111000=0xFFFFFFFFFF000000LL;
  146. static uint64_t __attribute__((aligned(8))) bm01010101=0x00FF00FF00FF00FFLL;
  147. static volatile uint64_t __attribute__((aligned(8))) b5Dither;
  148. static volatile uint64_t __attribute__((aligned(8))) g5Dither;
  149. static volatile uint64_t __attribute__((aligned(8))) g6Dither;
  150. static volatile uint64_t __attribute__((aligned(8))) r5Dither;
  151. static uint64_t __attribute__((aligned(8))) dither4[2]={
  152. 0x0103010301030103LL,
  153. 0x0200020002000200LL,};
  154. static uint64_t __attribute__((aligned(8))) dither8[2]={
  155. 0x0602060206020602LL,
  156. 0x0004000400040004LL,};
  157. static uint64_t __attribute__((aligned(8))) b16Mask= 0x001F001F001F001FLL;
  158. static uint64_t __attribute__((aligned(8))) g16Mask= 0x07E007E007E007E0LL;
  159. static uint64_t __attribute__((aligned(8))) r16Mask= 0xF800F800F800F800LL;
  160. static uint64_t __attribute__((aligned(8))) b15Mask= 0x001F001F001F001FLL;
  161. static uint64_t __attribute__((aligned(8))) g15Mask= 0x03E003E003E003E0LL;
  162. static uint64_t __attribute__((aligned(8))) r15Mask= 0x7C007C007C007C00LL;
  163. static uint64_t __attribute__((aligned(8))) M24A= 0x00FF0000FF0000FFLL;
  164. static uint64_t __attribute__((aligned(8))) M24B= 0xFF0000FF0000FF00LL;
  165. static uint64_t __attribute__((aligned(8))) M24C= 0x0000FF0000FF0000LL;
  166. #ifdef FAST_BGR2YV12
  167. static const uint64_t bgr2YCoeff __attribute__((aligned(8))) = 0x000000210041000DULL;
  168. static const uint64_t bgr2UCoeff __attribute__((aligned(8))) = 0x0000FFEEFFDC0038ULL;
  169. static const uint64_t bgr2VCoeff __attribute__((aligned(8))) = 0x00000038FFD2FFF8ULL;
  170. #else
  171. static const uint64_t bgr2YCoeff __attribute__((aligned(8))) = 0x000020E540830C8BULL;
  172. static const uint64_t bgr2UCoeff __attribute__((aligned(8))) = 0x0000ED0FDAC23831ULL;
  173. static const uint64_t bgr2VCoeff __attribute__((aligned(8))) = 0x00003831D0E6F6EAULL;
  174. #endif
  175. static const uint64_t bgr2YOffset __attribute__((aligned(8))) = 0x1010101010101010ULL;
  176. static const uint64_t bgr2UVOffset __attribute__((aligned(8)))= 0x8080808080808080ULL;
  177. static const uint64_t w1111 __attribute__((aligned(8))) = 0x0001000100010001ULL;
  178. // FIXME remove
  179. static uint64_t __attribute__((aligned(8))) asm_yalpha1;
  180. static uint64_t __attribute__((aligned(8))) asm_uvalpha1;
  181. #endif
  182. // clipping helper table for C implementations:
  183. static unsigned char clip_table[768];
  184. static unsigned short clip_table16b[768];
  185. static unsigned short clip_table16g[768];
  186. static unsigned short clip_table16r[768];
  187. static unsigned short clip_table15b[768];
  188. static unsigned short clip_table15g[768];
  189. static unsigned short clip_table15r[768];
  190. // yuv->rgb conversion tables:
  191. static int yuvtab_2568[256];
  192. static int yuvtab_3343[256];
  193. static int yuvtab_0c92[256];
  194. static int yuvtab_1a1e[256];
  195. static int yuvtab_40cf[256];
  196. // Needed for cubic scaler to catch overflows
  197. static int clip_yuvtab_2568[768];
  198. static int clip_yuvtab_3343[768];
  199. static int clip_yuvtab_0c92[768];
  200. static int clip_yuvtab_1a1e[768];
  201. static int clip_yuvtab_40cf[768];
  202. //global sws_flags from the command line
  203. int sws_flags=2;
  204. //global srcFilter
  205. SwsFilter src_filter= {NULL, NULL, NULL, NULL};
  206. float sws_lum_gblur= 0.0;
  207. float sws_chr_gblur= 0.0;
  208. int sws_chr_vshift= 0;
  209. int sws_chr_hshift= 0;
  210. float sws_chr_sharpen= 0.0;
  211. float sws_lum_sharpen= 0.0;
  212. /* cpuCaps combined from cpudetect and whats actually compiled in
  213. (if there is no support for something compiled in it wont appear here) */
  214. static CpuCaps cpuCaps;
  215. void (*swScale)(SwsContext *context, uint8_t* src[], int srcStride[], int srcSliceY,
  216. int srcSliceH, uint8_t* dst[], int dstStride[])=NULL;
  217. static SwsVector *getConvVec(SwsVector *a, SwsVector *b);
  218. #ifdef CAN_COMPILE_X86_ASM
  219. void in_asm_used_var_warning_killer()
  220. {
  221. volatile int i= yCoeff+vrCoeff+ubCoeff+vgCoeff+ugCoeff+bF8+bFC+w400+w80+w10+
  222. bm00001111+bm00000111+bm11111000+b16Mask+g16Mask+r16Mask+b15Mask+g15Mask+r15Mask+asm_yalpha1+ asm_uvalpha1+
  223. M24A+M24B+M24C+w02 + b5Dither+g5Dither+r5Dither+g6Dither+dither4[0]+dither8[0]+bm01010101;
  224. if(i) i=0;
  225. }
  226. #endif
  227. static int testFormat[]={
  228. IMGFMT_YVU9,
  229. IMGFMT_YV12,
  230. //IMGFMT_IYUV,
  231. IMGFMT_I420,
  232. IMGFMT_BGR15,
  233. IMGFMT_BGR16,
  234. IMGFMT_BGR24,
  235. IMGFMT_BGR32,
  236. //IMGFMT_Y8,
  237. IMGFMT_Y800,
  238. //IMGFMT_YUY2,
  239. 0
  240. };
  241. static uint64_t getSSD(uint8_t *src1, uint8_t *src2, int stride1, int stride2, int w, int h){
  242. int x,y;
  243. uint64_t ssd=0;
  244. for(y=0; y<h; y++){
  245. for(x=0; x<w; x++){
  246. int d= src1[x + y*stride1] - src2[x + y*stride2];
  247. ssd+= d*d;
  248. }
  249. }
  250. return ssd;
  251. }
  252. // test by ref -> src -> dst -> out & compare out against ref
  253. // ref & out are YV12
  254. static void doTest(uint8_t *ref[3], int refStride[3], int w, int h, int srcFormat, int dstFormat,
  255. int srcW, int srcH, int dstW, int dstH, int flags){
  256. uint8_t *src[3];
  257. uint8_t *dst[3];
  258. uint8_t *out[3];
  259. int srcStride[3], dstStride[3];
  260. int i;
  261. uint64_t ssdY, ssdU, ssdV;
  262. SwsContext *srcContext, *dstContext, *outContext;
  263. for(i=0; i<3; i++){
  264. srcStride[i]= srcW*4;
  265. dstStride[i]= dstW*4;
  266. src[i]= malloc(srcStride[i]*srcH);
  267. dst[i]= malloc(dstStride[i]*dstH);
  268. out[i]= malloc(refStride[i]*h);
  269. }
  270. srcContext= getSwsContext(w, h, IMGFMT_YV12, srcW, srcH, srcFormat, flags, NULL, NULL);
  271. dstContext= getSwsContext(srcW, srcH, srcFormat, dstW, dstH, dstFormat, flags, NULL, NULL);
  272. outContext= getSwsContext(dstW, dstH, dstFormat, w, h, IMGFMT_YV12, flags, NULL, NULL);
  273. if(srcContext==NULL ||dstContext==NULL ||outContext==NULL){
  274. printf("Failed allocating swsContext\n");
  275. goto end;
  276. }
  277. // printf("test %X %X %X -> %X %X %X\n", (int)ref[0], (int)ref[1], (int)ref[2],
  278. // (int)src[0], (int)src[1], (int)src[2]);
  279. srcContext->swScale(srcContext, ref, refStride, 0, h , src, srcStride);
  280. dstContext->swScale(dstContext, src, srcStride, 0, srcH, dst, dstStride);
  281. outContext->swScale(outContext, dst, dstStride, 0, dstH, out, refStride);
  282. ssdY= getSSD(ref[0], out[0], refStride[0], refStride[0], w, h);
  283. ssdU= getSSD(ref[1], out[1], refStride[1], refStride[1], (w+1)>>1, (h+1)>>1);
  284. ssdV= getSSD(ref[2], out[2], refStride[2], refStride[2], (w+1)>>1, (h+1)>>1);
  285. if(isGray(srcFormat) || isGray(dstFormat)) ssdU=ssdV=0; //FIXME check that output is really gray
  286. ssdY/= w*h;
  287. ssdU/= w*h/4;
  288. ssdV/= w*h/4;
  289. if(ssdY>100 || ssdU>50 || ssdV>50){
  290. printf(" %s %dx%d -> %s %4dx%4d flags=%2d SSD=%5lld,%5lld,%5lld\n",
  291. vo_format_name(srcFormat), srcW, srcH,
  292. vo_format_name(dstFormat), dstW, dstH,
  293. flags,
  294. ssdY, ssdU, ssdV);
  295. }
  296. end:
  297. freeSwsContext(srcContext);
  298. freeSwsContext(dstContext);
  299. freeSwsContext(outContext);
  300. for(i=0; i<3; i++){
  301. free(src[i]);
  302. free(dst[i]);
  303. free(out[i]);
  304. }
  305. }
  306. static void selfTest(uint8_t *src[3], int stride[3], int w, int h){
  307. int srcFormat, dstFormat, srcFormatIndex, dstFormatIndex;
  308. int srcW, srcH, dstW, dstH;
  309. int flags;
  310. for(srcFormatIndex=0; ;srcFormatIndex++){
  311. srcFormat= testFormat[srcFormatIndex];
  312. if(!srcFormat) break;
  313. for(dstFormatIndex=0; ;dstFormatIndex++){
  314. dstFormat= testFormat[dstFormatIndex];
  315. if(!dstFormat) break;
  316. if(!isSupportedOut(dstFormat)) continue;
  317. srcW= w+w/3;
  318. srcH= h+h/3;
  319. for(dstW=w; dstW<w*2; dstW+= dstW/3){
  320. for(dstH=h; dstH<h*2; dstH+= dstH/3){
  321. for(flags=1; flags<33; flags*=2)
  322. doTest(src, stride, w, h, srcFormat, dstFormat,
  323. srcW, srcH, dstW, dstH, flags);
  324. }
  325. }
  326. }
  327. }
  328. }
  329. static inline void yuv2yuvXinC(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
  330. int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
  331. uint8_t *dest, uint8_t *uDest, uint8_t *vDest)
  332. {
  333. //FIXME Optimize (just quickly writen not opti..)
  334. int i;
  335. for(i=0; i<c->dstW; i++)
  336. {
  337. int val=0;
  338. int j;
  339. for(j=0; j<lumFilterSize; j++)
  340. val += lumSrc[j][i] * lumFilter[j];
  341. dest[i]= MIN(MAX(val>>19, 0), 255);
  342. }
  343. if(uDest != NULL)
  344. for(i=0; i<c->chrDstW; i++)
  345. {
  346. int u=0;
  347. int v=0;
  348. int j;
  349. for(j=0; j<chrFilterSize; j++)
  350. {
  351. u += chrSrc[j][i] * chrFilter[j];
  352. v += chrSrc[j][i + 2048] * chrFilter[j];
  353. }
  354. uDest[i]= MIN(MAX(u>>19, 0), 255);
  355. vDest[i]= MIN(MAX(v>>19, 0), 255);
  356. }
  357. }
  358. static inline void yuv2rgbXinC(int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
  359. int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
  360. uint8_t *dest, int dstW, int dstFormat)
  361. {
  362. if(dstFormat==IMGFMT_BGR32)
  363. {
  364. int i;
  365. #ifdef WORDS_BIGENDIAN
  366. dest++;
  367. #endif
  368. for(i=0; i<(dstW>>1); i++){
  369. int j;
  370. int Y1=0;
  371. int Y2=0;
  372. int U=0;
  373. int V=0;
  374. int Cb, Cr, Cg;
  375. for(j=0; j<lumFilterSize; j++)
  376. {
  377. Y1 += lumSrc[j][2*i] * lumFilter[j];
  378. Y2 += lumSrc[j][2*i+1] * lumFilter[j];
  379. }
  380. for(j=0; j<chrFilterSize; j++)
  381. {
  382. U += chrSrc[j][i] * chrFilter[j];
  383. V += chrSrc[j][i+2048] * chrFilter[j];
  384. }
  385. Y1= clip_yuvtab_2568[ (Y1>>19) + 256 ];
  386. Y2= clip_yuvtab_2568[ (Y2>>19) + 256 ];
  387. U >>= 19;
  388. V >>= 19;
  389. Cb= clip_yuvtab_40cf[U+ 256];
  390. Cg= clip_yuvtab_1a1e[V+ 256] + yuvtab_0c92[U+ 256];
  391. Cr= clip_yuvtab_3343[V+ 256];
  392. dest[8*i+0]=clip_table[((Y1 + Cb) >>13)];
  393. dest[8*i+1]=clip_table[((Y1 + Cg) >>13)];
  394. dest[8*i+2]=clip_table[((Y1 + Cr) >>13)];
  395. dest[8*i+4]=clip_table[((Y2 + Cb) >>13)];
  396. dest[8*i+5]=clip_table[((Y2 + Cg) >>13)];
  397. dest[8*i+6]=clip_table[((Y2 + Cr) >>13)];
  398. }
  399. }
  400. else if(dstFormat==IMGFMT_BGR24)
  401. {
  402. int i;
  403. for(i=0; i<(dstW>>1); i++){
  404. int j;
  405. int Y1=0;
  406. int Y2=0;
  407. int U=0;
  408. int V=0;
  409. int Cb, Cr, Cg;
  410. for(j=0; j<lumFilterSize; j++)
  411. {
  412. Y1 += lumSrc[j][2*i] * lumFilter[j];
  413. Y2 += lumSrc[j][2*i+1] * lumFilter[j];
  414. }
  415. for(j=0; j<chrFilterSize; j++)
  416. {
  417. U += chrSrc[j][i] * chrFilter[j];
  418. V += chrSrc[j][i+2048] * chrFilter[j];
  419. }
  420. Y1= clip_yuvtab_2568[ (Y1>>19) + 256 ];
  421. Y2= clip_yuvtab_2568[ (Y2>>19) + 256 ];
  422. U >>= 19;
  423. V >>= 19;
  424. Cb= clip_yuvtab_40cf[U+ 256];
  425. Cg= clip_yuvtab_1a1e[V+ 256] + yuvtab_0c92[U+ 256];
  426. Cr= clip_yuvtab_3343[V+ 256];
  427. dest[0]=clip_table[((Y1 + Cb) >>13)];
  428. dest[1]=clip_table[((Y1 + Cg) >>13)];
  429. dest[2]=clip_table[((Y1 + Cr) >>13)];
  430. dest[3]=clip_table[((Y2 + Cb) >>13)];
  431. dest[4]=clip_table[((Y2 + Cg) >>13)];
  432. dest[5]=clip_table[((Y2 + Cr) >>13)];
  433. dest+=6;
  434. }
  435. }
  436. else if(dstFormat==IMGFMT_BGR16)
  437. {
  438. int i;
  439. #ifdef DITHER1XBPP
  440. static int ditherb1=1<<14;
  441. static int ditherg1=1<<13;
  442. static int ditherr1=2<<14;
  443. static int ditherb2=3<<14;
  444. static int ditherg2=3<<13;
  445. static int ditherr2=0<<14;
  446. ditherb1 ^= (1^2)<<14;
  447. ditherg1 ^= (1^2)<<13;
  448. ditherr1 ^= (1^2)<<14;
  449. ditherb2 ^= (3^0)<<14;
  450. ditherg2 ^= (3^0)<<13;
  451. ditherr2 ^= (3^0)<<14;
  452. #else
  453. const int ditherb1=0;
  454. const int ditherg1=0;
  455. const int ditherr1=0;
  456. const int ditherb2=0;
  457. const int ditherg2=0;
  458. const int ditherr2=0;
  459. #endif
  460. for(i=0; i<(dstW>>1); i++){
  461. int j;
  462. int Y1=0;
  463. int Y2=0;
  464. int U=0;
  465. int V=0;
  466. int Cb, Cr, Cg;
  467. for(j=0; j<lumFilterSize; j++)
  468. {
  469. Y1 += lumSrc[j][2*i] * lumFilter[j];
  470. Y2 += lumSrc[j][2*i+1] * lumFilter[j];
  471. }
  472. for(j=0; j<chrFilterSize; j++)
  473. {
  474. U += chrSrc[j][i] * chrFilter[j];
  475. V += chrSrc[j][i+2048] * chrFilter[j];
  476. }
  477. Y1= clip_yuvtab_2568[ (Y1>>19) + 256 ];
  478. Y2= clip_yuvtab_2568[ (Y2>>19) + 256 ];
  479. U >>= 19;
  480. V >>= 19;
  481. Cb= clip_yuvtab_40cf[U+ 256];
  482. Cg= clip_yuvtab_1a1e[V+ 256] + yuvtab_0c92[U+ 256];
  483. Cr= clip_yuvtab_3343[V+ 256];
  484. ((uint16_t*)dest)[2*i] =
  485. clip_table16b[(Y1 + Cb + ditherb1) >>13] |
  486. clip_table16g[(Y1 + Cg + ditherg1) >>13] |
  487. clip_table16r[(Y1 + Cr + ditherr1) >>13];
  488. ((uint16_t*)dest)[2*i+1] =
  489. clip_table16b[(Y2 + Cb + ditherb2) >>13] |
  490. clip_table16g[(Y2 + Cg + ditherg2) >>13] |
  491. clip_table16r[(Y2 + Cr + ditherr2) >>13];
  492. }
  493. }
  494. else if(dstFormat==IMGFMT_BGR15)
  495. {
  496. int i;
  497. #ifdef DITHER1XBPP
  498. static int ditherb1=1<<14;
  499. static int ditherg1=1<<14;
  500. static int ditherr1=2<<14;
  501. static int ditherb2=3<<14;
  502. static int ditherg2=3<<14;
  503. static int ditherr2=0<<14;
  504. ditherb1 ^= (1^2)<<14;
  505. ditherg1 ^= (1^2)<<14;
  506. ditherr1 ^= (1^2)<<14;
  507. ditherb2 ^= (3^0)<<14;
  508. ditherg2 ^= (3^0)<<14;
  509. ditherr2 ^= (3^0)<<14;
  510. #else
  511. const int ditherb1=0;
  512. const int ditherg1=0;
  513. const int ditherr1=0;
  514. const int ditherb2=0;
  515. const int ditherg2=0;
  516. const int ditherr2=0;
  517. #endif
  518. for(i=0; i<(dstW>>1); i++){
  519. int j;
  520. int Y1=0;
  521. int Y2=0;
  522. int U=0;
  523. int V=0;
  524. int Cb, Cr, Cg;
  525. for(j=0; j<lumFilterSize; j++)
  526. {
  527. Y1 += lumSrc[j][2*i] * lumFilter[j];
  528. Y2 += lumSrc[j][2*i+1] * lumFilter[j];
  529. }
  530. for(j=0; j<chrFilterSize; j++)
  531. {
  532. U += chrSrc[j][i] * chrFilter[j];
  533. V += chrSrc[j][i+2048] * chrFilter[j];
  534. }
  535. Y1= clip_yuvtab_2568[ (Y1>>19) + 256 ];
  536. Y2= clip_yuvtab_2568[ (Y2>>19) + 256 ];
  537. U >>= 19;
  538. V >>= 19;
  539. Cb= clip_yuvtab_40cf[U+ 256];
  540. Cg= clip_yuvtab_1a1e[V+ 256] + yuvtab_0c92[U+ 256];
  541. Cr= clip_yuvtab_3343[V+ 256];
  542. ((uint16_t*)dest)[2*i] =
  543. clip_table15b[(Y1 + Cb + ditherb1) >>13] |
  544. clip_table15g[(Y1 + Cg + ditherg1) >>13] |
  545. clip_table15r[(Y1 + Cr + ditherr1) >>13];
  546. ((uint16_t*)dest)[2*i+1] =
  547. clip_table15b[(Y2 + Cb + ditherb2) >>13] |
  548. clip_table15g[(Y2 + Cg + ditherg2) >>13] |
  549. clip_table15r[(Y2 + Cr + ditherr2) >>13];
  550. }
  551. }
  552. }
  553. //Note: we have C, X86, MMX, MMX2, 3DNOW version therse no 3DNOW+MMX2 one
  554. //Plain C versions
  555. #if !defined (HAVE_MMX) || defined (RUNTIME_CPUDETECT)
  556. #define COMPILE_C
  557. #endif
  558. #ifdef CAN_COMPILE_X86_ASM
  559. #if (defined (HAVE_MMX) && !defined (HAVE_3DNOW) && !defined (HAVE_MMX2)) || defined (RUNTIME_CPUDETECT)
  560. #define COMPILE_MMX
  561. #endif
  562. #if defined (HAVE_MMX2) || defined (RUNTIME_CPUDETECT)
  563. #define COMPILE_MMX2
  564. #endif
  565. #if (defined (HAVE_3DNOW) && !defined (HAVE_MMX2)) || defined (RUNTIME_CPUDETECT)
  566. #define COMPILE_3DNOW
  567. #endif
  568. #endif //CAN_COMPILE_X86_ASM
  569. #undef HAVE_MMX
  570. #undef HAVE_MMX2
  571. #undef HAVE_3DNOW
  572. #ifdef COMPILE_C
  573. #undef HAVE_MMX
  574. #undef HAVE_MMX2
  575. #undef HAVE_3DNOW
  576. #define RENAME(a) a ## _C
  577. #include "swscale_template.c"
  578. #endif
  579. #ifdef CAN_COMPILE_X86_ASM
  580. //X86 versions
  581. /*
  582. #undef RENAME
  583. #undef HAVE_MMX
  584. #undef HAVE_MMX2
  585. #undef HAVE_3DNOW
  586. #define ARCH_X86
  587. #define RENAME(a) a ## _X86
  588. #include "swscale_template.c"
  589. */
  590. //MMX versions
  591. #ifdef COMPILE_MMX
  592. #undef RENAME
  593. #define HAVE_MMX
  594. #undef HAVE_MMX2
  595. #undef HAVE_3DNOW
  596. #define RENAME(a) a ## _MMX
  597. #include "swscale_template.c"
  598. #endif
  599. //MMX2 versions
  600. #ifdef COMPILE_MMX2
  601. #undef RENAME
  602. #define HAVE_MMX
  603. #define HAVE_MMX2
  604. #undef HAVE_3DNOW
  605. #define RENAME(a) a ## _MMX2
  606. #include "swscale_template.c"
  607. #endif
  608. //3DNOW versions
  609. #ifdef COMPILE_3DNOW
  610. #undef RENAME
  611. #define HAVE_MMX
  612. #undef HAVE_MMX2
  613. #define HAVE_3DNOW
  614. #define RENAME(a) a ## _3DNow
  615. #include "swscale_template.c"
  616. #endif
  617. #endif //CAN_COMPILE_X86_ASM
  618. // minor note: the HAVE_xyz is messed up after that line so dont use it
  619. // old global scaler, dont use for new code
  620. // will use sws_flags from the command line
  621. void SwScale_YV12slice(unsigned char* src[], int srcStride[], int srcSliceY ,
  622. int srcSliceH, uint8_t* dst[], int dstStride, int dstbpp,
  623. int srcW, int srcH, int dstW, int dstH){
  624. static SwsContext *context=NULL;
  625. int dstFormat;
  626. int dstStride3[3]= {dstStride, dstStride>>1, dstStride>>1};
  627. switch(dstbpp)
  628. {
  629. case 8 : dstFormat= IMGFMT_Y8; break;
  630. case 12: dstFormat= IMGFMT_YV12; break;
  631. case 15: dstFormat= IMGFMT_BGR15; break;
  632. case 16: dstFormat= IMGFMT_BGR16; break;
  633. case 24: dstFormat= IMGFMT_BGR24; break;
  634. case 32: dstFormat= IMGFMT_BGR32; break;
  635. default: return;
  636. }
  637. if(!context) context=getSwsContextFromCmdLine(srcW, srcH, IMGFMT_YV12, dstW, dstH, dstFormat);
  638. context->swScale(context, src, srcStride, srcSliceY, srcSliceH, dst, dstStride3);
  639. }
  640. // will use sws_flags & src_filter (from cmd line)
  641. SwsContext *getSwsContextFromCmdLine(int srcW, int srcH, int srcFormat, int dstW, int dstH, int dstFormat)
  642. {
  643. int flags=0;
  644. static int firstTime=1;
  645. #ifdef ARCH_X86
  646. if(gCpuCaps.hasMMX)
  647. asm volatile("emms\n\t"::: "memory"); //FIXME this shouldnt be required but it IS (even for non mmx versions)
  648. #endif
  649. if(firstTime)
  650. {
  651. firstTime=0;
  652. flags= SWS_PRINT_INFO;
  653. }
  654. else if(verbose>1) flags= SWS_PRINT_INFO;
  655. if(src_filter.lumH) freeVec(src_filter.lumH);
  656. if(src_filter.lumV) freeVec(src_filter.lumV);
  657. if(src_filter.chrH) freeVec(src_filter.chrH);
  658. if(src_filter.chrV) freeVec(src_filter.chrV);
  659. if(sws_lum_gblur!=0.0){
  660. src_filter.lumH= getGaussianVec(sws_lum_gblur, 3.0);
  661. src_filter.lumV= getGaussianVec(sws_lum_gblur, 3.0);
  662. }else{
  663. src_filter.lumH= getIdentityVec();
  664. src_filter.lumV= getIdentityVec();
  665. }
  666. if(sws_chr_gblur!=0.0){
  667. src_filter.chrH= getGaussianVec(sws_chr_gblur, 3.0);
  668. src_filter.chrV= getGaussianVec(sws_chr_gblur, 3.0);
  669. }else{
  670. src_filter.chrH= getIdentityVec();
  671. src_filter.chrV= getIdentityVec();
  672. }
  673. if(sws_chr_sharpen!=0.0){
  674. SwsVector *g= getConstVec(-1.0, 3);
  675. SwsVector *id= getConstVec(10.0/sws_chr_sharpen, 1);
  676. g->coeff[1]=2.0;
  677. addVec(id, g);
  678. convVec(src_filter.chrH, id);
  679. convVec(src_filter.chrV, id);
  680. freeVec(g);
  681. freeVec(id);
  682. }
  683. if(sws_lum_sharpen!=0.0){
  684. SwsVector *g= getConstVec(-1.0, 3);
  685. SwsVector *id= getConstVec(10.0/sws_lum_sharpen, 1);
  686. g->coeff[1]=2.0;
  687. addVec(id, g);
  688. convVec(src_filter.lumH, id);
  689. convVec(src_filter.lumV, id);
  690. freeVec(g);
  691. freeVec(id);
  692. }
  693. if(sws_chr_hshift)
  694. shiftVec(src_filter.chrH, sws_chr_hshift);
  695. if(sws_chr_vshift)
  696. shiftVec(src_filter.chrV, sws_chr_vshift);
  697. normalizeVec(src_filter.chrH, 1.0);
  698. normalizeVec(src_filter.chrV, 1.0);
  699. normalizeVec(src_filter.lumH, 1.0);
  700. normalizeVec(src_filter.lumV, 1.0);
  701. if(verbose > 1) printVec(src_filter.chrH);
  702. if(verbose > 1) printVec(src_filter.lumH);
  703. switch(sws_flags)
  704. {
  705. case 0: flags|= SWS_FAST_BILINEAR; break;
  706. case 1: flags|= SWS_BILINEAR; break;
  707. case 2: flags|= SWS_BICUBIC; break;
  708. case 3: flags|= SWS_X; break;
  709. case 4: flags|= SWS_POINT; break;
  710. case 5: flags|= SWS_AREA; break;
  711. default:flags|= SWS_BILINEAR; break;
  712. }
  713. return getSwsContext(srcW, srcH, srcFormat, dstW, dstH, dstFormat, flags, &src_filter, NULL);
  714. }
  715. static inline void initFilter(int16_t **outFilter, int16_t **filterPos, int *outFilterSize, int xInc,
  716. int srcW, int dstW, int filterAlign, int one, int flags,
  717. SwsVector *srcFilter, SwsVector *dstFilter)
  718. {
  719. int i;
  720. int filterSize;
  721. int filter2Size;
  722. int minFilterSize;
  723. double *filter=NULL;
  724. double *filter2=NULL;
  725. #ifdef ARCH_X86
  726. if(gCpuCaps.hasMMX)
  727. asm volatile("emms\n\t"::: "memory"); //FIXME this shouldnt be required but it IS (even for non mmx versions)
  728. #endif
  729. // Note the +1 is for the MMXscaler which reads over the end
  730. *filterPos = (int16_t*)memalign(8, (dstW+1)*sizeof(int16_t));
  731. if(ABS(xInc - 0x10000) <10) // unscaled
  732. {
  733. int i;
  734. filterSize= 1;
  735. filter= (double*)memalign(8, dstW*sizeof(double)*filterSize);
  736. for(i=0; i<dstW*filterSize; i++) filter[i]=0;
  737. for(i=0; i<dstW; i++)
  738. {
  739. filter[i*filterSize]=1;
  740. (*filterPos)[i]=i;
  741. }
  742. }
  743. else if(flags&SWS_POINT) // lame looking point sampling mode
  744. {
  745. int i;
  746. int xDstInSrc;
  747. filterSize= 1;
  748. filter= (double*)memalign(8, dstW*sizeof(double)*filterSize);
  749. xDstInSrc= xInc/2 - 0x8000;
  750. for(i=0; i<dstW; i++)
  751. {
  752. int xx= (xDstInSrc - ((filterSize-1)<<15) + (1<<15))>>16;
  753. (*filterPos)[i]= xx;
  754. filter[i]= 1.0;
  755. xDstInSrc+= xInc;
  756. }
  757. }
  758. else if(xInc <= (1<<16) || (flags&SWS_FAST_BILINEAR)) // upscale
  759. {
  760. int i;
  761. int xDstInSrc;
  762. if (flags&SWS_BICUBIC) filterSize= 4;
  763. else if(flags&SWS_X ) filterSize= 4;
  764. else filterSize= 2; // SWS_BILINEAR / SWS_AREA
  765. filter= (double*)memalign(8, dstW*sizeof(double)*filterSize);
  766. xDstInSrc= xInc/2 - 0x8000;
  767. for(i=0; i<dstW; i++)
  768. {
  769. int xx= (xDstInSrc - ((filterSize-1)<<15) + (1<<15))>>16;
  770. int j;
  771. (*filterPos)[i]= xx;
  772. if((flags & SWS_BICUBIC) || (flags & SWS_X))
  773. {
  774. double d= ABS(((xx+1)<<16) - xDstInSrc)/(double)(1<<16);
  775. double y1,y2,y3,y4;
  776. double A= -0.6;
  777. if(flags & SWS_BICUBIC){
  778. // Equation is from VirtualDub
  779. y1 = ( + A*d - 2.0*A*d*d + A*d*d*d);
  780. y2 = (+ 1.0 - (A+3.0)*d*d + (A+2.0)*d*d*d);
  781. y3 = ( - A*d + (2.0*A+3.0)*d*d - (A+2.0)*d*d*d);
  782. y4 = ( + A*d*d - A*d*d*d);
  783. }else{
  784. // cubic interpolation (derived it myself)
  785. y1 = ( -2.0*d + 3.0*d*d - 1.0*d*d*d)/6.0;
  786. y2 = (6.0 -3.0*d - 6.0*d*d + 3.0*d*d*d)/6.0;
  787. y3 = ( +6.0*d + 3.0*d*d - 3.0*d*d*d)/6.0;
  788. y4 = ( -1.0*d + 1.0*d*d*d)/6.0;
  789. }
  790. filter[i*filterSize + 0]= y1;
  791. filter[i*filterSize + 1]= y2;
  792. filter[i*filterSize + 2]= y3;
  793. filter[i*filterSize + 3]= y4;
  794. }
  795. else
  796. {
  797. //Bilinear upscale / linear interpolate / Area averaging
  798. for(j=0; j<filterSize; j++)
  799. {
  800. double d= ABS((xx<<16) - xDstInSrc)/(double)(1<<16);
  801. double coeff= 1.0 - d;
  802. if(coeff<0) coeff=0;
  803. filter[i*filterSize + j]= coeff;
  804. xx++;
  805. }
  806. }
  807. xDstInSrc+= xInc;
  808. }
  809. }
  810. else // downscale
  811. {
  812. int xDstInSrc;
  813. ASSERT(dstW <= srcW)
  814. if(flags&SWS_BICUBIC) filterSize= (int)ceil(1 + 4.0*srcW / (double)dstW);
  815. else if(flags&SWS_X) filterSize= (int)ceil(1 + 4.0*srcW / (double)dstW);
  816. else if(flags&SWS_AREA) filterSize= (int)ceil(1 + 1.0*srcW / (double)dstW);
  817. else /* BILINEAR */ filterSize= (int)ceil(1 + 2.0*srcW / (double)dstW);
  818. filter= (double*)memalign(8, dstW*sizeof(double)*filterSize);
  819. xDstInSrc= xInc/2 - 0x8000;
  820. for(i=0; i<dstW; i++)
  821. {
  822. int xx= (int)((double)xDstInSrc/(double)(1<<16) - (filterSize-1)*0.5 + 0.5);
  823. int j;
  824. (*filterPos)[i]= xx;
  825. for(j=0; j<filterSize; j++)
  826. {
  827. double d= ABS((xx<<16) - xDstInSrc)/(double)xInc;
  828. double coeff;
  829. if((flags & SWS_BICUBIC) || (flags & SWS_X))
  830. {
  831. double A= -0.75;
  832. // d*=2;
  833. // Equation is from VirtualDub
  834. if(d<1.0)
  835. coeff = (1.0 - (A+3.0)*d*d + (A+2.0)*d*d*d);
  836. else if(d<2.0)
  837. coeff = (-4.0*A + 8.0*A*d - 5.0*A*d*d + A*d*d*d);
  838. else
  839. coeff=0.0;
  840. }
  841. else if(flags & SWS_AREA)
  842. {
  843. double srcPixelSize= (1<<16)/(double)xInc;
  844. if(d + srcPixelSize/2 < 0.5) coeff= 1.0;
  845. else if(d - srcPixelSize/2 < 0.5) coeff= (0.5-d)/srcPixelSize + 0.5;
  846. else coeff=0.0;
  847. }
  848. else
  849. {
  850. coeff= 1.0 - d;
  851. if(coeff<0) coeff=0;
  852. }
  853. filter[i*filterSize + j]= coeff;
  854. xx++;
  855. }
  856. xDstInSrc+= xInc;
  857. }
  858. }
  859. /* apply src & dst Filter to filter -> filter2
  860. free(filter);
  861. */
  862. ASSERT(filterSize>0)
  863. filter2Size= filterSize;
  864. if(srcFilter) filter2Size+= srcFilter->length - 1;
  865. if(dstFilter) filter2Size+= dstFilter->length - 1;
  866. ASSERT(filter2Size>0)
  867. filter2= (double*)memalign(8, filter2Size*dstW*sizeof(double));
  868. for(i=0; i<dstW; i++)
  869. {
  870. int j;
  871. SwsVector scaleFilter;
  872. SwsVector *outVec;
  873. scaleFilter.coeff= filter + i*filterSize;
  874. scaleFilter.length= filterSize;
  875. if(srcFilter) outVec= getConvVec(srcFilter, &scaleFilter);
  876. else outVec= &scaleFilter;
  877. ASSERT(outVec->length == filter2Size)
  878. //FIXME dstFilter
  879. for(j=0; j<outVec->length; j++)
  880. {
  881. filter2[i*filter2Size + j]= outVec->coeff[j];
  882. }
  883. (*filterPos)[i]+= (filterSize-1)/2 - (filter2Size-1)/2;
  884. if(outVec != &scaleFilter) freeVec(outVec);
  885. }
  886. free(filter); filter=NULL;
  887. /* try to reduce the filter-size (step1 find size and shift left) */
  888. // Assume its near normalized (*0.5 or *2.0 is ok but * 0.001 is not)
  889. minFilterSize= 0;
  890. for(i=dstW-1; i>=0; i--)
  891. {
  892. int min= filter2Size;
  893. int j;
  894. double cutOff=0.0;
  895. /* get rid off near zero elements on the left by shifting left */
  896. for(j=0; j<filter2Size; j++)
  897. {
  898. int k;
  899. cutOff += ABS(filter2[i*filter2Size]);
  900. if(cutOff > SWS_MAX_REDUCE_CUTOFF) break;
  901. /* preserve Monotonicity because the core cant handle the filter otherwise */
  902. if(i<dstW-1 && (*filterPos)[i] >= (*filterPos)[i+1]) break;
  903. // Move filter coeffs left
  904. for(k=1; k<filter2Size; k++)
  905. filter2[i*filter2Size + k - 1]= filter2[i*filter2Size + k];
  906. filter2[i*filter2Size + k - 1]= 0.0;
  907. (*filterPos)[i]++;
  908. }
  909. cutOff=0.0;
  910. /* count near zeros on the right */
  911. for(j=filter2Size-1; j>0; j--)
  912. {
  913. cutOff += ABS(filter2[i*filter2Size + j]);
  914. if(cutOff > SWS_MAX_REDUCE_CUTOFF) break;
  915. min--;
  916. }
  917. if(min>minFilterSize) minFilterSize= min;
  918. }
  919. ASSERT(minFilterSize > 0)
  920. filterSize= (minFilterSize +(filterAlign-1)) & (~(filterAlign-1));
  921. ASSERT(filterSize > 0)
  922. filter= (double*)memalign(8, filterSize*dstW*sizeof(double));
  923. *outFilterSize= filterSize;
  924. if(flags&SWS_PRINT_INFO)
  925. MSG_INFO("SwScaler: reducing / aligning filtersize %d -> %d\n", filter2Size, filterSize);
  926. /* try to reduce the filter-size (step2 reduce it) */
  927. for(i=0; i<dstW; i++)
  928. {
  929. int j;
  930. for(j=0; j<filterSize; j++)
  931. {
  932. if(j>=filter2Size) filter[i*filterSize + j]= 0.0;
  933. else filter[i*filterSize + j]= filter2[i*filter2Size + j];
  934. }
  935. }
  936. free(filter2); filter2=NULL;
  937. //FIXME try to align filterpos if possible
  938. //fix borders
  939. for(i=0; i<dstW; i++)
  940. {
  941. int j;
  942. if((*filterPos)[i] < 0)
  943. {
  944. // Move filter coeffs left to compensate for filterPos
  945. for(j=1; j<filterSize; j++)
  946. {
  947. int left= MAX(j + (*filterPos)[i], 0);
  948. filter[i*filterSize + left] += filter[i*filterSize + j];
  949. filter[i*filterSize + j]=0;
  950. }
  951. (*filterPos)[i]= 0;
  952. }
  953. if((*filterPos)[i] + filterSize > srcW)
  954. {
  955. int shift= (*filterPos)[i] + filterSize - srcW;
  956. // Move filter coeffs right to compensate for filterPos
  957. for(j=filterSize-2; j>=0; j--)
  958. {
  959. int right= MIN(j + shift, filterSize-1);
  960. filter[i*filterSize +right] += filter[i*filterSize +j];
  961. filter[i*filterSize +j]=0;
  962. }
  963. (*filterPos)[i]= srcW - filterSize;
  964. }
  965. }
  966. // Note the +1 is for the MMXscaler which reads over the end
  967. *outFilter= (int16_t*)memalign(8, *outFilterSize*(dstW+1)*sizeof(int16_t));
  968. memset(*outFilter, 0, *outFilterSize*(dstW+1)*sizeof(int16_t));
  969. /* Normalize & Store in outFilter */
  970. for(i=0; i<dstW; i++)
  971. {
  972. int j;
  973. double sum=0;
  974. double scale= one;
  975. for(j=0; j<filterSize; j++)
  976. {
  977. sum+= filter[i*filterSize + j];
  978. }
  979. scale/= sum;
  980. for(j=0; j<filterSize; j++)
  981. {
  982. (*outFilter)[i*(*outFilterSize) + j]= (int)(filter[i*filterSize + j]*scale);
  983. }
  984. }
  985. (*filterPos)[dstW]= (*filterPos)[dstW-1]; // the MMX scaler will read over the end
  986. for(i=0; i<*outFilterSize; i++)
  987. {
  988. int j= dstW*(*outFilterSize);
  989. (*outFilter)[j + i]= (*outFilter)[j + i - (*outFilterSize)];
  990. }
  991. free(filter);
  992. }
  993. #ifdef ARCH_X86
  994. static void initMMX2HScaler(int dstW, int xInc, uint8_t *funnyCode, int16_t *filter, int32_t *filterPos, int numSplits)
  995. {
  996. uint8_t *fragmentA;
  997. int imm8OfPShufW1A;
  998. int imm8OfPShufW2A;
  999. int fragmentLengthA;
  1000. uint8_t *fragmentB;
  1001. int imm8OfPShufW1B;
  1002. int imm8OfPShufW2B;
  1003. int fragmentLengthB;
  1004. int fragmentPos;
  1005. int xpos, i;
  1006. // create an optimized horizontal scaling routine
  1007. //code fragment
  1008. asm volatile(
  1009. "jmp 9f \n\t"
  1010. // Begin
  1011. "0: \n\t"
  1012. "movq (%%edx, %%eax), %%mm3 \n\t"
  1013. "movd (%%ecx, %%esi), %%mm0 \n\t"
  1014. "movd 1(%%ecx, %%esi), %%mm1 \n\t"
  1015. "punpcklbw %%mm7, %%mm1 \n\t"
  1016. "punpcklbw %%mm7, %%mm0 \n\t"
  1017. "pshufw $0xFF, %%mm1, %%mm1 \n\t"
  1018. "1: \n\t"
  1019. "pshufw $0xFF, %%mm0, %%mm0 \n\t"
  1020. "2: \n\t"
  1021. "psubw %%mm1, %%mm0 \n\t"
  1022. "movl 8(%%ebx, %%eax), %%esi \n\t"
  1023. "pmullw %%mm3, %%mm0 \n\t"
  1024. "psllw $7, %%mm1 \n\t"
  1025. "paddw %%mm1, %%mm0 \n\t"
  1026. "movq %%mm0, (%%edi, %%eax) \n\t"
  1027. "addl $8, %%eax \n\t"
  1028. // End
  1029. "9: \n\t"
  1030. // "int $3\n\t"
  1031. "leal 0b, %0 \n\t"
  1032. "leal 1b, %1 \n\t"
  1033. "leal 2b, %2 \n\t"
  1034. "decl %1 \n\t"
  1035. "decl %2 \n\t"
  1036. "subl %0, %1 \n\t"
  1037. "subl %0, %2 \n\t"
  1038. "leal 9b, %3 \n\t"
  1039. "subl %0, %3 \n\t"
  1040. :"=r" (fragmentA), "=r" (imm8OfPShufW1A), "=r" (imm8OfPShufW2A),
  1041. "=r" (fragmentLengthA)
  1042. );
  1043. asm volatile(
  1044. "jmp 9f \n\t"
  1045. // Begin
  1046. "0: \n\t"
  1047. "movq (%%edx, %%eax), %%mm3 \n\t"
  1048. "movd (%%ecx, %%esi), %%mm0 \n\t"
  1049. "punpcklbw %%mm7, %%mm0 \n\t"
  1050. "pshufw $0xFF, %%mm0, %%mm1 \n\t"
  1051. "1: \n\t"
  1052. "pshufw $0xFF, %%mm0, %%mm0 \n\t"
  1053. "2: \n\t"
  1054. "psubw %%mm1, %%mm0 \n\t"
  1055. "movl 8(%%ebx, %%eax), %%esi \n\t"
  1056. "pmullw %%mm3, %%mm0 \n\t"
  1057. "psllw $7, %%mm1 \n\t"
  1058. "paddw %%mm1, %%mm0 \n\t"
  1059. "movq %%mm0, (%%edi, %%eax) \n\t"
  1060. "addl $8, %%eax \n\t"
  1061. // End
  1062. "9: \n\t"
  1063. // "int $3\n\t"
  1064. "leal 0b, %0 \n\t"
  1065. "leal 1b, %1 \n\t"
  1066. "leal 2b, %2 \n\t"
  1067. "decl %1 \n\t"
  1068. "decl %2 \n\t"
  1069. "subl %0, %1 \n\t"
  1070. "subl %0, %2 \n\t"
  1071. "leal 9b, %3 \n\t"
  1072. "subl %0, %3 \n\t"
  1073. :"=r" (fragmentB), "=r" (imm8OfPShufW1B), "=r" (imm8OfPShufW2B),
  1074. "=r" (fragmentLengthB)
  1075. );
  1076. xpos= 0; //lumXInc/2 - 0x8000; // difference between pixel centers
  1077. fragmentPos=0;
  1078. for(i=0; i<dstW/numSplits; i++)
  1079. {
  1080. int xx=xpos>>16;
  1081. if((i&3) == 0)
  1082. {
  1083. int a=0;
  1084. int b=((xpos+xInc)>>16) - xx;
  1085. int c=((xpos+xInc*2)>>16) - xx;
  1086. int d=((xpos+xInc*3)>>16) - xx;
  1087. filter[i ] = (( xpos & 0xFFFF) ^ 0xFFFF)>>9;
  1088. filter[i+1] = (((xpos+xInc ) & 0xFFFF) ^ 0xFFFF)>>9;
  1089. filter[i+2] = (((xpos+xInc*2) & 0xFFFF) ^ 0xFFFF)>>9;
  1090. filter[i+3] = (((xpos+xInc*3) & 0xFFFF) ^ 0xFFFF)>>9;
  1091. filterPos[i/2]= xx;
  1092. if(d+1<4)
  1093. {
  1094. int maxShift= 3-(d+1);
  1095. int shift=0;
  1096. memcpy(funnyCode + fragmentPos, fragmentB, fragmentLengthB);
  1097. funnyCode[fragmentPos + imm8OfPShufW1B]=
  1098. (a+1) | ((b+1)<<2) | ((c+1)<<4) | ((d+1)<<6);
  1099. funnyCode[fragmentPos + imm8OfPShufW2B]=
  1100. a | (b<<2) | (c<<4) | (d<<6);
  1101. if(i+3>=dstW) shift=maxShift; //avoid overread
  1102. else if((filterPos[i/2]&3) <= maxShift) shift=filterPos[i/2]&3; //Align
  1103. if(shift && i>=shift)
  1104. {
  1105. funnyCode[fragmentPos + imm8OfPShufW1B]+= 0x55*shift;
  1106. funnyCode[fragmentPos + imm8OfPShufW2B]+= 0x55*shift;
  1107. filterPos[i/2]-=shift;
  1108. }
  1109. fragmentPos+= fragmentLengthB;
  1110. }
  1111. else
  1112. {
  1113. int maxShift= 3-d;
  1114. int shift=0;
  1115. memcpy(funnyCode + fragmentPos, fragmentA, fragmentLengthA);
  1116. funnyCode[fragmentPos + imm8OfPShufW1A]=
  1117. funnyCode[fragmentPos + imm8OfPShufW2A]=
  1118. a | (b<<2) | (c<<4) | (d<<6);
  1119. if(i+4>=dstW) shift=maxShift; //avoid overread
  1120. else if((filterPos[i/2]&3) <= maxShift) shift=filterPos[i/2]&3; //partial align
  1121. if(shift && i>=shift)
  1122. {
  1123. funnyCode[fragmentPos + imm8OfPShufW1A]+= 0x55*shift;
  1124. funnyCode[fragmentPos + imm8OfPShufW2A]+= 0x55*shift;
  1125. filterPos[i/2]-=shift;
  1126. }
  1127. fragmentPos+= fragmentLengthA;
  1128. }
  1129. funnyCode[fragmentPos]= RET;
  1130. }
  1131. xpos+=xInc;
  1132. }
  1133. filterPos[i/2]= xpos>>16; // needed to jump to the next part
  1134. }
  1135. #endif // ARCH_X86
  1136. //FIXME remove
  1137. void SwScale_Init(){
  1138. }
  1139. static void globalInit(){
  1140. // generating tables:
  1141. int i;
  1142. for(i=0; i<768; i++){
  1143. int c= MIN(MAX(i-256, 0), 255);
  1144. clip_table[i]=c;
  1145. yuvtab_2568[c]= clip_yuvtab_2568[i]=(0x2568*(c-16))+(256<<13);
  1146. yuvtab_3343[c]= clip_yuvtab_3343[i]=0x3343*(c-128);
  1147. yuvtab_0c92[c]= clip_yuvtab_0c92[i]=-0x0c92*(c-128);
  1148. yuvtab_1a1e[c]= clip_yuvtab_1a1e[i]=-0x1a1e*(c-128);
  1149. yuvtab_40cf[c]= clip_yuvtab_40cf[i]=0x40cf*(c-128);
  1150. }
  1151. for(i=0; i<768; i++)
  1152. {
  1153. int v= clip_table[i];
  1154. clip_table16b[i]= v>>3;
  1155. clip_table16g[i]= (v<<3)&0x07E0;
  1156. clip_table16r[i]= (v<<8)&0xF800;
  1157. clip_table15b[i]= v>>3;
  1158. clip_table15g[i]= (v<<2)&0x03E0;
  1159. clip_table15r[i]= (v<<7)&0x7C00;
  1160. }
  1161. cpuCaps= gCpuCaps;
  1162. #ifdef RUNTIME_CPUDETECT
  1163. #ifdef CAN_COMPILE_X86_ASM
  1164. // ordered per speed fasterst first
  1165. if(gCpuCaps.hasMMX2)
  1166. swScale= swScale_MMX2;
  1167. else if(gCpuCaps.has3DNow)
  1168. swScale= swScale_3DNow;
  1169. else if(gCpuCaps.hasMMX)
  1170. swScale= swScale_MMX;
  1171. else
  1172. swScale= swScale_C;
  1173. #else
  1174. swScale= swScale_C;
  1175. cpuCaps.hasMMX2 = cpuCaps.hasMMX = cpuCaps.has3DNow = 0;
  1176. #endif
  1177. #else //RUNTIME_CPUDETECT
  1178. #ifdef HAVE_MMX2
  1179. swScale= swScale_MMX2;
  1180. cpuCaps.has3DNow = 0;
  1181. #elif defined (HAVE_3DNOW)
  1182. swScale= swScale_3DNow;
  1183. cpuCaps.hasMMX2 = 0;
  1184. #elif defined (HAVE_MMX)
  1185. swScale= swScale_MMX;
  1186. cpuCaps.hasMMX2 = cpuCaps.has3DNow = 0;
  1187. #else
  1188. swScale= swScale_C;
  1189. cpuCaps.hasMMX2 = cpuCaps.hasMMX = cpuCaps.has3DNow = 0;
  1190. #endif
  1191. #endif //!RUNTIME_CPUDETECT
  1192. }
  1193. static void PlanarToNV12Wrapper(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
  1194. int srcSliceH, uint8_t* dstParam[], int dstStride[]){
  1195. uint8_t *dst=dstParam[0] + dstStride[0]*srcSliceY;
  1196. /* Copy Y plane */
  1197. if(dstStride[0]==srcStride[0])
  1198. memcpy(dst, src[0], srcSliceH*dstStride[0]);
  1199. else
  1200. {
  1201. int i;
  1202. uint8_t *srcPtr= src[0];
  1203. uint8_t *dstPtr= dst;
  1204. for(i=0; i<srcSliceH; i++)
  1205. {
  1206. memcpy(dstPtr, srcPtr, srcStride[0]);
  1207. srcPtr+= srcStride[0];
  1208. dstPtr+= dstStride[0];
  1209. }
  1210. }
  1211. dst = dstParam[1] + dstStride[1]*srcSliceY;
  1212. if(c->srcFormat==IMGFMT_YV12)
  1213. interleaveBytes( src[1],src[2],dst,c->srcW,srcSliceH,srcStride[1],srcStride[2],dstStride[0] );
  1214. else /* I420 & IYUV */
  1215. interleaveBytes( src[2],src[1],dst,c->srcW,srcSliceH,srcStride[2],srcStride[1],dstStride[0] );
  1216. }
  1217. /* Warper functions for yuv2bgr */
  1218. static void planarYuvToBgr(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
  1219. int srcSliceH, uint8_t* dstParam[], int dstStride[]){
  1220. uint8_t *dst=dstParam[0] + dstStride[0]*srcSliceY;
  1221. if(c->srcFormat==IMGFMT_YV12)
  1222. yuv2rgb( dst,src[0],src[1],src[2],c->srcW,srcSliceH,dstStride[0],srcStride[0],srcStride[1] );
  1223. else /* I420 & IYUV */
  1224. yuv2rgb( dst,src[0],src[2],src[1],c->srcW,srcSliceH,dstStride[0],srcStride[0],srcStride[1] );
  1225. }
  1226. static void PlanarToYuy2Wrapper(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
  1227. int srcSliceH, uint8_t* dstParam[], int dstStride[]){
  1228. uint8_t *dst=dstParam[0] + dstStride[0]*srcSliceY;
  1229. if(c->srcFormat==IMGFMT_YV12)
  1230. yv12toyuy2( src[0],src[1],src[2],dst,c->srcW,srcSliceH,srcStride[0],srcStride[1],dstStride[0] );
  1231. else /* I420 & IYUV */
  1232. yv12toyuy2( src[0],src[2],src[1],dst,c->srcW,srcSliceH,srcStride[0],srcStride[1],dstStride[0] );
  1233. }
  1234. static void bgr24to32Wrapper(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
  1235. int srcSliceH, uint8_t* dst[], int dstStride[]){
  1236. if(dstStride[0]*3==srcStride[0]*4)
  1237. rgb24to32(src[0], dst[0] + dstStride[0]*srcSliceY, srcSliceH*srcStride[0]);
  1238. else
  1239. {
  1240. int i;
  1241. uint8_t *srcPtr= src[0];
  1242. uint8_t *dstPtr= dst[0] + dstStride[0]*srcSliceY;
  1243. for(i=0; i<srcSliceH; i++)
  1244. {
  1245. rgb24to32(srcPtr, dstPtr, c->srcW*3);
  1246. srcPtr+= srcStride[0];
  1247. dstPtr+= dstStride[0];
  1248. }
  1249. }
  1250. }
  1251. static void bgr24to16Wrapper(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
  1252. int srcSliceH, uint8_t* dst[], int dstStride[]){
  1253. if(dstStride[0]*3==srcStride[0]*2)
  1254. rgb24to16(src[0], dst[0] + dstStride[0]*srcSliceY, srcSliceH*srcStride[0]);
  1255. else
  1256. {
  1257. int i;
  1258. uint8_t *srcPtr= src[0];
  1259. uint8_t *dstPtr= dst[0] + dstStride[0]*srcSliceY;
  1260. for(i=0; i<srcSliceH; i++)
  1261. {
  1262. rgb24to16(srcPtr, dstPtr, c->srcW*3);
  1263. srcPtr+= srcStride[0];
  1264. dstPtr+= dstStride[0];
  1265. }
  1266. }
  1267. }
  1268. static void bgr24to15Wrapper(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
  1269. int srcSliceH, uint8_t* dst[], int dstStride[]){
  1270. if(dstStride[0]*3==srcStride[0]*2)
  1271. rgb24to15(src[0], dst[0] + dstStride[0]*srcSliceY, srcSliceH*srcStride[0]);
  1272. else
  1273. {
  1274. int i;
  1275. uint8_t *srcPtr= src[0];
  1276. uint8_t *dstPtr= dst[0] + dstStride[0]*srcSliceY;
  1277. for(i=0; i<srcSliceH; i++)
  1278. {
  1279. rgb24to15(srcPtr, dstPtr, c->srcW*3);
  1280. srcPtr+= srcStride[0];
  1281. dstPtr+= dstStride[0];
  1282. }
  1283. }
  1284. }
  1285. static void bgr32to24Wrapper(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
  1286. int srcSliceH, uint8_t* dst[], int dstStride[]){
  1287. if(dstStride[0]*4==srcStride[0]*3)
  1288. rgb32to24(src[0], dst[0] + dstStride[0]*srcSliceY, srcSliceH*srcStride[0]);
  1289. else
  1290. {
  1291. int i;
  1292. uint8_t *srcPtr= src[0];
  1293. uint8_t *dstPtr= dst[0] + dstStride[0]*srcSliceY;
  1294. for(i=0; i<srcSliceH; i++)
  1295. {
  1296. rgb32to24(srcPtr, dstPtr, c->srcW<<2);
  1297. srcPtr+= srcStride[0];
  1298. dstPtr+= dstStride[0];
  1299. }
  1300. }
  1301. }
  1302. static void bgr32to16Wrapper(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
  1303. int srcSliceH, uint8_t* dst[], int dstStride[]){
  1304. if(dstStride[0]*4==srcStride[0]*2)
  1305. rgb32to16(src[0], dst[0] + dstStride[0]*srcSliceY, srcSliceH*srcStride[0]);
  1306. else
  1307. {
  1308. int i;
  1309. uint8_t *srcPtr= src[0];
  1310. uint8_t *dstPtr= dst[0] + dstStride[0]*srcSliceY;
  1311. for(i=0; i<srcSliceH; i++)
  1312. {
  1313. rgb32to16(srcPtr, dstPtr, c->srcW<<2);
  1314. srcPtr+= srcStride[0];
  1315. dstPtr+= dstStride[0];
  1316. }
  1317. }
  1318. }
  1319. static void bgr32to15Wrapper(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
  1320. int srcSliceH, uint8_t* dst[], int dstStride[]){
  1321. if(dstStride[0]*4==srcStride[0]*2)
  1322. rgb32to15(src[0], dst[0] + dstStride[0]*srcSliceY, srcSliceH*srcStride[0]);
  1323. else
  1324. {
  1325. int i;
  1326. uint8_t *srcPtr= src[0];
  1327. uint8_t *dstPtr= dst[0] + dstStride[0]*srcSliceY;
  1328. for(i=0; i<srcSliceH; i++)
  1329. {
  1330. rgb32to15(srcPtr, dstPtr, c->srcW<<2);
  1331. srcPtr+= srcStride[0];
  1332. dstPtr+= dstStride[0];
  1333. }
  1334. }
  1335. }
  1336. static void bgr15to16Wrapper(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
  1337. int srcSliceH, uint8_t* dst[], int dstStride[]){
  1338. if(dstStride[0]==srcStride[0])
  1339. rgb15to16(src[0], dst[0] + dstStride[0]*srcSliceY, srcSliceH*srcStride[0]);
  1340. else
  1341. {
  1342. int i;
  1343. uint8_t *srcPtr= src[0];
  1344. uint8_t *dstPtr= dst[0] + dstStride[0]*srcSliceY;
  1345. for(i=0; i<srcSliceH; i++)
  1346. {
  1347. rgb15to16(srcPtr, dstPtr, c->srcW<<1);
  1348. srcPtr+= srcStride[0];
  1349. dstPtr+= dstStride[0];
  1350. }
  1351. }
  1352. }
  1353. static void bgr15to24Wrapper(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
  1354. int srcSliceH, uint8_t* dst[], int dstStride[]){
  1355. if(dstStride[0]*2==srcStride[0]*3)
  1356. rgb15to24(src[0], dst[0] + dstStride[0]*srcSliceY, srcSliceH*srcStride[0]);
  1357. else
  1358. {
  1359. int i;
  1360. uint8_t *srcPtr= src[0];
  1361. uint8_t *dstPtr= dst[0] + dstStride[0]*srcSliceY;
  1362. for(i=0; i<srcSliceH; i++)
  1363. {
  1364. rgb15to24(srcPtr, dstPtr, c->srcW<<1);
  1365. srcPtr+= srcStride[0];
  1366. dstPtr+= dstStride[0];
  1367. }
  1368. }
  1369. }
  1370. static void bgr15to32Wrapper(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
  1371. int srcSliceH, uint8_t* dst[], int dstStride[]){
  1372. if(dstStride[0]*2==srcStride[0]*4)
  1373. rgb15to32(src[0], dst[0] + dstStride[0]*srcSliceY, srcSliceH*srcStride[0]);
  1374. else
  1375. {
  1376. int i;
  1377. uint8_t *srcPtr= src[0];
  1378. uint8_t *dstPtr= dst[0] + dstStride[0]*srcSliceY;
  1379. for(i=0; i<srcSliceH; i++)
  1380. {
  1381. rgb15to32(srcPtr, dstPtr, c->srcW<<1);
  1382. srcPtr+= srcStride[0];
  1383. dstPtr+= dstStride[0];
  1384. }
  1385. }
  1386. }
  1387. static void bgr16to24Wrapper(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
  1388. int srcSliceH, uint8_t* dst[], int dstStride[]){
  1389. if(dstStride[0]*2==srcStride[0]*3)
  1390. rgb16to24(src[0], dst[0] + dstStride[0]*srcSliceY, srcSliceH*srcStride[0]);
  1391. else
  1392. {
  1393. int i;
  1394. uint8_t *srcPtr= src[0];
  1395. uint8_t *dstPtr= dst[0] + dstStride[0]*srcSliceY;
  1396. for(i=0; i<srcSliceH; i++)
  1397. {
  1398. rgb16to24(srcPtr, dstPtr, c->srcW<<1);
  1399. srcPtr+= srcStride[0];
  1400. dstPtr+= dstStride[0];
  1401. }
  1402. }
  1403. }
  1404. static void bgr16to32Wrapper(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
  1405. int srcSliceH, uint8_t* dst[], int dstStride[]){
  1406. if(dstStride[0]*2==srcStride[0]*4)
  1407. rgb16to32(src[0], dst[0] + dstStride[0]*srcSliceY, srcSliceH*srcStride[0]);
  1408. else
  1409. {
  1410. int i;
  1411. uint8_t *srcPtr= src[0];
  1412. uint8_t *dstPtr= dst[0] + dstStride[0]*srcSliceY;
  1413. for(i=0; i<srcSliceH; i++)
  1414. {
  1415. rgb16to32(srcPtr, dstPtr, c->srcW<<1);
  1416. srcPtr+= srcStride[0];
  1417. dstPtr+= dstStride[0];
  1418. }
  1419. }
  1420. }
  1421. static void bgr24toyv12Wrapper(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
  1422. int srcSliceH, uint8_t* dst[], int dstStride[]){
  1423. rgb24toyv12(
  1424. src[0],
  1425. dst[0]+ srcSliceY *dstStride[0],
  1426. dst[1]+(srcSliceY>>1)*dstStride[1],
  1427. dst[2]+(srcSliceY>>1)*dstStride[2],
  1428. c->srcW, srcSliceH,
  1429. dstStride[0], dstStride[1], srcStride[0]);
  1430. }
  1431. /**
  1432. * bring pointers in YUV order instead of YVU
  1433. */
  1434. static inline void orderYUV(int format, uint8_t * sortedP[], int sortedStride[], uint8_t * p[], int stride[]){
  1435. if(format == IMGFMT_YV12 || format == IMGFMT_YVU9){
  1436. sortedP[0]= p[0];
  1437. sortedP[1]= p[1];
  1438. sortedP[2]= p[2];
  1439. sortedStride[0]= stride[0];
  1440. sortedStride[1]= stride[1];
  1441. sortedStride[2]= stride[2];
  1442. }
  1443. else if(isPacked(format) || isGray(format))
  1444. {
  1445. sortedP[0]= p[0];
  1446. sortedP[1]=
  1447. sortedP[2]= NULL;
  1448. sortedStride[0]= stride[0];
  1449. sortedStride[1]=
  1450. sortedStride[2]= 0;
  1451. }
  1452. else /* I420 */
  1453. {
  1454. sortedP[0]= p[0];
  1455. sortedP[1]= p[2];
  1456. sortedP[2]= p[1];
  1457. sortedStride[0]= stride[0];
  1458. sortedStride[1]= stride[2];
  1459. sortedStride[2]= stride[1];
  1460. }
  1461. }
  1462. /* unscaled copy like stuff (assumes nearly identical formats) */
  1463. static void simpleCopy(SwsContext *c, uint8_t* srcParam[], int srcStrideParam[], int srcSliceY,
  1464. int srcSliceH, uint8_t* dstParam[], int dstStrideParam[]){
  1465. int srcStride[3];
  1466. int dstStride[3];
  1467. uint8_t *src[3];
  1468. uint8_t *dst[3];
  1469. orderYUV(c->srcFormat, src, srcStride, srcParam, srcStrideParam);
  1470. orderYUV(c->dstFormat, dst, dstStride, dstParam, dstStrideParam);
  1471. if(isPacked(c->srcFormat))
  1472. {
  1473. if(dstStride[0]==srcStride[0])
  1474. memcpy(dst[0] + dstStride[0]*srcSliceY, src[0], srcSliceH*dstStride[0]);
  1475. else
  1476. {
  1477. int i;
  1478. uint8_t *srcPtr= src[0];
  1479. uint8_t *dstPtr= dst[0] + dstStride[0]*srcSliceY;
  1480. int length=0;
  1481. /* universal length finder */
  1482. while(length+c->srcW <= ABS(dstStride[0])
  1483. && length+c->srcW <= ABS(srcStride[0])) length+= c->srcW;
  1484. ASSERT(length!=0);
  1485. for(i=0; i<srcSliceH; i++)
  1486. {
  1487. memcpy(dstPtr, srcPtr, length);
  1488. srcPtr+= srcStride[0];
  1489. dstPtr+= dstStride[0];
  1490. }
  1491. }
  1492. }
  1493. else
  1494. { /* Planar YUV or gray */
  1495. int plane;
  1496. for(plane=0; plane<3; plane++)
  1497. {
  1498. int length= plane==0 ? c->srcW : -((-c->srcW )>>c->chrDstHSubSample);
  1499. int y= plane==0 ? srcSliceY: -((-srcSliceY)>>c->chrDstVSubSample);
  1500. int height= plane==0 ? srcSliceH: -((-srcSliceH)>>c->chrDstVSubSample);
  1501. if((isGray(c->srcFormat) || isGray(c->dstFormat)) && plane>0)
  1502. {
  1503. if(!isGray(c->dstFormat))
  1504. memset(dst[plane], 128, dstStride[plane]*height);
  1505. }
  1506. else
  1507. {
  1508. if(dstStride[plane]==srcStride[plane])
  1509. memcpy(dst[plane] + dstStride[plane]*y, src[plane], height*dstStride[plane]);
  1510. else
  1511. {
  1512. int i;
  1513. uint8_t *srcPtr= src[plane];
  1514. uint8_t *dstPtr= dst[plane] + dstStride[plane]*y;
  1515. for(i=0; i<height; i++)
  1516. {
  1517. memcpy(dstPtr, srcPtr, length);
  1518. srcPtr+= srcStride[plane];
  1519. dstPtr+= dstStride[plane];
  1520. }
  1521. }
  1522. }
  1523. }
  1524. }
  1525. }
  1526. static int remove_dup_fourcc(int fourcc)
  1527. {
  1528. switch(fourcc)
  1529. {
  1530. case IMGFMT_IYUV: return IMGFMT_I420;
  1531. case IMGFMT_Y8 : return IMGFMT_Y800;
  1532. default: return fourcc;
  1533. }
  1534. }
  1535. static void getSubSampleFactors(int *h, int *v, int format){
  1536. switch(format){
  1537. case IMGFMT_YUY2:
  1538. *h=1;
  1539. *v=0;
  1540. break;
  1541. case IMGFMT_YV12:
  1542. case IMGFMT_I420:
  1543. case IMGFMT_Y800: //FIXME remove after different subsamplings are fully implemented
  1544. *h=1;
  1545. *v=1;
  1546. break;
  1547. case IMGFMT_YVU9:
  1548. *h=2;
  1549. *v=2;
  1550. break;
  1551. default:
  1552. *h=0;
  1553. *v=0;
  1554. break;
  1555. }
  1556. }
  1557. SwsContext *getSwsContext(int srcW, int srcH, int srcFormat, int dstW, int dstH, int dstFormat, int flags,
  1558. SwsFilter *srcFilter, SwsFilter *dstFilter){
  1559. SwsContext *c;
  1560. int i;
  1561. int usesFilter;
  1562. int unscaled;
  1563. SwsFilter dummyFilter= {NULL, NULL, NULL, NULL};
  1564. #ifdef ARCH_X86
  1565. if(gCpuCaps.hasMMX)
  1566. asm volatile("emms\n\t"::: "memory");
  1567. #endif
  1568. if(swScale==NULL) globalInit();
  1569. //srcFormat= IMGFMT_Y800;
  1570. //srcFormat= IMGFMT_YVU9;
  1571. /* avoid dupplicate Formats, so we dont need to check to much */
  1572. srcFormat = remove_dup_fourcc(srcFormat);
  1573. dstFormat = remove_dup_fourcc(dstFormat);
  1574. unscaled = (srcW == dstW && srcH == dstH);
  1575. if(!isSupportedIn(srcFormat))
  1576. {
  1577. MSG_ERR("swScaler: %s is not supported as input format\n", vo_format_name(srcFormat));
  1578. return NULL;
  1579. }
  1580. if(!isSupportedOut(dstFormat))
  1581. {
  1582. MSG_ERR("swScaler: %s is not supported as output format\n", vo_format_name(dstFormat));
  1583. return NULL;
  1584. }
  1585. /* sanity check */
  1586. if(srcW<4 || srcH<1 || dstW<8 || dstH<1) //FIXME check if these are enough and try to lowwer them after fixing the relevant parts of the code
  1587. {
  1588. MSG_ERR("swScaler: %dx%d -> %dx%d is invalid scaling dimension\n",
  1589. srcW, srcH, dstW, dstH);
  1590. return NULL;
  1591. }
  1592. if(!dstFilter) dstFilter= &dummyFilter;
  1593. if(!srcFilter) srcFilter= &dummyFilter;
  1594. c= memalign(64, sizeof(SwsContext));
  1595. memset(c, 0, sizeof(SwsContext));
  1596. c->srcW= srcW;
  1597. c->srcH= srcH;
  1598. c->dstW= dstW;
  1599. c->dstH= dstH;
  1600. c->lumXInc= ((srcW<<16) + (dstW>>1))/dstW;
  1601. c->lumYInc= ((srcH<<16) + (dstH>>1))/dstH;
  1602. c->flags= flags;
  1603. c->dstFormat= dstFormat;
  1604. c->srcFormat= srcFormat;
  1605. usesFilter=0;
  1606. if(dstFilter->lumV!=NULL && dstFilter->lumV->length>1) usesFilter=1;
  1607. if(dstFilter->lumH!=NULL && dstFilter->lumH->length>1) usesFilter=1;
  1608. if(dstFilter->chrV!=NULL && dstFilter->chrV->length>1) usesFilter=1;
  1609. if(dstFilter->chrH!=NULL && dstFilter->chrH->length>1) usesFilter=1;
  1610. if(srcFilter->lumV!=NULL && srcFilter->lumV->length>1) usesFilter=1;
  1611. if(srcFilter->lumH!=NULL && srcFilter->lumH->length>1) usesFilter=1;
  1612. if(srcFilter->chrV!=NULL && srcFilter->chrV->length>1) usesFilter=1;
  1613. if(srcFilter->chrH!=NULL && srcFilter->chrH->length>1) usesFilter=1;
  1614. getSubSampleFactors(&c->chrSrcHSubSample, &c->chrSrcVSubSample, srcFormat);
  1615. getSubSampleFactors(&c->chrDstHSubSample, &c->chrDstVSubSample, dstFormat);
  1616. // reuse chroma for 2 pixles rgb/bgr unless user wants full chroma interpolation
  1617. if((isBGR(dstFormat) || isRGB(dstFormat)) && !(flags&SWS_FULL_CHR_H_INT)) c->chrDstHSubSample=1;
  1618. // drop eery 2. pixel for chroma calculation unless user wants full chroma
  1619. if((isBGR(srcFormat) || isRGB(srcFormat) || srcFormat==IMGFMT_YUY2) && !(flags&SWS_FULL_CHR_V))
  1620. c->chrSrcVSubSample=1;
  1621. // drop eery 2. pixel for chroma calculation unless user wants full chroma
  1622. if((isBGR(srcFormat) || isRGB(srcFormat)) && !(flags&SWS_FULL_CHR_H_INP))
  1623. c->chrSrcHSubSample=1;
  1624. c->chrIntHSubSample= c->chrDstHSubSample;
  1625. c->chrIntVSubSample= c->chrSrcVSubSample;
  1626. // note the -((-x)>>y) is so that we allways round toward +inf
  1627. c->chrSrcW= -((-srcW) >> c->chrSrcHSubSample);
  1628. c->chrSrcH= -((-srcH) >> c->chrSrcVSubSample);
  1629. c->chrDstW= -((-dstW) >> c->chrDstHSubSample);
  1630. c->chrDstH= -((-dstH) >> c->chrDstVSubSample);
  1631. /* printf("%d %d %d %d / %d %d %d %d //\n",
  1632. c->chrSrcW,
  1633. c->chrSrcH,
  1634. c->chrDstW,
  1635. c->chrDstH,
  1636. srcW,
  1637. srcH,
  1638. dstW,
  1639. dstH);*/
  1640. /* unscaled special Cases */
  1641. if(unscaled && !usesFilter)
  1642. {
  1643. /* yv12_to_nv12 */
  1644. if((srcFormat == IMGFMT_YV12||srcFormat==IMGFMT_I420)&&dstFormat == IMGFMT_NV12)
  1645. {
  1646. c->swScale= PlanarToNV12Wrapper;
  1647. if(flags&SWS_PRINT_INFO)
  1648. MSG_INFO("SwScaler: using unscaled %s -> %s special converter\n",
  1649. vo_format_name(srcFormat), vo_format_name(dstFormat));
  1650. return c;
  1651. }
  1652. /* yv12_to_yuy2 */
  1653. if((srcFormat == IMGFMT_YV12||srcFormat==IMGFMT_I420)&&dstFormat == IMGFMT_YUY2)
  1654. {
  1655. c->swScale= PlanarToYuy2Wrapper;
  1656. if(flags&SWS_PRINT_INFO)
  1657. MSG_INFO("SwScaler: using unscaled %s -> %s special converter\n",
  1658. vo_format_name(srcFormat), vo_format_name(dstFormat));
  1659. return c;
  1660. }
  1661. /* yuv2bgr */
  1662. if((srcFormat==IMGFMT_YV12 || srcFormat==IMGFMT_I420) && isBGR(dstFormat))
  1663. {
  1664. // FIXME multiple yuv2rgb converters wont work that way cuz that thing is full of globals&statics
  1665. #ifdef WORDS_BIGENDIAN
  1666. if(dstFormat==IMGFMT_BGR32)
  1667. yuv2rgb_init( dstFormat&0xFF /* =bpp */, MODE_BGR);
  1668. else
  1669. yuv2rgb_init( dstFormat&0xFF /* =bpp */, MODE_RGB);
  1670. #else
  1671. yuv2rgb_init( dstFormat&0xFF /* =bpp */, MODE_RGB);
  1672. #endif
  1673. c->swScale= planarYuvToBgr;
  1674. if(flags&SWS_PRINT_INFO)
  1675. MSG_INFO("SwScaler: using unscaled %s -> %s special converter\n",
  1676. vo_format_name(srcFormat), vo_format_name(dstFormat));
  1677. return c;
  1678. }
  1679. #if 1
  1680. /* simple copy */
  1681. if( srcFormat == dstFormat
  1682. || (srcFormat==IMGFMT_YV12 && dstFormat==IMGFMT_I420)
  1683. || (srcFormat==IMGFMT_I420 && dstFormat==IMGFMT_YV12)
  1684. || (isPlanarYUV(srcFormat) && isGray(dstFormat))
  1685. || (isPlanarYUV(dstFormat) && isGray(srcFormat))
  1686. )
  1687. {
  1688. c->swScale= simpleCopy;
  1689. if(flags&SWS_PRINT_INFO)
  1690. MSG_INFO("SwScaler: using unscaled %s -> %s special converter\n",
  1691. vo_format_name(srcFormat), vo_format_name(dstFormat));
  1692. return c;
  1693. }
  1694. #endif
  1695. /* bgr32to24 & rgb32to24*/
  1696. if((srcFormat==IMGFMT_BGR32 && dstFormat==IMGFMT_BGR24)
  1697. ||(srcFormat==IMGFMT_RGB32 && dstFormat==IMGFMT_RGB24))
  1698. {
  1699. c->swScale= bgr32to24Wrapper;
  1700. if(flags&SWS_PRINT_INFO)
  1701. MSG_INFO("SwScaler: using unscaled %s -> %s special converter\n",
  1702. vo_format_name(srcFormat), vo_format_name(dstFormat));
  1703. return c;
  1704. }
  1705. /* bgr32to16 & rgb32to16*/
  1706. if((srcFormat==IMGFMT_BGR32 && dstFormat==IMGFMT_BGR16)
  1707. ||(srcFormat==IMGFMT_RGB32 && dstFormat==IMGFMT_RGB16))
  1708. {
  1709. c->swScale= bgr32to16Wrapper;
  1710. if(flags&SWS_PRINT_INFO)
  1711. MSG_INFO("SwScaler: using unscaled %s -> %s special converter\n",
  1712. vo_format_name(srcFormat), vo_format_name(dstFormat));
  1713. return c;
  1714. }
  1715. /* bgr32to15 & rgb32to15*/
  1716. if((srcFormat==IMGFMT_BGR32 && dstFormat==IMGFMT_BGR15)
  1717. ||(srcFormat==IMGFMT_RGB32 && dstFormat==IMGFMT_RGB15))
  1718. {
  1719. c->swScale= bgr32to15Wrapper;
  1720. if(flags&SWS_PRINT_INFO)
  1721. MSG_INFO("SwScaler: using unscaled %s -> %s special converter\n",
  1722. vo_format_name(srcFormat), vo_format_name(dstFormat));
  1723. return c;
  1724. }
  1725. /* bgr24to32 & rgb24to32*/
  1726. if((srcFormat==IMGFMT_BGR24 && dstFormat==IMGFMT_BGR32)
  1727. ||(srcFormat==IMGFMT_RGB24 && dstFormat==IMGFMT_RGB32))
  1728. {
  1729. c->swScale= bgr24to32Wrapper;
  1730. if(flags&SWS_PRINT_INFO)
  1731. MSG_INFO("SwScaler: using unscaled %s -> %s special converter\n",
  1732. vo_format_name(srcFormat), vo_format_name(dstFormat));
  1733. return c;
  1734. }
  1735. /* bgr24to16 & rgb24to16*/
  1736. if((srcFormat==IMGFMT_BGR24 && dstFormat==IMGFMT_BGR16)
  1737. ||(srcFormat==IMGFMT_RGB24 && dstFormat==IMGFMT_RGB16))
  1738. {
  1739. c->swScale= bgr24to16Wrapper;
  1740. if(flags&SWS_PRINT_INFO)
  1741. MSG_INFO("SwScaler: using unscaled %s -> %s special converter\n",
  1742. vo_format_name(srcFormat), vo_format_name(dstFormat));
  1743. return c;
  1744. }
  1745. /* bgr24to15 & rgb24to15*/
  1746. if((srcFormat==IMGFMT_BGR24 && dstFormat==IMGFMT_BGR15)
  1747. ||(srcFormat==IMGFMT_RGB24 && dstFormat==IMGFMT_RGB15))
  1748. {
  1749. c->swScale= bgr24to15Wrapper;
  1750. if(flags&SWS_PRINT_INFO)
  1751. MSG_INFO("SwScaler: using unscaled %s -> %s special converter\n",
  1752. vo_format_name(srcFormat), vo_format_name(dstFormat));
  1753. return c;
  1754. }
  1755. /* bgr15to16 */
  1756. if(srcFormat==IMGFMT_BGR15 && dstFormat==IMGFMT_BGR16)
  1757. {
  1758. c->swScale= bgr15to16Wrapper;
  1759. if(flags&SWS_PRINT_INFO)
  1760. MSG_INFO("SwScaler: using unscaled %s -> %s special converter\n",
  1761. vo_format_name(srcFormat), vo_format_name(dstFormat));
  1762. return c;
  1763. }
  1764. /* bgr15to24 */
  1765. if((srcFormat==IMGFMT_BGR15 && dstFormat==IMGFMT_BGR24)
  1766. ||(srcFormat==IMGFMT_RGB15 && dstFormat==IMGFMT_RGB24))
  1767. {
  1768. c->swScale= bgr15to24Wrapper;
  1769. if(flags&SWS_PRINT_INFO)
  1770. MSG_INFO("SwScaler: using unscaled %s -> %s special converter\n",
  1771. vo_format_name(srcFormat), vo_format_name(dstFormat));
  1772. return c;
  1773. }
  1774. #if 0 //segfaults
  1775. /* bgr15to32 */
  1776. if((srcFormat==IMGFMT_BGR15 && dstFormat==IMGFMT_BGR32)
  1777. ||(srcFormat==IMGFMT_RGB15 && dstFormat==IMGFMT_RGB32))
  1778. {
  1779. c->swScale= bgr15to32Wrapper;
  1780. if(flags&SWS_PRINT_INFO)
  1781. MSG_INFO("SwScaler: using unscaled %s -> %s special converter\n",
  1782. vo_format_name(srcFormat), vo_format_name(dstFormat));
  1783. return c;
  1784. }
  1785. #endif
  1786. /* bgr16to24 */
  1787. if((srcFormat==IMGFMT_BGR16 && dstFormat==IMGFMT_BGR24)
  1788. ||(srcFormat==IMGFMT_RGB16 && dstFormat==IMGFMT_RGB24))
  1789. {
  1790. c->swScale= bgr16to24Wrapper;
  1791. if(flags&SWS_PRINT_INFO)
  1792. MSG_INFO("SwScaler: using unscaled %s -> %s special converter\n",
  1793. vo_format_name(srcFormat), vo_format_name(dstFormat));
  1794. return c;
  1795. }
  1796. #if 0 //segfaults
  1797. /* bgr16to32 */
  1798. if((srcFormat==IMGFMT_BGR16 && dstFormat==IMGFMT_BGR32)
  1799. ||(srcFormat==IMGFMT_RGB16 && dstFormat==IMGFMT_RGB32))
  1800. {
  1801. c->swScale= bgr16to32Wrapper;
  1802. if(flags&SWS_PRINT_INFO)
  1803. MSG_INFO("SwScaler: using unscaled %s -> %s special converter\n",
  1804. vo_format_name(srcFormat), vo_format_name(dstFormat));
  1805. return c;
  1806. }
  1807. #endif
  1808. /* bgr24toYV12 */
  1809. if(srcFormat==IMGFMT_BGR24 && dstFormat==IMGFMT_YV12)
  1810. {
  1811. c->swScale= bgr24toyv12Wrapper;
  1812. if(flags&SWS_PRINT_INFO)
  1813. MSG_INFO("SwScaler: using unscaled %s -> %s special converter\n",
  1814. vo_format_name(srcFormat), vo_format_name(dstFormat));
  1815. return c;
  1816. }
  1817. }
  1818. if(cpuCaps.hasMMX2)
  1819. {
  1820. c->canMMX2BeUsed= (dstW >=srcW && (dstW&31)==0 && (srcW&15)==0) ? 1 : 0;
  1821. if(!c->canMMX2BeUsed && dstW >=srcW && (srcW&15)==0 && (flags&SWS_FAST_BILINEAR))
  1822. {
  1823. if(flags&SWS_PRINT_INFO)
  1824. MSG_INFO("SwScaler: output Width is not a multiple of 32 -> no MMX2 scaler\n");
  1825. }
  1826. }
  1827. else
  1828. c->canMMX2BeUsed=0;
  1829. c->chrXInc= ((c->chrSrcW<<16) + (c->chrDstW>>1))/c->chrDstW;
  1830. c->chrYInc= ((c->chrSrcH<<16) + (c->chrDstH>>1))/c->chrDstH;
  1831. // match pixel 0 of the src to pixel 0 of dst and match pixel n-2 of src to pixel n-2 of dst
  1832. // but only for the FAST_BILINEAR mode otherwise do correct scaling
  1833. // n-2 is the last chrominance sample available
  1834. // this is not perfect, but noone shuld notice the difference, the more correct variant
  1835. // would be like the vertical one, but that would require some special code for the
  1836. // first and last pixel
  1837. if(flags&SWS_FAST_BILINEAR)
  1838. {
  1839. if(c->canMMX2BeUsed)
  1840. {
  1841. c->lumXInc+= 20;
  1842. c->chrXInc+= 20;
  1843. }
  1844. //we dont use the x86asm scaler if mmx is available
  1845. else if(cpuCaps.hasMMX)
  1846. {
  1847. c->lumXInc = ((srcW-2)<<16)/(dstW-2) - 20;
  1848. c->chrXInc = ((c->chrSrcW-2)<<16)/(c->chrDstW-2) - 20;
  1849. }
  1850. }
  1851. /* precalculate horizontal scaler filter coefficients */
  1852. {
  1853. const int filterAlign= cpuCaps.hasMMX ? 4 : 1;
  1854. initFilter(&c->hLumFilter, &c->hLumFilterPos, &c->hLumFilterSize, c->lumXInc,
  1855. srcW , dstW, filterAlign, 1<<14, flags,
  1856. srcFilter->lumH, dstFilter->lumH);
  1857. initFilter(&c->hChrFilter, &c->hChrFilterPos, &c->hChrFilterSize, c->chrXInc,
  1858. c->chrSrcW, c->chrDstW, filterAlign, 1<<14, flags,
  1859. srcFilter->chrH, dstFilter->chrH);
  1860. #ifdef ARCH_X86
  1861. // cant downscale !!!
  1862. if(c->canMMX2BeUsed && (flags & SWS_FAST_BILINEAR))
  1863. {
  1864. c->lumMmx2Filter = (int16_t*)memalign(8, (dstW /8+8)*sizeof(int16_t));
  1865. c->chrMmx2Filter = (int16_t*)memalign(8, (c->chrDstW /4+8)*sizeof(int16_t));
  1866. c->lumMmx2FilterPos= (int32_t*)memalign(8, (dstW /2/8+8)*sizeof(int32_t));
  1867. c->chrMmx2FilterPos= (int32_t*)memalign(8, (c->chrDstW/2/4+8)*sizeof(int32_t));
  1868. initMMX2HScaler( dstW, c->lumXInc, c->funnyYCode , c->lumMmx2Filter, c->lumMmx2FilterPos, 8);
  1869. initMMX2HScaler(c->chrDstW, c->chrXInc, c->funnyUVCode, c->chrMmx2Filter, c->chrMmx2FilterPos, 4);
  1870. }
  1871. #endif
  1872. } // Init Horizontal stuff
  1873. /* precalculate vertical scaler filter coefficients */
  1874. initFilter(&c->vLumFilter, &c->vLumFilterPos, &c->vLumFilterSize, c->lumYInc,
  1875. srcH , dstH, 1, (1<<12)-4, flags,
  1876. srcFilter->lumV, dstFilter->lumV);
  1877. initFilter(&c->vChrFilter, &c->vChrFilterPos, &c->vChrFilterSize, c->chrYInc,
  1878. c->chrSrcH, c->chrDstH, 1, (1<<12)-4, flags,
  1879. srcFilter->chrV, dstFilter->chrV);
  1880. // Calculate Buffer Sizes so that they wont run out while handling these damn slices
  1881. c->vLumBufSize= c->vLumFilterSize;
  1882. c->vChrBufSize= c->vChrFilterSize;
  1883. for(i=0; i<dstH; i++)
  1884. {
  1885. int chrI= i*c->chrDstH / dstH;
  1886. int nextSlice= MAX(c->vLumFilterPos[i ] + c->vLumFilterSize - 1,
  1887. ((c->vChrFilterPos[chrI] + c->vChrFilterSize - 1)<<c->chrSrcVSubSample));
  1888. nextSlice&= ~3; // Slices start at boundaries which are divisable through 4
  1889. if(c->vLumFilterPos[i ] + c->vLumBufSize < nextSlice)
  1890. c->vLumBufSize= nextSlice - c->vLumFilterPos[i ];
  1891. if(c->vChrFilterPos[chrI] + c->vChrBufSize < (nextSlice>>c->chrSrcVSubSample))
  1892. c->vChrBufSize= (nextSlice>>c->chrSrcVSubSample) - c->vChrFilterPos[chrI];
  1893. }
  1894. // allocate pixbufs (we use dynamic allocation because otherwise we would need to
  1895. c->lumPixBuf= (int16_t**)memalign(4, c->vLumBufSize*2*sizeof(int16_t*));
  1896. c->chrPixBuf= (int16_t**)memalign(4, c->vChrBufSize*2*sizeof(int16_t*));
  1897. //Note we need at least one pixel more at the end because of the mmx code (just in case someone wanna replace the 4000/8000)
  1898. for(i=0; i<c->vLumBufSize; i++)
  1899. c->lumPixBuf[i]= c->lumPixBuf[i+c->vLumBufSize]= (uint16_t*)memalign(8, 4000);
  1900. for(i=0; i<c->vChrBufSize; i++)
  1901. c->chrPixBuf[i]= c->chrPixBuf[i+c->vChrBufSize]= (uint16_t*)memalign(8, 8000);
  1902. //try to avoid drawing green stuff between the right end and the stride end
  1903. for(i=0; i<c->vLumBufSize; i++) memset(c->lumPixBuf[i], 0, 4000);
  1904. for(i=0; i<c->vChrBufSize; i++) memset(c->chrPixBuf[i], 64, 8000);
  1905. ASSERT(c->chrDstH <= dstH)
  1906. // pack filter data for mmx code
  1907. if(cpuCaps.hasMMX)
  1908. {
  1909. c->lumMmxFilter= (int16_t*)memalign(8, c->vLumFilterSize* dstH*4*sizeof(int16_t));
  1910. c->chrMmxFilter= (int16_t*)memalign(8, c->vChrFilterSize*c->chrDstH*4*sizeof(int16_t));
  1911. for(i=0; i<c->vLumFilterSize*dstH; i++)
  1912. c->lumMmxFilter[4*i]=c->lumMmxFilter[4*i+1]=c->lumMmxFilter[4*i+2]=c->lumMmxFilter[4*i+3]=
  1913. c->vLumFilter[i];
  1914. for(i=0; i<c->vChrFilterSize*c->chrDstH; i++)
  1915. c->chrMmxFilter[4*i]=c->chrMmxFilter[4*i+1]=c->chrMmxFilter[4*i+2]=c->chrMmxFilter[4*i+3]=
  1916. c->vChrFilter[i];
  1917. }
  1918. if(flags&SWS_PRINT_INFO)
  1919. {
  1920. #ifdef DITHER1XBPP
  1921. char *dither= " dithered";
  1922. #else
  1923. char *dither= "";
  1924. #endif
  1925. if(flags&SWS_FAST_BILINEAR)
  1926. MSG_INFO("\nSwScaler: FAST_BILINEAR scaler, ");
  1927. else if(flags&SWS_BILINEAR)
  1928. MSG_INFO("\nSwScaler: BILINEAR scaler, ");
  1929. else if(flags&SWS_BICUBIC)
  1930. MSG_INFO("\nSwScaler: BICUBIC scaler, ");
  1931. else if(flags&SWS_X)
  1932. MSG_INFO("\nSwScaler: Experimental scaler, ");
  1933. else if(flags&SWS_POINT)
  1934. MSG_INFO("\nSwScaler: Nearest Neighbor / POINT scaler, ");
  1935. else if(flags&SWS_AREA)
  1936. MSG_INFO("\nSwScaler: Area Averageing scaler, ");
  1937. else
  1938. MSG_INFO("\nSwScaler: ehh flags invalid?! ");
  1939. if(dstFormat==IMGFMT_BGR15 || dstFormat==IMGFMT_BGR16)
  1940. MSG_INFO("from %s to%s %s ",
  1941. vo_format_name(srcFormat), dither, vo_format_name(dstFormat));
  1942. else
  1943. MSG_INFO("from %s to %s ",
  1944. vo_format_name(srcFormat), vo_format_name(dstFormat));
  1945. if(cpuCaps.hasMMX2)
  1946. MSG_INFO("using MMX2\n");
  1947. else if(cpuCaps.has3DNow)
  1948. MSG_INFO("using 3DNOW\n");
  1949. else if(cpuCaps.hasMMX)
  1950. MSG_INFO("using MMX\n");
  1951. else
  1952. MSG_INFO("using C\n");
  1953. }
  1954. if((flags & SWS_PRINT_INFO) && verbose)
  1955. {
  1956. if(cpuCaps.hasMMX)
  1957. {
  1958. if(c->canMMX2BeUsed && (flags&SWS_FAST_BILINEAR))
  1959. MSG_V("SwScaler: using FAST_BILINEAR MMX2 scaler for horizontal scaling\n");
  1960. else
  1961. {
  1962. if(c->hLumFilterSize==4)
  1963. MSG_V("SwScaler: using 4-tap MMX scaler for horizontal luminance scaling\n");
  1964. else if(c->hLumFilterSize==8)
  1965. MSG_V("SwScaler: using 8-tap MMX scaler for horizontal luminance scaling\n");
  1966. else
  1967. MSG_V("SwScaler: using n-tap MMX scaler for horizontal luminance scaling\n");
  1968. if(c->hChrFilterSize==4)
  1969. MSG_V("SwScaler: using 4-tap MMX scaler for horizontal chrominance scaling\n");
  1970. else if(c->hChrFilterSize==8)
  1971. MSG_V("SwScaler: using 8-tap MMX scaler for horizontal chrominance scaling\n");
  1972. else
  1973. MSG_V("SwScaler: using n-tap MMX scaler for horizontal chrominance scaling\n");
  1974. }
  1975. }
  1976. else
  1977. {
  1978. #ifdef ARCH_X86
  1979. MSG_V("SwScaler: using X86-Asm scaler for horizontal scaling\n");
  1980. #else
  1981. if(flags & SWS_FAST_BILINEAR)
  1982. MSG_V("SwScaler: using FAST_BILINEAR C scaler for horizontal scaling\n");
  1983. else
  1984. MSG_V("SwScaler: using C scaler for horizontal scaling\n");
  1985. #endif
  1986. }
  1987. if(isPlanarYUV(dstFormat))
  1988. {
  1989. if(c->vLumFilterSize==1)
  1990. MSG_V("SwScaler: using 1-tap %s \"scaler\" for vertical scaling (YV12 like)\n", cpuCaps.hasMMX ? "MMX" : "C");
  1991. else
  1992. MSG_V("SwScaler: using n-tap %s scaler for vertical scaling (YV12 like)\n", cpuCaps.hasMMX ? "MMX" : "C");
  1993. }
  1994. else
  1995. {
  1996. if(c->vLumFilterSize==1 && c->vChrFilterSize==2)
  1997. MSG_V("SwScaler: using 1-tap %s \"scaler\" for vertical luminance scaling (BGR)\n"
  1998. "SwScaler: 2-tap scaler for vertical chrominance scaling (BGR)\n",cpuCaps.hasMMX ? "MMX" : "C");
  1999. else if(c->vLumFilterSize==2 && c->vChrFilterSize==2)
  2000. MSG_V("SwScaler: using 2-tap linear %s scaler for vertical scaling (BGR)\n", cpuCaps.hasMMX ? "MMX" : "C");
  2001. else
  2002. MSG_V("SwScaler: using n-tap %s scaler for vertical scaling (BGR)\n", cpuCaps.hasMMX ? "MMX" : "C");
  2003. }
  2004. if(dstFormat==IMGFMT_BGR24)
  2005. MSG_V("SwScaler: using %s YV12->BGR24 Converter\n",
  2006. cpuCaps.hasMMX2 ? "MMX2" : (cpuCaps.hasMMX ? "MMX" : "C"));
  2007. else if(dstFormat==IMGFMT_BGR32)
  2008. MSG_V("SwScaler: using %s YV12->BGR32 Converter\n", cpuCaps.hasMMX ? "MMX" : "C");
  2009. else if(dstFormat==IMGFMT_BGR16)
  2010. MSG_V("SwScaler: using %s YV12->BGR16 Converter\n", cpuCaps.hasMMX ? "MMX" : "C");
  2011. else if(dstFormat==IMGFMT_BGR15)
  2012. MSG_V("SwScaler: using %s YV12->BGR15 Converter\n", cpuCaps.hasMMX ? "MMX" : "C");
  2013. MSG_V("SwScaler: %dx%d -> %dx%d\n", srcW, srcH, dstW, dstH);
  2014. }
  2015. if((flags & SWS_PRINT_INFO) && verbose>1)
  2016. {
  2017. MSG_DBG2("SwScaler:Lum srcW=%d srcH=%d dstW=%d dstH=%d xInc=%d yInc=%d\n",
  2018. c->srcW, c->srcH, c->dstW, c->dstH, c->lumXInc, c->lumYInc);
  2019. MSG_DBG2("SwScaler:Chr srcW=%d srcH=%d dstW=%d dstH=%d xInc=%d yInc=%d\n",
  2020. c->chrSrcW, c->chrSrcH, c->chrDstW, c->chrDstH, c->chrXInc, c->chrYInc);
  2021. }
  2022. c->swScale= swScale;
  2023. return c;
  2024. }
  2025. /**
  2026. * returns a normalized gaussian curve used to filter stuff
  2027. * quality=3 is high quality, lowwer is lowwer quality
  2028. */
  2029. SwsVector *getGaussianVec(double variance, double quality){
  2030. const int length= (int)(variance*quality + 0.5) | 1;
  2031. int i;
  2032. double *coeff= memalign(sizeof(double), length*sizeof(double));
  2033. double middle= (length-1)*0.5;
  2034. SwsVector *vec= malloc(sizeof(SwsVector));
  2035. vec->coeff= coeff;
  2036. vec->length= length;
  2037. for(i=0; i<length; i++)
  2038. {
  2039. double dist= i-middle;
  2040. coeff[i]= exp( -dist*dist/(2*variance*variance) ) / sqrt(2*variance*PI);
  2041. }
  2042. normalizeVec(vec, 1.0);
  2043. return vec;
  2044. }
  2045. SwsVector *getConstVec(double c, int length){
  2046. int i;
  2047. double *coeff= memalign(sizeof(double), length*sizeof(double));
  2048. SwsVector *vec= malloc(sizeof(SwsVector));
  2049. vec->coeff= coeff;
  2050. vec->length= length;
  2051. for(i=0; i<length; i++)
  2052. coeff[i]= c;
  2053. return vec;
  2054. }
  2055. SwsVector *getIdentityVec(void){
  2056. double *coeff= memalign(sizeof(double), sizeof(double));
  2057. SwsVector *vec= malloc(sizeof(SwsVector));
  2058. coeff[0]= 1.0;
  2059. vec->coeff= coeff;
  2060. vec->length= 1;
  2061. return vec;
  2062. }
  2063. void normalizeVec(SwsVector *a, double height){
  2064. int i;
  2065. double sum=0;
  2066. double inv;
  2067. for(i=0; i<a->length; i++)
  2068. sum+= a->coeff[i];
  2069. inv= height/sum;
  2070. for(i=0; i<a->length; i++)
  2071. a->coeff[i]*= height;
  2072. }
  2073. void scaleVec(SwsVector *a, double scalar){
  2074. int i;
  2075. for(i=0; i<a->length; i++)
  2076. a->coeff[i]*= scalar;
  2077. }
  2078. static SwsVector *getConvVec(SwsVector *a, SwsVector *b){
  2079. int length= a->length + b->length - 1;
  2080. double *coeff= memalign(sizeof(double), length*sizeof(double));
  2081. int i, j;
  2082. SwsVector *vec= malloc(sizeof(SwsVector));
  2083. vec->coeff= coeff;
  2084. vec->length= length;
  2085. for(i=0; i<length; i++) coeff[i]= 0.0;
  2086. for(i=0; i<a->length; i++)
  2087. {
  2088. for(j=0; j<b->length; j++)
  2089. {
  2090. coeff[i+j]+= a->coeff[i]*b->coeff[j];
  2091. }
  2092. }
  2093. return vec;
  2094. }
  2095. static SwsVector *sumVec(SwsVector *a, SwsVector *b){
  2096. int length= MAX(a->length, b->length);
  2097. double *coeff= memalign(sizeof(double), length*sizeof(double));
  2098. int i;
  2099. SwsVector *vec= malloc(sizeof(SwsVector));
  2100. vec->coeff= coeff;
  2101. vec->length= length;
  2102. for(i=0; i<length; i++) coeff[i]= 0.0;
  2103. for(i=0; i<a->length; i++) coeff[i + (length-1)/2 - (a->length-1)/2]+= a->coeff[i];
  2104. for(i=0; i<b->length; i++) coeff[i + (length-1)/2 - (b->length-1)/2]+= b->coeff[i];
  2105. return vec;
  2106. }
  2107. static SwsVector *diffVec(SwsVector *a, SwsVector *b){
  2108. int length= MAX(a->length, b->length);
  2109. double *coeff= memalign(sizeof(double), length*sizeof(double));
  2110. int i;
  2111. SwsVector *vec= malloc(sizeof(SwsVector));
  2112. vec->coeff= coeff;
  2113. vec->length= length;
  2114. for(i=0; i<length; i++) coeff[i]= 0.0;
  2115. for(i=0; i<a->length; i++) coeff[i + (length-1)/2 - (a->length-1)/2]+= a->coeff[i];
  2116. for(i=0; i<b->length; i++) coeff[i + (length-1)/2 - (b->length-1)/2]-= b->coeff[i];
  2117. return vec;
  2118. }
  2119. /* shift left / or right if "shift" is negative */
  2120. static SwsVector *getShiftedVec(SwsVector *a, int shift){
  2121. int length= a->length + ABS(shift)*2;
  2122. double *coeff= memalign(sizeof(double), length*sizeof(double));
  2123. int i;
  2124. SwsVector *vec= malloc(sizeof(SwsVector));
  2125. vec->coeff= coeff;
  2126. vec->length= length;
  2127. for(i=0; i<length; i++) coeff[i]= 0.0;
  2128. for(i=0; i<a->length; i++)
  2129. {
  2130. coeff[i + (length-1)/2 - (a->length-1)/2 - shift]= a->coeff[i];
  2131. }
  2132. return vec;
  2133. }
  2134. void shiftVec(SwsVector *a, int shift){
  2135. SwsVector *shifted= getShiftedVec(a, shift);
  2136. free(a->coeff);
  2137. a->coeff= shifted->coeff;
  2138. a->length= shifted->length;
  2139. free(shifted);
  2140. }
  2141. void addVec(SwsVector *a, SwsVector *b){
  2142. SwsVector *sum= sumVec(a, b);
  2143. free(a->coeff);
  2144. a->coeff= sum->coeff;
  2145. a->length= sum->length;
  2146. free(sum);
  2147. }
  2148. void subVec(SwsVector *a, SwsVector *b){
  2149. SwsVector *diff= diffVec(a, b);
  2150. free(a->coeff);
  2151. a->coeff= diff->coeff;
  2152. a->length= diff->length;
  2153. free(diff);
  2154. }
  2155. void convVec(SwsVector *a, SwsVector *b){
  2156. SwsVector *conv= getConvVec(a, b);
  2157. free(a->coeff);
  2158. a->coeff= conv->coeff;
  2159. a->length= conv->length;
  2160. free(conv);
  2161. }
  2162. SwsVector *cloneVec(SwsVector *a){
  2163. double *coeff= memalign(sizeof(double), a->length*sizeof(double));
  2164. int i;
  2165. SwsVector *vec= malloc(sizeof(SwsVector));
  2166. vec->coeff= coeff;
  2167. vec->length= a->length;
  2168. for(i=0; i<a->length; i++) coeff[i]= a->coeff[i];
  2169. return vec;
  2170. }
  2171. void printVec(SwsVector *a){
  2172. int i;
  2173. double max=0;
  2174. double min=0;
  2175. double range;
  2176. for(i=0; i<a->length; i++)
  2177. if(a->coeff[i]>max) max= a->coeff[i];
  2178. for(i=0; i<a->length; i++)
  2179. if(a->coeff[i]<min) min= a->coeff[i];
  2180. range= max - min;
  2181. for(i=0; i<a->length; i++)
  2182. {
  2183. int x= (int)((a->coeff[i]-min)*60.0/range +0.5);
  2184. MSG_DBG2("%1.3f ", a->coeff[i]);
  2185. for(;x>0; x--) MSG_DBG2(" ");
  2186. MSG_DBG2("|\n");
  2187. }
  2188. }
  2189. void freeVec(SwsVector *a){
  2190. if(!a) return;
  2191. if(a->coeff) free(a->coeff);
  2192. a->coeff=NULL;
  2193. a->length=0;
  2194. free(a);
  2195. }
  2196. void freeSwsContext(SwsContext *c){
  2197. int i;
  2198. if(!c) return;
  2199. if(c->lumPixBuf)
  2200. {
  2201. for(i=0; i<c->vLumBufSize; i++)
  2202. {
  2203. if(c->lumPixBuf[i]) free(c->lumPixBuf[i]);
  2204. c->lumPixBuf[i]=NULL;
  2205. }
  2206. free(c->lumPixBuf);
  2207. c->lumPixBuf=NULL;
  2208. }
  2209. if(c->chrPixBuf)
  2210. {
  2211. for(i=0; i<c->vChrBufSize; i++)
  2212. {
  2213. if(c->chrPixBuf[i]) free(c->chrPixBuf[i]);
  2214. c->chrPixBuf[i]=NULL;
  2215. }
  2216. free(c->chrPixBuf);
  2217. c->chrPixBuf=NULL;
  2218. }
  2219. if(c->vLumFilter) free(c->vLumFilter);
  2220. c->vLumFilter = NULL;
  2221. if(c->vChrFilter) free(c->vChrFilter);
  2222. c->vChrFilter = NULL;
  2223. if(c->hLumFilter) free(c->hLumFilter);
  2224. c->hLumFilter = NULL;
  2225. if(c->hChrFilter) free(c->hChrFilter);
  2226. c->hChrFilter = NULL;
  2227. if(c->vLumFilterPos) free(c->vLumFilterPos);
  2228. c->vLumFilterPos = NULL;
  2229. if(c->vChrFilterPos) free(c->vChrFilterPos);
  2230. c->vChrFilterPos = NULL;
  2231. if(c->hLumFilterPos) free(c->hLumFilterPos);
  2232. c->hLumFilterPos = NULL;
  2233. if(c->hChrFilterPos) free(c->hChrFilterPos);
  2234. c->hChrFilterPos = NULL;
  2235. if(c->lumMmxFilter) free(c->lumMmxFilter);
  2236. c->lumMmxFilter = NULL;
  2237. if(c->chrMmxFilter) free(c->chrMmxFilter);
  2238. c->chrMmxFilter = NULL;
  2239. if(c->lumMmx2Filter) free(c->lumMmx2Filter);
  2240. c->lumMmx2Filter=NULL;
  2241. if(c->chrMmx2Filter) free(c->chrMmx2Filter);
  2242. c->chrMmx2Filter=NULL;
  2243. if(c->lumMmx2FilterPos) free(c->lumMmx2FilterPos);
  2244. c->lumMmx2FilterPos=NULL;
  2245. if(c->chrMmx2FilterPos) free(c->chrMmx2FilterPos);
  2246. c->chrMmx2FilterPos=NULL;
  2247. free(c);
  2248. }