You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

665 lines
19KB

  1. // Software scaling and colorspace conversion routines for MPlayer
  2. #include <inttypes.h>
  3. #include "../config.h"
  4. #undef HAVE_MMX2 //code is buggy
  5. //#undef HAVE_MMX
  6. #define RET 0xC3 //near return opcode
  7. // temporary storage for 4 yuv lines:
  8. // 16bit for now (mmx likes it more compact)
  9. static uint16_t pix_buf_y[4][2048];
  10. static uint16_t pix_buf_uv[2][2048*2];
  11. // clipping helper table for C implementations:
  12. static unsigned char clip_table[768];
  13. // yuv->rgb conversion tables:
  14. static int yuvtab_2568[256];
  15. static int yuvtab_3343[256];
  16. static int yuvtab_0c92[256];
  17. static int yuvtab_1a1e[256];
  18. static int yuvtab_40cf[256];
  19. static uint64_t yCoeff= 0x2568256825682568LL;
  20. static uint64_t ubCoeff= 0x3343334333433343LL;
  21. static uint64_t vrCoeff= 0x40cf40cf40cf40cfLL;
  22. static uint64_t ugCoeff= 0xE5E2E5E2E5E2E5E2LL;
  23. static uint64_t vgCoeff= 0xF36EF36EF36EF36ELL;
  24. static uint64_t w80= 0x0080008000800080LL;
  25. static uint64_t w10= 0x0010001000100010LL;
  26. static uint64_t b16Dither= 0x0004000400040004LL;
  27. static uint64_t b16Dither1=0x0004000400040004LL;
  28. static uint64_t b16Dither2=0x0602060206020602LL;
  29. static uint64_t g16Dither= 0x0002000200020002LL;
  30. static uint64_t g16Dither1=0x0002000200020002LL;
  31. static uint64_t g16Dither2=0x0301030103010301LL;
  32. static uint64_t b16Mask= 0x001F001F001F001FLL;
  33. static uint64_t g16Mask= 0x07E007E007E007E0LL;
  34. static uint64_t r16Mask= 0xF800F800F800F800LL;
  35. static uint64_t temp0;
  36. static uint8_t funnyYCode[10000];
  37. static uint8_t funnyUVCode[10000];
  38. // *** bilinear scaling and yuv->rgb conversion of yv12 slices:
  39. // *** Note: it's called multiple times while decoding a frame, first time y==0
  40. // *** Designed to upscale, but may work for downscale too.
  41. // s_xinc = (src_width << 8) / dst_width
  42. // s_yinc = (src_height << 16) / dst_height
  43. void SwScale_YV12slice_brg24(unsigned char* srcptr[],int stride[], int y, int h,
  44. unsigned char* dstptr, int dststride, int dstw, int dstbpp,
  45. unsigned int s_xinc,unsigned int s_yinc){
  46. // scaling factors:
  47. //static int s_yinc=(vo_dga_src_height<<16)/vo_dga_vp_height;
  48. //static int s_xinc=(vo_dga_src_width<<8)/vo_dga_vp_width;
  49. unsigned int s_xinc2=s_xinc>>1;
  50. static int s_srcypos;
  51. static int s_ypos;
  52. static int s_last_ypos;
  53. static int static_dstw;
  54. #ifdef HAVE_MMX2
  55. static int old_dstw= -1;
  56. static int old_s_xinc= -1;
  57. #endif
  58. s_xinc&= -2; //clear last bit or uv and y might be shifted relative to each other
  59. if(y==0){
  60. s_srcypos=-2*s_yinc;
  61. s_ypos=-2;
  62. s_last_ypos=-2;
  63. #ifdef HAVE_MMX2
  64. // cant downscale !!!
  65. if(old_s_xinc != s_xinc || old_dstw!=dstw)
  66. {
  67. uint8_t *fragment;
  68. int imm8OfPShufW1;
  69. int imm8OfPShufW2;
  70. int fragmentLength;
  71. int xpos, xx, xalpha, i;
  72. old_s_xinc= s_xinc;
  73. old_dstw= dstw;
  74. static_dstw= dstw;
  75. // create an optimized horizontal scaling routine
  76. //code fragment
  77. // fragmentLength=0;
  78. // printf("%d, %d\n", fragmentLength,imm8OfPShufW1);
  79. asm volatile(
  80. "jmp 9f \n\t"
  81. // Begin
  82. "0: \n\t"
  83. "movq (%%esi, %%ebx), %%mm0 \n\t" //FIXME Alignment
  84. "movq %%mm0, %%mm1 \n\t"
  85. "psrlq $8, %%mm0 \n\t"
  86. "punpcklbw %%mm7, %%mm1 \n\t"
  87. "punpcklbw %%mm7, %%mm0 \n\t"
  88. "pshufw $0xFF, %%mm1, %%mm1 \n\t"
  89. "1: \n\t"
  90. "pshufw $0xFF, %%mm0, %%mm0 \n\t"
  91. "2: \n\t"
  92. "psubw %%mm1, %%mm0 \n\t"
  93. "psraw $1, %%mm0 \n\t"
  94. "pmullw %%mm2, %%mm0 \n\t"
  95. "psllw $7, %%mm1 \n\t"
  96. "paddw %%mm1, %%mm0 \n\t"
  97. "movq %%mm0, (%%edi, %%eax) \n\t"
  98. "paddb %%mm6, %%mm2 \n\t" // 2*alpha += xpos&0xFF
  99. "addb %%ch, %%cl \n\t" //2*xalpha += (4*s_xinc)&0xFF
  100. "adcl %%edx, %%ebx \n\t" //xx+= (4*s_xinc)>>8 + carry
  101. "addl $8, %%eax \n\t"
  102. // End
  103. "9: \n\t"
  104. // "int $3\n\t"
  105. "leal 0b, %0 \n\t"
  106. "leal 1b, %1 \n\t"
  107. "leal 2b, %2 \n\t"
  108. "decl %1 \n\t"
  109. "decl %2 \n\t"
  110. "subl %0, %1 \n\t"
  111. "subl %0, %2 \n\t"
  112. "leal 9b, %3 \n\t"
  113. "subl %0, %3 \n\t"
  114. :"=r" (fragment), "=r" (imm8OfPShufW1), "=r" (imm8OfPShufW2),
  115. "=r" (fragmentLength)
  116. );
  117. xpos= xx=xalpha= 0;
  118. //FIXME choose size and or xinc so that they fit exactly
  119. for(i=0; i<dstw/8; i++)
  120. {
  121. int xx=xpos>>8;
  122. if((i&3) == 0)
  123. {
  124. int a=0;
  125. int b=((xpos+s_xinc)>>8) - xx;
  126. int c=((xpos+s_xinc*2)>>8) - xx;
  127. int d=((xpos+s_xinc*3)>>8) - xx;
  128. memcpy(funnyYCode + fragmentLength*i/4, fragment, fragmentLength);
  129. funnyYCode[fragmentLength*i/4 + imm8OfPShufW1]=
  130. funnyYCode[fragmentLength*i/4 + imm8OfPShufW2]=
  131. a | (b<<2) | (c<<4) | (d<<6);
  132. funnyYCode[fragmentLength*(i+4)/4]= RET;
  133. }
  134. xpos+=s_xinc;
  135. }
  136. xpos= xx=xalpha= 0;
  137. //FIXME choose size and or xinc so that they fit exactly
  138. for(i=0; i<dstw/8; i++)
  139. {
  140. int xx=xpos>>8;
  141. if((i&3) == 0)
  142. {
  143. int a=0;
  144. int b=((xpos+s_xinc2)>>8) - xx;
  145. int c=((xpos+s_xinc2*2)>>8) - xx;
  146. int d=((xpos+s_xinc2*3)>>8) - xx;
  147. memcpy(funnyUVCode + fragmentLength*i/4, fragment, fragmentLength);
  148. funnyUVCode[fragmentLength*i/4 + imm8OfPShufW1]=
  149. funnyUVCode[fragmentLength*i/4 + imm8OfPShufW2]=
  150. a | (b<<2) | (c<<4) | (d<<6);
  151. funnyUVCode[fragmentLength*(i+4)/4]= RET;
  152. }
  153. xpos+=s_xinc2;
  154. }
  155. // funnyCode[0]= RET;
  156. }
  157. #endif
  158. } // reset counters
  159. while(1){
  160. unsigned char *dest=dstptr+dststride*s_ypos;
  161. int y0=2+(s_srcypos>>16);
  162. int y1=1+(s_srcypos>>17);
  163. int yalpha=(s_srcypos&0xFFFF)>>7;
  164. int yalpha1=yalpha^511;
  165. int uvalpha=((s_srcypos>>1)&0xFFFF)>>7;
  166. int uvalpha1=uvalpha^511;
  167. uint16_t *buf0=pix_buf_y[y0&3];
  168. uint16_t *buf1=pix_buf_y[((y0+1)&3)];
  169. uint16_t *uvbuf0=pix_buf_uv[y1&1];
  170. uint16_t *uvbuf1=pix_buf_uv[(y1&1)^1];
  171. int i;
  172. if(y0>=y+h) break;
  173. s_ypos++; s_srcypos+=s_yinc;
  174. if(s_last_ypos!=y0){
  175. unsigned char *src=srcptr[0]+(y0-y)*stride[0];
  176. unsigned int xpos=0;
  177. s_last_ypos=y0;
  178. // *** horizontal scale Y line to temp buffer
  179. // this loop should be rewritten in MMX assembly!!!!
  180. #ifdef HAVE_MMX2
  181. asm volatile(
  182. "pxor %%mm7, %%mm7 \n\t"
  183. "pxor %%mm2, %%mm2 \n\t" // 2*xalpha
  184. "movd %5, %%mm6 \n\t" // s_xinc&0xFF
  185. "punpcklwd %%mm6, %%mm6 \n\t"
  186. "punpcklwd %%mm6, %%mm6 \n\t"
  187. "movq %%mm6, %%mm2 \n\t"
  188. "psllq $16, %%mm2 \n\t"
  189. "paddb %%mm6, %%mm2 \n\t"
  190. "psllq $16, %%mm2 \n\t"
  191. "paddb %%mm6, %%mm2 \n\t"
  192. "psllq $16, %%mm2 \n\t" //0,t,2t,3t t=s_xinc&0xFF
  193. "movq %%mm2, temp0 \n\t"
  194. "movd %4, %%mm6 \n\t" //(s_xinc*4)&0xFF
  195. "punpcklwd %%mm6, %%mm6 \n\t"
  196. "punpcklwd %%mm6, %%mm6 \n\t"
  197. "xorl %%eax, %%eax \n\t" // i
  198. "xorl %%ebx, %%ebx \n\t" // xx
  199. "movl %0, %%esi \n\t" // src
  200. "movl %1, %%edi \n\t" // buf1
  201. "movl %3, %%edx \n\t" // (s_xinc*4)>>8
  202. "xorl %%ecx, %%ecx \n\t"
  203. "movb %4, %%ch \n\t" // (s_xinc*4)&0xFF
  204. // "int $3\n\t"
  205. "call funnyYCode \n\t"
  206. "movq temp0, %%mm2 \n\t"
  207. "xorb %%cl, %%cl \n\t"
  208. "call funnyYCode \n\t"
  209. "movq temp0, %%mm2 \n\t"
  210. "xorb %%cl, %%cl \n\t"
  211. "call funnyYCode \n\t"
  212. "movq temp0, %%mm2 \n\t"
  213. "xorb %%cl, %%cl \n\t"
  214. "call funnyYCode \n\t"
  215. "movq temp0, %%mm2 \n\t"
  216. "xorb %%cl, %%cl \n\t"
  217. "call funnyYCode \n\t"
  218. "movq temp0, %%mm2 \n\t"
  219. "xorb %%cl, %%cl \n\t"
  220. "call funnyYCode \n\t"
  221. "movq temp0, %%mm2 \n\t"
  222. "xorb %%cl, %%cl \n\t"
  223. "call funnyYCode \n\t"
  224. "movq temp0, %%mm2 \n\t"
  225. "xorb %%cl, %%cl \n\t"
  226. "call funnyYCode \n\t"
  227. :: "m" (src), "m" (buf1), "m" (dstw), "m" ((s_xinc*4)>>8),
  228. "m" ((s_xinc*4)&0xFF), "m" (s_xinc&0xFF)
  229. : "%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi"
  230. );
  231. #elif defined (ARCH_X86)
  232. //NO MMX just normal asm ... FIXME try/write funny MMX2 variant
  233. //FIXME add prefetch
  234. asm volatile(
  235. "xorl %%eax, %%eax \n\t" // i
  236. "xorl %%ebx, %%ebx \n\t" // xx
  237. "xorl %%ecx, %%ecx \n\t" // 2*xalpha
  238. "1: \n\t"
  239. "movzbl (%0, %%ebx), %%edi \n\t" //src[xx]
  240. "movzbl 1(%0, %%ebx), %%esi \n\t" //src[xx+1]
  241. "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
  242. "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
  243. "shll $8, %%edi \n\t"
  244. "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
  245. "movl %1, %%edi \n\t"
  246. "shrl $1, %%esi \n\t"
  247. "movw %%si, (%%edi, %%eax, 2) \n\t"
  248. "addb %4, %%cl \n\t" //2*xalpha += s_xinc&0xFF
  249. "adcl %3, %%ebx \n\t" //xx+= s_xinc>>8 + carry
  250. "movzbl (%0, %%ebx), %%edi \n\t" //src[xx]
  251. "movzbl 1(%0, %%ebx), %%esi \n\t" //src[xx+1]
  252. "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
  253. "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
  254. "shll $8, %%edi \n\t"
  255. "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
  256. "movl %1, %%edi \n\t"
  257. "shrl $1, %%esi \n\t"
  258. "movw %%si, 2(%%edi, %%eax, 2) \n\t"
  259. "addb %4, %%cl \n\t" //2*xalpha += s_xinc&0xFF
  260. "adcl %3, %%ebx \n\t" //xx+= s_xinc>>8 + carry
  261. "addl $2, %%eax \n\t"
  262. "cmpl %2, %%eax \n\t"
  263. " jb 1b \n\t"
  264. :: "r" (src), "m" (buf1), "m" (dstw), "m" (s_xinc>>8), "m" (s_xinc&0xFF)
  265. : "%eax", "%ebx", "%ecx", "%edi", "%esi"
  266. );
  267. #else
  268. for(i=0;i<dstw;i++){
  269. register unsigned int xx=xpos>>8;
  270. register unsigned int xalpha=(xpos&0xFF)>>1;
  271. buf1[i]=(src[xx]*(xalpha^127)+src[xx+1]*xalpha);
  272. xpos+=s_xinc;
  273. }
  274. #endif
  275. // *** horizontal scale U and V lines to temp buffer
  276. if(!(y0&1)){
  277. unsigned char *src1=srcptr[1]+(y1-y/2)*stride[1];
  278. unsigned char *src2=srcptr[2]+(y1-y/2)*stride[2];
  279. xpos=0;
  280. // this loop should be rewritten in MMX assembly!!!!
  281. #ifdef HAVE_MMX2
  282. asm volatile(
  283. "pxor %%mm7, %%mm7 \n\t"
  284. "pxor %%mm2, %%mm2 \n\t" // 2*xalpha
  285. "movd %5, %%mm6 \n\t" // s_xinc&0xFF
  286. "punpcklwd %%mm6, %%mm6 \n\t"
  287. "punpcklwd %%mm6, %%mm6 \n\t"
  288. "movq %%mm6, %%mm2 \n\t"
  289. "psllq $16, %%mm2 \n\t"
  290. "paddb %%mm6, %%mm2 \n\t"
  291. "psllq $16, %%mm2 \n\t"
  292. "paddb %%mm6, %%mm2 \n\t"
  293. "psllq $16, %%mm2 \n\t" //0,t,2t,3t t=s_xinc&0xFF
  294. "movq %%mm2, temp0 \n\t"
  295. "movd %4, %%mm6 \n\t" //(s_xinc*4)&0xFF
  296. "punpcklwd %%mm6, %%mm6 \n\t"
  297. "punpcklwd %%mm6, %%mm6 \n\t"
  298. "xorl %%eax, %%eax \n\t" // i
  299. "xorl %%ebx, %%ebx \n\t" // xx
  300. "movl %0, %%esi \n\t" // src
  301. "movl %1, %%edi \n\t" // buf1
  302. "movl %3, %%edx \n\t" // (s_xinc*4)>>8
  303. "xorl %%ecx, %%ecx \n\t"
  304. "movb %4, %%ch \n\t" // (s_xinc*4)&0xFF
  305. // "int $3\n\t"
  306. "call funnyUVCode \n\t"
  307. "movq temp0, %%mm2 \n\t"
  308. "xorb %%cl, %%cl \n\t"
  309. "call funnyUVCode \n\t"
  310. "movq temp0, %%mm2 \n\t"
  311. "xorb %%cl, %%cl \n\t"
  312. "call funnyUVCode \n\t"
  313. "movq temp0, %%mm2 \n\t"
  314. "xorb %%cl, %%cl \n\t"
  315. "call funnyUVCode \n\t"
  316. "movq temp0, %%mm2 \n\t"
  317. "xorb %%cl, %%cl \n\t"
  318. "call funnyUVCode \n\t"
  319. "movq temp0, %%mm2 \n\t"
  320. "xorb %%cl, %%cl \n\t"
  321. "call funnyUVCode \n\t"
  322. "movq temp0, %%mm2 \n\t"
  323. "xorb %%cl, %%cl \n\t"
  324. "call funnyUVCode \n\t"
  325. "movq temp0, %%mm2 \n\t"
  326. "xorb %%cl, %%cl \n\t"
  327. "call funnyUVCode \n\t"
  328. "xorl %%eax, %%eax \n\t" // i
  329. "xorl %%ebx, %%ebx \n\t" // xx
  330. "movl %6, %%esi \n\t" // src
  331. "movl %1, %%edi \n\t" // buf1
  332. "addl $4096, %%edi \n\t"
  333. "call funnyUVCode \n\t"
  334. "movq temp0, %%mm2 \n\t"
  335. "xorb %%cl, %%cl \n\t"
  336. "call funnyUVCode \n\t"
  337. "movq temp0, %%mm2 \n\t"
  338. "xorb %%cl, %%cl \n\t"
  339. "call funnyUVCode \n\t"
  340. "movq temp0, %%mm2 \n\t"
  341. "xorb %%cl, %%cl \n\t"
  342. "call funnyUVCode \n\t"
  343. "movq temp0, %%mm2 \n\t"
  344. "xorb %%cl, %%cl \n\t"
  345. "call funnyUVCode \n\t"
  346. "movq temp0, %%mm2 \n\t"
  347. "xorb %%cl, %%cl \n\t"
  348. "call funnyUVCode \n\t"
  349. "movq temp0, %%mm2 \n\t"
  350. "xorb %%cl, %%cl \n\t"
  351. "call funnyUVCode \n\t"
  352. "movq temp0, %%mm2 \n\t"
  353. "xorb %%cl, %%cl \n\t"
  354. "call funnyUVCode \n\t"
  355. :: "m" (src1), "m" (uvbuf1), "m" (dstw), "m" ((s_xinc2*4)>>8),
  356. "m" ((s_xinc2*4)&0xFF), "m" (s_xinc2&0xFF), "m" (src2)
  357. : "%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi"
  358. );
  359. #elif defined (ARCH_X86)
  360. //NO MMX just normal asm ... FIXME try/write funny MMX2 variant
  361. asm volatile(
  362. "xorl %%eax, %%eax \n\t" // i
  363. "xorl %%ebx, %%ebx \n\t" // xx
  364. "xorl %%ecx, %%ecx \n\t" // 2*xalpha
  365. "1: \n\t"
  366. "movl %0, %%esi \n\t"
  367. "movzbl (%%esi, %%ebx), %%edi \n\t" //src[xx]
  368. "movzbl 1(%%esi, %%ebx), %%esi \n\t" //src[xx+1]
  369. "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
  370. "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
  371. "shll $8, %%edi \n\t"
  372. "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
  373. "movl %1, %%edi \n\t"
  374. "shrl $1, %%esi \n\t"
  375. "movw %%si, (%%edi, %%eax, 2) \n\t"
  376. "movzbl (%5, %%ebx), %%edi \n\t" //src[xx]
  377. "movzbl 1(%5, %%ebx), %%esi \n\t" //src[xx+1]
  378. "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
  379. "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
  380. "shll $8, %%edi \n\t"
  381. "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
  382. "movl %1, %%edi \n\t"
  383. "shrl $1, %%esi \n\t"
  384. "movw %%si, 4096(%%edi, %%eax, 2)\n\t"
  385. "addb %4, %%cl \n\t" //2*xalpha += s_xinc&0xFF
  386. "adcl %3, %%ebx \n\t" //xx+= s_xinc>>8 + carry
  387. "addl $1, %%eax \n\t"
  388. "cmpl %2, %%eax \n\t"
  389. " jb 1b \n\t"
  390. :: "m" (src1), "m" (uvbuf1), "m" (dstw), "m" (s_xinc2>>8), "m" (s_xinc2&0xFF),
  391. "r" (src2)
  392. : "%eax", "%ebx", "%ecx", "%edi", "%esi"
  393. );
  394. #else
  395. for(i=0;i<dstw;i++){
  396. register unsigned int xx=xpos>>8;
  397. register unsigned int xalpha=(xpos&0xFF)>>1;
  398. uvbuf1[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha);
  399. uvbuf1[i+2048]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha);
  400. xpos+=s_xinc2;
  401. }
  402. #endif
  403. }
  404. if(!y0) continue;
  405. }
  406. // Note1: this code can be resticted to n*8 (or n*16) width lines to simplify optimization...
  407. // Re: Note1: ok n*4 for now
  408. // Note2: instead of using lookup tabs, mmx version could do the multiply...
  409. // Re: Note2: yep
  410. // Note3: maybe we should make separated 15/16, 24 and 32bpp version of this:
  411. // Re: done (32 & 16) and 16 has dithering :) but 16 is untested
  412. #ifdef HAVE_MMX
  413. //FIXME write lq version with less uv ...
  414. //FIXME reorder / optimize
  415. if(dstbpp == 4)
  416. {
  417. asm volatile(
  418. #define YSCALEYUV2RGB \
  419. "pxor %%mm7, %%mm7 \n\t"\
  420. "movd %6, %%mm6 \n\t" /*yalpha1*/\
  421. "punpcklwd %%mm6, %%mm6 \n\t"\
  422. "punpcklwd %%mm6, %%mm6 \n\t"\
  423. "movd %7, %%mm5 \n\t" /*uvalpha1*/\
  424. "punpcklwd %%mm5, %%mm5 \n\t"\
  425. "punpcklwd %%mm5, %%mm5 \n\t"\
  426. "xorl %%eax, %%eax \n\t"\
  427. "1: \n\t"\
  428. "movq (%0, %%eax, 2), %%mm0 \n\t" /*buf0[eax]*/\
  429. "movq (%1, %%eax, 2), %%mm1 \n\t" /*buf1[eax]*/\
  430. "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
  431. "pmulhw %%mm6, %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
  432. "psraw $7, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>7*/\
  433. "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
  434. "psubw w10, %%mm1 \n\t" /* Y-16*/\
  435. "psllw $3, %%mm1 \n\t" /* (y-16)*8*/\
  436. "pmulhw yCoeff, %%mm1 \n\t"\
  437. \
  438. "movq (%2, %%eax,2), %%mm2 \n\t" /* uvbuf0[eax]*/\
  439. "movq (%3, %%eax,2), %%mm3 \n\t" /* uvbuf1[eax]*/\
  440. "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
  441. "pmulhw %%mm5, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
  442. "psraw $7, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>7*/\
  443. "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
  444. "psubw w80, %%mm3 \n\t" /* (U-128)*/\
  445. "psllw $3, %%mm3 \n\t" /*(U-128)8*/\
  446. \
  447. "movq 4096(%2, %%eax,2), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
  448. "movq 4096(%3, %%eax,2), %%mm0 \n\t" /* uvbuf1[eax+2048]*/\
  449. "psubw %%mm0, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
  450. "pmulhw %%mm5, %%mm4 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
  451. "psraw $7, %%mm0 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>7*/\
  452. "paddw %%mm4, %%mm0 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
  453. "psubw w80, %%mm0 \n\t" /* (V-128)*/\
  454. "psllw $3, %%mm0 \n\t" /* (V-128)8*/\
  455. \
  456. "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
  457. "pmulhw ubCoeff, %%mm3 \n\t"\
  458. "paddw %%mm1, %%mm3 \n\t" /* B*/\
  459. \
  460. "movq %%mm0, %%mm4 \n\t" /* (V-128)8*/\
  461. "pmulhw vrCoeff, %%mm0 \n\t"\
  462. "paddw %%mm1, %%mm0 \n\t" /* R*/\
  463. \
  464. "pmulhw ugCoeff, %%mm2 \n\t"\
  465. "pmulhw vgCoeff, %%mm4 \n\t"\
  466. "paddw %%mm4, %%mm2 \n\t"\
  467. "paddw %%mm2, %%mm1 \n\t" /* G*/\
  468. \
  469. "packuswb %%mm3, %%mm3 \n\t"\
  470. "packuswb %%mm0, %%mm0 \n\t"\
  471. "packuswb %%mm1, %%mm1 \n\t"
  472. YSCALEYUV2RGB
  473. "punpcklbw %%mm1, %%mm3 \n\t" // BGBGBGBG
  474. "punpcklbw %%mm7, %%mm0 \n\t" // R0R0R0R0
  475. "movq %%mm3, %%mm1 \n\t"
  476. "punpcklwd %%mm0, %%mm3 \n\t" // BGR0BGR0
  477. "punpckhwd %%mm0, %%mm1 \n\t" // BGR0BGR0
  478. #ifdef HAVE_MMX2
  479. "movntq %%mm3, (%4, %%eax, 4) \n\t"
  480. "movntq %%mm1, 8(%4, %%eax, 4) \n\t"
  481. #else
  482. "movq %%mm3, (%4, %%eax, 4) \n\t"
  483. "movq %%mm1, 8(%4, %%eax, 4) \n\t"
  484. #endif
  485. "addl $4, %%eax \n\t"
  486. "cmpl %5, %%eax \n\t"
  487. " jb 1b \n\t"
  488. :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstw),
  489. "m" (yalpha1), "m" (uvalpha1)
  490. : "%eax"
  491. );
  492. }
  493. else if(dstbpp==2)
  494. {
  495. asm volatile(
  496. YSCALEYUV2RGB
  497. "paddusb g16Dither, %%mm1 \n\t"
  498. "paddusb b16Dither, %%mm0 \n\t"
  499. "paddusb b16Dither, %%mm3 \n\t"
  500. "punpcklbw %%mm7, %%mm1 \n\t" // 0G0G0G0G
  501. "punpcklbw %%mm7, %%mm3 \n\t" // 0B0B0B0B
  502. "punpcklbw %%mm7, %%mm0 \n\t" // 0R0R0R0R
  503. "psrlw $3, %%mm3 \n\t"
  504. "psllw $3, %%mm1 \n\t"
  505. "psllw $8, %%mm0 \n\t"
  506. "pand g16Mask, %%mm1 \n\t"
  507. "pand r16Mask, %%mm0 \n\t"
  508. "por %%mm3, %%mm1 \n\t"
  509. "por %%mm1, %%mm0 \n\t"
  510. #ifdef HAVE_MMX2
  511. "movntq %%mm0, (%4, %%eax, 2) \n\t"
  512. #else
  513. "movq %%mm0, (%4, %%eax, 2) \n\t"
  514. #endif
  515. "addl $4, %%eax \n\t"
  516. "cmpl %5, %%eax \n\t"
  517. " jb 1b \n\t"
  518. :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstw),
  519. "m" (yalpha1), "m" (uvalpha1)
  520. : "%eax"
  521. );
  522. }
  523. #else
  524. if(dstbpp==4 || dstbpp==3)
  525. {
  526. for(i=0;i<dstw;i++){
  527. // vertical linear interpolation && yuv2rgb in a single step:
  528. int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>16)];
  529. int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>16);
  530. int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>16);
  531. dest[0]=clip_table[((Y + yuvtab_3343[U]) >>13)];
  532. dest[1]=clip_table[((Y + yuvtab_0c92[V] + yuvtab_1a1e[U]) >>13)];
  533. dest[2]=clip_table[((Y + yuvtab_40cf[V]) >>13)];
  534. dest+=dstbpp;
  535. }
  536. }
  537. else if(dstbpp==2) //16bit
  538. {
  539. for(i=0;i<dstw;i++){
  540. // vertical linear interpolation && yuv2rgb in a single step:
  541. int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>16)];
  542. int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>16);
  543. int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>16);
  544. ((uint16_t*)dest)[0] =
  545. (clip_table[((Y + yuvtab_3343[U]) >>13)]>>3) |
  546. (clip_table[((Y + yuvtab_0c92[V] + yuvtab_1a1e[U]) >>13)]<<3)&0x07E0 |
  547. (clip_table[((Y + yuvtab_40cf[V]) >>13)]<<8)&0xF800;
  548. dest+=dstbpp;
  549. }
  550. }
  551. else if(dstbpp==2) //15bit FIXME how do i figure out if its 15 or 16?
  552. {
  553. for(i=0;i<dstw;i++){
  554. // vertical linear interpolation && yuv2rgb in a single step:
  555. int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>16)];
  556. int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>16);
  557. int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>16);
  558. ((uint16_t*)dest)[0] =
  559. (clip_table[((Y + yuvtab_3343[U]) >>13)]>>3) |
  560. (clip_table[((Y + yuvtab_0c92[V] + yuvtab_1a1e[U]) >>13)]<<2)&0x03E0 |
  561. (clip_table[((Y + yuvtab_40cf[V]) >>13)]<<7)&0x7C00;
  562. dest+=dstbpp;
  563. }
  564. }
  565. #endif
  566. b16Dither= b16Dither1;
  567. b16Dither1= b16Dither2;
  568. b16Dither2= b16Dither;
  569. g16Dither= g16Dither1;
  570. g16Dither1= g16Dither2;
  571. g16Dither2= g16Dither;
  572. }
  573. }
  574. void SwScale_Init(){
  575. // generating tables:
  576. int i;
  577. for(i=0;i<256;i++){
  578. clip_table[i]=0;
  579. clip_table[i+256]=i;
  580. clip_table[i+512]=255;
  581. yuvtab_2568[i]=(0x2568*(i-16))+(256<<13);
  582. yuvtab_3343[i]=0x3343*(i-128);
  583. yuvtab_0c92[i]=-0x0c92*(i-128);
  584. yuvtab_1a1e[i]=-0x1a1e*(i-128);
  585. yuvtab_40cf[i]=0x40cf*(i-128);
  586. }
  587. }