You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

722 lines
21KB

  1. // Software scaling and colorspace conversion routines for MPlayer
  2. #include <inttypes.h>
  3. #include "../config.h"
  4. #undef HAVE_MMX2 //code is buggy
  5. //#undef HAVE_MMX
  6. #define RET 0xC3 //near return opcode
  7. // temporary storage for 4 yuv lines:
  8. // 16bit for now (mmx likes it more compact)
  9. static uint16_t pix_buf_y[4][2048];
  10. static uint16_t pix_buf_uv[2][2048*2];
  11. // clipping helper table for C implementations:
  12. static unsigned char clip_table[768];
  13. // yuv->rgb conversion tables:
  14. static int yuvtab_2568[256];
  15. static int yuvtab_3343[256];
  16. static int yuvtab_0c92[256];
  17. static int yuvtab_1a1e[256];
  18. static int yuvtab_40cf[256];
  19. static uint64_t yCoeff= 0x2568256825682568LL;
  20. static uint64_t ubCoeff= 0x3343334333433343LL;
  21. static uint64_t vrCoeff= 0x40cf40cf40cf40cfLL;
  22. static uint64_t ugCoeff= 0xE5E2E5E2E5E2E5E2LL;
  23. static uint64_t vgCoeff= 0xF36EF36EF36EF36ELL;
  24. static uint64_t w80= 0x0080008000800080LL;
  25. static uint64_t w10= 0x0010001000100010LL;
  26. static uint64_t bm00000111=0x0000000000FFFFFFLL;
  27. static uint64_t bm11111000=0xFFFFFFFFFF000000LL;
  28. static uint64_t b16Dither= 0x0004000400040004LL;
  29. static uint64_t b16Dither1=0x0004000400040004LL;
  30. static uint64_t b16Dither2=0x0602060206020602LL;
  31. static uint64_t g16Dither= 0x0002000200020002LL;
  32. static uint64_t g16Dither1=0x0002000200020002LL;
  33. static uint64_t g16Dither2=0x0301030103010301LL;
  34. static uint64_t b16Mask= 0x001F001F001F001FLL;
  35. static uint64_t g16Mask= 0x07E007E007E007E0LL;
  36. static uint64_t r16Mask= 0xF800F800F800F800LL;
  37. static uint64_t temp0;
  38. static uint8_t funnyYCode[10000];
  39. static uint8_t funnyUVCode[10000];
  40. // *** bilinear scaling and yuv->rgb conversion of yv12 slices:
  41. // *** Note: it's called multiple times while decoding a frame, first time y==0
  42. // *** Designed to upscale, but may work for downscale too.
  43. // s_xinc = (src_width << 8) / dst_width
  44. // s_yinc = (src_height << 16) / dst_height
  45. void SwScale_YV12slice_brg24(unsigned char* srcptr[],int stride[], int y, int h,
  46. unsigned char* dstptr, int dststride, int dstw, int dstbpp,
  47. unsigned int s_xinc,unsigned int s_yinc){
  48. // scaling factors:
  49. //static int s_yinc=(vo_dga_src_height<<16)/vo_dga_vp_height;
  50. //static int s_xinc=(vo_dga_src_width<<8)/vo_dga_vp_width;
  51. unsigned int s_xinc2=s_xinc>>1;
  52. static int s_srcypos;
  53. static int s_ypos;
  54. static int s_last_ypos;
  55. static int static_dstw;
  56. #ifdef HAVE_MMX2
  57. static int old_dstw= -1;
  58. static int old_s_xinc= -1;
  59. #endif
  60. s_xinc&= -2; //clear last bit or uv and y might be shifted relative to each other
  61. if(y==0){
  62. s_srcypos=-2*s_yinc;
  63. s_ypos=-2;
  64. s_last_ypos=-2;
  65. #ifdef HAVE_MMX2
  66. // cant downscale !!!
  67. if(old_s_xinc != s_xinc || old_dstw!=dstw)
  68. {
  69. uint8_t *fragment;
  70. int imm8OfPShufW1;
  71. int imm8OfPShufW2;
  72. int fragmentLength;
  73. int xpos, xx, xalpha, i;
  74. old_s_xinc= s_xinc;
  75. old_dstw= dstw;
  76. static_dstw= dstw;
  77. // create an optimized horizontal scaling routine
  78. //code fragment
  79. // fragmentLength=0;
  80. // printf("%d, %d\n", fragmentLength,imm8OfPShufW1);
  81. asm volatile(
  82. "jmp 9f \n\t"
  83. // Begin
  84. "0: \n\t"
  85. "movq (%%esi, %%ebx), %%mm0 \n\t" //FIXME Alignment
  86. "movq %%mm0, %%mm1 \n\t"
  87. "psrlq $8, %%mm0 \n\t"
  88. "punpcklbw %%mm7, %%mm1 \n\t"
  89. "punpcklbw %%mm7, %%mm0 \n\t"
  90. "pshufw $0xFF, %%mm1, %%mm1 \n\t"
  91. "1: \n\t"
  92. "pshufw $0xFF, %%mm0, %%mm0 \n\t"
  93. "2: \n\t"
  94. "psubw %%mm1, %%mm0 \n\t"
  95. "psraw $1, %%mm0 \n\t"
  96. "pmullw %%mm2, %%mm0 \n\t"
  97. "psllw $7, %%mm1 \n\t"
  98. "paddw %%mm1, %%mm0 \n\t"
  99. "movq %%mm0, (%%edi, %%eax) \n\t"
  100. "paddb %%mm6, %%mm2 \n\t" // 2*alpha += xpos&0xFF
  101. "addb %%ch, %%cl \n\t" //2*xalpha += (4*s_xinc)&0xFF
  102. "adcl %%edx, %%ebx \n\t" //xx+= (4*s_xinc)>>8 + carry
  103. "addl $8, %%eax \n\t"
  104. // End
  105. "9: \n\t"
  106. // "int $3\n\t"
  107. "leal 0b, %0 \n\t"
  108. "leal 1b, %1 \n\t"
  109. "leal 2b, %2 \n\t"
  110. "decl %1 \n\t"
  111. "decl %2 \n\t"
  112. "subl %0, %1 \n\t"
  113. "subl %0, %2 \n\t"
  114. "leal 9b, %3 \n\t"
  115. "subl %0, %3 \n\t"
  116. :"=r" (fragment), "=r" (imm8OfPShufW1), "=r" (imm8OfPShufW2),
  117. "=r" (fragmentLength)
  118. );
  119. xpos= xx=xalpha= 0;
  120. //FIXME choose size and or xinc so that they fit exactly
  121. for(i=0; i<dstw/8; i++)
  122. {
  123. int xx=xpos>>8;
  124. if((i&3) == 0)
  125. {
  126. int a=0;
  127. int b=((xpos+s_xinc)>>8) - xx;
  128. int c=((xpos+s_xinc*2)>>8) - xx;
  129. int d=((xpos+s_xinc*3)>>8) - xx;
  130. memcpy(funnyYCode + fragmentLength*i/4, fragment, fragmentLength);
  131. funnyYCode[fragmentLength*i/4 + imm8OfPShufW1]=
  132. funnyYCode[fragmentLength*i/4 + imm8OfPShufW2]=
  133. a | (b<<2) | (c<<4) | (d<<6);
  134. funnyYCode[fragmentLength*(i+4)/4]= RET;
  135. }
  136. xpos+=s_xinc;
  137. }
  138. xpos= xx=xalpha= 0;
  139. //FIXME choose size and or xinc so that they fit exactly
  140. for(i=0; i<dstw/8; i++)
  141. {
  142. int xx=xpos>>8;
  143. if((i&3) == 0)
  144. {
  145. int a=0;
  146. int b=((xpos+s_xinc2)>>8) - xx;
  147. int c=((xpos+s_xinc2*2)>>8) - xx;
  148. int d=((xpos+s_xinc2*3)>>8) - xx;
  149. memcpy(funnyUVCode + fragmentLength*i/4, fragment, fragmentLength);
  150. funnyUVCode[fragmentLength*i/4 + imm8OfPShufW1]=
  151. funnyUVCode[fragmentLength*i/4 + imm8OfPShufW2]=
  152. a | (b<<2) | (c<<4) | (d<<6);
  153. funnyUVCode[fragmentLength*(i+4)/4]= RET;
  154. }
  155. xpos+=s_xinc2;
  156. }
  157. // funnyCode[0]= RET;
  158. }
  159. #endif
  160. } // reset counters
  161. while(1){
  162. unsigned char *dest=dstptr+dststride*s_ypos;
  163. int y0=2+(s_srcypos>>16);
  164. int y1=1+(s_srcypos>>17);
  165. int yalpha=(s_srcypos&0xFFFF)>>7;
  166. int yalpha1=yalpha^511;
  167. int uvalpha=((s_srcypos>>1)&0xFFFF)>>7;
  168. int uvalpha1=uvalpha^511;
  169. uint16_t *buf0=pix_buf_y[y0&3];
  170. uint16_t *buf1=pix_buf_y[((y0+1)&3)];
  171. uint16_t *uvbuf0=pix_buf_uv[y1&1];
  172. uint16_t *uvbuf1=pix_buf_uv[(y1&1)^1];
  173. int i;
  174. if(y0>=y+h) break;
  175. s_ypos++; s_srcypos+=s_yinc;
  176. if(s_last_ypos!=y0){
  177. unsigned char *src=srcptr[0]+(y0-y)*stride[0];
  178. unsigned int xpos=0;
  179. s_last_ypos=y0;
  180. // *** horizontal scale Y line to temp buffer
  181. // this loop should be rewritten in MMX assembly!!!!
  182. #ifdef HAVE_MMX2
  183. asm volatile(
  184. "pxor %%mm7, %%mm7 \n\t"
  185. "pxor %%mm2, %%mm2 \n\t" // 2*xalpha
  186. "movd %5, %%mm6 \n\t" // s_xinc&0xFF
  187. "punpcklwd %%mm6, %%mm6 \n\t"
  188. "punpcklwd %%mm6, %%mm6 \n\t"
  189. "movq %%mm6, %%mm2 \n\t"
  190. "psllq $16, %%mm2 \n\t"
  191. "paddb %%mm6, %%mm2 \n\t"
  192. "psllq $16, %%mm2 \n\t"
  193. "paddb %%mm6, %%mm2 \n\t"
  194. "psllq $16, %%mm2 \n\t" //0,t,2t,3t t=s_xinc&0xFF
  195. "movq %%mm2, temp0 \n\t"
  196. "movd %4, %%mm6 \n\t" //(s_xinc*4)&0xFF
  197. "punpcklwd %%mm6, %%mm6 \n\t"
  198. "punpcklwd %%mm6, %%mm6 \n\t"
  199. "xorl %%eax, %%eax \n\t" // i
  200. "xorl %%ebx, %%ebx \n\t" // xx
  201. "movl %0, %%esi \n\t" // src
  202. "movl %1, %%edi \n\t" // buf1
  203. "movl %3, %%edx \n\t" // (s_xinc*4)>>8
  204. "xorl %%ecx, %%ecx \n\t"
  205. "movb %4, %%ch \n\t" // (s_xinc*4)&0xFF
  206. // "int $3\n\t"
  207. "call funnyYCode \n\t"
  208. "movq temp0, %%mm2 \n\t"
  209. "xorb %%cl, %%cl \n\t"
  210. "call funnyYCode \n\t"
  211. "movq temp0, %%mm2 \n\t"
  212. "xorb %%cl, %%cl \n\t"
  213. "call funnyYCode \n\t"
  214. "movq temp0, %%mm2 \n\t"
  215. "xorb %%cl, %%cl \n\t"
  216. "call funnyYCode \n\t"
  217. "movq temp0, %%mm2 \n\t"
  218. "xorb %%cl, %%cl \n\t"
  219. "call funnyYCode \n\t"
  220. "movq temp0, %%mm2 \n\t"
  221. "xorb %%cl, %%cl \n\t"
  222. "call funnyYCode \n\t"
  223. "movq temp0, %%mm2 \n\t"
  224. "xorb %%cl, %%cl \n\t"
  225. "call funnyYCode \n\t"
  226. "movq temp0, %%mm2 \n\t"
  227. "xorb %%cl, %%cl \n\t"
  228. "call funnyYCode \n\t"
  229. :: "m" (src), "m" (buf1), "m" (dstw), "m" ((s_xinc*4)>>8),
  230. "m" ((s_xinc*4)&0xFF), "m" (s_xinc&0xFF)
  231. : "%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi"
  232. );
  233. #elif defined (ARCH_X86)
  234. //NO MMX just normal asm ... FIXME try/write funny MMX2 variant
  235. //FIXME add prefetch
  236. asm volatile(
  237. "xorl %%eax, %%eax \n\t" // i
  238. "xorl %%ebx, %%ebx \n\t" // xx
  239. "xorl %%ecx, %%ecx \n\t" // 2*xalpha
  240. "1: \n\t"
  241. "movzbl (%0, %%ebx), %%edi \n\t" //src[xx]
  242. "movzbl 1(%0, %%ebx), %%esi \n\t" //src[xx+1]
  243. "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
  244. "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
  245. "shll $8, %%edi \n\t"
  246. "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
  247. "movl %1, %%edi \n\t"
  248. "shrl $1, %%esi \n\t"
  249. "movw %%si, (%%edi, %%eax, 2) \n\t"
  250. "addb %4, %%cl \n\t" //2*xalpha += s_xinc&0xFF
  251. "adcl %3, %%ebx \n\t" //xx+= s_xinc>>8 + carry
  252. "movzbl (%0, %%ebx), %%edi \n\t" //src[xx]
  253. "movzbl 1(%0, %%ebx), %%esi \n\t" //src[xx+1]
  254. "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
  255. "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
  256. "shll $8, %%edi \n\t"
  257. "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
  258. "movl %1, %%edi \n\t"
  259. "shrl $1, %%esi \n\t"
  260. "movw %%si, 2(%%edi, %%eax, 2) \n\t"
  261. "addb %4, %%cl \n\t" //2*xalpha += s_xinc&0xFF
  262. "adcl %3, %%ebx \n\t" //xx+= s_xinc>>8 + carry
  263. "addl $2, %%eax \n\t"
  264. "cmpl %2, %%eax \n\t"
  265. " jb 1b \n\t"
  266. :: "r" (src), "m" (buf1), "m" (dstw), "m" (s_xinc>>8), "m" (s_xinc&0xFF)
  267. : "%eax", "%ebx", "%ecx", "%edi", "%esi"
  268. );
  269. #else
  270. for(i=0;i<dstw;i++){
  271. register unsigned int xx=xpos>>8;
  272. register unsigned int xalpha=(xpos&0xFF)>>1;
  273. buf1[i]=(src[xx]*(xalpha^127)+src[xx+1]*xalpha);
  274. xpos+=s_xinc;
  275. }
  276. #endif
  277. // *** horizontal scale U and V lines to temp buffer
  278. if(!(y0&1)){
  279. unsigned char *src1=srcptr[1]+(y1-y/2)*stride[1];
  280. unsigned char *src2=srcptr[2]+(y1-y/2)*stride[2];
  281. xpos=0;
  282. // this loop should be rewritten in MMX assembly!!!!
  283. #ifdef HAVE_MMX2
  284. asm volatile(
  285. "pxor %%mm7, %%mm7 \n\t"
  286. "pxor %%mm2, %%mm2 \n\t" // 2*xalpha
  287. "movd %5, %%mm6 \n\t" // s_xinc&0xFF
  288. "punpcklwd %%mm6, %%mm6 \n\t"
  289. "punpcklwd %%mm6, %%mm6 \n\t"
  290. "movq %%mm6, %%mm2 \n\t"
  291. "psllq $16, %%mm2 \n\t"
  292. "paddb %%mm6, %%mm2 \n\t"
  293. "psllq $16, %%mm2 \n\t"
  294. "paddb %%mm6, %%mm2 \n\t"
  295. "psllq $16, %%mm2 \n\t" //0,t,2t,3t t=s_xinc&0xFF
  296. "movq %%mm2, temp0 \n\t"
  297. "movd %4, %%mm6 \n\t" //(s_xinc*4)&0xFF
  298. "punpcklwd %%mm6, %%mm6 \n\t"
  299. "punpcklwd %%mm6, %%mm6 \n\t"
  300. "xorl %%eax, %%eax \n\t" // i
  301. "xorl %%ebx, %%ebx \n\t" // xx
  302. "movl %0, %%esi \n\t" // src
  303. "movl %1, %%edi \n\t" // buf1
  304. "movl %3, %%edx \n\t" // (s_xinc*4)>>8
  305. "xorl %%ecx, %%ecx \n\t"
  306. "movb %4, %%ch \n\t" // (s_xinc*4)&0xFF
  307. // "int $3\n\t"
  308. "call funnyUVCode \n\t"
  309. "movq temp0, %%mm2 \n\t"
  310. "xorb %%cl, %%cl \n\t"
  311. "call funnyUVCode \n\t"
  312. "movq temp0, %%mm2 \n\t"
  313. "xorb %%cl, %%cl \n\t"
  314. "call funnyUVCode \n\t"
  315. "movq temp0, %%mm2 \n\t"
  316. "xorb %%cl, %%cl \n\t"
  317. "call funnyUVCode \n\t"
  318. "movq temp0, %%mm2 \n\t"
  319. "xorb %%cl, %%cl \n\t"
  320. "call funnyUVCode \n\t"
  321. "movq temp0, %%mm2 \n\t"
  322. "xorb %%cl, %%cl \n\t"
  323. "call funnyUVCode \n\t"
  324. "movq temp0, %%mm2 \n\t"
  325. "xorb %%cl, %%cl \n\t"
  326. "call funnyUVCode \n\t"
  327. "movq temp0, %%mm2 \n\t"
  328. "xorb %%cl, %%cl \n\t"
  329. "call funnyUVCode \n\t"
  330. "xorl %%eax, %%eax \n\t" // i
  331. "xorl %%ebx, %%ebx \n\t" // xx
  332. "movl %6, %%esi \n\t" // src
  333. "movl %1, %%edi \n\t" // buf1
  334. "addl $4096, %%edi \n\t"
  335. "call funnyUVCode \n\t"
  336. "movq temp0, %%mm2 \n\t"
  337. "xorb %%cl, %%cl \n\t"
  338. "call funnyUVCode \n\t"
  339. "movq temp0, %%mm2 \n\t"
  340. "xorb %%cl, %%cl \n\t"
  341. "call funnyUVCode \n\t"
  342. "movq temp0, %%mm2 \n\t"
  343. "xorb %%cl, %%cl \n\t"
  344. "call funnyUVCode \n\t"
  345. "movq temp0, %%mm2 \n\t"
  346. "xorb %%cl, %%cl \n\t"
  347. "call funnyUVCode \n\t"
  348. "movq temp0, %%mm2 \n\t"
  349. "xorb %%cl, %%cl \n\t"
  350. "call funnyUVCode \n\t"
  351. "movq temp0, %%mm2 \n\t"
  352. "xorb %%cl, %%cl \n\t"
  353. "call funnyUVCode \n\t"
  354. "movq temp0, %%mm2 \n\t"
  355. "xorb %%cl, %%cl \n\t"
  356. "call funnyUVCode \n\t"
  357. :: "m" (src1), "m" (uvbuf1), "m" (dstw), "m" ((s_xinc2*4)>>8),
  358. "m" ((s_xinc2*4)&0xFF), "m" (s_xinc2&0xFF), "m" (src2)
  359. : "%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi"
  360. );
  361. #elif defined (ARCH_X86)
  362. asm volatile(
  363. "xorl %%eax, %%eax \n\t" // i
  364. "xorl %%ebx, %%ebx \n\t" // xx
  365. "xorl %%ecx, %%ecx \n\t" // 2*xalpha
  366. "1: \n\t"
  367. "movl %0, %%esi \n\t"
  368. "movzbl (%%esi, %%ebx), %%edi \n\t" //src[xx]
  369. "movzbl 1(%%esi, %%ebx), %%esi \n\t" //src[xx+1]
  370. "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
  371. "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
  372. "shll $8, %%edi \n\t"
  373. "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
  374. "movl %1, %%edi \n\t"
  375. "shrl $1, %%esi \n\t"
  376. "movw %%si, (%%edi, %%eax, 2) \n\t"
  377. "movzbl (%5, %%ebx), %%edi \n\t" //src[xx]
  378. "movzbl 1(%5, %%ebx), %%esi \n\t" //src[xx+1]
  379. "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
  380. "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
  381. "shll $8, %%edi \n\t"
  382. "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
  383. "movl %1, %%edi \n\t"
  384. "shrl $1, %%esi \n\t"
  385. "movw %%si, 4096(%%edi, %%eax, 2)\n\t"
  386. "addb %4, %%cl \n\t" //2*xalpha += s_xinc&0xFF
  387. "adcl %3, %%ebx \n\t" //xx+= s_xinc>>8 + carry
  388. "addl $1, %%eax \n\t"
  389. "cmpl %2, %%eax \n\t"
  390. " jb 1b \n\t"
  391. :: "m" (src1), "m" (uvbuf1), "m" (dstw), "m" (s_xinc2>>8), "m" (s_xinc2&0xFF),
  392. "r" (src2)
  393. : "%eax", "%ebx", "%ecx", "%edi", "%esi"
  394. );
  395. #else
  396. for(i=0;i<dstw;i++){
  397. register unsigned int xx=xpos>>8;
  398. register unsigned int xalpha=(xpos&0xFF)>>1;
  399. uvbuf1[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha);
  400. uvbuf1[i+2048]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha);
  401. xpos+=s_xinc2;
  402. }
  403. #endif
  404. }
  405. if(!y0) continue;
  406. }
  407. // Note1: this code can be resticted to n*8 (or n*16) width lines to simplify optimization...
  408. // Re: Note1: ok n*4 for now
  409. // Note2: instead of using lookup tabs, mmx version could do the multiply...
  410. // Re: Note2: yep
  411. // Note3: maybe we should make separated 15/16, 24 and 32bpp version of this:
  412. // Re: done (32 & 16) and 16 has dithering :) but 16 is untested
  413. #ifdef HAVE_MMX
  414. //FIXME write lq version with less uv ...
  415. //FIXME reorder / optimize
  416. if(dstbpp == 32)
  417. {
  418. asm volatile(
  419. #define YSCALEYUV2RGB \
  420. "pxor %%mm7, %%mm7 \n\t"\
  421. "movd %6, %%mm6 \n\t" /*yalpha1*/\
  422. "punpcklwd %%mm6, %%mm6 \n\t"\
  423. "punpcklwd %%mm6, %%mm6 \n\t"\
  424. "movd %7, %%mm5 \n\t" /*uvalpha1*/\
  425. "punpcklwd %%mm5, %%mm5 \n\t"\
  426. "punpcklwd %%mm5, %%mm5 \n\t"\
  427. "xorl %%eax, %%eax \n\t"\
  428. "1: \n\t"\
  429. "movq (%0, %%eax, 2), %%mm0 \n\t" /*buf0[eax]*/\
  430. "movq (%1, %%eax, 2), %%mm1 \n\t" /*buf1[eax]*/\
  431. "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
  432. "pmulhw %%mm6, %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
  433. "psraw $7, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>7*/\
  434. "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
  435. "psubw w10, %%mm1 \n\t" /* Y-16*/\
  436. "psllw $3, %%mm1 \n\t" /* (y-16)*8*/\
  437. "pmulhw yCoeff, %%mm1 \n\t"\
  438. \
  439. "movq (%2, %%eax,2), %%mm2 \n\t" /* uvbuf0[eax]*/\
  440. "movq (%3, %%eax,2), %%mm3 \n\t" /* uvbuf1[eax]*/\
  441. "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
  442. "pmulhw %%mm5, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
  443. "psraw $7, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>7*/\
  444. "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
  445. "psubw w80, %%mm3 \n\t" /* (U-128)*/\
  446. "psllw $3, %%mm3 \n\t" /*(U-128)8*/\
  447. \
  448. "movq 4096(%2, %%eax,2), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
  449. "movq 4096(%3, %%eax,2), %%mm0 \n\t" /* uvbuf1[eax+2048]*/\
  450. "psubw %%mm0, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
  451. "pmulhw %%mm5, %%mm4 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
  452. "psraw $7, %%mm0 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>7*/\
  453. "paddw %%mm4, %%mm0 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
  454. "psubw w80, %%mm0 \n\t" /* (V-128)*/\
  455. "psllw $3, %%mm0 \n\t" /* (V-128)8*/\
  456. \
  457. "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
  458. "pmulhw ubCoeff, %%mm3 \n\t"\
  459. "paddw %%mm1, %%mm3 \n\t" /* B*/\
  460. \
  461. "movq %%mm0, %%mm4 \n\t" /* (V-128)8*/\
  462. "pmulhw vrCoeff, %%mm0 \n\t"\
  463. "paddw %%mm1, %%mm0 \n\t" /* R*/\
  464. \
  465. "pmulhw ugCoeff, %%mm2 \n\t"\
  466. "pmulhw vgCoeff, %%mm4 \n\t"\
  467. "paddw %%mm4, %%mm2 \n\t"\
  468. "paddw %%mm2, %%mm1 \n\t" /* G*/\
  469. \
  470. "packuswb %%mm3, %%mm3 \n\t"\
  471. "packuswb %%mm0, %%mm0 \n\t"\
  472. "packuswb %%mm1, %%mm1 \n\t"
  473. YSCALEYUV2RGB
  474. "punpcklbw %%mm1, %%mm3 \n\t" // BGBGBGBG
  475. "punpcklbw %%mm7, %%mm0 \n\t" // R0R0R0R0
  476. "movq %%mm3, %%mm1 \n\t"
  477. "punpcklwd %%mm0, %%mm3 \n\t" // BGR0BGR0
  478. "punpckhwd %%mm0, %%mm1 \n\t" // BGR0BGR0
  479. #ifdef HAVE_MMX2
  480. "movntq %%mm3, (%4, %%eax, 4) \n\t"
  481. "movntq %%mm1, 8(%4, %%eax, 4) \n\t"
  482. #else
  483. "movq %%mm3, (%4, %%eax, 4) \n\t"
  484. "movq %%mm1, 8(%4, %%eax, 4) \n\t"
  485. #endif
  486. "addl $4, %%eax \n\t"
  487. "cmpl %5, %%eax \n\t"
  488. " jb 1b \n\t"
  489. :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstw),
  490. "m" (yalpha1), "m" (uvalpha1)
  491. : "%eax"
  492. );
  493. }
  494. else if(dstbpp==24)
  495. {
  496. asm volatile(
  497. YSCALEYUV2RGB
  498. // lsb ... msb
  499. "punpcklbw %%mm1, %%mm3 \n\t" // BGBGBGBG
  500. "punpcklbw %%mm7, %%mm0 \n\t" // R0R0R0R0
  501. "movq %%mm3, %%mm1 \n\t"
  502. "punpcklwd %%mm0, %%mm3 \n\t" // BGR0BGR0
  503. "punpckhwd %%mm0, %%mm1 \n\t" // BGR0BGR0
  504. "movq %%mm3, %%mm2 \n\t" // BGR0BGR0
  505. "psrlq $8, %%mm3 \n\t" // GR0BGR00
  506. "pand bm00000111, %%mm2 \n\t" // BGR00000
  507. "pand bm11111000, %%mm3 \n\t" // 000BGR00
  508. "por %%mm2, %%mm3 \n\t" // BGRBGR00
  509. "movq %%mm1, %%mm2 \n\t"
  510. "psllq $48, %%mm1 \n\t" // 000000BG
  511. "por %%mm1, %%mm3 \n\t" // BGRBGRBG
  512. "movq %%mm2, %%mm1 \n\t" // BGR0BGR0
  513. "psrld $16, %%mm2 \n\t" // R000R000
  514. "psrlq $24, %%mm1 \n\t" // 0BGR0000
  515. "por %%mm2, %%mm1 \n\t" // RBGRR000
  516. "movl %4, %%ebx \n\t"
  517. "addl %%eax, %%ebx \n\t"
  518. #ifdef HAVE_MMX2
  519. //FIXME Alignment
  520. "movntq %%mm3, (%%ebx, %%eax, 2)\n\t"
  521. "movntq %%mm1, 8(%%ebx, %%eax, 2)\n\t"
  522. #else
  523. "movd %%mm3, (%%ebx, %%eax, 2) \n\t"
  524. "psrlq $32, %%mm3 \n\t"
  525. "movd %%mm3, 4(%%ebx, %%eax, 2) \n\t"
  526. "movd %%mm1, 8(%%ebx, %%eax, 2) \n\t"
  527. #endif
  528. "addl $4, %%eax \n\t"
  529. "cmpl %5, %%eax \n\t"
  530. " jb 1b \n\t"
  531. :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstw),
  532. "m" (yalpha1), "m" (uvalpha1)
  533. : "%eax", "%ebx"
  534. );
  535. }
  536. else if(dstbpp==16)
  537. {
  538. asm volatile(
  539. YSCALEYUV2RGB
  540. "paddusb g16Dither, %%mm1 \n\t"
  541. "paddusb b16Dither, %%mm0 \n\t"
  542. "paddusb b16Dither, %%mm3 \n\t"
  543. "punpcklbw %%mm7, %%mm1 \n\t" // 0G0G0G0G
  544. "punpcklbw %%mm7, %%mm3 \n\t" // 0B0B0B0B
  545. "punpcklbw %%mm7, %%mm0 \n\t" // 0R0R0R0R
  546. "psrlw $3, %%mm3 \n\t"
  547. "psllw $3, %%mm1 \n\t"
  548. "psllw $8, %%mm0 \n\t"
  549. "pand g16Mask, %%mm1 \n\t"
  550. "pand r16Mask, %%mm0 \n\t"
  551. "por %%mm3, %%mm1 \n\t"
  552. "por %%mm1, %%mm0 \n\t"
  553. #ifdef HAVE_MMX2
  554. "movntq %%mm0, (%4, %%eax, 2) \n\t"
  555. #else
  556. "movq %%mm0, (%4, %%eax, 2) \n\t"
  557. #endif
  558. "addl $4, %%eax \n\t"
  559. "cmpl %5, %%eax \n\t"
  560. " jb 1b \n\t"
  561. :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstw),
  562. "m" (yalpha1), "m" (uvalpha1)
  563. : "%eax"
  564. );
  565. }
  566. #else
  567. if(dstbpp==32 || dstbpp==24)
  568. {
  569. for(i=0;i<dstw;i++){
  570. // vertical linear interpolation && yuv2rgb in a single step:
  571. int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>16)];
  572. int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>16);
  573. int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>16);
  574. dest[0]=clip_table[((Y + yuvtab_3343[U]) >>13)];
  575. dest[1]=clip_table[((Y + yuvtab_0c92[V] + yuvtab_1a1e[U]) >>13)];
  576. dest[2]=clip_table[((Y + yuvtab_40cf[V]) >>13)];
  577. dest+=dstbpp>>3;
  578. }
  579. }
  580. else if(dstbpp==16)
  581. {
  582. for(i=0;i<dstw;i++){
  583. // vertical linear interpolation && yuv2rgb in a single step:
  584. int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>16)];
  585. int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>16);
  586. int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>16);
  587. ((uint16_t*)dest)[0] =
  588. (clip_table[((Y + yuvtab_3343[U]) >>13)]>>3) |
  589. (clip_table[((Y + yuvtab_0c92[V] + yuvtab_1a1e[U]) >>13)]<<3)&0x07E0 |
  590. (clip_table[((Y + yuvtab_40cf[V]) >>13)]<<8)&0xF800;
  591. dest+=2;
  592. }
  593. }
  594. else if(dstbpp==15) //15bit FIXME how do i figure out if its 15 or 16?
  595. {
  596. for(i=0;i<dstw;i++){
  597. // vertical linear interpolation && yuv2rgb in a single step:
  598. int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>16)];
  599. int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>16);
  600. int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>16);
  601. ((uint16_t*)dest)[0] =
  602. (clip_table[((Y + yuvtab_3343[U]) >>13)]>>3) |
  603. (clip_table[((Y + yuvtab_0c92[V] + yuvtab_1a1e[U]) >>13)]<<2)&0x03E0 |
  604. (clip_table[((Y + yuvtab_40cf[V]) >>13)]<<7)&0x7C00;
  605. dest+=2;
  606. }
  607. }
  608. #endif
  609. b16Dither= b16Dither1;
  610. b16Dither1= b16Dither2;
  611. b16Dither2= b16Dither;
  612. g16Dither= g16Dither1;
  613. g16Dither1= g16Dither2;
  614. g16Dither2= g16Dither;
  615. }
  616. #ifdef HAVE_3DNOW
  617. asm volatile("femms");
  618. #elif defined (HAVE_MMX)
  619. asm volatile("emms");
  620. #endif
  621. }
  622. void SwScale_Init(){
  623. // generating tables:
  624. int i;
  625. for(i=0;i<256;i++){
  626. clip_table[i]=0;
  627. clip_table[i+256]=i;
  628. clip_table[i+512]=255;
  629. yuvtab_2568[i]=(0x2568*(i-16))+(256<<13);
  630. yuvtab_3343[i]=0x3343*(i-128);
  631. yuvtab_0c92[i]=-0x0c92*(i-128);
  632. yuvtab_1a1e[i]=-0x1a1e*(i-128);
  633. yuvtab_40cf[i]=0x40cf*(i-128);
  634. }
  635. }