You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1703 lines
50KB

  1. // Software scaling and colorspace conversion routines for MPlayer
  2. // Orginal C implementation by A'rpi/ESP-team <arpi@thot.banki.hu>
  3. // current version mostly by Michael Niedermayer (michaelni@gmx.at)
  4. // the parts written by michael are under GNU GPL
  5. #include <inttypes.h>
  6. #include <string.h>
  7. #include "../config.h"
  8. #include "swscale.h"
  9. #include "../mmx_defs.h"
  10. #undef MOVNTQ
  11. //#undef HAVE_MMX2
  12. //#undef HAVE_MMX
  13. //#undef ARCH_X86
  14. #define DITHER1XBPP
  15. int fullUVIpol=0;
  16. //disables the unscaled height version
  17. int allwaysIpol=0;
  18. #define RET 0xC3 //near return opcode
  19. /*
  20. NOTES
  21. known BUGS with known cause (no bugreports please!, but patches are welcome :) )
  22. horizontal MMX2 scaler reads 1-7 samples too much (might cause a sig11)
  23. Supported output formats BGR15 BGR16 BGR24 BGR32
  24. BGR15 & BGR16 MMX verions support dithering
  25. Special versions: fast Y 1:1 scaling (no interpolation in y direction)
  26. TODO
  27. more intelligent missalignment avoidance for the horizontal scaler
  28. bicubic scaler
  29. dither in C
  30. change the distance of the u & v buffer
  31. */
  32. #define ABS(a) ((a) > 0 ? (a) : (-(a)))
  33. #define MIN(a,b) ((a) > (b) ? (b) : (a))
  34. #define MAX(a,b) ((a) < (b) ? (b) : (a))
  35. #ifdef HAVE_MMX2
  36. #define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
  37. #elif defined (HAVE_3DNOW)
  38. #define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
  39. #endif
  40. #ifdef HAVE_MMX2
  41. #define MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
  42. #else
  43. #define MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
  44. #endif
  45. #ifdef HAVE_MMX
  46. static uint64_t __attribute__((aligned(8))) yCoeff= 0x2568256825682568LL;
  47. static uint64_t __attribute__((aligned(8))) vrCoeff= 0x3343334333433343LL;
  48. static uint64_t __attribute__((aligned(8))) ubCoeff= 0x40cf40cf40cf40cfLL;
  49. static uint64_t __attribute__((aligned(8))) vgCoeff= 0xE5E2E5E2E5E2E5E2LL;
  50. static uint64_t __attribute__((aligned(8))) ugCoeff= 0xF36EF36EF36EF36ELL;
  51. static uint64_t __attribute__((aligned(8))) w400= 0x0400040004000400LL;
  52. static uint64_t __attribute__((aligned(8))) w80= 0x0080008000800080LL;
  53. static uint64_t __attribute__((aligned(8))) w10= 0x0010001000100010LL;
  54. static uint64_t __attribute__((aligned(8))) bm00001111=0x00000000FFFFFFFFLL;
  55. static uint64_t __attribute__((aligned(8))) bm00000111=0x0000000000FFFFFFLL;
  56. static uint64_t __attribute__((aligned(8))) bm11111000=0xFFFFFFFFFF000000LL;
  57. static uint64_t __attribute__((aligned(8))) b16Dither= 0x0004000400040004LL;
  58. static uint64_t __attribute__((aligned(8))) b16Dither1=0x0004000400040004LL;
  59. static uint64_t __attribute__((aligned(8))) b16Dither2=0x0602060206020602LL;
  60. static uint64_t __attribute__((aligned(8))) g16Dither= 0x0002000200020002LL;
  61. static uint64_t __attribute__((aligned(8))) g16Dither1=0x0002000200020002LL;
  62. static uint64_t __attribute__((aligned(8))) g16Dither2=0x0301030103010301LL;
  63. static uint64_t __attribute__((aligned(8))) b16Mask= 0x001F001F001F001FLL;
  64. static uint64_t __attribute__((aligned(8))) g16Mask= 0x07E007E007E007E0LL;
  65. static uint64_t __attribute__((aligned(8))) r16Mask= 0xF800F800F800F800LL;
  66. static uint64_t __attribute__((aligned(8))) b15Mask= 0x001F001F001F001FLL;
  67. static uint64_t __attribute__((aligned(8))) g15Mask= 0x03E003E003E003E0LL;
  68. static uint64_t __attribute__((aligned(8))) r15Mask= 0x7C007C007C007C00LL;
  69. static uint64_t __attribute__((aligned(8))) temp0;
  70. static uint64_t __attribute__((aligned(8))) asm_yalpha1;
  71. static uint64_t __attribute__((aligned(8))) asm_uvalpha1;
  72. #endif
  73. // temporary storage for 4 yuv lines:
  74. // 16bit for now (mmx likes it more compact)
  75. #ifdef HAVE_MMX
  76. static uint16_t __attribute__((aligned(8))) pix_buf_y[4][2048];
  77. static uint16_t __attribute__((aligned(8))) pix_buf_uv[2][2048*2];
  78. #else
  79. static uint16_t pix_buf_y[4][2048];
  80. static uint16_t pix_buf_uv[2][2048*2];
  81. #endif
  82. // clipping helper table for C implementations:
  83. static unsigned char clip_table[768];
  84. static unsigned short clip_table16b[768];
  85. static unsigned short clip_table16g[768];
  86. static unsigned short clip_table16r[768];
  87. static unsigned short clip_table15b[768];
  88. static unsigned short clip_table15g[768];
  89. static unsigned short clip_table15r[768];
  90. // yuv->rgb conversion tables:
  91. static int yuvtab_2568[256];
  92. static int yuvtab_3343[256];
  93. static int yuvtab_0c92[256];
  94. static int yuvtab_1a1e[256];
  95. static int yuvtab_40cf[256];
  96. static uint8_t funnyYCode[10000];
  97. static uint8_t funnyUVCode[10000];
  98. static int canMMX2BeUsed=0;
  99. #define FULL_YSCALEYUV2RGB \
  100. "pxor %%mm7, %%mm7 \n\t"\
  101. "movd %6, %%mm6 \n\t" /*yalpha1*/\
  102. "punpcklwd %%mm6, %%mm6 \n\t"\
  103. "punpcklwd %%mm6, %%mm6 \n\t"\
  104. "movd %7, %%mm5 \n\t" /*uvalpha1*/\
  105. "punpcklwd %%mm5, %%mm5 \n\t"\
  106. "punpcklwd %%mm5, %%mm5 \n\t"\
  107. "xorl %%eax, %%eax \n\t"\
  108. "1: \n\t"\
  109. "movq (%0, %%eax, 2), %%mm0 \n\t" /*buf0[eax]*/\
  110. "movq (%1, %%eax, 2), %%mm1 \n\t" /*buf1[eax]*/\
  111. "movq (%2, %%eax,2), %%mm2 \n\t" /* uvbuf0[eax]*/\
  112. "movq (%3, %%eax,2), %%mm3 \n\t" /* uvbuf1[eax]*/\
  113. "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
  114. "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
  115. "pmulhw %%mm6, %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
  116. "pmulhw %%mm5, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
  117. "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  118. "movq 4096(%2, %%eax,2), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
  119. "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
  120. "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
  121. "movq 4096(%3, %%eax,2), %%mm0 \n\t" /* uvbuf1[eax+2048]*/\
  122. "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
  123. "psubw %%mm0, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
  124. "psubw w80, %%mm1 \n\t" /* 8(Y-16)*/\
  125. "psubw w400, %%mm3 \n\t" /* 8(U-128)*/\
  126. "pmulhw yCoeff, %%mm1 \n\t"\
  127. \
  128. \
  129. "pmulhw %%mm5, %%mm4 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
  130. "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
  131. "pmulhw ubCoeff, %%mm3 \n\t"\
  132. "psraw $4, %%mm0 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
  133. "pmulhw ugCoeff, %%mm2 \n\t"\
  134. "paddw %%mm4, %%mm0 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
  135. "psubw w400, %%mm0 \n\t" /* (V-128)8*/\
  136. \
  137. \
  138. "movq %%mm0, %%mm4 \n\t" /* (V-128)8*/\
  139. "pmulhw vrCoeff, %%mm0 \n\t"\
  140. "pmulhw vgCoeff, %%mm4 \n\t"\
  141. "paddw %%mm1, %%mm3 \n\t" /* B*/\
  142. "paddw %%mm1, %%mm0 \n\t" /* R*/\
  143. "packuswb %%mm3, %%mm3 \n\t"\
  144. \
  145. "packuswb %%mm0, %%mm0 \n\t"\
  146. "paddw %%mm4, %%mm2 \n\t"\
  147. "paddw %%mm2, %%mm1 \n\t" /* G*/\
  148. \
  149. "packuswb %%mm1, %%mm1 \n\t"
  150. #define YSCALEYUV2RGB \
  151. "movd %6, %%mm6 \n\t" /*yalpha1*/\
  152. "punpcklwd %%mm6, %%mm6 \n\t"\
  153. "punpcklwd %%mm6, %%mm6 \n\t"\
  154. "movq %%mm6, asm_yalpha1 \n\t"\
  155. "movd %7, %%mm5 \n\t" /*uvalpha1*/\
  156. "punpcklwd %%mm5, %%mm5 \n\t"\
  157. "punpcklwd %%mm5, %%mm5 \n\t"\
  158. "movq %%mm5, asm_uvalpha1 \n\t"\
  159. "xorl %%eax, %%eax \n\t"\
  160. "1: \n\t"\
  161. "movq (%2, %%eax), %%mm2 \n\t" /* uvbuf0[eax]*/\
  162. "movq (%3, %%eax), %%mm3 \n\t" /* uvbuf1[eax]*/\
  163. "movq 4096(%2, %%eax), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
  164. "movq 4096(%3, %%eax), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
  165. "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
  166. "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
  167. "movq asm_uvalpha1, %%mm0 \n\t"\
  168. "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
  169. "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
  170. "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
  171. "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
  172. "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
  173. "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
  174. "psubw w400, %%mm3 \n\t" /* (U-128)8*/\
  175. "psubw w400, %%mm4 \n\t" /* (V-128)8*/\
  176. "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
  177. "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
  178. "pmulhw ugCoeff, %%mm3 \n\t"\
  179. "pmulhw vgCoeff, %%mm4 \n\t"\
  180. /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
  181. "movq (%0, %%eax, 2), %%mm0 \n\t" /*buf0[eax]*/\
  182. "movq (%1, %%eax, 2), %%mm1 \n\t" /*buf1[eax]*/\
  183. "movq 8(%0, %%eax, 2), %%mm6 \n\t" /*buf0[eax]*/\
  184. "movq 8(%1, %%eax, 2), %%mm7 \n\t" /*buf1[eax]*/\
  185. "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
  186. "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
  187. "pmulhw asm_yalpha1, %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
  188. "pmulhw asm_yalpha1, %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
  189. "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  190. "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  191. "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
  192. "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
  193. "pmulhw ubCoeff, %%mm2 \n\t"\
  194. "pmulhw vrCoeff, %%mm5 \n\t"\
  195. "psubw w80, %%mm1 \n\t" /* 8(Y-16)*/\
  196. "psubw w80, %%mm7 \n\t" /* 8(Y-16)*/\
  197. "pmulhw yCoeff, %%mm1 \n\t"\
  198. "pmulhw yCoeff, %%mm7 \n\t"\
  199. /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
  200. "paddw %%mm3, %%mm4 \n\t"\
  201. "movq %%mm2, %%mm0 \n\t"\
  202. "movq %%mm5, %%mm6 \n\t"\
  203. "movq %%mm4, %%mm3 \n\t"\
  204. "punpcklwd %%mm2, %%mm2 \n\t"\
  205. "punpcklwd %%mm5, %%mm5 \n\t"\
  206. "punpcklwd %%mm4, %%mm4 \n\t"\
  207. "paddw %%mm1, %%mm2 \n\t"\
  208. "paddw %%mm1, %%mm5 \n\t"\
  209. "paddw %%mm1, %%mm4 \n\t"\
  210. "punpckhwd %%mm0, %%mm0 \n\t"\
  211. "punpckhwd %%mm6, %%mm6 \n\t"\
  212. "punpckhwd %%mm3, %%mm3 \n\t"\
  213. "paddw %%mm7, %%mm0 \n\t"\
  214. "paddw %%mm7, %%mm6 \n\t"\
  215. "paddw %%mm7, %%mm3 \n\t"\
  216. /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
  217. "packuswb %%mm0, %%mm2 \n\t"\
  218. "packuswb %%mm6, %%mm5 \n\t"\
  219. "packuswb %%mm3, %%mm4 \n\t"\
  220. "pxor %%mm7, %%mm7 \n\t"
  221. #define YSCALEYUV2RGB1 \
  222. "xorl %%eax, %%eax \n\t"\
  223. "1: \n\t"\
  224. "movq (%2, %%eax), %%mm3 \n\t" /* uvbuf0[eax]*/\
  225. "movq 4096(%2, %%eax), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
  226. "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
  227. "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
  228. "psubw w400, %%mm3 \n\t" /* (U-128)8*/\
  229. "psubw w400, %%mm4 \n\t" /* (V-128)8*/\
  230. "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
  231. "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
  232. "pmulhw ugCoeff, %%mm3 \n\t"\
  233. "pmulhw vgCoeff, %%mm4 \n\t"\
  234. /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
  235. "movq (%0, %%eax, 2), %%mm1 \n\t" /*buf0[eax]*/\
  236. "movq 8(%0, %%eax, 2), %%mm7 \n\t" /*buf0[eax]*/\
  237. "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  238. "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  239. "pmulhw ubCoeff, %%mm2 \n\t"\
  240. "pmulhw vrCoeff, %%mm5 \n\t"\
  241. "psubw w80, %%mm1 \n\t" /* 8(Y-16)*/\
  242. "psubw w80, %%mm7 \n\t" /* 8(Y-16)*/\
  243. "pmulhw yCoeff, %%mm1 \n\t"\
  244. "pmulhw yCoeff, %%mm7 \n\t"\
  245. /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
  246. "paddw %%mm3, %%mm4 \n\t"\
  247. "movq %%mm2, %%mm0 \n\t"\
  248. "movq %%mm5, %%mm6 \n\t"\
  249. "movq %%mm4, %%mm3 \n\t"\
  250. "punpcklwd %%mm2, %%mm2 \n\t"\
  251. "punpcklwd %%mm5, %%mm5 \n\t"\
  252. "punpcklwd %%mm4, %%mm4 \n\t"\
  253. "paddw %%mm1, %%mm2 \n\t"\
  254. "paddw %%mm1, %%mm5 \n\t"\
  255. "paddw %%mm1, %%mm4 \n\t"\
  256. "punpckhwd %%mm0, %%mm0 \n\t"\
  257. "punpckhwd %%mm6, %%mm6 \n\t"\
  258. "punpckhwd %%mm3, %%mm3 \n\t"\
  259. "paddw %%mm7, %%mm0 \n\t"\
  260. "paddw %%mm7, %%mm6 \n\t"\
  261. "paddw %%mm7, %%mm3 \n\t"\
  262. /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
  263. "packuswb %%mm0, %%mm2 \n\t"\
  264. "packuswb %%mm6, %%mm5 \n\t"\
  265. "packuswb %%mm3, %%mm4 \n\t"\
  266. "pxor %%mm7, %%mm7 \n\t"
  267. // do vertical chrominance interpolation
  268. #define YSCALEYUV2RGB1b \
  269. "xorl %%eax, %%eax \n\t"\
  270. "1: \n\t"\
  271. "movq (%2, %%eax), %%mm2 \n\t" /* uvbuf0[eax]*/\
  272. "movq (%3, %%eax), %%mm3 \n\t" /* uvbuf1[eax]*/\
  273. "movq 4096(%2, %%eax), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
  274. "movq 4096(%3, %%eax), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
  275. "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
  276. "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
  277. "psrlw $5, %%mm3 \n\t"\
  278. "psrlw $5, %%mm4 \n\t"\
  279. "psubw w400, %%mm3 \n\t" /* (U-128)8*/\
  280. "psubw w400, %%mm4 \n\t" /* (V-128)8*/\
  281. "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
  282. "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
  283. "pmulhw ugCoeff, %%mm3 \n\t"\
  284. "pmulhw vgCoeff, %%mm4 \n\t"\
  285. /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
  286. "movq (%0, %%eax, 2), %%mm1 \n\t" /*buf0[eax]*/\
  287. "movq 8(%0, %%eax, 2), %%mm7 \n\t" /*buf0[eax]*/\
  288. "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  289. "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  290. "pmulhw ubCoeff, %%mm2 \n\t"\
  291. "pmulhw vrCoeff, %%mm5 \n\t"\
  292. "psubw w80, %%mm1 \n\t" /* 8(Y-16)*/\
  293. "psubw w80, %%mm7 \n\t" /* 8(Y-16)*/\
  294. "pmulhw yCoeff, %%mm1 \n\t"\
  295. "pmulhw yCoeff, %%mm7 \n\t"\
  296. /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
  297. "paddw %%mm3, %%mm4 \n\t"\
  298. "movq %%mm2, %%mm0 \n\t"\
  299. "movq %%mm5, %%mm6 \n\t"\
  300. "movq %%mm4, %%mm3 \n\t"\
  301. "punpcklwd %%mm2, %%mm2 \n\t"\
  302. "punpcklwd %%mm5, %%mm5 \n\t"\
  303. "punpcklwd %%mm4, %%mm4 \n\t"\
  304. "paddw %%mm1, %%mm2 \n\t"\
  305. "paddw %%mm1, %%mm5 \n\t"\
  306. "paddw %%mm1, %%mm4 \n\t"\
  307. "punpckhwd %%mm0, %%mm0 \n\t"\
  308. "punpckhwd %%mm6, %%mm6 \n\t"\
  309. "punpckhwd %%mm3, %%mm3 \n\t"\
  310. "paddw %%mm7, %%mm0 \n\t"\
  311. "paddw %%mm7, %%mm6 \n\t"\
  312. "paddw %%mm7, %%mm3 \n\t"\
  313. /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
  314. "packuswb %%mm0, %%mm2 \n\t"\
  315. "packuswb %%mm6, %%mm5 \n\t"\
  316. "packuswb %%mm3, %%mm4 \n\t"\
  317. "pxor %%mm7, %%mm7 \n\t"
  318. #define WRITEBGR32 \
  319. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
  320. "movq %%mm2, %%mm1 \n\t" /* B */\
  321. "movq %%mm5, %%mm6 \n\t" /* R */\
  322. "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
  323. "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
  324. "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
  325. "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
  326. "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
  327. "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
  328. "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
  329. "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
  330. "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
  331. "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
  332. \
  333. MOVNTQ(%%mm0, (%4, %%eax, 4))\
  334. MOVNTQ(%%mm2, 8(%4, %%eax, 4))\
  335. MOVNTQ(%%mm1, 16(%4, %%eax, 4))\
  336. MOVNTQ(%%mm3, 24(%4, %%eax, 4))\
  337. \
  338. "addl $8, %%eax \n\t"\
  339. "cmpl %5, %%eax \n\t"\
  340. " jb 1b \n\t"
  341. #define WRITEBGR16 \
  342. "movq %%mm2, %%mm1 \n\t" /* B */\
  343. "movq %%mm4, %%mm3 \n\t" /* G */\
  344. "movq %%mm5, %%mm6 \n\t" /* R */\
  345. \
  346. "punpcklbw %%mm7, %%mm3 \n\t" /* 0G0G0G0G */\
  347. "punpcklbw %%mm7, %%mm2 \n\t" /* 0B0B0B0B */\
  348. "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R */\
  349. \
  350. "psrlw $3, %%mm2 \n\t"\
  351. "psllw $3, %%mm3 \n\t"\
  352. "psllw $8, %%mm5 \n\t"\
  353. \
  354. "pand g16Mask, %%mm3 \n\t"\
  355. "pand r16Mask, %%mm5 \n\t"\
  356. \
  357. "por %%mm3, %%mm2 \n\t"\
  358. "por %%mm5, %%mm2 \n\t"\
  359. \
  360. "punpckhbw %%mm7, %%mm4 \n\t" /* 0G0G0G0G */\
  361. "punpckhbw %%mm7, %%mm1 \n\t" /* 0B0B0B0B */\
  362. "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R */\
  363. \
  364. "psrlw $3, %%mm1 \n\t"\
  365. "psllw $3, %%mm4 \n\t"\
  366. "psllw $8, %%mm6 \n\t"\
  367. \
  368. "pand g16Mask, %%mm4 \n\t"\
  369. "pand r16Mask, %%mm6 \n\t"\
  370. \
  371. "por %%mm4, %%mm1 \n\t"\
  372. "por %%mm6, %%mm1 \n\t"\
  373. \
  374. MOVNTQ(%%mm2, (%4, %%eax, 2))\
  375. MOVNTQ(%%mm1, 8(%4, %%eax, 2))\
  376. \
  377. "addl $8, %%eax \n\t"\
  378. "cmpl %5, %%eax \n\t"\
  379. " jb 1b \n\t"
  380. #define WRITEBGR15 \
  381. "movq %%mm2, %%mm1 \n\t" /* B */\
  382. "movq %%mm4, %%mm3 \n\t" /* G */\
  383. "movq %%mm5, %%mm6 \n\t" /* R */\
  384. \
  385. "punpcklbw %%mm7, %%mm3 \n\t" /* 0G0G0G0G */\
  386. "punpcklbw %%mm7, %%mm2 \n\t" /* 0B0B0B0B */\
  387. "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R */\
  388. \
  389. "psrlw $3, %%mm2 \n\t"\
  390. "psllw $2, %%mm3 \n\t"\
  391. "psllw $7, %%mm5 \n\t"\
  392. \
  393. "pand g15Mask, %%mm3 \n\t"\
  394. "pand r15Mask, %%mm5 \n\t"\
  395. \
  396. "por %%mm3, %%mm2 \n\t"\
  397. "por %%mm5, %%mm2 \n\t"\
  398. \
  399. "punpckhbw %%mm7, %%mm4 \n\t" /* 0G0G0G0G */\
  400. "punpckhbw %%mm7, %%mm1 \n\t" /* 0B0B0B0B */\
  401. "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R */\
  402. \
  403. "psrlw $3, %%mm1 \n\t"\
  404. "psllw $2, %%mm4 \n\t"\
  405. "psllw $7, %%mm6 \n\t"\
  406. \
  407. "pand g15Mask, %%mm4 \n\t"\
  408. "pand r15Mask, %%mm6 \n\t"\
  409. \
  410. "por %%mm4, %%mm1 \n\t"\
  411. "por %%mm6, %%mm1 \n\t"\
  412. \
  413. MOVNTQ(%%mm2, (%4, %%eax, 2))\
  414. MOVNTQ(%%mm1, 8(%4, %%eax, 2))\
  415. \
  416. "addl $8, %%eax \n\t"\
  417. "cmpl %5, %%eax \n\t"\
  418. " jb 1b \n\t"
  419. // FIXME find a faster way to shuffle it to BGR24
  420. #define WRITEBGR24 \
  421. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
  422. "movq %%mm2, %%mm1 \n\t" /* B */\
  423. "movq %%mm5, %%mm6 \n\t" /* R */\
  424. "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
  425. "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
  426. "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
  427. "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
  428. "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
  429. "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
  430. "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
  431. "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
  432. "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
  433. "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
  434. \
  435. "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
  436. "psrlq $8, %%mm0 \n\t" /* 00RGB0RG 0 */\
  437. "pand bm00000111, %%mm4 \n\t" /* 00000RGB 0 */\
  438. "pand bm11111000, %%mm0 \n\t" /* 00RGB000 0.5 */\
  439. "por %%mm4, %%mm0 \n\t" /* 00RGBRGB 0 */\
  440. "movq %%mm2, %%mm4 \n\t" /* 0RGB0RGB 1 */\
  441. "psllq $48, %%mm2 \n\t" /* GB000000 1 */\
  442. "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
  443. \
  444. "movq %%mm4, %%mm2 \n\t" /* 0RGB0RGB 1 */\
  445. "psrld $16, %%mm4 \n\t" /* 000R000R 1 */\
  446. "psrlq $24, %%mm2 \n\t" /* 0000RGB0 1.5 */\
  447. "por %%mm4, %%mm2 \n\t" /* 000RRGBR 1 */\
  448. "pand bm00001111, %%mm2 \n\t" /* 0000RGBR 1 */\
  449. "movq %%mm1, %%mm4 \n\t" /* 0RGB0RGB 2 */\
  450. "psrlq $8, %%mm1 \n\t" /* 00RGB0RG 2 */\
  451. "pand bm00000111, %%mm4 \n\t" /* 00000RGB 2 */\
  452. "pand bm11111000, %%mm1 \n\t" /* 00RGB000 2.5 */\
  453. "por %%mm4, %%mm1 \n\t" /* 00RGBRGB 2 */\
  454. "movq %%mm1, %%mm4 \n\t" /* 00RGBRGB 2 */\
  455. "psllq $32, %%mm1 \n\t" /* BRGB0000 2 */\
  456. "por %%mm1, %%mm2 \n\t" /* BRGBRGBR 1 */\
  457. \
  458. "psrlq $32, %%mm4 \n\t" /* 000000RG 2.5 */\
  459. "movq %%mm3, %%mm5 \n\t" /* 0RGB0RGB 3 */\
  460. "psrlq $8, %%mm3 \n\t" /* 00RGB0RG 3 */\
  461. "pand bm00000111, %%mm5 \n\t" /* 00000RGB 3 */\
  462. "pand bm11111000, %%mm3 \n\t" /* 00RGB000 3.5 */\
  463. "por %%mm5, %%mm3 \n\t" /* 00RGBRGB 3 */\
  464. "psllq $16, %%mm3 \n\t" /* RGBRGB00 3 */\
  465. "por %%mm4, %%mm3 \n\t" /* RGBRGBRG 2.5 */\
  466. \
  467. "leal (%%eax, %%eax, 2), %%ebx \n\t"\
  468. MOVNTQ(%%mm0, (%4, %%ebx))\
  469. MOVNTQ(%%mm2, 8(%4, %%ebx))\
  470. MOVNTQ(%%mm3, 16(%4, %%ebx))\
  471. \
  472. "addl $8, %%eax \n\t"\
  473. "cmpl %5, %%eax \n\t"\
  474. " jb 1b \n\t"
  475. static inline void yuv2yuv(uint16_t *buf0, uint16_t *buf1, uint16_t *uvbuf0, uint16_t *uvbuf1,
  476. uint8_t *dest, uint8_t *uDest, uint8_t *vDest, int dstw, int yalpha, int uvalpha)
  477. {
  478. int yalpha1=yalpha^4095;
  479. int uvalpha1=uvalpha^4095;
  480. int i;
  481. asm volatile ("\n\t"::: "memory");
  482. for(i=0;i<dstw;i++)
  483. {
  484. ((uint8_t*)dest)[i] = (buf0[i]*yalpha1+buf1[i]*yalpha)>>19;
  485. }
  486. if(uvalpha != -1)
  487. {
  488. for(i=0; i<(dstw>>1); i++)
  489. {
  490. ((uint8_t*)uDest)[i] = (uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19;
  491. ((uint8_t*)vDest)[i] = (uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19;
  492. }
  493. }
  494. }
  495. /**
  496. * vertical scale YV12 to RGB
  497. */
  498. static inline void yuv2rgbX(uint16_t *buf0, uint16_t *buf1, uint16_t *uvbuf0, uint16_t *uvbuf1,
  499. uint8_t *dest, int dstw, int yalpha, int uvalpha, int dstbpp)
  500. {
  501. int yalpha1=yalpha^4095;
  502. int uvalpha1=uvalpha^4095;
  503. int i;
  504. if(fullUVIpol)
  505. {
  506. #ifdef HAVE_MMX
  507. if(dstbpp == 32)
  508. {
  509. asm volatile(
  510. FULL_YSCALEYUV2RGB
  511. "punpcklbw %%mm1, %%mm3 \n\t" // BGBGBGBG
  512. "punpcklbw %%mm7, %%mm0 \n\t" // R0R0R0R0
  513. "movq %%mm3, %%mm1 \n\t"
  514. "punpcklwd %%mm0, %%mm3 \n\t" // BGR0BGR0
  515. "punpckhwd %%mm0, %%mm1 \n\t" // BGR0BGR0
  516. MOVNTQ(%%mm3, (%4, %%eax, 4))
  517. MOVNTQ(%%mm1, 8(%4, %%eax, 4))
  518. "addl $4, %%eax \n\t"
  519. "cmpl %5, %%eax \n\t"
  520. " jb 1b \n\t"
  521. :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstw),
  522. "m" (yalpha1), "m" (uvalpha1)
  523. : "%eax"
  524. );
  525. }
  526. else if(dstbpp==24)
  527. {
  528. asm volatile(
  529. FULL_YSCALEYUV2RGB
  530. // lsb ... msb
  531. "punpcklbw %%mm1, %%mm3 \n\t" // BGBGBGBG
  532. "punpcklbw %%mm7, %%mm0 \n\t" // R0R0R0R0
  533. "movq %%mm3, %%mm1 \n\t"
  534. "punpcklwd %%mm0, %%mm3 \n\t" // BGR0BGR0
  535. "punpckhwd %%mm0, %%mm1 \n\t" // BGR0BGR0
  536. "movq %%mm3, %%mm2 \n\t" // BGR0BGR0
  537. "psrlq $8, %%mm3 \n\t" // GR0BGR00
  538. "pand bm00000111, %%mm2 \n\t" // BGR00000
  539. "pand bm11111000, %%mm3 \n\t" // 000BGR00
  540. "por %%mm2, %%mm3 \n\t" // BGRBGR00
  541. "movq %%mm1, %%mm2 \n\t"
  542. "psllq $48, %%mm1 \n\t" // 000000BG
  543. "por %%mm1, %%mm3 \n\t" // BGRBGRBG
  544. "movq %%mm2, %%mm1 \n\t" // BGR0BGR0
  545. "psrld $16, %%mm2 \n\t" // R000R000
  546. "psrlq $24, %%mm1 \n\t" // 0BGR0000
  547. "por %%mm2, %%mm1 \n\t" // RBGRR000
  548. "movl %4, %%ebx \n\t"
  549. "addl %%eax, %%ebx \n\t"
  550. #ifdef HAVE_MMX2
  551. //FIXME Alignment
  552. "movntq %%mm3, (%%ebx, %%eax, 2)\n\t"
  553. "movntq %%mm1, 8(%%ebx, %%eax, 2)\n\t"
  554. #else
  555. "movd %%mm3, (%%ebx, %%eax, 2) \n\t"
  556. "psrlq $32, %%mm3 \n\t"
  557. "movd %%mm3, 4(%%ebx, %%eax, 2) \n\t"
  558. "movd %%mm1, 8(%%ebx, %%eax, 2) \n\t"
  559. #endif
  560. "addl $4, %%eax \n\t"
  561. "cmpl %5, %%eax \n\t"
  562. " jb 1b \n\t"
  563. :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstw),
  564. "m" (yalpha1), "m" (uvalpha1)
  565. : "%eax", "%ebx"
  566. );
  567. }
  568. else if(dstbpp==15)
  569. {
  570. asm volatile(
  571. FULL_YSCALEYUV2RGB
  572. #ifdef DITHER1XBPP
  573. "paddusb b16Dither, %%mm1 \n\t"
  574. "paddusb b16Dither, %%mm0 \n\t"
  575. "paddusb b16Dither, %%mm3 \n\t"
  576. #endif
  577. "punpcklbw %%mm7, %%mm1 \n\t" // 0G0G0G0G
  578. "punpcklbw %%mm7, %%mm3 \n\t" // 0B0B0B0B
  579. "punpcklbw %%mm7, %%mm0 \n\t" // 0R0R0R0R
  580. "psrlw $3, %%mm3 \n\t"
  581. "psllw $2, %%mm1 \n\t"
  582. "psllw $7, %%mm0 \n\t"
  583. "pand g15Mask, %%mm1 \n\t"
  584. "pand r15Mask, %%mm0 \n\t"
  585. "por %%mm3, %%mm1 \n\t"
  586. "por %%mm1, %%mm0 \n\t"
  587. MOVNTQ(%%mm0, (%4, %%eax, 2))
  588. "addl $4, %%eax \n\t"
  589. "cmpl %5, %%eax \n\t"
  590. " jb 1b \n\t"
  591. :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstw),
  592. "m" (yalpha1), "m" (uvalpha1)
  593. : "%eax"
  594. );
  595. }
  596. else if(dstbpp==16)
  597. {
  598. asm volatile(
  599. FULL_YSCALEYUV2RGB
  600. #ifdef DITHER1XBPP
  601. "paddusb g16Dither, %%mm1 \n\t"
  602. "paddusb b16Dither, %%mm0 \n\t"
  603. "paddusb b16Dither, %%mm3 \n\t"
  604. #endif
  605. "punpcklbw %%mm7, %%mm1 \n\t" // 0G0G0G0G
  606. "punpcklbw %%mm7, %%mm3 \n\t" // 0B0B0B0B
  607. "punpcklbw %%mm7, %%mm0 \n\t" // 0R0R0R0R
  608. "psrlw $3, %%mm3 \n\t"
  609. "psllw $3, %%mm1 \n\t"
  610. "psllw $8, %%mm0 \n\t"
  611. "pand g16Mask, %%mm1 \n\t"
  612. "pand r16Mask, %%mm0 \n\t"
  613. "por %%mm3, %%mm1 \n\t"
  614. "por %%mm1, %%mm0 \n\t"
  615. MOVNTQ(%%mm0, (%4, %%eax, 2))
  616. "addl $4, %%eax \n\t"
  617. "cmpl %5, %%eax \n\t"
  618. " jb 1b \n\t"
  619. :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstw),
  620. "m" (yalpha1), "m" (uvalpha1)
  621. : "%eax"
  622. );
  623. }
  624. #else
  625. asm volatile ("\n\t"::: "memory");
  626. if(dstbpp==32 || dstbpp==24)
  627. {
  628. for(i=0;i<dstw;i++){
  629. // vertical linear interpolation && yuv2rgb in a single step:
  630. int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
  631. int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
  632. int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
  633. dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)];
  634. dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)];
  635. dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)];
  636. dest+=dstbpp>>3;
  637. }
  638. }
  639. else if(dstbpp==16)
  640. {
  641. for(i=0;i<dstw;i++){
  642. // vertical linear interpolation && yuv2rgb in a single step:
  643. int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
  644. int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
  645. int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
  646. ((uint16_t*)dest)[i] =
  647. clip_table16b[(Y + yuvtab_40cf[U]) >>13] |
  648. clip_table16g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
  649. clip_table16r[(Y + yuvtab_3343[V]) >>13];
  650. }
  651. }
  652. else if(dstbpp==15)
  653. {
  654. for(i=0;i<dstw;i++){
  655. // vertical linear interpolation && yuv2rgb in a single step:
  656. int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
  657. int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
  658. int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
  659. ((uint16_t*)dest)[i] =
  660. clip_table15b[(Y + yuvtab_40cf[U]) >>13] |
  661. clip_table15g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
  662. clip_table15r[(Y + yuvtab_3343[V]) >>13];
  663. }
  664. }
  665. #endif
  666. }//FULL_UV_IPOL
  667. else
  668. {
  669. #ifdef HAVE_MMX
  670. if(dstbpp == 32)
  671. {
  672. asm volatile(
  673. YSCALEYUV2RGB
  674. WRITEBGR32
  675. :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstw),
  676. "m" (yalpha1), "m" (uvalpha1)
  677. : "%eax"
  678. );
  679. }
  680. else if(dstbpp==24)
  681. {
  682. asm volatile(
  683. YSCALEYUV2RGB
  684. WRITEBGR24
  685. :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstw),
  686. "m" (yalpha1), "m" (uvalpha1)
  687. : "%eax", "%ebx"
  688. );
  689. }
  690. else if(dstbpp==15)
  691. {
  692. asm volatile(
  693. YSCALEYUV2RGB
  694. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  695. #ifdef DITHER1XBPP
  696. "paddusb b16Dither, %%mm2 \n\t"
  697. "paddusb b16Dither, %%mm4 \n\t"
  698. "paddusb b16Dither, %%mm5 \n\t"
  699. #endif
  700. WRITEBGR15
  701. :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstw),
  702. "m" (yalpha1), "m" (uvalpha1)
  703. : "%eax"
  704. );
  705. }
  706. else if(dstbpp==16)
  707. {
  708. asm volatile(
  709. YSCALEYUV2RGB
  710. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  711. #ifdef DITHER1XBPP
  712. "paddusb g16Dither, %%mm2 \n\t"
  713. "paddusb b16Dither, %%mm4 \n\t"
  714. "paddusb b16Dither, %%mm5 \n\t"
  715. #endif
  716. WRITEBGR16
  717. :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstw),
  718. "m" (yalpha1), "m" (uvalpha1)
  719. : "%eax"
  720. );
  721. }
  722. #else
  723. asm volatile ("\n\t"::: "memory");
  724. if(dstbpp==32)
  725. {
  726. for(i=0; i<dstw-1; i+=2){
  727. // vertical linear interpolation && yuv2rgb in a single step:
  728. int Y1=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
  729. int Y2=yuvtab_2568[((buf0[i+1]*yalpha1+buf1[i+1]*yalpha)>>19)];
  730. int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
  731. int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
  732. int Cb= yuvtab_40cf[U];
  733. int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
  734. int Cr= yuvtab_3343[V];
  735. dest[4*i+0]=clip_table[((Y1 + Cb) >>13)];
  736. dest[4*i+1]=clip_table[((Y1 + Cg) >>13)];
  737. dest[4*i+2]=clip_table[((Y1 + Cr) >>13)];
  738. dest[4*i+4]=clip_table[((Y2 + Cb) >>13)];
  739. dest[4*i+5]=clip_table[((Y2 + Cg) >>13)];
  740. dest[4*i+6]=clip_table[((Y2 + Cr) >>13)];
  741. }
  742. }
  743. if(dstbpp==24)
  744. {
  745. for(i=0; i<dstw-1; i+=2){
  746. // vertical linear interpolation && yuv2rgb in a single step:
  747. int Y1=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
  748. int Y2=yuvtab_2568[((buf0[i+1]*yalpha1+buf1[i+1]*yalpha)>>19)];
  749. int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
  750. int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
  751. int Cb= yuvtab_40cf[U];
  752. int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
  753. int Cr= yuvtab_3343[V];
  754. dest[0]=clip_table[((Y1 + Cb) >>13)];
  755. dest[1]=clip_table[((Y1 + Cg) >>13)];
  756. dest[2]=clip_table[((Y1 + Cr) >>13)];
  757. dest[3]=clip_table[((Y2 + Cb) >>13)];
  758. dest[4]=clip_table[((Y2 + Cg) >>13)];
  759. dest[5]=clip_table[((Y2 + Cr) >>13)];
  760. dest+=6;
  761. }
  762. }
  763. else if(dstbpp==16)
  764. {
  765. for(i=0; i<dstw-1; i+=2){
  766. // vertical linear interpolation && yuv2rgb in a single step:
  767. int Y1=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
  768. int Y2=yuvtab_2568[((buf0[i+1]*yalpha1+buf1[i+1]*yalpha)>>19)];
  769. int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
  770. int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
  771. int Cb= yuvtab_40cf[U];
  772. int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
  773. int Cr= yuvtab_3343[V];
  774. ((uint16_t*)dest)[i] =
  775. clip_table16b[(Y1 + Cb) >>13] |
  776. clip_table16g[(Y1 + Cg) >>13] |
  777. clip_table16r[(Y1 + Cr) >>13];
  778. ((uint16_t*)dest)[i+1] =
  779. clip_table16b[(Y2 + Cb) >>13] |
  780. clip_table16g[(Y2 + Cg) >>13] |
  781. clip_table16r[(Y2 + Cr) >>13];
  782. }
  783. }
  784. else if(dstbpp==15)
  785. {
  786. for(i=0; i<dstw-1; i+=2){
  787. // vertical linear interpolation && yuv2rgb in a single step:
  788. int Y1=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
  789. int Y2=yuvtab_2568[((buf0[i+1]*yalpha1+buf1[i+1]*yalpha)>>19)];
  790. int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
  791. int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
  792. int Cb= yuvtab_40cf[U];
  793. int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
  794. int Cr= yuvtab_3343[V];
  795. ((uint16_t*)dest)[i] =
  796. clip_table15b[(Y1 + Cb) >>13] |
  797. clip_table15g[(Y1 + Cg) >>13] |
  798. clip_table15r[(Y1 + Cr) >>13];
  799. ((uint16_t*)dest)[i+1] =
  800. clip_table15b[(Y2 + Cb) >>13] |
  801. clip_table15g[(Y2 + Cg) >>13] |
  802. clip_table15r[(Y2 + Cr) >>13];
  803. }
  804. }
  805. #endif
  806. } //!FULL_UV_IPOL
  807. }
  808. /**
  809. * YV12 to RGB without scaling or interpolating
  810. */
  811. static inline void yuv2rgb1(uint16_t *buf0, uint16_t *buf1, uint16_t *uvbuf0, uint16_t *uvbuf1,
  812. uint8_t *dest, int dstw, int yalpha, int uvalpha, int dstbpp)
  813. {
  814. int yalpha1=yalpha^4095;
  815. int uvalpha1=uvalpha^4095;
  816. int i;
  817. if(fullUVIpol || allwaysIpol)
  818. {
  819. yuv2rgbX(buf0, buf1, uvbuf0, uvbuf1, dest, dstw, yalpha, uvalpha, dstbpp);
  820. return;
  821. }
  822. if( yalpha > 2048 ) buf0 = buf1;
  823. #ifdef HAVE_MMX
  824. if( uvalpha < 2048 ) // note this is not correct (shifts chrominance by 0.5 pixels) but its a bit faster
  825. {
  826. if(dstbpp == 32)
  827. {
  828. asm volatile(
  829. YSCALEYUV2RGB1
  830. WRITEBGR32
  831. :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstw),
  832. "m" (yalpha1), "m" (uvalpha1)
  833. : "%eax"
  834. );
  835. }
  836. else if(dstbpp==24)
  837. {
  838. asm volatile(
  839. YSCALEYUV2RGB1
  840. WRITEBGR24
  841. :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstw),
  842. "m" (yalpha1), "m" (uvalpha1)
  843. : "%eax", "%ebx"
  844. );
  845. }
  846. else if(dstbpp==15)
  847. {
  848. asm volatile(
  849. YSCALEYUV2RGB1
  850. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  851. #ifdef DITHER1XBPP
  852. "paddusb b16Dither, %%mm2 \n\t"
  853. "paddusb b16Dither, %%mm4 \n\t"
  854. "paddusb b16Dither, %%mm5 \n\t"
  855. #endif
  856. WRITEBGR15
  857. :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstw),
  858. "m" (yalpha1), "m" (uvalpha1)
  859. : "%eax"
  860. );
  861. }
  862. else if(dstbpp==16)
  863. {
  864. asm volatile(
  865. YSCALEYUV2RGB1
  866. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  867. #ifdef DITHER1XBPP
  868. "paddusb g16Dither, %%mm2 \n\t"
  869. "paddusb b16Dither, %%mm4 \n\t"
  870. "paddusb b16Dither, %%mm5 \n\t"
  871. #endif
  872. WRITEBGR16
  873. :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstw),
  874. "m" (yalpha1), "m" (uvalpha1)
  875. : "%eax"
  876. );
  877. }
  878. }
  879. else
  880. {
  881. if(dstbpp == 32)
  882. {
  883. asm volatile(
  884. YSCALEYUV2RGB1b
  885. WRITEBGR32
  886. :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstw),
  887. "m" (yalpha1), "m" (uvalpha1)
  888. : "%eax"
  889. );
  890. }
  891. else if(dstbpp==24)
  892. {
  893. asm volatile(
  894. YSCALEYUV2RGB1b
  895. WRITEBGR24
  896. :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstw),
  897. "m" (yalpha1), "m" (uvalpha1)
  898. : "%eax", "%ebx"
  899. );
  900. }
  901. else if(dstbpp==15)
  902. {
  903. asm volatile(
  904. YSCALEYUV2RGB1b
  905. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  906. #ifdef DITHER1XBPP
  907. "paddusb b16Dither, %%mm2 \n\t"
  908. "paddusb b16Dither, %%mm4 \n\t"
  909. "paddusb b16Dither, %%mm5 \n\t"
  910. #endif
  911. WRITEBGR15
  912. :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstw),
  913. "m" (yalpha1), "m" (uvalpha1)
  914. : "%eax"
  915. );
  916. }
  917. else if(dstbpp==16)
  918. {
  919. asm volatile(
  920. YSCALEYUV2RGB1b
  921. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  922. #ifdef DITHER1XBPP
  923. "paddusb g16Dither, %%mm2 \n\t"
  924. "paddusb b16Dither, %%mm4 \n\t"
  925. "paddusb b16Dither, %%mm5 \n\t"
  926. #endif
  927. WRITEBGR16
  928. :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstw),
  929. "m" (yalpha1), "m" (uvalpha1)
  930. : "%eax"
  931. );
  932. }
  933. }
  934. #else
  935. //FIXME write 2 versions (for even & odd lines)
  936. asm volatile ("\n\t"::: "memory");
  937. if(dstbpp==32)
  938. {
  939. for(i=0; i<dstw-1; i+=2){
  940. // vertical linear interpolation && yuv2rgb in a single step:
  941. int Y1=yuvtab_2568[buf0[i]>>7];
  942. int Y2=yuvtab_2568[buf0[i+1]>>7];
  943. int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
  944. int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
  945. int Cb= yuvtab_40cf[U];
  946. int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
  947. int Cr= yuvtab_3343[V];
  948. dest[4*i+0]=clip_table[((Y1 + Cb) >>13)];
  949. dest[4*i+1]=clip_table[((Y1 + Cg) >>13)];
  950. dest[4*i+2]=clip_table[((Y1 + Cr) >>13)];
  951. dest[4*i+4]=clip_table[((Y2 + Cb) >>13)];
  952. dest[4*i+5]=clip_table[((Y2 + Cg) >>13)];
  953. dest[4*i+6]=clip_table[((Y2 + Cr) >>13)];
  954. }
  955. }
  956. if(dstbpp==24)
  957. {
  958. for(i=0; i<dstw-1; i+=2){
  959. // vertical linear interpolation && yuv2rgb in a single step:
  960. int Y1=yuvtab_2568[buf0[i]>>7];
  961. int Y2=yuvtab_2568[buf0[i+1]>>7];
  962. int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
  963. int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
  964. int Cb= yuvtab_40cf[U];
  965. int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
  966. int Cr= yuvtab_3343[V];
  967. dest[0]=clip_table[((Y1 + Cb) >>13)];
  968. dest[1]=clip_table[((Y1 + Cg) >>13)];
  969. dest[2]=clip_table[((Y1 + Cr) >>13)];
  970. dest[3]=clip_table[((Y2 + Cb) >>13)];
  971. dest[4]=clip_table[((Y2 + Cg) >>13)];
  972. dest[5]=clip_table[((Y2 + Cr) >>13)];
  973. dest+=6;
  974. }
  975. }
  976. else if(dstbpp==16)
  977. {
  978. for(i=0; i<dstw-1; i+=2){
  979. // vertical linear interpolation && yuv2rgb in a single step:
  980. int Y1=yuvtab_2568[buf0[i]>>7];
  981. int Y2=yuvtab_2568[buf0[i+1]>>7];
  982. int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
  983. int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
  984. int Cb= yuvtab_40cf[U];
  985. int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
  986. int Cr= yuvtab_3343[V];
  987. ((uint16_t*)dest)[i] =
  988. clip_table16b[(Y1 + Cb) >>13] |
  989. clip_table16g[(Y1 + Cg) >>13] |
  990. clip_table16r[(Y1 + Cr) >>13];
  991. ((uint16_t*)dest)[i+1] =
  992. clip_table16b[(Y2 + Cb) >>13] |
  993. clip_table16g[(Y2 + Cg) >>13] |
  994. clip_table16r[(Y2 + Cr) >>13];
  995. }
  996. }
  997. else if(dstbpp==15)
  998. {
  999. for(i=0; i<dstw-1; i+=2){
  1000. // vertical linear interpolation && yuv2rgb in a single step:
  1001. int Y1=yuvtab_2568[buf0[i]>>7];
  1002. int Y2=yuvtab_2568[buf0[i+1]>>7];
  1003. int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
  1004. int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
  1005. int Cb= yuvtab_40cf[U];
  1006. int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
  1007. int Cr= yuvtab_3343[V];
  1008. ((uint16_t*)dest)[i] =
  1009. clip_table15b[(Y1 + Cb) >>13] |
  1010. clip_table15g[(Y1 + Cg) >>13] |
  1011. clip_table15r[(Y1 + Cr) >>13];
  1012. ((uint16_t*)dest)[i+1] =
  1013. clip_table15b[(Y2 + Cb) >>13] |
  1014. clip_table15g[(Y2 + Cg) >>13] |
  1015. clip_table15r[(Y2 + Cr) >>13];
  1016. }
  1017. }
  1018. #endif
  1019. }
  1020. static inline void hyscale(uint16_t *dst, int dstWidth, uint8_t *src, int srcWidth, int xInc)
  1021. {
  1022. int i;
  1023. unsigned int xpos=0;
  1024. // *** horizontal scale Y line to temp buffer
  1025. #ifdef ARCH_X86
  1026. #ifdef HAVE_MMX2
  1027. if(canMMX2BeUsed)
  1028. {
  1029. asm volatile(
  1030. "pxor %%mm7, %%mm7 \n\t"
  1031. "pxor %%mm2, %%mm2 \n\t" // 2*xalpha
  1032. "movd %5, %%mm6 \n\t" // xInc&0xFFFF
  1033. "punpcklwd %%mm6, %%mm6 \n\t"
  1034. "punpcklwd %%mm6, %%mm6 \n\t"
  1035. "movq %%mm6, %%mm2 \n\t"
  1036. "psllq $16, %%mm2 \n\t"
  1037. "paddw %%mm6, %%mm2 \n\t"
  1038. "psllq $16, %%mm2 \n\t"
  1039. "paddw %%mm6, %%mm2 \n\t"
  1040. "psllq $16, %%mm2 \n\t" //0,t,2t,3t t=xInc&0xFF
  1041. "movq %%mm2, temp0 \n\t"
  1042. "movd %4, %%mm6 \n\t" //(xInc*4)&0xFFFF
  1043. "punpcklwd %%mm6, %%mm6 \n\t"
  1044. "punpcklwd %%mm6, %%mm6 \n\t"
  1045. "xorl %%eax, %%eax \n\t" // i
  1046. "movl %0, %%esi \n\t" // src
  1047. "movl %1, %%edi \n\t" // buf1
  1048. "movl %3, %%edx \n\t" // (xInc*4)>>16
  1049. "xorl %%ecx, %%ecx \n\t"
  1050. "xorl %%ebx, %%ebx \n\t"
  1051. "movw %4, %%bx \n\t" // (xInc*4)&0xFFFF
  1052. #define FUNNY_Y_CODE \
  1053. PREFETCH" 1024(%%esi) \n\t"\
  1054. PREFETCH" 1056(%%esi) \n\t"\
  1055. PREFETCH" 1088(%%esi) \n\t"\
  1056. "call funnyYCode \n\t"\
  1057. "movq temp0, %%mm2 \n\t"\
  1058. "xorl %%ecx, %%ecx \n\t"
  1059. FUNNY_Y_CODE
  1060. FUNNY_Y_CODE
  1061. FUNNY_Y_CODE
  1062. FUNNY_Y_CODE
  1063. FUNNY_Y_CODE
  1064. FUNNY_Y_CODE
  1065. FUNNY_Y_CODE
  1066. FUNNY_Y_CODE
  1067. :: "m" (src), "m" (dst), "m" (dstWidth), "m" ((xInc*4)>>16),
  1068. "m" ((xInc*4)&0xFFFF), "m" (xInc&0xFFFF)
  1069. : "%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi"
  1070. );
  1071. for(i=dstWidth-1; (i*xInc)>>16 >=srcWidth-1; i--) dst[i] = src[srcWidth-1]*128;
  1072. }
  1073. else
  1074. {
  1075. #endif
  1076. //NO MMX just normal asm ...
  1077. asm volatile(
  1078. "xorl %%eax, %%eax \n\t" // i
  1079. "xorl %%ebx, %%ebx \n\t" // xx
  1080. "xorl %%ecx, %%ecx \n\t" // 2*xalpha
  1081. "1: \n\t"
  1082. "movzbl (%0, %%ebx), %%edi \n\t" //src[xx]
  1083. "movzbl 1(%0, %%ebx), %%esi \n\t" //src[xx+1]
  1084. "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
  1085. "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
  1086. "shll $16, %%edi \n\t"
  1087. "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
  1088. "movl %1, %%edi \n\t"
  1089. "shrl $9, %%esi \n\t"
  1090. "movw %%si, (%%edi, %%eax, 2) \n\t"
  1091. "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
  1092. "adcl %3, %%ebx \n\t" //xx+= xInc>>8 + carry
  1093. "movzbl (%0, %%ebx), %%edi \n\t" //src[xx]
  1094. "movzbl 1(%0, %%ebx), %%esi \n\t" //src[xx+1]
  1095. "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
  1096. "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
  1097. "shll $16, %%edi \n\t"
  1098. "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
  1099. "movl %1, %%edi \n\t"
  1100. "shrl $9, %%esi \n\t"
  1101. "movw %%si, 2(%%edi, %%eax, 2) \n\t"
  1102. "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
  1103. "adcl %3, %%ebx \n\t" //xx+= xInc>>8 + carry
  1104. "addl $2, %%eax \n\t"
  1105. "cmpl %2, %%eax \n\t"
  1106. " jb 1b \n\t"
  1107. :: "r" (src), "m" (dst), "m" (dstWidth), "m" (xInc>>16), "m" (xInc&0xFFFF)
  1108. : "%eax", "%ebx", "%ecx", "%edi", "%esi"
  1109. );
  1110. #ifdef HAVE_MMX2
  1111. } //if MMX2 cant be used
  1112. #endif
  1113. #else
  1114. for(i=0;i<dstWidth;i++){
  1115. register unsigned int xx=xpos>>16;
  1116. register unsigned int xalpha=(xpos&0xFFFF)>>9;
  1117. dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha;
  1118. xpos+=xInc;
  1119. }
  1120. #endif
  1121. }
  1122. inline static void hcscale(uint16_t *dst, int dstWidth,
  1123. uint8_t *src1, uint8_t *src2, int srcWidth, int xInc)
  1124. {
  1125. int xpos=0;
  1126. int i;
  1127. #ifdef ARCH_X86
  1128. #ifdef HAVE_MMX2
  1129. if(canMMX2BeUsed)
  1130. {
  1131. asm volatile(
  1132. "pxor %%mm7, %%mm7 \n\t"
  1133. "pxor %%mm2, %%mm2 \n\t" // 2*xalpha
  1134. "movd %5, %%mm6 \n\t" // xInc&0xFFFF
  1135. "punpcklwd %%mm6, %%mm6 \n\t"
  1136. "punpcklwd %%mm6, %%mm6 \n\t"
  1137. "movq %%mm6, %%mm2 \n\t"
  1138. "psllq $16, %%mm2 \n\t"
  1139. "paddw %%mm6, %%mm2 \n\t"
  1140. "psllq $16, %%mm2 \n\t"
  1141. "paddw %%mm6, %%mm2 \n\t"
  1142. "psllq $16, %%mm2 \n\t" //0,t,2t,3t t=xInc&0xFFFF
  1143. "movq %%mm2, temp0 \n\t"
  1144. "movd %4, %%mm6 \n\t" //(xInc*4)&0xFFFF
  1145. "punpcklwd %%mm6, %%mm6 \n\t"
  1146. "punpcklwd %%mm6, %%mm6 \n\t"
  1147. "xorl %%eax, %%eax \n\t" // i
  1148. "movl %0, %%esi \n\t" // src
  1149. "movl %1, %%edi \n\t" // buf1
  1150. "movl %3, %%edx \n\t" // (xInc*4)>>16
  1151. "xorl %%ecx, %%ecx \n\t"
  1152. "xorl %%ebx, %%ebx \n\t"
  1153. "movw %4, %%bx \n\t" // (xInc*4)&0xFFFF
  1154. #define FUNNYUVCODE \
  1155. PREFETCH" 1024(%%esi) \n\t"\
  1156. PREFETCH" 1056(%%esi) \n\t"\
  1157. PREFETCH" 1088(%%esi) \n\t"\
  1158. "call funnyUVCode \n\t"\
  1159. "movq temp0, %%mm2 \n\t"\
  1160. "xorl %%ecx, %%ecx \n\t"
  1161. FUNNYUVCODE
  1162. FUNNYUVCODE
  1163. FUNNYUVCODE
  1164. FUNNYUVCODE
  1165. FUNNYUVCODE
  1166. FUNNYUVCODE
  1167. FUNNYUVCODE
  1168. FUNNYUVCODE
  1169. "xorl %%eax, %%eax \n\t" // i
  1170. "movl %6, %%esi \n\t" // src
  1171. "movl %1, %%edi \n\t" // buf1
  1172. "addl $4096, %%edi \n\t"
  1173. FUNNYUVCODE
  1174. FUNNYUVCODE
  1175. FUNNYUVCODE
  1176. FUNNYUVCODE
  1177. FUNNYUVCODE
  1178. FUNNYUVCODE
  1179. FUNNYUVCODE
  1180. FUNNYUVCODE
  1181. :: "m" (src1), "m" (dst), "m" (dstWidth), "m" ((xInc*4)>>16),
  1182. "m" ((xInc*4)&0xFFFF), "m" (xInc&0xFFFF), "m" (src2)
  1183. : "%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi"
  1184. );
  1185. for(i=dstWidth-1; (i*xInc)>>16 >=srcWidth/2-1; i--)
  1186. {
  1187. dst[i] = src1[srcWidth/2-1]*128;
  1188. dst[i+2048] = src2[srcWidth/2-1]*128;
  1189. }
  1190. }
  1191. else
  1192. {
  1193. #endif
  1194. asm volatile(
  1195. "xorl %%eax, %%eax \n\t" // i
  1196. "xorl %%ebx, %%ebx \n\t" // xx
  1197. "xorl %%ecx, %%ecx \n\t" // 2*xalpha
  1198. "1: \n\t"
  1199. "movl %0, %%esi \n\t"
  1200. "movzbl (%%esi, %%ebx), %%edi \n\t" //src[xx]
  1201. "movzbl 1(%%esi, %%ebx), %%esi \n\t" //src[xx+1]
  1202. "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
  1203. "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
  1204. "shll $16, %%edi \n\t"
  1205. "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
  1206. "movl %1, %%edi \n\t"
  1207. "shrl $9, %%esi \n\t"
  1208. "movw %%si, (%%edi, %%eax, 2) \n\t"
  1209. "movzbl (%5, %%ebx), %%edi \n\t" //src[xx]
  1210. "movzbl 1(%5, %%ebx), %%esi \n\t" //src[xx+1]
  1211. "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
  1212. "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
  1213. "shll $16, %%edi \n\t"
  1214. "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
  1215. "movl %1, %%edi \n\t"
  1216. "shrl $9, %%esi \n\t"
  1217. "movw %%si, 4096(%%edi, %%eax, 2)\n\t"
  1218. "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
  1219. "adcl %3, %%ebx \n\t" //xx+= xInc>>8 + carry
  1220. "addl $1, %%eax \n\t"
  1221. "cmpl %2, %%eax \n\t"
  1222. " jb 1b \n\t"
  1223. :: "m" (src1), "m" (dst), "m" (dstWidth), "m" (xInc>>16), "m" (xInc&0xFFFF),
  1224. "r" (src2)
  1225. : "%eax", "%ebx", "%ecx", "%edi", "%esi"
  1226. );
  1227. #ifdef HAVE_MMX2
  1228. } //if MMX2 cant be used
  1229. #endif
  1230. #else
  1231. for(i=0;i<dstWidth;i++){
  1232. register unsigned int xx=xpos>>16;
  1233. register unsigned int xalpha=(xpos&0xFFFF)>>9;
  1234. dst[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha);
  1235. dst[i+2048]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha);
  1236. /* slower
  1237. dst[i]= (src1[xx]<<7) + (src1[xx+1] - src1[xx])*xalpha;
  1238. dst[i+2048]=(src2[xx]<<7) + (src2[xx+1] - src2[xx])*xalpha;
  1239. */
  1240. xpos+=xInc;
  1241. }
  1242. #endif
  1243. }
  1244. // *** bilinear scaling and yuv->rgb or yuv->yuv conversion of yv12 slices:
  1245. // *** Note: it's called multiple times while decoding a frame, first time y==0
  1246. // *** Designed to upscale, but may work for downscale too.
  1247. // s_xinc = (src_width << 16) / dst_width
  1248. // s_yinc = (src_height << 16) / dst_height
  1249. void SwScale_YV12slice(unsigned char* srcptr[],int stride[], int y, int h,
  1250. uint8_t* dstptr[], int dststride, int dstw, int dstbpp,
  1251. unsigned int s_xinc,unsigned int s_yinc){
  1252. // scaling factors:
  1253. //static int s_yinc=(vo_dga_src_height<<16)/vo_dga_vp_height;
  1254. //static int s_xinc=(vo_dga_src_width<<8)/vo_dga_vp_width;
  1255. unsigned int s_xinc2;
  1256. static int s_srcypos; // points to the dst Pixels center in the source (0 is the center of pixel 0,0 in src)
  1257. static int s_ypos;
  1258. // last horzontally interpolated lines, used to avoid unnecessary calculations
  1259. static int s_last_ypos;
  1260. static int s_last_y1pos;
  1261. static int static_dstw;
  1262. #ifdef HAVE_MMX2
  1263. // used to detect a horizontal size change
  1264. static int old_dstw= -1;
  1265. static int old_s_xinc= -1;
  1266. #endif
  1267. int srcWidth= (dstw*s_xinc + 0x8000)>>16;
  1268. int dstUVw= fullUVIpol ? dstw : dstw/2;
  1269. int i;
  1270. #ifdef HAVE_MMX2
  1271. canMMX2BeUsed= (s_xinc <= 0x10000 && (dstw&31)==0 && (srcWidth&15)==0) ? 1 : 0;
  1272. #endif
  1273. // match pixel 0 of the src to pixel 0 of dst and match pixel n-2 of src to pixel n-2 of dst
  1274. // n-2 is the last chrominance sample available
  1275. // FIXME this is not perfect, but noone shuld notice the difference, the more correct variant
  1276. // would be like the vertical one, but that would require some special code for the
  1277. // first and last pixel
  1278. if(canMMX2BeUsed) s_xinc+= 20;
  1279. else s_xinc = ((srcWidth-2)<<16)/(dstw-2) - 20;
  1280. if(fullUVIpol && !(dstbpp==12)) s_xinc2= s_xinc>>1;
  1281. else s_xinc2= s_xinc;
  1282. // force calculation of the horizontal interpolation of the first line
  1283. if(y==0){
  1284. s_last_ypos=-99;
  1285. s_last_y1pos=-99;
  1286. s_srcypos= s_yinc/2 - 0x8000;
  1287. s_ypos=0;
  1288. // clean the buffers so that no green stuff is drawen if the width is not sane (%8=0)
  1289. for(i=dstw-2; i<dstw+20; i++)
  1290. {
  1291. pix_buf_uv[0][i] = pix_buf_uv[1][i]
  1292. = pix_buf_uv[0][2048+i] = pix_buf_uv[1][2048+i] = 128;
  1293. pix_buf_uv[0][i/2] = pix_buf_uv[1][i/2]
  1294. = pix_buf_uv[0][2048+i/2] = pix_buf_uv[1][2048+i/2] = 128;
  1295. pix_buf_y[0][i]= pix_buf_y[1][i]= 0;
  1296. }
  1297. #ifdef HAVE_MMX2
  1298. // cant downscale !!!
  1299. if((old_s_xinc != s_xinc || old_dstw!=dstw) && canMMX2BeUsed)
  1300. {
  1301. uint8_t *fragment;
  1302. int imm8OfPShufW1;
  1303. int imm8OfPShufW2;
  1304. int fragmentLength;
  1305. int xpos, xx, xalpha, i;
  1306. old_s_xinc= s_xinc;
  1307. old_dstw= dstw;
  1308. static_dstw= dstw;
  1309. // create an optimized horizontal scaling routine
  1310. //code fragment
  1311. asm volatile(
  1312. "jmp 9f \n\t"
  1313. // Begin
  1314. "0: \n\t"
  1315. "movq (%%esi), %%mm0 \n\t" //FIXME Alignment
  1316. "movq %%mm0, %%mm1 \n\t"
  1317. "psrlq $8, %%mm0 \n\t"
  1318. "punpcklbw %%mm7, %%mm1 \n\t"
  1319. "movq %%mm2, %%mm3 \n\t"
  1320. "punpcklbw %%mm7, %%mm0 \n\t"
  1321. "addw %%bx, %%cx \n\t" //2*xalpha += (4*s_xinc)&0xFFFF
  1322. "pshufw $0xFF, %%mm1, %%mm1 \n\t"
  1323. "1: \n\t"
  1324. "adcl %%edx, %%esi \n\t" //xx+= (4*s_xinc)>>16 + carry
  1325. "pshufw $0xFF, %%mm0, %%mm0 \n\t"
  1326. "2: \n\t"
  1327. "psrlw $9, %%mm3 \n\t"
  1328. "psubw %%mm1, %%mm0 \n\t"
  1329. "pmullw %%mm3, %%mm0 \n\t"
  1330. "paddw %%mm6, %%mm2 \n\t" // 2*alpha += xpos&0xFFFF
  1331. "psllw $7, %%mm1 \n\t"
  1332. "paddw %%mm1, %%mm0 \n\t"
  1333. "movq %%mm0, (%%edi, %%eax) \n\t"
  1334. "addl $8, %%eax \n\t"
  1335. // End
  1336. "9: \n\t"
  1337. // "int $3\n\t"
  1338. "leal 0b, %0 \n\t"
  1339. "leal 1b, %1 \n\t"
  1340. "leal 2b, %2 \n\t"
  1341. "decl %1 \n\t"
  1342. "decl %2 \n\t"
  1343. "subl %0, %1 \n\t"
  1344. "subl %0, %2 \n\t"
  1345. "leal 9b, %3 \n\t"
  1346. "subl %0, %3 \n\t"
  1347. :"=r" (fragment), "=r" (imm8OfPShufW1), "=r" (imm8OfPShufW2),
  1348. "=r" (fragmentLength)
  1349. );
  1350. xpos= 0; //s_xinc/2 - 0x8000; // difference between pixel centers
  1351. /* choose xinc so that all 8 parts fit exactly
  1352. Note: we cannot use just 1 part because it would not fit in the code cache */
  1353. // s_xinc2_diff= -((((s_xinc2*(dstw/8))&0xFFFF))/(dstw/8))-10;
  1354. // s_xinc_diff= -((((s_xinc*(dstw/8))&0xFFFF))/(dstw/8));
  1355. #ifdef ALT_ERROR
  1356. // s_xinc2_diff+= ((0x10000/(dstw/8)));
  1357. #endif
  1358. // s_xinc_diff= s_xinc2_diff*2;
  1359. // s_xinc2+= s_xinc2_diff;
  1360. // s_xinc+= s_xinc_diff;
  1361. // old_s_xinc= s_xinc;
  1362. for(i=0; i<dstw/8; i++)
  1363. {
  1364. int xx=xpos>>16;
  1365. if((i&3) == 0)
  1366. {
  1367. int a=0;
  1368. int b=((xpos+s_xinc)>>16) - xx;
  1369. int c=((xpos+s_xinc*2)>>16) - xx;
  1370. int d=((xpos+s_xinc*3)>>16) - xx;
  1371. memcpy(funnyYCode + fragmentLength*i/4, fragment, fragmentLength);
  1372. funnyYCode[fragmentLength*i/4 + imm8OfPShufW1]=
  1373. funnyYCode[fragmentLength*i/4 + imm8OfPShufW2]=
  1374. a | (b<<2) | (c<<4) | (d<<6);
  1375. // if we dont need to read 8 bytes than dont :), reduces the chance of
  1376. // crossing a cache line
  1377. if(d<3) funnyYCode[fragmentLength*i/4 + 1]= 0x6E;
  1378. funnyYCode[fragmentLength*(i+4)/4]= RET;
  1379. }
  1380. xpos+=s_xinc;
  1381. }
  1382. xpos= 0; //s_xinc2/2 - 0x10000; // difference between centers of chrom samples
  1383. for(i=0; i<dstUVw/8; i++)
  1384. {
  1385. int xx=xpos>>16;
  1386. if((i&3) == 0)
  1387. {
  1388. int a=0;
  1389. int b=((xpos+s_xinc2)>>16) - xx;
  1390. int c=((xpos+s_xinc2*2)>>16) - xx;
  1391. int d=((xpos+s_xinc2*3)>>16) - xx;
  1392. memcpy(funnyUVCode + fragmentLength*i/4, fragment, fragmentLength);
  1393. funnyUVCode[fragmentLength*i/4 + imm8OfPShufW1]=
  1394. funnyUVCode[fragmentLength*i/4 + imm8OfPShufW2]=
  1395. a | (b<<2) | (c<<4) | (d<<6);
  1396. // if we dont need to read 8 bytes than dont :), reduces the chance of
  1397. // crossing a cache line
  1398. if(d<3) funnyUVCode[fragmentLength*i/4 + 1]= 0x6E;
  1399. funnyUVCode[fragmentLength*(i+4)/4]= RET;
  1400. }
  1401. xpos+=s_xinc2;
  1402. }
  1403. // funnyCode[0]= RET;
  1404. }
  1405. #endif // HAVE_MMX2
  1406. } // reset counters
  1407. while(1){
  1408. unsigned char *dest =dstptr[0]+dststride*s_ypos;
  1409. unsigned char *uDest=dstptr[1]+(dststride>>1)*(s_ypos>>1);
  1410. unsigned char *vDest=dstptr[2]+(dststride>>1)*(s_ypos>>1);
  1411. int y0=(s_srcypos + 0xFFFF)>>16; // first luminance source line number below the dst line
  1412. // points to the dst Pixels center in the source (0 is the center of pixel 0,0 in src)
  1413. int srcuvpos= dstbpp==12 ? s_srcypos + s_yinc/2 - 0x8000 :
  1414. s_srcypos - 0x8000;
  1415. int y1=(srcuvpos + 0x1FFFF)>>17; // first chrominance source line number below the dst line
  1416. int yalpha=((s_srcypos-1)&0xFFFF)>>4;
  1417. int uvalpha=((srcuvpos-1)&0x1FFFF)>>5;
  1418. uint16_t *buf0=pix_buf_y[y0&1]; // top line of the interpolated slice
  1419. uint16_t *buf1=pix_buf_y[((y0+1)&1)]; // bottom line of the interpolated slice
  1420. uint16_t *uvbuf0=pix_buf_uv[y1&1]; // top line of the interpolated slice
  1421. uint16_t *uvbuf1=pix_buf_uv[(y1+1)&1]; // bottom line of the interpolated slice
  1422. int i;
  1423. if(y0>=y+h) break; // FIXME wrong, skips last lines, but they are dupliactes anyway
  1424. if((y0&1) && dstbpp==12) uvalpha=-1; // there is no alpha if there is no line
  1425. s_ypos++; s_srcypos+=s_yinc;
  1426. //only interpolate the src line horizontally if we didnt do it allready
  1427. if(s_last_ypos!=y0)
  1428. {
  1429. unsigned char *src;
  1430. // skip if first line has been horiz scaled alleady
  1431. if(s_last_ypos != y0-1)
  1432. {
  1433. // check if first line is before any available src lines
  1434. if(y0-1 < y) src=srcptr[0]+(0 )*stride[0];
  1435. else src=srcptr[0]+(y0-y-1)*stride[0];
  1436. hyscale(buf0, dstw, src, srcWidth, s_xinc);
  1437. }
  1438. // check if second line is after any available src lines
  1439. if(y0-y >= h) src=srcptr[0]+(h-1)*stride[0];
  1440. else src=srcptr[0]+(y0-y)*stride[0];
  1441. // the min() is required to avoid reuseing lines which where not available
  1442. s_last_ypos= MIN(y0, y+h-1);
  1443. hyscale(buf1, dstw, src, srcWidth, s_xinc);
  1444. }
  1445. // printf("%d %d %d %d\n", y, y1, s_last_y1pos, h);
  1446. // *** horizontal scale U and V lines to temp buffer
  1447. if(s_last_y1pos!=y1)
  1448. {
  1449. uint8_t *src1, *src2;
  1450. // skip if first line has been horiz scaled alleady
  1451. if(s_last_y1pos != y1-1)
  1452. {
  1453. // check if first line is before any available src lines
  1454. if(y1-y/2-1 < 0)
  1455. {
  1456. src1= srcptr[1]+(0)*stride[1];
  1457. src2= srcptr[2]+(0)*stride[2];
  1458. }else{
  1459. src1= srcptr[1]+(y1-y/2-1)*stride[1];
  1460. src2= srcptr[2]+(y1-y/2-1)*stride[2];
  1461. }
  1462. hcscale(uvbuf0, dstUVw, src1, src2, srcWidth, s_xinc2);
  1463. }
  1464. // check if second line is after any available src lines
  1465. if(y1 - y/2 >= h/2)
  1466. {
  1467. src1= srcptr[1]+(h/2-1)*stride[1];
  1468. src2= srcptr[2]+(h/2-1)*stride[2];
  1469. }else{
  1470. src1= srcptr[1]+(y1-y/2)*stride[1];
  1471. src2= srcptr[2]+(y1-y/2)*stride[2];
  1472. }
  1473. hcscale(uvbuf1, dstUVw, src1, src2, srcWidth, s_xinc2);
  1474. // the min() is required to avoid reuseing lines which where not available
  1475. s_last_y1pos= MIN(y1, y/2+h/2-1);
  1476. }
  1477. if(dstbpp==12) //YV12
  1478. yuv2yuv(buf0, buf1, uvbuf0, uvbuf1, dest, uDest, vDest, dstw, yalpha, uvalpha);
  1479. else if(ABS(s_yinc - 0x10000) < 10)
  1480. yuv2rgb1(buf0, buf1, uvbuf0, uvbuf1, dest, dstw, yalpha, uvalpha, dstbpp);
  1481. else
  1482. yuv2rgbX(buf0, buf1, uvbuf0, uvbuf1, dest, dstw, yalpha, uvalpha, dstbpp);
  1483. #ifdef HAVE_MMX
  1484. b16Dither= b16Dither1;
  1485. b16Dither1= b16Dither2;
  1486. b16Dither2= b16Dither;
  1487. g16Dither= g16Dither1;
  1488. g16Dither1= g16Dither2;
  1489. g16Dither2= g16Dither;
  1490. #endif
  1491. }
  1492. #ifdef HAVE_MMX
  1493. __asm __volatile(SFENCE:::"memory");
  1494. __asm __volatile(EMMS:::"memory");
  1495. #endif
  1496. }
  1497. void SwScale_Init(){
  1498. // generating tables:
  1499. int i;
  1500. for(i=0;i<256;i++){
  1501. clip_table[i]=0;
  1502. clip_table[i+256]=i;
  1503. clip_table[i+512]=255;
  1504. yuvtab_2568[i]=(0x2568*(i-16))+(256<<13);
  1505. yuvtab_3343[i]=0x3343*(i-128);
  1506. yuvtab_0c92[i]=-0x0c92*(i-128);
  1507. yuvtab_1a1e[i]=-0x1a1e*(i-128);
  1508. yuvtab_40cf[i]=0x40cf*(i-128);
  1509. }
  1510. for(i=0; i<768; i++)
  1511. {
  1512. int v= clip_table[i];
  1513. clip_table16b[i]= v>>3;
  1514. clip_table16g[i]= (v<<3)&0x07E0;
  1515. clip_table16r[i]= (v<<8)&0xF800;
  1516. clip_table15b[i]= v>>3;
  1517. clip_table15g[i]= (v<<2)&0x03E0;
  1518. clip_table15r[i]= (v<<7)&0x7C00;
  1519. }
  1520. }