You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

2604 lines
76KB

  1. // Software scaling and colorspace conversion routines for MPlayer
  2. // Orginal C implementation by A'rpi/ESP-team <arpi@thot.banki.hu>
  3. // current version mostly by Michael Niedermayer (michaelni@gmx.at)
  4. // the parts written by michael are under GNU GPL
  5. #undef MOVNTQ
  6. #undef PAVGB
  7. #undef PREFETCH
  8. #undef PREFETCHW
  9. #undef EMMS
  10. #undef SFENCE
  11. #ifdef HAVE_3DNOW
  12. /* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
  13. #define EMMS "femms"
  14. #else
  15. #define EMMS "emms"
  16. #endif
  17. #ifdef HAVE_3DNOW
  18. #define PREFETCH "prefetch"
  19. #define PREFETCHW "prefetchw"
  20. #elif defined ( HAVE_MMX2 )
  21. #define PREFETCH "prefetchnta"
  22. #define PREFETCHW "prefetcht0"
  23. #else
  24. #define PREFETCH "/nop"
  25. #define PREFETCHW "/nop"
  26. #endif
  27. #ifdef HAVE_MMX2
  28. #define SFENCE "sfence"
  29. #else
  30. #define SFENCE "/nop"
  31. #endif
  32. #ifdef HAVE_MMX2
  33. #define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
  34. #elif defined (HAVE_3DNOW)
  35. #define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
  36. #endif
  37. #ifdef HAVE_MMX2
  38. #define MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
  39. #else
  40. #define MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
  41. #endif
  42. #define YSCALEYUV2YV12X(x) \
  43. "xorl %%eax, %%eax \n\t"\
  44. "pxor %%mm3, %%mm3 \n\t"\
  45. "pxor %%mm4, %%mm4 \n\t"\
  46. "movl %0, %%edx \n\t"\
  47. ".balign 16 \n\t" /* FIXME Unroll? */\
  48. "1: \n\t"\
  49. "movl (%1, %%edx, 4), %%esi \n\t"\
  50. "movq (%2, %%edx, 8), %%mm0 \n\t" /* filterCoeff */\
  51. "movq " #x "(%%esi, %%eax, 2), %%mm2 \n\t" /* srcData */\
  52. "movq 8+" #x "(%%esi, %%eax, 2), %%mm5 \n\t" /* srcData */\
  53. "pmulhw %%mm0, %%mm2 \n\t"\
  54. "pmulhw %%mm0, %%mm5 \n\t"\
  55. "paddw %%mm2, %%mm3 \n\t"\
  56. "paddw %%mm5, %%mm4 \n\t"\
  57. "addl $1, %%edx \n\t"\
  58. " jnz 1b \n\t"\
  59. "psraw $3, %%mm3 \n\t"\
  60. "psraw $3, %%mm4 \n\t"\
  61. "packuswb %%mm4, %%mm3 \n\t"\
  62. MOVNTQ(%%mm3, (%3, %%eax))\
  63. "addl $8, %%eax \n\t"\
  64. "cmpl %4, %%eax \n\t"\
  65. "pxor %%mm3, %%mm3 \n\t"\
  66. "pxor %%mm4, %%mm4 \n\t"\
  67. "movl %0, %%edx \n\t"\
  68. "jb 1b \n\t"
  69. #define YSCALEYUV2YV121 \
  70. "movl %2, %%eax \n\t"\
  71. ".balign 16 \n\t" /* FIXME Unroll? */\
  72. "1: \n\t"\
  73. "movq (%0, %%eax, 2), %%mm0 \n\t"\
  74. "movq 8(%0, %%eax, 2), %%mm1 \n\t"\
  75. "psraw $7, %%mm0 \n\t"\
  76. "psraw $7, %%mm1 \n\t"\
  77. "packuswb %%mm1, %%mm0 \n\t"\
  78. MOVNTQ(%%mm0, (%1, %%eax))\
  79. "addl $8, %%eax \n\t"\
  80. "jnc 1b \n\t"
  81. /*
  82. :: "m" (-lumFilterSize), "m" (-chrFilterSize),
  83. "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
  84. "r" (dest), "m" (dstW),
  85. "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
  86. : "%eax", "%ebx", "%ecx", "%edx", "%esi"
  87. */
  88. #define YSCALEYUV2RGBX \
  89. "xorl %%eax, %%eax \n\t"\
  90. ".balign 16 \n\t"\
  91. "1: \n\t"\
  92. "movl %1, %%edx \n\t" /* -chrFilterSize */\
  93. "movl %3, %%ebx \n\t" /* chrMmxFilter+lumFilterSize */\
  94. "movl %7, %%ecx \n\t" /* chrSrc+lumFilterSize */\
  95. "pxor %%mm3, %%mm3 \n\t"\
  96. "pxor %%mm4, %%mm4 \n\t"\
  97. "2: \n\t"\
  98. "movl (%%ecx, %%edx, 4), %%esi \n\t"\
  99. "movq (%%ebx, %%edx, 8), %%mm0 \n\t" /* filterCoeff */\
  100. "movq (%%esi, %%eax), %%mm2 \n\t" /* UsrcData */\
  101. "movq 4096(%%esi, %%eax), %%mm5 \n\t" /* VsrcData */\
  102. "pmulhw %%mm0, %%mm2 \n\t"\
  103. "pmulhw %%mm0, %%mm5 \n\t"\
  104. "paddw %%mm2, %%mm3 \n\t"\
  105. "paddw %%mm5, %%mm4 \n\t"\
  106. "addl $1, %%edx \n\t"\
  107. " jnz 2b \n\t"\
  108. \
  109. "movl %0, %%edx \n\t" /* -lumFilterSize */\
  110. "movl %2, %%ebx \n\t" /* lumMmxFilter+lumFilterSize */\
  111. "movl %6, %%ecx \n\t" /* lumSrc+lumFilterSize */\
  112. "pxor %%mm1, %%mm1 \n\t"\
  113. "pxor %%mm7, %%mm7 \n\t"\
  114. "2: \n\t"\
  115. "movl (%%ecx, %%edx, 4), %%esi \n\t"\
  116. "movq (%%ebx, %%edx, 8), %%mm0 \n\t" /* filterCoeff */\
  117. "movq (%%esi, %%eax, 2), %%mm2 \n\t" /* Y1srcData */\
  118. "movq 8(%%esi, %%eax, 2), %%mm5 \n\t" /* Y2srcData */\
  119. "pmulhw %%mm0, %%mm2 \n\t"\
  120. "pmulhw %%mm0, %%mm5 \n\t"\
  121. "paddw %%mm2, %%mm1 \n\t"\
  122. "paddw %%mm5, %%mm7 \n\t"\
  123. "addl $1, %%edx \n\t"\
  124. " jnz 2b \n\t"\
  125. \
  126. "psubw w400, %%mm3 \n\t" /* (U-128)8*/\
  127. "psubw w400, %%mm4 \n\t" /* (V-128)8*/\
  128. "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
  129. "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
  130. "pmulhw ugCoeff, %%mm3 \n\t"\
  131. "pmulhw vgCoeff, %%mm4 \n\t"\
  132. /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
  133. "pmulhw ubCoeff, %%mm2 \n\t"\
  134. "pmulhw vrCoeff, %%mm5 \n\t"\
  135. "psubw w80, %%mm1 \n\t" /* 8(Y-16)*/\
  136. "psubw w80, %%mm7 \n\t" /* 8(Y-16)*/\
  137. "pmulhw yCoeff, %%mm1 \n\t"\
  138. "pmulhw yCoeff, %%mm7 \n\t"\
  139. /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
  140. "paddw %%mm3, %%mm4 \n\t"\
  141. "movq %%mm2, %%mm0 \n\t"\
  142. "movq %%mm5, %%mm6 \n\t"\
  143. "movq %%mm4, %%mm3 \n\t"\
  144. "punpcklwd %%mm2, %%mm2 \n\t"\
  145. "punpcklwd %%mm5, %%mm5 \n\t"\
  146. "punpcklwd %%mm4, %%mm4 \n\t"\
  147. "paddw %%mm1, %%mm2 \n\t"\
  148. "paddw %%mm1, %%mm5 \n\t"\
  149. "paddw %%mm1, %%mm4 \n\t"\
  150. "punpckhwd %%mm0, %%mm0 \n\t"\
  151. "punpckhwd %%mm6, %%mm6 \n\t"\
  152. "punpckhwd %%mm3, %%mm3 \n\t"\
  153. "paddw %%mm7, %%mm0 \n\t"\
  154. "paddw %%mm7, %%mm6 \n\t"\
  155. "paddw %%mm7, %%mm3 \n\t"\
  156. /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
  157. "packuswb %%mm0, %%mm2 \n\t"\
  158. "packuswb %%mm6, %%mm5 \n\t"\
  159. "packuswb %%mm3, %%mm4 \n\t"\
  160. "pxor %%mm7, %%mm7 \n\t"
  161. #define FULL_YSCALEYUV2RGB \
  162. "pxor %%mm7, %%mm7 \n\t"\
  163. "movd %6, %%mm6 \n\t" /*yalpha1*/\
  164. "punpcklwd %%mm6, %%mm6 \n\t"\
  165. "punpcklwd %%mm6, %%mm6 \n\t"\
  166. "movd %7, %%mm5 \n\t" /*uvalpha1*/\
  167. "punpcklwd %%mm5, %%mm5 \n\t"\
  168. "punpcklwd %%mm5, %%mm5 \n\t"\
  169. "xorl %%eax, %%eax \n\t"\
  170. ".balign 16 \n\t"\
  171. "1: \n\t"\
  172. "movq (%0, %%eax, 2), %%mm0 \n\t" /*buf0[eax]*/\
  173. "movq (%1, %%eax, 2), %%mm1 \n\t" /*buf1[eax]*/\
  174. "movq (%2, %%eax,2), %%mm2 \n\t" /* uvbuf0[eax]*/\
  175. "movq (%3, %%eax,2), %%mm3 \n\t" /* uvbuf1[eax]*/\
  176. "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
  177. "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
  178. "pmulhw %%mm6, %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
  179. "pmulhw %%mm5, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
  180. "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  181. "movq 4096(%2, %%eax,2), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
  182. "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
  183. "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
  184. "movq 4096(%3, %%eax,2), %%mm0 \n\t" /* uvbuf1[eax+2048]*/\
  185. "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
  186. "psubw %%mm0, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
  187. "psubw w80, %%mm1 \n\t" /* 8(Y-16)*/\
  188. "psubw w400, %%mm3 \n\t" /* 8(U-128)*/\
  189. "pmulhw yCoeff, %%mm1 \n\t"\
  190. \
  191. \
  192. "pmulhw %%mm5, %%mm4 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
  193. "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
  194. "pmulhw ubCoeff, %%mm3 \n\t"\
  195. "psraw $4, %%mm0 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
  196. "pmulhw ugCoeff, %%mm2 \n\t"\
  197. "paddw %%mm4, %%mm0 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
  198. "psubw w400, %%mm0 \n\t" /* (V-128)8*/\
  199. \
  200. \
  201. "movq %%mm0, %%mm4 \n\t" /* (V-128)8*/\
  202. "pmulhw vrCoeff, %%mm0 \n\t"\
  203. "pmulhw vgCoeff, %%mm4 \n\t"\
  204. "paddw %%mm1, %%mm3 \n\t" /* B*/\
  205. "paddw %%mm1, %%mm0 \n\t" /* R*/\
  206. "packuswb %%mm3, %%mm3 \n\t"\
  207. \
  208. "packuswb %%mm0, %%mm0 \n\t"\
  209. "paddw %%mm4, %%mm2 \n\t"\
  210. "paddw %%mm2, %%mm1 \n\t" /* G*/\
  211. \
  212. "packuswb %%mm1, %%mm1 \n\t"
  213. #define YSCALEYUV2RGB \
  214. "movd %6, %%mm6 \n\t" /*yalpha1*/\
  215. "punpcklwd %%mm6, %%mm6 \n\t"\
  216. "punpcklwd %%mm6, %%mm6 \n\t"\
  217. "movq %%mm6, asm_yalpha1 \n\t"\
  218. "movd %7, %%mm5 \n\t" /*uvalpha1*/\
  219. "punpcklwd %%mm5, %%mm5 \n\t"\
  220. "punpcklwd %%mm5, %%mm5 \n\t"\
  221. "movq %%mm5, asm_uvalpha1 \n\t"\
  222. "xorl %%eax, %%eax \n\t"\
  223. ".balign 16 \n\t"\
  224. "1: \n\t"\
  225. "movq (%2, %%eax), %%mm2 \n\t" /* uvbuf0[eax]*/\
  226. "movq (%3, %%eax), %%mm3 \n\t" /* uvbuf1[eax]*/\
  227. "movq 4096(%2, %%eax), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
  228. "movq 4096(%3, %%eax), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
  229. "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
  230. "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
  231. "movq asm_uvalpha1, %%mm0 \n\t"\
  232. "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
  233. "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
  234. "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
  235. "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
  236. "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
  237. "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
  238. "psubw w400, %%mm3 \n\t" /* (U-128)8*/\
  239. "psubw w400, %%mm4 \n\t" /* (V-128)8*/\
  240. "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
  241. "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
  242. "pmulhw ugCoeff, %%mm3 \n\t"\
  243. "pmulhw vgCoeff, %%mm4 \n\t"\
  244. /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
  245. "movq (%0, %%eax, 2), %%mm0 \n\t" /*buf0[eax]*/\
  246. "movq (%1, %%eax, 2), %%mm1 \n\t" /*buf1[eax]*/\
  247. "movq 8(%0, %%eax, 2), %%mm6 \n\t" /*buf0[eax]*/\
  248. "movq 8(%1, %%eax, 2), %%mm7 \n\t" /*buf1[eax]*/\
  249. "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
  250. "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
  251. "pmulhw asm_yalpha1, %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
  252. "pmulhw asm_yalpha1, %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
  253. "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  254. "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  255. "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
  256. "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
  257. "pmulhw ubCoeff, %%mm2 \n\t"\
  258. "pmulhw vrCoeff, %%mm5 \n\t"\
  259. "psubw w80, %%mm1 \n\t" /* 8(Y-16)*/\
  260. "psubw w80, %%mm7 \n\t" /* 8(Y-16)*/\
  261. "pmulhw yCoeff, %%mm1 \n\t"\
  262. "pmulhw yCoeff, %%mm7 \n\t"\
  263. /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
  264. "paddw %%mm3, %%mm4 \n\t"\
  265. "movq %%mm2, %%mm0 \n\t"\
  266. "movq %%mm5, %%mm6 \n\t"\
  267. "movq %%mm4, %%mm3 \n\t"\
  268. "punpcklwd %%mm2, %%mm2 \n\t"\
  269. "punpcklwd %%mm5, %%mm5 \n\t"\
  270. "punpcklwd %%mm4, %%mm4 \n\t"\
  271. "paddw %%mm1, %%mm2 \n\t"\
  272. "paddw %%mm1, %%mm5 \n\t"\
  273. "paddw %%mm1, %%mm4 \n\t"\
  274. "punpckhwd %%mm0, %%mm0 \n\t"\
  275. "punpckhwd %%mm6, %%mm6 \n\t"\
  276. "punpckhwd %%mm3, %%mm3 \n\t"\
  277. "paddw %%mm7, %%mm0 \n\t"\
  278. "paddw %%mm7, %%mm6 \n\t"\
  279. "paddw %%mm7, %%mm3 \n\t"\
  280. /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
  281. "packuswb %%mm0, %%mm2 \n\t"\
  282. "packuswb %%mm6, %%mm5 \n\t"\
  283. "packuswb %%mm3, %%mm4 \n\t"\
  284. "pxor %%mm7, %%mm7 \n\t"
  285. #define YSCALEYUV2RGB1 \
  286. "xorl %%eax, %%eax \n\t"\
  287. ".balign 16 \n\t"\
  288. "1: \n\t"\
  289. "movq (%2, %%eax), %%mm3 \n\t" /* uvbuf0[eax]*/\
  290. "movq 4096(%2, %%eax), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
  291. "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
  292. "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
  293. "psubw w400, %%mm3 \n\t" /* (U-128)8*/\
  294. "psubw w400, %%mm4 \n\t" /* (V-128)8*/\
  295. "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
  296. "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
  297. "pmulhw ugCoeff, %%mm3 \n\t"\
  298. "pmulhw vgCoeff, %%mm4 \n\t"\
  299. /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
  300. "movq (%0, %%eax, 2), %%mm1 \n\t" /*buf0[eax]*/\
  301. "movq 8(%0, %%eax, 2), %%mm7 \n\t" /*buf0[eax]*/\
  302. "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  303. "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  304. "pmulhw ubCoeff, %%mm2 \n\t"\
  305. "pmulhw vrCoeff, %%mm5 \n\t"\
  306. "psubw w80, %%mm1 \n\t" /* 8(Y-16)*/\
  307. "psubw w80, %%mm7 \n\t" /* 8(Y-16)*/\
  308. "pmulhw yCoeff, %%mm1 \n\t"\
  309. "pmulhw yCoeff, %%mm7 \n\t"\
  310. /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
  311. "paddw %%mm3, %%mm4 \n\t"\
  312. "movq %%mm2, %%mm0 \n\t"\
  313. "movq %%mm5, %%mm6 \n\t"\
  314. "movq %%mm4, %%mm3 \n\t"\
  315. "punpcklwd %%mm2, %%mm2 \n\t"\
  316. "punpcklwd %%mm5, %%mm5 \n\t"\
  317. "punpcklwd %%mm4, %%mm4 \n\t"\
  318. "paddw %%mm1, %%mm2 \n\t"\
  319. "paddw %%mm1, %%mm5 \n\t"\
  320. "paddw %%mm1, %%mm4 \n\t"\
  321. "punpckhwd %%mm0, %%mm0 \n\t"\
  322. "punpckhwd %%mm6, %%mm6 \n\t"\
  323. "punpckhwd %%mm3, %%mm3 \n\t"\
  324. "paddw %%mm7, %%mm0 \n\t"\
  325. "paddw %%mm7, %%mm6 \n\t"\
  326. "paddw %%mm7, %%mm3 \n\t"\
  327. /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
  328. "packuswb %%mm0, %%mm2 \n\t"\
  329. "packuswb %%mm6, %%mm5 \n\t"\
  330. "packuswb %%mm3, %%mm4 \n\t"\
  331. "pxor %%mm7, %%mm7 \n\t"
  332. // do vertical chrominance interpolation
  333. #define YSCALEYUV2RGB1b \
  334. "xorl %%eax, %%eax \n\t"\
  335. ".balign 16 \n\t"\
  336. "1: \n\t"\
  337. "movq (%2, %%eax), %%mm2 \n\t" /* uvbuf0[eax]*/\
  338. "movq (%3, %%eax), %%mm3 \n\t" /* uvbuf1[eax]*/\
  339. "movq 4096(%2, %%eax), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
  340. "movq 4096(%3, %%eax), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
  341. "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
  342. "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
  343. "psrlw $5, %%mm3 \n\t" /*FIXME might overflow*/\
  344. "psrlw $5, %%mm4 \n\t" /*FIXME might overflow*/\
  345. "psubw w400, %%mm3 \n\t" /* (U-128)8*/\
  346. "psubw w400, %%mm4 \n\t" /* (V-128)8*/\
  347. "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
  348. "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
  349. "pmulhw ugCoeff, %%mm3 \n\t"\
  350. "pmulhw vgCoeff, %%mm4 \n\t"\
  351. /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
  352. "movq (%0, %%eax, 2), %%mm1 \n\t" /*buf0[eax]*/\
  353. "movq 8(%0, %%eax, 2), %%mm7 \n\t" /*buf0[eax]*/\
  354. "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  355. "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  356. "pmulhw ubCoeff, %%mm2 \n\t"\
  357. "pmulhw vrCoeff, %%mm5 \n\t"\
  358. "psubw w80, %%mm1 \n\t" /* 8(Y-16)*/\
  359. "psubw w80, %%mm7 \n\t" /* 8(Y-16)*/\
  360. "pmulhw yCoeff, %%mm1 \n\t"\
  361. "pmulhw yCoeff, %%mm7 \n\t"\
  362. /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
  363. "paddw %%mm3, %%mm4 \n\t"\
  364. "movq %%mm2, %%mm0 \n\t"\
  365. "movq %%mm5, %%mm6 \n\t"\
  366. "movq %%mm4, %%mm3 \n\t"\
  367. "punpcklwd %%mm2, %%mm2 \n\t"\
  368. "punpcklwd %%mm5, %%mm5 \n\t"\
  369. "punpcklwd %%mm4, %%mm4 \n\t"\
  370. "paddw %%mm1, %%mm2 \n\t"\
  371. "paddw %%mm1, %%mm5 \n\t"\
  372. "paddw %%mm1, %%mm4 \n\t"\
  373. "punpckhwd %%mm0, %%mm0 \n\t"\
  374. "punpckhwd %%mm6, %%mm6 \n\t"\
  375. "punpckhwd %%mm3, %%mm3 \n\t"\
  376. "paddw %%mm7, %%mm0 \n\t"\
  377. "paddw %%mm7, %%mm6 \n\t"\
  378. "paddw %%mm7, %%mm3 \n\t"\
  379. /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
  380. "packuswb %%mm0, %%mm2 \n\t"\
  381. "packuswb %%mm6, %%mm5 \n\t"\
  382. "packuswb %%mm3, %%mm4 \n\t"\
  383. "pxor %%mm7, %%mm7 \n\t"
  384. #define WRITEBGR32 \
  385. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
  386. "movq %%mm2, %%mm1 \n\t" /* B */\
  387. "movq %%mm5, %%mm6 \n\t" /* R */\
  388. "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
  389. "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
  390. "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
  391. "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
  392. "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
  393. "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
  394. "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
  395. "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
  396. "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
  397. "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
  398. \
  399. MOVNTQ(%%mm0, (%4, %%eax, 4))\
  400. MOVNTQ(%%mm2, 8(%4, %%eax, 4))\
  401. MOVNTQ(%%mm1, 16(%4, %%eax, 4))\
  402. MOVNTQ(%%mm3, 24(%4, %%eax, 4))\
  403. \
  404. "addl $8, %%eax \n\t"\
  405. "cmpl %5, %%eax \n\t"\
  406. " jb 1b \n\t"
  407. #define WRITEBGR16 \
  408. "pand bF8, %%mm2 \n\t" /* B */\
  409. "pand bFC, %%mm4 \n\t" /* G */\
  410. "pand bF8, %%mm5 \n\t" /* R */\
  411. "psrlq $3, %%mm2 \n\t"\
  412. \
  413. "movq %%mm2, %%mm1 \n\t"\
  414. "movq %%mm4, %%mm3 \n\t"\
  415. \
  416. "punpcklbw %%mm7, %%mm3 \n\t"\
  417. "punpcklbw %%mm5, %%mm2 \n\t"\
  418. "punpckhbw %%mm7, %%mm4 \n\t"\
  419. "punpckhbw %%mm5, %%mm1 \n\t"\
  420. \
  421. "psllq $3, %%mm3 \n\t"\
  422. "psllq $3, %%mm4 \n\t"\
  423. \
  424. "por %%mm3, %%mm2 \n\t"\
  425. "por %%mm4, %%mm1 \n\t"\
  426. \
  427. MOVNTQ(%%mm2, (%4, %%eax, 2))\
  428. MOVNTQ(%%mm1, 8(%4, %%eax, 2))\
  429. \
  430. "addl $8, %%eax \n\t"\
  431. "cmpl %5, %%eax \n\t"\
  432. " jb 1b \n\t"
  433. #define WRITEBGR15 \
  434. "pand bF8, %%mm2 \n\t" /* B */\
  435. "pand bF8, %%mm4 \n\t" /* G */\
  436. "pand bF8, %%mm5 \n\t" /* R */\
  437. "psrlq $3, %%mm2 \n\t"\
  438. "psrlq $1, %%mm5 \n\t"\
  439. \
  440. "movq %%mm2, %%mm1 \n\t"\
  441. "movq %%mm4, %%mm3 \n\t"\
  442. \
  443. "punpcklbw %%mm7, %%mm3 \n\t"\
  444. "punpcklbw %%mm5, %%mm2 \n\t"\
  445. "punpckhbw %%mm7, %%mm4 \n\t"\
  446. "punpckhbw %%mm5, %%mm1 \n\t"\
  447. \
  448. "psllq $2, %%mm3 \n\t"\
  449. "psllq $2, %%mm4 \n\t"\
  450. \
  451. "por %%mm3, %%mm2 \n\t"\
  452. "por %%mm4, %%mm1 \n\t"\
  453. \
  454. MOVNTQ(%%mm2, (%4, %%eax, 2))\
  455. MOVNTQ(%%mm1, 8(%4, %%eax, 2))\
  456. \
  457. "addl $8, %%eax \n\t"\
  458. "cmpl %5, %%eax \n\t"\
  459. " jb 1b \n\t"
  460. #define WRITEBGR24OLD \
  461. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
  462. "movq %%mm2, %%mm1 \n\t" /* B */\
  463. "movq %%mm5, %%mm6 \n\t" /* R */\
  464. "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
  465. "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
  466. "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
  467. "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
  468. "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
  469. "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
  470. "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
  471. "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
  472. "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
  473. "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
  474. \
  475. "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
  476. "psrlq $8, %%mm0 \n\t" /* 00RGB0RG 0 */\
  477. "pand bm00000111, %%mm4 \n\t" /* 00000RGB 0 */\
  478. "pand bm11111000, %%mm0 \n\t" /* 00RGB000 0.5 */\
  479. "por %%mm4, %%mm0 \n\t" /* 00RGBRGB 0 */\
  480. "movq %%mm2, %%mm4 \n\t" /* 0RGB0RGB 1 */\
  481. "psllq $48, %%mm2 \n\t" /* GB000000 1 */\
  482. "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
  483. \
  484. "movq %%mm4, %%mm2 \n\t" /* 0RGB0RGB 1 */\
  485. "psrld $16, %%mm4 \n\t" /* 000R000R 1 */\
  486. "psrlq $24, %%mm2 \n\t" /* 0000RGB0 1.5 */\
  487. "por %%mm4, %%mm2 \n\t" /* 000RRGBR 1 */\
  488. "pand bm00001111, %%mm2 \n\t" /* 0000RGBR 1 */\
  489. "movq %%mm1, %%mm4 \n\t" /* 0RGB0RGB 2 */\
  490. "psrlq $8, %%mm1 \n\t" /* 00RGB0RG 2 */\
  491. "pand bm00000111, %%mm4 \n\t" /* 00000RGB 2 */\
  492. "pand bm11111000, %%mm1 \n\t" /* 00RGB000 2.5 */\
  493. "por %%mm4, %%mm1 \n\t" /* 00RGBRGB 2 */\
  494. "movq %%mm1, %%mm4 \n\t" /* 00RGBRGB 2 */\
  495. "psllq $32, %%mm1 \n\t" /* BRGB0000 2 */\
  496. "por %%mm1, %%mm2 \n\t" /* BRGBRGBR 1 */\
  497. \
  498. "psrlq $32, %%mm4 \n\t" /* 000000RG 2.5 */\
  499. "movq %%mm3, %%mm5 \n\t" /* 0RGB0RGB 3 */\
  500. "psrlq $8, %%mm3 \n\t" /* 00RGB0RG 3 */\
  501. "pand bm00000111, %%mm5 \n\t" /* 00000RGB 3 */\
  502. "pand bm11111000, %%mm3 \n\t" /* 00RGB000 3.5 */\
  503. "por %%mm5, %%mm3 \n\t" /* 00RGBRGB 3 */\
  504. "psllq $16, %%mm3 \n\t" /* RGBRGB00 3 */\
  505. "por %%mm4, %%mm3 \n\t" /* RGBRGBRG 2.5 */\
  506. \
  507. MOVNTQ(%%mm0, (%%ebx))\
  508. MOVNTQ(%%mm2, 8(%%ebx))\
  509. MOVNTQ(%%mm3, 16(%%ebx))\
  510. "addl $24, %%ebx \n\t"\
  511. \
  512. "addl $8, %%eax \n\t"\
  513. "cmpl %5, %%eax \n\t"\
  514. " jb 1b \n\t"
  515. #define WRITEBGR24MMX \
  516. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
  517. "movq %%mm2, %%mm1 \n\t" /* B */\
  518. "movq %%mm5, %%mm6 \n\t" /* R */\
  519. "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
  520. "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
  521. "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
  522. "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
  523. "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
  524. "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
  525. "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
  526. "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
  527. "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
  528. "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
  529. \
  530. "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
  531. "movq %%mm2, %%mm6 \n\t" /* 0RGB0RGB 1 */\
  532. "movq %%mm1, %%mm5 \n\t" /* 0RGB0RGB 2 */\
  533. "movq %%mm3, %%mm7 \n\t" /* 0RGB0RGB 3 */\
  534. \
  535. "psllq $40, %%mm0 \n\t" /* RGB00000 0 */\
  536. "psllq $40, %%mm2 \n\t" /* RGB00000 1 */\
  537. "psllq $40, %%mm1 \n\t" /* RGB00000 2 */\
  538. "psllq $40, %%mm3 \n\t" /* RGB00000 3 */\
  539. \
  540. "punpckhdq %%mm4, %%mm0 \n\t" /* 0RGBRGB0 0 */\
  541. "punpckhdq %%mm6, %%mm2 \n\t" /* 0RGBRGB0 1 */\
  542. "punpckhdq %%mm5, %%mm1 \n\t" /* 0RGBRGB0 2 */\
  543. "punpckhdq %%mm7, %%mm3 \n\t" /* 0RGBRGB0 3 */\
  544. \
  545. "psrlq $8, %%mm0 \n\t" /* 00RGBRGB 0 */\
  546. "movq %%mm2, %%mm6 \n\t" /* 0RGBRGB0 1 */\
  547. "psllq $40, %%mm2 \n\t" /* GB000000 1 */\
  548. "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
  549. MOVNTQ(%%mm0, (%%ebx))\
  550. \
  551. "psrlq $24, %%mm6 \n\t" /* 0000RGBR 1 */\
  552. "movq %%mm1, %%mm5 \n\t" /* 0RGBRGB0 2 */\
  553. "psllq $24, %%mm1 \n\t" /* BRGB0000 2 */\
  554. "por %%mm1, %%mm6 \n\t" /* BRGBRGBR 1 */\
  555. MOVNTQ(%%mm6, 8(%%ebx))\
  556. \
  557. "psrlq $40, %%mm5 \n\t" /* 000000RG 2 */\
  558. "psllq $8, %%mm3 \n\t" /* RGBRGB00 3 */\
  559. "por %%mm3, %%mm5 \n\t" /* RGBRGBRG 2 */\
  560. MOVNTQ(%%mm5, 16(%%ebx))\
  561. \
  562. "addl $24, %%ebx \n\t"\
  563. \
  564. "addl $8, %%eax \n\t"\
  565. "cmpl %5, %%eax \n\t"\
  566. " jb 1b \n\t"
  567. #define WRITEBGR24MMX2 \
  568. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
  569. "movq M24A, %%mm0 \n\t"\
  570. "movq M24C, %%mm7 \n\t"\
  571. "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2 B1 B0 B1 B0 */\
  572. "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2 G1 G0 G1 G0 */\
  573. "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0 R1 R0 R1 R0 */\
  574. \
  575. "pand %%mm0, %%mm1 \n\t" /* B2 B1 B0 */\
  576. "pand %%mm0, %%mm3 \n\t" /* G2 G1 G0 */\
  577. "pand %%mm7, %%mm6 \n\t" /* R1 R0 */\
  578. \
  579. "psllq $8, %%mm3 \n\t" /* G2 G1 G0 */\
  580. "por %%mm1, %%mm6 \n\t"\
  581. "por %%mm3, %%mm6 \n\t"\
  582. MOVNTQ(%%mm6, (%%ebx))\
  583. \
  584. "psrlq $8, %%mm4 \n\t" /* 00 G7 G6 G5 G4 G3 G2 G1 */\
  585. "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4 B3 B2 B3 B2 */\
  586. "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3 G4 G3 G4 G3 */\
  587. "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4 R3 R2 R3 R2 */\
  588. \
  589. "pand M24B, %%mm1 \n\t" /* B5 B4 B3 */\
  590. "pand %%mm7, %%mm3 \n\t" /* G4 G3 */\
  591. "pand %%mm0, %%mm6 \n\t" /* R4 R3 R2 */\
  592. \
  593. "por %%mm1, %%mm3 \n\t" /* B5 G4 B4 G3 B3 */\
  594. "por %%mm3, %%mm6 \n\t"\
  595. MOVNTQ(%%mm6, 8(%%ebx))\
  596. \
  597. "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6 B7 B6 B6 B7 */\
  598. "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7 G6 G5 G6 G5 */\
  599. "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6 R5 R4 R5 R4 */\
  600. \
  601. "pand %%mm7, %%mm1 \n\t" /* B7 B6 */\
  602. "pand %%mm0, %%mm3 \n\t" /* G7 G6 G5 */\
  603. "pand M24B, %%mm6 \n\t" /* R7 R6 R5 */\
  604. \
  605. "por %%mm1, %%mm3 \n\t"\
  606. "por %%mm3, %%mm6 \n\t"\
  607. MOVNTQ(%%mm6, 16(%%ebx))\
  608. \
  609. "addl $24, %%ebx \n\t"\
  610. \
  611. "addl $8, %%eax \n\t"\
  612. "cmpl %5, %%eax \n\t"\
  613. " jb 1b \n\t"
  614. #ifdef HAVE_MMX2
  615. #undef WRITEBGR24
  616. #define WRITEBGR24 WRITEBGR24MMX2
  617. #else
  618. #undef WRITEBGR24
  619. #define WRITEBGR24 WRITEBGR24MMX
  620. #endif
  621. static inline void RENAME(yuv2yuvX)(int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
  622. int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
  623. uint8_t *dest, uint8_t *uDest, uint8_t *vDest, int dstW,
  624. int16_t * lumMmxFilter, int16_t * chrMmxFilter)
  625. {
  626. #ifdef HAVE_MMX
  627. if(uDest != NULL)
  628. {
  629. asm volatile(
  630. YSCALEYUV2YV12X(0)
  631. :: "m" (-chrFilterSize), "r" (chrSrc+chrFilterSize),
  632. "r" (chrMmxFilter+chrFilterSize*4), "r" (uDest), "m" (dstW>>1)
  633. : "%eax", "%edx", "%esi"
  634. );
  635. asm volatile(
  636. YSCALEYUV2YV12X(4096)
  637. :: "m" (-chrFilterSize), "r" (chrSrc+chrFilterSize),
  638. "r" (chrMmxFilter+chrFilterSize*4), "r" (vDest), "m" (dstW>>1)
  639. : "%eax", "%edx", "%esi"
  640. );
  641. }
  642. asm volatile(
  643. YSCALEYUV2YV12X(0)
  644. :: "m" (-lumFilterSize), "r" (lumSrc+lumFilterSize),
  645. "r" (lumMmxFilter+lumFilterSize*4), "r" (dest), "m" (dstW)
  646. : "%eax", "%edx", "%esi"
  647. );
  648. #else
  649. yuv2yuvXinC(lumFilter, lumSrc, lumFilterSize,
  650. chrFilter, chrSrc, chrFilterSize,
  651. dest, uDest, vDest, dstW);
  652. #endif
  653. }
  654. static inline void RENAME(yuv2yuv1)(int16_t *lumSrc, int16_t *chrSrc,
  655. uint8_t *dest, uint8_t *uDest, uint8_t *vDest, int dstW)
  656. {
  657. #ifdef HAVE_MMX
  658. if(uDest != NULL)
  659. {
  660. asm volatile(
  661. YSCALEYUV2YV121
  662. :: "r" (chrSrc + (dstW>>1)), "r" (uDest + (dstW>>1)),
  663. "g" (-(dstW>>1))
  664. : "%eax"
  665. );
  666. asm volatile(
  667. YSCALEYUV2YV121
  668. :: "r" (chrSrc + 2048 + (dstW>>1)), "r" (vDest + (dstW>>1)),
  669. "g" (-(dstW>>1))
  670. : "%eax"
  671. );
  672. }
  673. asm volatile(
  674. YSCALEYUV2YV121
  675. :: "r" (lumSrc + dstW), "r" (dest + dstW),
  676. "g" (-dstW)
  677. : "%eax"
  678. );
  679. #else
  680. //FIXME Optimize (just quickly writen not opti..)
  681. //FIXME replace MINMAX with LUTs
  682. int i;
  683. for(i=0; i<dstW; i++)
  684. {
  685. int val= lumSrc[i]>>7;
  686. dest[i]= MIN(MAX(val>>19, 0), 255);
  687. }
  688. if(uDest != NULL)
  689. for(i=0; i<(dstW>>1); i++)
  690. {
  691. int u=chrSrc[i]>>7;
  692. int v=chrSrc[i + 2048]>>7;
  693. uDest[i]= MIN(MAX(u>>19, 0), 255);
  694. vDest[i]= MIN(MAX(v>>19, 0), 255);
  695. }
  696. #endif
  697. }
  698. /**
  699. * vertical scale YV12 to RGB
  700. */
  701. static inline void RENAME(yuv2rgbX)(int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
  702. int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
  703. uint8_t *dest, int dstW, int dstbpp, int16_t * lumMmxFilter, int16_t * chrMmxFilter)
  704. {
  705. if(fullUVIpol)
  706. {
  707. //FIXME
  708. }//FULL_UV_IPOL
  709. else
  710. {
  711. #ifdef HAVE_MMX
  712. if(dstbpp == 32) //FIXME untested
  713. {
  714. asm volatile(
  715. YSCALEYUV2RGBX
  716. WRITEBGR32
  717. :: "m" (-lumFilterSize), "m" (-chrFilterSize),
  718. "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
  719. "r" (dest), "m" (dstW),
  720. "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
  721. : "%eax", "%ebx", "%ecx", "%edx", "%esi"
  722. );
  723. }
  724. else if(dstbpp==24) //FIXME untested
  725. {
  726. asm volatile(
  727. YSCALEYUV2RGBX
  728. "leal (%%eax, %%eax, 2), %%ebx \n\t" //FIXME optimize
  729. "addl %4, %%ebx \n\t"
  730. WRITEBGR24
  731. :: "m" (-lumFilterSize), "m" (-chrFilterSize),
  732. "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
  733. "r" (dest), "m" (dstW),
  734. "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
  735. : "%eax", "%ebx", "%ecx", "%edx", "%esi"
  736. );
  737. }
  738. else if(dstbpp==15)
  739. {
  740. asm volatile(
  741. YSCALEYUV2RGBX
  742. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  743. #ifdef DITHER1XBPP
  744. "paddusb b5Dither, %%mm2 \n\t"
  745. "paddusb g5Dither, %%mm4 \n\t"
  746. "paddusb r5Dither, %%mm5 \n\t"
  747. #endif
  748. WRITEBGR15
  749. :: "m" (-lumFilterSize), "m" (-chrFilterSize),
  750. "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
  751. "r" (dest), "m" (dstW),
  752. "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
  753. : "%eax", "%ebx", "%ecx", "%edx", "%esi"
  754. );
  755. }
  756. else if(dstbpp==16)
  757. {
  758. asm volatile(
  759. YSCALEYUV2RGBX
  760. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  761. #ifdef DITHER1XBPP
  762. "paddusb b5Dither, %%mm2 \n\t"
  763. "paddusb g6Dither, %%mm4 \n\t"
  764. "paddusb r5Dither, %%mm5 \n\t"
  765. #endif
  766. WRITEBGR16
  767. :: "m" (-lumFilterSize), "m" (-chrFilterSize),
  768. "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
  769. "r" (dest), "m" (dstW),
  770. "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
  771. : "%eax", "%ebx", "%ecx", "%edx", "%esi"
  772. );
  773. }
  774. #else
  775. yuv2rgbXinC(lumFilter, lumSrc, lumFilterSize,
  776. chrFilter, chrSrc, chrFilterSize,
  777. dest, dstW, dstbpp);
  778. #endif
  779. } //!FULL_UV_IPOL
  780. }
  781. /**
  782. * vertical bilinear scale YV12 to RGB
  783. */
  784. static inline void RENAME(yuv2rgb2)(uint16_t *buf0, uint16_t *buf1, uint16_t *uvbuf0, uint16_t *uvbuf1,
  785. uint8_t *dest, int dstW, int yalpha, int uvalpha, int dstbpp)
  786. {
  787. int yalpha1=yalpha^4095;
  788. int uvalpha1=uvalpha^4095;
  789. if(fullUVIpol)
  790. {
  791. #ifdef HAVE_MMX
  792. if(dstbpp == 32)
  793. {
  794. asm volatile(
  795. FULL_YSCALEYUV2RGB
  796. "punpcklbw %%mm1, %%mm3 \n\t" // BGBGBGBG
  797. "punpcklbw %%mm7, %%mm0 \n\t" // R0R0R0R0
  798. "movq %%mm3, %%mm1 \n\t"
  799. "punpcklwd %%mm0, %%mm3 \n\t" // BGR0BGR0
  800. "punpckhwd %%mm0, %%mm1 \n\t" // BGR0BGR0
  801. MOVNTQ(%%mm3, (%4, %%eax, 4))
  802. MOVNTQ(%%mm1, 8(%4, %%eax, 4))
  803. "addl $4, %%eax \n\t"
  804. "cmpl %5, %%eax \n\t"
  805. " jb 1b \n\t"
  806. :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
  807. "m" (yalpha1), "m" (uvalpha1)
  808. : "%eax"
  809. );
  810. }
  811. else if(dstbpp==24)
  812. {
  813. asm volatile(
  814. FULL_YSCALEYUV2RGB
  815. // lsb ... msb
  816. "punpcklbw %%mm1, %%mm3 \n\t" // BGBGBGBG
  817. "punpcklbw %%mm7, %%mm0 \n\t" // R0R0R0R0
  818. "movq %%mm3, %%mm1 \n\t"
  819. "punpcklwd %%mm0, %%mm3 \n\t" // BGR0BGR0
  820. "punpckhwd %%mm0, %%mm1 \n\t" // BGR0BGR0
  821. "movq %%mm3, %%mm2 \n\t" // BGR0BGR0
  822. "psrlq $8, %%mm3 \n\t" // GR0BGR00
  823. "pand bm00000111, %%mm2 \n\t" // BGR00000
  824. "pand bm11111000, %%mm3 \n\t" // 000BGR00
  825. "por %%mm2, %%mm3 \n\t" // BGRBGR00
  826. "movq %%mm1, %%mm2 \n\t"
  827. "psllq $48, %%mm1 \n\t" // 000000BG
  828. "por %%mm1, %%mm3 \n\t" // BGRBGRBG
  829. "movq %%mm2, %%mm1 \n\t" // BGR0BGR0
  830. "psrld $16, %%mm2 \n\t" // R000R000
  831. "psrlq $24, %%mm1 \n\t" // 0BGR0000
  832. "por %%mm2, %%mm1 \n\t" // RBGRR000
  833. "movl %4, %%ebx \n\t"
  834. "addl %%eax, %%ebx \n\t"
  835. #ifdef HAVE_MMX2
  836. //FIXME Alignment
  837. "movntq %%mm3, (%%ebx, %%eax, 2)\n\t"
  838. "movntq %%mm1, 8(%%ebx, %%eax, 2)\n\t"
  839. #else
  840. "movd %%mm3, (%%ebx, %%eax, 2) \n\t"
  841. "psrlq $32, %%mm3 \n\t"
  842. "movd %%mm3, 4(%%ebx, %%eax, 2) \n\t"
  843. "movd %%mm1, 8(%%ebx, %%eax, 2) \n\t"
  844. #endif
  845. "addl $4, %%eax \n\t"
  846. "cmpl %5, %%eax \n\t"
  847. " jb 1b \n\t"
  848. :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstW),
  849. "m" (yalpha1), "m" (uvalpha1)
  850. : "%eax", "%ebx"
  851. );
  852. }
  853. else if(dstbpp==15)
  854. {
  855. asm volatile(
  856. FULL_YSCALEYUV2RGB
  857. #ifdef DITHER1XBPP
  858. "paddusb g5Dither, %%mm1 \n\t"
  859. "paddusb r5Dither, %%mm0 \n\t"
  860. "paddusb b5Dither, %%mm3 \n\t"
  861. #endif
  862. "punpcklbw %%mm7, %%mm1 \n\t" // 0G0G0G0G
  863. "punpcklbw %%mm7, %%mm3 \n\t" // 0B0B0B0B
  864. "punpcklbw %%mm7, %%mm0 \n\t" // 0R0R0R0R
  865. "psrlw $3, %%mm3 \n\t"
  866. "psllw $2, %%mm1 \n\t"
  867. "psllw $7, %%mm0 \n\t"
  868. "pand g15Mask, %%mm1 \n\t"
  869. "pand r15Mask, %%mm0 \n\t"
  870. "por %%mm3, %%mm1 \n\t"
  871. "por %%mm1, %%mm0 \n\t"
  872. MOVNTQ(%%mm0, (%4, %%eax, 2))
  873. "addl $4, %%eax \n\t"
  874. "cmpl %5, %%eax \n\t"
  875. " jb 1b \n\t"
  876. :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
  877. "m" (yalpha1), "m" (uvalpha1)
  878. : "%eax"
  879. );
  880. }
  881. else if(dstbpp==16)
  882. {
  883. asm volatile(
  884. FULL_YSCALEYUV2RGB
  885. #ifdef DITHER1XBPP
  886. "paddusb g6Dither, %%mm1 \n\t"
  887. "paddusb r5Dither, %%mm0 \n\t"
  888. "paddusb b5Dither, %%mm3 \n\t"
  889. #endif
  890. "punpcklbw %%mm7, %%mm1 \n\t" // 0G0G0G0G
  891. "punpcklbw %%mm7, %%mm3 \n\t" // 0B0B0B0B
  892. "punpcklbw %%mm7, %%mm0 \n\t" // 0R0R0R0R
  893. "psrlw $3, %%mm3 \n\t"
  894. "psllw $3, %%mm1 \n\t"
  895. "psllw $8, %%mm0 \n\t"
  896. "pand g16Mask, %%mm1 \n\t"
  897. "pand r16Mask, %%mm0 \n\t"
  898. "por %%mm3, %%mm1 \n\t"
  899. "por %%mm1, %%mm0 \n\t"
  900. MOVNTQ(%%mm0, (%4, %%eax, 2))
  901. "addl $4, %%eax \n\t"
  902. "cmpl %5, %%eax \n\t"
  903. " jb 1b \n\t"
  904. :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
  905. "m" (yalpha1), "m" (uvalpha1)
  906. : "%eax"
  907. );
  908. }
  909. #else
  910. if(dstbpp==32 || dstbpp==24)
  911. {
  912. int i;
  913. for(i=0;i<dstW;i++){
  914. // vertical linear interpolation && yuv2rgb in a single step:
  915. int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
  916. int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
  917. int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
  918. dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)];
  919. dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)];
  920. dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)];
  921. dest+=dstbpp>>3;
  922. }
  923. }
  924. else if(dstbpp==16)
  925. {
  926. int i;
  927. for(i=0;i<dstW;i++){
  928. // vertical linear interpolation && yuv2rgb in a single step:
  929. int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
  930. int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
  931. int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
  932. ((uint16_t*)dest)[i] =
  933. clip_table16b[(Y + yuvtab_40cf[U]) >>13] |
  934. clip_table16g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
  935. clip_table16r[(Y + yuvtab_3343[V]) >>13];
  936. }
  937. }
  938. else if(dstbpp==15)
  939. {
  940. int i;
  941. for(i=0;i<dstW;i++){
  942. // vertical linear interpolation && yuv2rgb in a single step:
  943. int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
  944. int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
  945. int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
  946. ((uint16_t*)dest)[i] =
  947. clip_table15b[(Y + yuvtab_40cf[U]) >>13] |
  948. clip_table15g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
  949. clip_table15r[(Y + yuvtab_3343[V]) >>13];
  950. }
  951. }
  952. #endif
  953. }//FULL_UV_IPOL
  954. else
  955. {
  956. #ifdef HAVE_MMX
  957. if(dstbpp == 32)
  958. {
  959. asm volatile(
  960. YSCALEYUV2RGB
  961. WRITEBGR32
  962. :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
  963. "m" (yalpha1), "m" (uvalpha1)
  964. : "%eax"
  965. );
  966. }
  967. else if(dstbpp==24)
  968. {
  969. asm volatile(
  970. "movl %4, %%ebx \n\t"
  971. YSCALEYUV2RGB
  972. WRITEBGR24
  973. :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstW),
  974. "m" (yalpha1), "m" (uvalpha1)
  975. : "%eax", "%ebx"
  976. );
  977. }
  978. else if(dstbpp==15)
  979. {
  980. asm volatile(
  981. YSCALEYUV2RGB
  982. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  983. #ifdef DITHER1XBPP
  984. "paddusb b5Dither, %%mm2 \n\t"
  985. "paddusb g5Dither, %%mm4 \n\t"
  986. "paddusb r5Dither, %%mm5 \n\t"
  987. #endif
  988. WRITEBGR15
  989. :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
  990. "m" (yalpha1), "m" (uvalpha1)
  991. : "%eax"
  992. );
  993. }
  994. else if(dstbpp==16)
  995. {
  996. asm volatile(
  997. YSCALEYUV2RGB
  998. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  999. #ifdef DITHER1XBPP
  1000. "paddusb b5Dither, %%mm2 \n\t"
  1001. "paddusb g6Dither, %%mm4 \n\t"
  1002. "paddusb r5Dither, %%mm5 \n\t"
  1003. #endif
  1004. WRITEBGR16
  1005. :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
  1006. "m" (yalpha1), "m" (uvalpha1)
  1007. : "%eax"
  1008. );
  1009. }
  1010. #else
  1011. if(dstbpp==32)
  1012. {
  1013. int i;
  1014. for(i=0; i<dstW-1; i+=2){
  1015. // vertical linear interpolation && yuv2rgb in a single step:
  1016. int Y1=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
  1017. int Y2=yuvtab_2568[((buf0[i+1]*yalpha1+buf1[i+1]*yalpha)>>19)];
  1018. int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
  1019. int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
  1020. int Cb= yuvtab_40cf[U];
  1021. int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
  1022. int Cr= yuvtab_3343[V];
  1023. dest[4*i+0]=clip_table[((Y1 + Cb) >>13)];
  1024. dest[4*i+1]=clip_table[((Y1 + Cg) >>13)];
  1025. dest[4*i+2]=clip_table[((Y1 + Cr) >>13)];
  1026. dest[4*i+4]=clip_table[((Y2 + Cb) >>13)];
  1027. dest[4*i+5]=clip_table[((Y2 + Cg) >>13)];
  1028. dest[4*i+6]=clip_table[((Y2 + Cr) >>13)];
  1029. }
  1030. }
  1031. else if(dstbpp==24)
  1032. {
  1033. int i;
  1034. for(i=0; i<dstW-1; i+=2){
  1035. // vertical linear interpolation && yuv2rgb in a single step:
  1036. int Y1=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
  1037. int Y2=yuvtab_2568[((buf0[i+1]*yalpha1+buf1[i+1]*yalpha)>>19)];
  1038. int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
  1039. int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
  1040. int Cb= yuvtab_40cf[U];
  1041. int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
  1042. int Cr= yuvtab_3343[V];
  1043. dest[0]=clip_table[((Y1 + Cb) >>13)];
  1044. dest[1]=clip_table[((Y1 + Cg) >>13)];
  1045. dest[2]=clip_table[((Y1 + Cr) >>13)];
  1046. dest[3]=clip_table[((Y2 + Cb) >>13)];
  1047. dest[4]=clip_table[((Y2 + Cg) >>13)];
  1048. dest[5]=clip_table[((Y2 + Cr) >>13)];
  1049. dest+=6;
  1050. }
  1051. }
  1052. else if(dstbpp==16)
  1053. {
  1054. int i;
  1055. for(i=0; i<dstW-1; i+=2){
  1056. // vertical linear interpolation && yuv2rgb in a single step:
  1057. int Y1=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
  1058. int Y2=yuvtab_2568[((buf0[i+1]*yalpha1+buf1[i+1]*yalpha)>>19)];
  1059. int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
  1060. int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
  1061. int Cb= yuvtab_40cf[U];
  1062. int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
  1063. int Cr= yuvtab_3343[V];
  1064. ((uint16_t*)dest)[i] =
  1065. clip_table16b[(Y1 + Cb) >>13] |
  1066. clip_table16g[(Y1 + Cg) >>13] |
  1067. clip_table16r[(Y1 + Cr) >>13];
  1068. ((uint16_t*)dest)[i+1] =
  1069. clip_table16b[(Y2 + Cb) >>13] |
  1070. clip_table16g[(Y2 + Cg) >>13] |
  1071. clip_table16r[(Y2 + Cr) >>13];
  1072. }
  1073. }
  1074. else if(dstbpp==15)
  1075. {
  1076. int i;
  1077. for(i=0; i<dstW-1; i+=2){
  1078. // vertical linear interpolation && yuv2rgb in a single step:
  1079. int Y1=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
  1080. int Y2=yuvtab_2568[((buf0[i+1]*yalpha1+buf1[i+1]*yalpha)>>19)];
  1081. int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
  1082. int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
  1083. int Cb= yuvtab_40cf[U];
  1084. int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
  1085. int Cr= yuvtab_3343[V];
  1086. ((uint16_t*)dest)[i] =
  1087. clip_table15b[(Y1 + Cb) >>13] |
  1088. clip_table15g[(Y1 + Cg) >>13] |
  1089. clip_table15r[(Y1 + Cr) >>13];
  1090. ((uint16_t*)dest)[i+1] =
  1091. clip_table15b[(Y2 + Cb) >>13] |
  1092. clip_table15g[(Y2 + Cg) >>13] |
  1093. clip_table15r[(Y2 + Cr) >>13];
  1094. }
  1095. }
  1096. #endif
  1097. } //!FULL_UV_IPOL
  1098. }
  1099. /**
  1100. * YV12 to RGB without scaling or interpolating
  1101. */
  1102. static inline void RENAME(yuv2rgb1)(uint16_t *buf0, uint16_t *uvbuf0, uint16_t *uvbuf1,
  1103. uint8_t *dest, int dstW, int uvalpha, int dstbpp)
  1104. {
  1105. int uvalpha1=uvalpha^4095;
  1106. const int yalpha1=0;
  1107. if(fullUVIpol || allwaysIpol)
  1108. {
  1109. RENAME(yuv2rgb2)(buf0, buf0, uvbuf0, uvbuf1, dest, dstW, 0, uvalpha, dstbpp);
  1110. return;
  1111. }
  1112. #ifdef HAVE_MMX
  1113. if( uvalpha < 2048 ) // note this is not correct (shifts chrominance by 0.5 pixels) but its a bit faster
  1114. {
  1115. if(dstbpp == 32)
  1116. {
  1117. asm volatile(
  1118. YSCALEYUV2RGB1
  1119. WRITEBGR32
  1120. :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
  1121. "m" (yalpha1), "m" (uvalpha1)
  1122. : "%eax"
  1123. );
  1124. }
  1125. else if(dstbpp==24)
  1126. {
  1127. asm volatile(
  1128. "movl %4, %%ebx \n\t"
  1129. YSCALEYUV2RGB1
  1130. WRITEBGR24
  1131. :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstW),
  1132. "m" (yalpha1), "m" (uvalpha1)
  1133. : "%eax", "%ebx"
  1134. );
  1135. }
  1136. else if(dstbpp==15)
  1137. {
  1138. asm volatile(
  1139. YSCALEYUV2RGB1
  1140. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1141. #ifdef DITHER1XBPP
  1142. "paddusb b5Dither, %%mm2 \n\t"
  1143. "paddusb g5Dither, %%mm4 \n\t"
  1144. "paddusb r5Dither, %%mm5 \n\t"
  1145. #endif
  1146. WRITEBGR15
  1147. :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
  1148. "m" (yalpha1), "m" (uvalpha1)
  1149. : "%eax"
  1150. );
  1151. }
  1152. else if(dstbpp==16)
  1153. {
  1154. asm volatile(
  1155. YSCALEYUV2RGB1
  1156. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1157. #ifdef DITHER1XBPP
  1158. "paddusb b5Dither, %%mm2 \n\t"
  1159. "paddusb g6Dither, %%mm4 \n\t"
  1160. "paddusb r5Dither, %%mm5 \n\t"
  1161. #endif
  1162. WRITEBGR16
  1163. :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
  1164. "m" (yalpha1), "m" (uvalpha1)
  1165. : "%eax"
  1166. );
  1167. }
  1168. }
  1169. else
  1170. {
  1171. if(dstbpp == 32)
  1172. {
  1173. asm volatile(
  1174. YSCALEYUV2RGB1b
  1175. WRITEBGR32
  1176. :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
  1177. "m" (yalpha1), "m" (uvalpha1)
  1178. : "%eax"
  1179. );
  1180. }
  1181. else if(dstbpp==24)
  1182. {
  1183. asm volatile(
  1184. "movl %4, %%ebx \n\t"
  1185. YSCALEYUV2RGB1b
  1186. WRITEBGR24
  1187. :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstW),
  1188. "m" (yalpha1), "m" (uvalpha1)
  1189. : "%eax", "%ebx"
  1190. );
  1191. }
  1192. else if(dstbpp==15)
  1193. {
  1194. asm volatile(
  1195. YSCALEYUV2RGB1b
  1196. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1197. #ifdef DITHER1XBPP
  1198. "paddusb b5Dither, %%mm2 \n\t"
  1199. "paddusb g5Dither, %%mm4 \n\t"
  1200. "paddusb r5Dither, %%mm5 \n\t"
  1201. #endif
  1202. WRITEBGR15
  1203. :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
  1204. "m" (yalpha1), "m" (uvalpha1)
  1205. : "%eax"
  1206. );
  1207. }
  1208. else if(dstbpp==16)
  1209. {
  1210. asm volatile(
  1211. YSCALEYUV2RGB1b
  1212. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1213. #ifdef DITHER1XBPP
  1214. "paddusb b5Dither, %%mm2 \n\t"
  1215. "paddusb g6Dither, %%mm4 \n\t"
  1216. "paddusb r5Dither, %%mm5 \n\t"
  1217. #endif
  1218. WRITEBGR16
  1219. :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
  1220. "m" (yalpha1), "m" (uvalpha1)
  1221. : "%eax"
  1222. );
  1223. }
  1224. }
  1225. #else
  1226. //FIXME write 2 versions (for even & odd lines)
  1227. if(dstbpp==32)
  1228. {
  1229. int i;
  1230. for(i=0; i<dstW-1; i+=2){
  1231. // vertical linear interpolation && yuv2rgb in a single step:
  1232. int Y1=yuvtab_2568[buf0[i]>>7];
  1233. int Y2=yuvtab_2568[buf0[i+1]>>7];
  1234. int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
  1235. int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
  1236. int Cb= yuvtab_40cf[U];
  1237. int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
  1238. int Cr= yuvtab_3343[V];
  1239. dest[4*i+0]=clip_table[((Y1 + Cb) >>13)];
  1240. dest[4*i+1]=clip_table[((Y1 + Cg) >>13)];
  1241. dest[4*i+2]=clip_table[((Y1 + Cr) >>13)];
  1242. dest[4*i+4]=clip_table[((Y2 + Cb) >>13)];
  1243. dest[4*i+5]=clip_table[((Y2 + Cg) >>13)];
  1244. dest[4*i+6]=clip_table[((Y2 + Cr) >>13)];
  1245. }
  1246. }
  1247. else if(dstbpp==24)
  1248. {
  1249. int i;
  1250. for(i=0; i<dstW-1; i+=2){
  1251. // vertical linear interpolation && yuv2rgb in a single step:
  1252. int Y1=yuvtab_2568[buf0[i]>>7];
  1253. int Y2=yuvtab_2568[buf0[i+1]>>7];
  1254. int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
  1255. int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
  1256. int Cb= yuvtab_40cf[U];
  1257. int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
  1258. int Cr= yuvtab_3343[V];
  1259. dest[0]=clip_table[((Y1 + Cb) >>13)];
  1260. dest[1]=clip_table[((Y1 + Cg) >>13)];
  1261. dest[2]=clip_table[((Y1 + Cr) >>13)];
  1262. dest[3]=clip_table[((Y2 + Cb) >>13)];
  1263. dest[4]=clip_table[((Y2 + Cg) >>13)];
  1264. dest[5]=clip_table[((Y2 + Cr) >>13)];
  1265. dest+=6;
  1266. }
  1267. }
  1268. else if(dstbpp==16)
  1269. {
  1270. int i;
  1271. for(i=0; i<dstW-1; i+=2){
  1272. // vertical linear interpolation && yuv2rgb in a single step:
  1273. int Y1=yuvtab_2568[buf0[i]>>7];
  1274. int Y2=yuvtab_2568[buf0[i+1]>>7];
  1275. int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
  1276. int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
  1277. int Cb= yuvtab_40cf[U];
  1278. int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
  1279. int Cr= yuvtab_3343[V];
  1280. ((uint16_t*)dest)[i] =
  1281. clip_table16b[(Y1 + Cb) >>13] |
  1282. clip_table16g[(Y1 + Cg) >>13] |
  1283. clip_table16r[(Y1 + Cr) >>13];
  1284. ((uint16_t*)dest)[i+1] =
  1285. clip_table16b[(Y2 + Cb) >>13] |
  1286. clip_table16g[(Y2 + Cg) >>13] |
  1287. clip_table16r[(Y2 + Cr) >>13];
  1288. }
  1289. }
  1290. else if(dstbpp==15)
  1291. {
  1292. int i;
  1293. for(i=0; i<dstW-1; i+=2){
  1294. // vertical linear interpolation && yuv2rgb in a single step:
  1295. int Y1=yuvtab_2568[buf0[i]>>7];
  1296. int Y2=yuvtab_2568[buf0[i+1]>>7];
  1297. int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
  1298. int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
  1299. int Cb= yuvtab_40cf[U];
  1300. int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
  1301. int Cr= yuvtab_3343[V];
  1302. ((uint16_t*)dest)[i] =
  1303. clip_table15b[(Y1 + Cb) >>13] |
  1304. clip_table15g[(Y1 + Cg) >>13] |
  1305. clip_table15r[(Y1 + Cr) >>13];
  1306. ((uint16_t*)dest)[i+1] =
  1307. clip_table15b[(Y2 + Cb) >>13] |
  1308. clip_table15g[(Y2 + Cg) >>13] |
  1309. clip_table15r[(Y2 + Cr) >>13];
  1310. }
  1311. }
  1312. #endif
  1313. }
  1314. // Bilinear / Bicubic scaling
  1315. static inline void RENAME(hScale)(int16_t *dst, int dstW, uint8_t *src, int srcW, int xInc,
  1316. int16_t *filter, int16_t *filterPos, int filterSize)
  1317. {
  1318. #ifdef HAVE_MMX
  1319. if(filterSize==4) // allways true for upscaling, sometimes for down too
  1320. {
  1321. int counter= -2*dstW;
  1322. filter-= counter*2;
  1323. filterPos-= counter/2;
  1324. dst-= counter/2;
  1325. asm volatile(
  1326. "pxor %%mm7, %%mm7 \n\t"
  1327. "movq w02, %%mm6 \n\t"
  1328. "pushl %%ebp \n\t" // we use 7 regs here ...
  1329. "movl %%eax, %%ebp \n\t"
  1330. ".balign 16 \n\t"
  1331. "1: \n\t"
  1332. "movzwl (%2, %%ebp), %%eax \n\t"
  1333. "movzwl 2(%2, %%ebp), %%ebx \n\t"
  1334. "movq (%1, %%ebp, 4), %%mm1 \n\t"
  1335. "movq 8(%1, %%ebp, 4), %%mm3 \n\t"
  1336. "movd (%3, %%eax), %%mm0 \n\t"
  1337. "movd (%3, %%ebx), %%mm2 \n\t"
  1338. "punpcklbw %%mm7, %%mm0 \n\t"
  1339. "punpcklbw %%mm7, %%mm2 \n\t"
  1340. "pmaddwd %%mm1, %%mm0 \n\t"
  1341. "pmaddwd %%mm2, %%mm3 \n\t"
  1342. "psrad $8, %%mm0 \n\t"
  1343. "psrad $8, %%mm3 \n\t"
  1344. "packssdw %%mm3, %%mm0 \n\t"
  1345. "pmaddwd %%mm6, %%mm0 \n\t"
  1346. "packssdw %%mm0, %%mm0 \n\t"
  1347. "movd %%mm0, (%4, %%ebp) \n\t"
  1348. "addl $4, %%ebp \n\t"
  1349. " jnc 1b \n\t"
  1350. "popl %%ebp \n\t"
  1351. : "+a" (counter)
  1352. : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
  1353. : "%ebx"
  1354. );
  1355. }
  1356. else if(filterSize==8)
  1357. {
  1358. int counter= -2*dstW;
  1359. filter-= counter*4;
  1360. filterPos-= counter/2;
  1361. dst-= counter/2;
  1362. asm volatile(
  1363. "pxor %%mm7, %%mm7 \n\t"
  1364. "movq w02, %%mm6 \n\t"
  1365. "pushl %%ebp \n\t" // we use 7 regs here ...
  1366. "movl %%eax, %%ebp \n\t"
  1367. ".balign 16 \n\t"
  1368. "1: \n\t"
  1369. "movzwl (%2, %%ebp), %%eax \n\t"
  1370. "movzwl 2(%2, %%ebp), %%ebx \n\t"
  1371. "movq (%1, %%ebp, 8), %%mm1 \n\t"
  1372. "movq 16(%1, %%ebp, 8), %%mm3 \n\t"
  1373. "movd (%3, %%eax), %%mm0 \n\t"
  1374. "movd (%3, %%ebx), %%mm2 \n\t"
  1375. "punpcklbw %%mm7, %%mm0 \n\t"
  1376. "punpcklbw %%mm7, %%mm2 \n\t"
  1377. "pmaddwd %%mm1, %%mm0 \n\t"
  1378. "pmaddwd %%mm2, %%mm3 \n\t"
  1379. "movq 8(%1, %%ebp, 8), %%mm1 \n\t"
  1380. "movq 24(%1, %%ebp, 8), %%mm5 \n\t"
  1381. "movd 4(%3, %%eax), %%mm4 \n\t"
  1382. "movd 4(%3, %%ebx), %%mm2 \n\t"
  1383. "punpcklbw %%mm7, %%mm4 \n\t"
  1384. "punpcklbw %%mm7, %%mm2 \n\t"
  1385. "pmaddwd %%mm1, %%mm4 \n\t"
  1386. "pmaddwd %%mm2, %%mm5 \n\t"
  1387. "paddd %%mm4, %%mm0 \n\t"
  1388. "paddd %%mm5, %%mm3 \n\t"
  1389. "psrad $8, %%mm0 \n\t"
  1390. "psrad $8, %%mm3 \n\t"
  1391. "packssdw %%mm3, %%mm0 \n\t"
  1392. "pmaddwd %%mm6, %%mm0 \n\t"
  1393. "packssdw %%mm0, %%mm0 \n\t"
  1394. "movd %%mm0, (%4, %%ebp) \n\t"
  1395. "addl $4, %%ebp \n\t"
  1396. " jnc 1b \n\t"
  1397. "popl %%ebp \n\t"
  1398. : "+a" (counter)
  1399. : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
  1400. : "%ebx"
  1401. );
  1402. }
  1403. else
  1404. {
  1405. int counter= -2*dstW;
  1406. // filter-= counter*filterSize/2;
  1407. filterPos-= counter/2;
  1408. dst-= counter/2;
  1409. asm volatile(
  1410. "pxor %%mm7, %%mm7 \n\t"
  1411. "movq w02, %%mm6 \n\t"
  1412. ".balign 16 \n\t"
  1413. "1: \n\t"
  1414. "movl %2, %%ecx \n\t"
  1415. "movzwl (%%ecx, %0), %%eax \n\t"
  1416. "movzwl 2(%%ecx, %0), %%ebx \n\t"
  1417. "movl %5, %%ecx \n\t"
  1418. "pxor %%mm4, %%mm4 \n\t"
  1419. "pxor %%mm5, %%mm5 \n\t"
  1420. "2: \n\t"
  1421. "movq (%1), %%mm1 \n\t"
  1422. "movq (%1, %6), %%mm3 \n\t"
  1423. "movd (%%ecx, %%eax), %%mm0 \n\t"
  1424. "movd (%%ecx, %%ebx), %%mm2 \n\t"
  1425. "punpcklbw %%mm7, %%mm0 \n\t"
  1426. "punpcklbw %%mm7, %%mm2 \n\t"
  1427. "pmaddwd %%mm1, %%mm0 \n\t"
  1428. "pmaddwd %%mm2, %%mm3 \n\t"
  1429. "paddd %%mm3, %%mm5 \n\t"
  1430. "paddd %%mm0, %%mm4 \n\t"
  1431. "addl $8, %1 \n\t"
  1432. "addl $4, %%ecx \n\t"
  1433. "cmpl %4, %%ecx \n\t"
  1434. " jb 2b \n\t"
  1435. "addl %6, %1 \n\t"
  1436. "psrad $8, %%mm4 \n\t"
  1437. "psrad $8, %%mm5 \n\t"
  1438. "packssdw %%mm5, %%mm4 \n\t"
  1439. "pmaddwd %%mm6, %%mm4 \n\t"
  1440. "packssdw %%mm4, %%mm4 \n\t"
  1441. "movl %3, %%eax \n\t"
  1442. "movd %%mm4, (%%eax, %0) \n\t"
  1443. "addl $4, %0 \n\t"
  1444. " jnc 1b \n\t"
  1445. : "+r" (counter), "+r" (filter)
  1446. : "m" (filterPos), "m" (dst), "m"(src+filterSize),
  1447. "m" (src), "r" (filterSize*2)
  1448. : "%ebx", "%eax", "%ecx"
  1449. );
  1450. }
  1451. #else
  1452. int i;
  1453. for(i=0; i<dstW; i++)
  1454. {
  1455. int j;
  1456. int srcPos= filterPos[i];
  1457. int val=0;
  1458. // printf("filterPos: %d\n", filterPos[i]);
  1459. for(j=0; j<filterSize; j++)
  1460. {
  1461. // printf("filter: %d, src: %d\n", filter[i], src[srcPos + j]);
  1462. val += ((int)src[srcPos + j])*filter[filterSize*i + j];
  1463. }
  1464. // filter += hFilterSize;
  1465. dst[i] = MIN(MAX(0, val>>7), (1<<15)-1); // the cubic equation does overflow ...
  1466. // dst[i] = val>>7;
  1467. }
  1468. #endif
  1469. }
  1470. // *** horizontal scale Y line to temp buffer
  1471. static inline void RENAME(hyscale)(uint16_t *dst, int dstWidth, uint8_t *src, int srcW, int xInc)
  1472. {
  1473. #ifdef HAVE_MMX
  1474. // use the new MMX scaler if th mmx2 cant be used (its faster than the x86asm one)
  1475. if(sws_flags != SWS_FAST_BILINEAR || (!canMMX2BeUsed))
  1476. #else
  1477. if(sws_flags != SWS_FAST_BILINEAR)
  1478. #endif
  1479. {
  1480. RENAME(hScale)(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize);
  1481. }
  1482. else // Fast Bilinear upscale / crap downscale
  1483. {
  1484. #ifdef ARCH_X86
  1485. #ifdef HAVE_MMX2
  1486. int i;
  1487. if(canMMX2BeUsed)
  1488. {
  1489. asm volatile(
  1490. "pxor %%mm7, %%mm7 \n\t"
  1491. "pxor %%mm2, %%mm2 \n\t" // 2*xalpha
  1492. "movd %5, %%mm6 \n\t" // xInc&0xFFFF
  1493. "punpcklwd %%mm6, %%mm6 \n\t"
  1494. "punpcklwd %%mm6, %%mm6 \n\t"
  1495. "movq %%mm6, %%mm2 \n\t"
  1496. "psllq $16, %%mm2 \n\t"
  1497. "paddw %%mm6, %%mm2 \n\t"
  1498. "psllq $16, %%mm2 \n\t"
  1499. "paddw %%mm6, %%mm2 \n\t"
  1500. "psllq $16, %%mm2 \n\t" //0,t,2t,3t t=xInc&0xFF
  1501. "movq %%mm2, temp0 \n\t"
  1502. "movd %4, %%mm6 \n\t" //(xInc*4)&0xFFFF
  1503. "punpcklwd %%mm6, %%mm6 \n\t"
  1504. "punpcklwd %%mm6, %%mm6 \n\t"
  1505. "xorl %%eax, %%eax \n\t" // i
  1506. "movl %0, %%esi \n\t" // src
  1507. "movl %1, %%edi \n\t" // buf1
  1508. "movl %3, %%edx \n\t" // (xInc*4)>>16
  1509. "xorl %%ecx, %%ecx \n\t"
  1510. "xorl %%ebx, %%ebx \n\t"
  1511. "movw %4, %%bx \n\t" // (xInc*4)&0xFFFF
  1512. #define FUNNY_Y_CODE \
  1513. PREFETCH" 1024(%%esi) \n\t"\
  1514. PREFETCH" 1056(%%esi) \n\t"\
  1515. PREFETCH" 1088(%%esi) \n\t"\
  1516. "call funnyYCode \n\t"\
  1517. "movq temp0, %%mm2 \n\t"\
  1518. "xorl %%ecx, %%ecx \n\t"
  1519. FUNNY_Y_CODE
  1520. FUNNY_Y_CODE
  1521. FUNNY_Y_CODE
  1522. FUNNY_Y_CODE
  1523. FUNNY_Y_CODE
  1524. FUNNY_Y_CODE
  1525. FUNNY_Y_CODE
  1526. FUNNY_Y_CODE
  1527. :: "m" (src), "m" (dst), "m" (dstWidth), "m" ((xInc*4)>>16),
  1528. "m" ((xInc*4)&0xFFFF), "m" (xInc&0xFFFF)
  1529. : "%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi"
  1530. );
  1531. for(i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) dst[i] = src[srcW-1]*128;
  1532. }
  1533. else
  1534. {
  1535. #endif
  1536. //NO MMX just normal asm ...
  1537. asm volatile(
  1538. "xorl %%eax, %%eax \n\t" // i
  1539. "xorl %%ebx, %%ebx \n\t" // xx
  1540. "xorl %%ecx, %%ecx \n\t" // 2*xalpha
  1541. ".balign 16 \n\t"
  1542. "1: \n\t"
  1543. "movzbl (%0, %%ebx), %%edi \n\t" //src[xx]
  1544. "movzbl 1(%0, %%ebx), %%esi \n\t" //src[xx+1]
  1545. "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
  1546. "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
  1547. "shll $16, %%edi \n\t"
  1548. "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
  1549. "movl %1, %%edi \n\t"
  1550. "shrl $9, %%esi \n\t"
  1551. "movw %%si, (%%edi, %%eax, 2) \n\t"
  1552. "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
  1553. "adcl %3, %%ebx \n\t" //xx+= xInc>>8 + carry
  1554. "movzbl (%0, %%ebx), %%edi \n\t" //src[xx]
  1555. "movzbl 1(%0, %%ebx), %%esi \n\t" //src[xx+1]
  1556. "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
  1557. "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
  1558. "shll $16, %%edi \n\t"
  1559. "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
  1560. "movl %1, %%edi \n\t"
  1561. "shrl $9, %%esi \n\t"
  1562. "movw %%si, 2(%%edi, %%eax, 2) \n\t"
  1563. "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
  1564. "adcl %3, %%ebx \n\t" //xx+= xInc>>8 + carry
  1565. "addl $2, %%eax \n\t"
  1566. "cmpl %2, %%eax \n\t"
  1567. " jb 1b \n\t"
  1568. :: "r" (src), "m" (dst), "m" (dstWidth), "m" (xInc>>16), "m" (xInc&0xFFFF)
  1569. : "%eax", "%ebx", "%ecx", "%edi", "%esi"
  1570. );
  1571. #ifdef HAVE_MMX2
  1572. } //if MMX2 cant be used
  1573. #endif
  1574. #else
  1575. int i;
  1576. unsigned int xpos=0;
  1577. for(i=0;i<dstWidth;i++)
  1578. {
  1579. register unsigned int xx=xpos>>16;
  1580. register unsigned int xalpha=(xpos&0xFFFF)>>9;
  1581. dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha;
  1582. xpos+=xInc;
  1583. }
  1584. #endif
  1585. }
  1586. }
  1587. inline static void RENAME(hcscale)(uint16_t *dst, int dstWidth,
  1588. uint8_t *src1, uint8_t *src2, int srcW, int xInc)
  1589. {
  1590. #ifdef HAVE_MMX
  1591. // use the new MMX scaler if th mmx2 cant be used (its faster than the x86asm one)
  1592. if(sws_flags != SWS_FAST_BILINEAR || (!canMMX2BeUsed))
  1593. #else
  1594. if(sws_flags != SWS_FAST_BILINEAR)
  1595. #endif
  1596. {
  1597. RENAME(hScale)(dst , dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
  1598. RENAME(hScale)(dst+2048, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
  1599. }
  1600. else // Fast Bilinear upscale / crap downscale
  1601. {
  1602. #ifdef ARCH_X86
  1603. #ifdef HAVE_MMX2
  1604. int i;
  1605. if(canMMX2BeUsed)
  1606. {
  1607. asm volatile(
  1608. "pxor %%mm7, %%mm7 \n\t"
  1609. "pxor %%mm2, %%mm2 \n\t" // 2*xalpha
  1610. "movd %5, %%mm6 \n\t" // xInc&0xFFFF
  1611. "punpcklwd %%mm6, %%mm6 \n\t"
  1612. "punpcklwd %%mm6, %%mm6 \n\t"
  1613. "movq %%mm6, %%mm2 \n\t"
  1614. "psllq $16, %%mm2 \n\t"
  1615. "paddw %%mm6, %%mm2 \n\t"
  1616. "psllq $16, %%mm2 \n\t"
  1617. "paddw %%mm6, %%mm2 \n\t"
  1618. "psllq $16, %%mm2 \n\t" //0,t,2t,3t t=xInc&0xFFFF
  1619. "movq %%mm2, temp0 \n\t"
  1620. "movd %4, %%mm6 \n\t" //(xInc*4)&0xFFFF
  1621. "punpcklwd %%mm6, %%mm6 \n\t"
  1622. "punpcklwd %%mm6, %%mm6 \n\t"
  1623. "xorl %%eax, %%eax \n\t" // i
  1624. "movl %0, %%esi \n\t" // src
  1625. "movl %1, %%edi \n\t" // buf1
  1626. "movl %3, %%edx \n\t" // (xInc*4)>>16
  1627. "xorl %%ecx, %%ecx \n\t"
  1628. "xorl %%ebx, %%ebx \n\t"
  1629. "movw %4, %%bx \n\t" // (xInc*4)&0xFFFF
  1630. #define FUNNYUVCODE \
  1631. PREFETCH" 1024(%%esi) \n\t"\
  1632. PREFETCH" 1056(%%esi) \n\t"\
  1633. PREFETCH" 1088(%%esi) \n\t"\
  1634. "call funnyUVCode \n\t"\
  1635. "movq temp0, %%mm2 \n\t"\
  1636. "xorl %%ecx, %%ecx \n\t"
  1637. FUNNYUVCODE
  1638. FUNNYUVCODE
  1639. FUNNYUVCODE
  1640. FUNNYUVCODE
  1641. FUNNYUVCODE
  1642. FUNNYUVCODE
  1643. FUNNYUVCODE
  1644. FUNNYUVCODE
  1645. "xorl %%eax, %%eax \n\t" // i
  1646. "movl %6, %%esi \n\t" // src
  1647. "movl %1, %%edi \n\t" // buf1
  1648. "addl $4096, %%edi \n\t"
  1649. FUNNYUVCODE
  1650. FUNNYUVCODE
  1651. FUNNYUVCODE
  1652. FUNNYUVCODE
  1653. FUNNYUVCODE
  1654. FUNNYUVCODE
  1655. FUNNYUVCODE
  1656. FUNNYUVCODE
  1657. :: "m" (src1), "m" (dst), "m" (dstWidth), "m" ((xInc*4)>>16),
  1658. "m" ((xInc*4)&0xFFFF), "m" (xInc&0xFFFF), "m" (src2)
  1659. : "%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi"
  1660. );
  1661. for(i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
  1662. {
  1663. // printf("%d %d %d\n", dstWidth, i, srcW);
  1664. dst[i] = src1[srcW-1]*128;
  1665. dst[i+2048] = src2[srcW-1]*128;
  1666. }
  1667. }
  1668. else
  1669. {
  1670. #endif
  1671. asm volatile(
  1672. "xorl %%eax, %%eax \n\t" // i
  1673. "xorl %%ebx, %%ebx \n\t" // xx
  1674. "xorl %%ecx, %%ecx \n\t" // 2*xalpha
  1675. ".balign 16 \n\t"
  1676. "1: \n\t"
  1677. "movl %0, %%esi \n\t"
  1678. "movzbl (%%esi, %%ebx), %%edi \n\t" //src[xx]
  1679. "movzbl 1(%%esi, %%ebx), %%esi \n\t" //src[xx+1]
  1680. "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
  1681. "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
  1682. "shll $16, %%edi \n\t"
  1683. "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
  1684. "movl %1, %%edi \n\t"
  1685. "shrl $9, %%esi \n\t"
  1686. "movw %%si, (%%edi, %%eax, 2) \n\t"
  1687. "movzbl (%5, %%ebx), %%edi \n\t" //src[xx]
  1688. "movzbl 1(%5, %%ebx), %%esi \n\t" //src[xx+1]
  1689. "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
  1690. "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
  1691. "shll $16, %%edi \n\t"
  1692. "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
  1693. "movl %1, %%edi \n\t"
  1694. "shrl $9, %%esi \n\t"
  1695. "movw %%si, 4096(%%edi, %%eax, 2)\n\t"
  1696. "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
  1697. "adcl %3, %%ebx \n\t" //xx+= xInc>>8 + carry
  1698. "addl $1, %%eax \n\t"
  1699. "cmpl %2, %%eax \n\t"
  1700. " jb 1b \n\t"
  1701. :: "m" (src1), "m" (dst), "m" (dstWidth), "m" (xInc>>16), "m" (xInc&0xFFFF),
  1702. "r" (src2)
  1703. : "%eax", "%ebx", "%ecx", "%edi", "%esi"
  1704. );
  1705. #ifdef HAVE_MMX2
  1706. } //if MMX2 cant be used
  1707. #endif
  1708. #else
  1709. int i;
  1710. unsigned int xpos=0;
  1711. for(i=0;i<dstWidth;i++)
  1712. {
  1713. register unsigned int xx=xpos>>16;
  1714. register unsigned int xalpha=(xpos&0xFFFF)>>9;
  1715. dst[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha);
  1716. dst[i+2048]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha);
  1717. /* slower
  1718. dst[i]= (src1[xx]<<7) + (src1[xx+1] - src1[xx])*xalpha;
  1719. dst[i+2048]=(src2[xx]<<7) + (src2[xx+1] - src2[xx])*xalpha;
  1720. */
  1721. xpos+=xInc;
  1722. }
  1723. #endif
  1724. }
  1725. }
  1726. static inline void RENAME(initFilter)(int16_t *dstFilter, int16_t *filterPos, int *filterSize, int xInc,
  1727. int srcW, int dstW, int filterAlign, int one)
  1728. {
  1729. int i;
  1730. double filter[8000];
  1731. #ifdef HAVE_MMX
  1732. asm volatile("emms\n\t"::: "memory"); //FIXME this shouldnt be required but it IS (even for non mmx versions)
  1733. #endif
  1734. if(ABS(xInc - 0x10000) <10) // unscaled
  1735. {
  1736. int i;
  1737. *filterSize= (1 +(filterAlign-1)) & (~(filterAlign-1)); // 1 or 4 normaly
  1738. for(i=0; i<dstW*(*filterSize); i++) filter[i]=0;
  1739. for(i=0; i<dstW; i++)
  1740. {
  1741. filter[i*(*filterSize)]=1;
  1742. filterPos[i]=i;
  1743. }
  1744. }
  1745. else if(xInc <= (1<<16) || sws_flags==SWS_FAST_BILINEAR) // upscale
  1746. {
  1747. int i;
  1748. int xDstInSrc;
  1749. if(sws_flags==SWS_BICUBIC) *filterSize= 4;
  1750. else *filterSize= 2;
  1751. // printf("%d %d %d\n", filterSize, srcW, dstW);
  1752. *filterSize= (*filterSize +(filterAlign-1)) & (~(filterAlign-1));
  1753. xDstInSrc= xInc - 0x8000;
  1754. for(i=0; i<dstW; i++)
  1755. {
  1756. int xx= (xDstInSrc>>16) - (*filterSize>>1) + 1;
  1757. int j;
  1758. filterPos[i]= xx;
  1759. if(sws_flags == SWS_BICUBIC)
  1760. {
  1761. double d= ABS(((xx+1)<<16) - xDstInSrc)/(double)(1<<16);
  1762. double y1,y2,y3,y4;
  1763. double A= -0.75;
  1764. // Equation is from VirtualDub
  1765. y1 = ( + A*d - 2.0*A*d*d + A*d*d*d);
  1766. y2 = (+ 1.0 - (A+3.0)*d*d + (A+2.0)*d*d*d);
  1767. y3 = ( - A*d + (2.0*A+3.0)*d*d - (A+2.0)*d*d*d);
  1768. y4 = ( + A*d*d - A*d*d*d);
  1769. // printf("%d %d %d \n", coeff, (int)d, xDstInSrc);
  1770. filter[i*(*filterSize) + 0]= y1;
  1771. filter[i*(*filterSize) + 1]= y2;
  1772. filter[i*(*filterSize) + 2]= y3;
  1773. filter[i*(*filterSize) + 3]= y4;
  1774. // printf("%1.3f %d, %d, %d, %d\n",d , y1, y2, y3, y4);
  1775. }
  1776. else
  1777. {
  1778. for(j=0; j<*filterSize; j++)
  1779. {
  1780. double d= ABS((xx<<16) - xDstInSrc)/(double)(1<<16);
  1781. double coeff= 1.0 - d;
  1782. if(coeff<0) coeff=0;
  1783. // printf("%d %d %d \n", coeff, (int)d, xDstInSrc);
  1784. filter[i*(*filterSize) + j]= coeff;
  1785. xx++;
  1786. }
  1787. }
  1788. xDstInSrc+= xInc;
  1789. }
  1790. }
  1791. else // downscale
  1792. {
  1793. int xDstInSrc;
  1794. if(sws_flags==SWS_BICUBIC) *filterSize= (int)ceil(1 + 4.0*srcW / (double)dstW);
  1795. else *filterSize= (int)ceil(1 + 2.0*srcW / (double)dstW);
  1796. // printf("%d %d %d\n", *filterSize, srcW, dstW);
  1797. *filterSize= (*filterSize +(filterAlign-1)) & (~(filterAlign-1));
  1798. xDstInSrc= xInc - 0x8000;
  1799. for(i=0; i<dstW; i++)
  1800. {
  1801. int xx= (int)((double)xDstInSrc/(double)(1<<16) - *filterSize*0.5 + 0.5);
  1802. int j;
  1803. filterPos[i]= xx;
  1804. for(j=0; j<*filterSize; j++)
  1805. {
  1806. double d= ABS((xx<<16) - xDstInSrc)/(double)xInc;
  1807. double coeff;
  1808. if(sws_flags == SWS_BICUBIC)
  1809. {
  1810. double A= -0.75;
  1811. // d*=2;
  1812. // Equation is from VirtualDub
  1813. if(d<1.0)
  1814. coeff = (1.0 - (A+3.0)*d*d + (A+2.0)*d*d*d);
  1815. else if(d<2.0)
  1816. coeff = (-4.0*A + 8.0*A*d - 5.0*A*d*d + A*d*d*d);
  1817. else
  1818. coeff=0.0;
  1819. }
  1820. else
  1821. {
  1822. coeff= 1.0 - d;
  1823. if(coeff<0) coeff=0;
  1824. }
  1825. // if(filterAlign==1) printf("%d %d %d \n", coeff, (int)d, xDstInSrc);
  1826. filter[i*(*filterSize) + j]= coeff;
  1827. xx++;
  1828. }
  1829. xDstInSrc+= xInc;
  1830. }
  1831. }
  1832. //fix borders
  1833. for(i=0; i<dstW; i++)
  1834. {
  1835. int j;
  1836. if(filterPos[i] < 0)
  1837. {
  1838. // Move filter coeffs left to compensate for filterPos
  1839. for(j=1; j<*filterSize; j++)
  1840. {
  1841. int left= MAX(j + filterPos[i], 0);
  1842. filter[i*(*filterSize) + left] += filter[i*(*filterSize) + j];
  1843. filter[i*(*filterSize) + j]=0;
  1844. }
  1845. filterPos[i]= 0;
  1846. }
  1847. if(filterPos[i] + (*filterSize) > srcW)
  1848. {
  1849. int shift= filterPos[i] + (*filterSize) - srcW;
  1850. // Move filter coeffs right to compensate for filterPos
  1851. for(j=(*filterSize)-2; j>=0; j--)
  1852. {
  1853. int right= MIN(j + shift, (*filterSize)-1);
  1854. filter[i*(*filterSize) +right] += filter[i*(*filterSize) +j];
  1855. filter[i*(*filterSize) +j]=0;
  1856. }
  1857. filterPos[i]= srcW - (*filterSize);
  1858. }
  1859. }
  1860. //FIXME try to align filterpos if possible / try to shift filterpos to put zeros at the end
  1861. // and skip these than later
  1862. //Normalize
  1863. for(i=0; i<dstW; i++)
  1864. {
  1865. int j;
  1866. double sum=0;
  1867. double scale= one;
  1868. for(j=0; j<*filterSize; j++)
  1869. {
  1870. sum+= filter[i*(*filterSize) + j];
  1871. }
  1872. scale/= sum;
  1873. for(j=0; j<*filterSize; j++)
  1874. {
  1875. dstFilter[i*(*filterSize) + j]= (int)(filter[i*(*filterSize) + j]*scale);
  1876. }
  1877. }
  1878. }
  1879. #ifdef HAVE_MMX2
  1880. static void initMMX2HScaler(int dstW, int xInc, uint8_t *funnyCode)
  1881. {
  1882. uint8_t *fragment;
  1883. int imm8OfPShufW1;
  1884. int imm8OfPShufW2;
  1885. int fragmentLength;
  1886. int xpos, i;
  1887. // create an optimized horizontal scaling routine
  1888. //code fragment
  1889. asm volatile(
  1890. "jmp 9f \n\t"
  1891. // Begin
  1892. "0: \n\t"
  1893. "movq (%%esi), %%mm0 \n\t" //FIXME Alignment
  1894. "movq %%mm0, %%mm1 \n\t"
  1895. "psrlq $8, %%mm0 \n\t"
  1896. "punpcklbw %%mm7, %%mm1 \n\t"
  1897. "movq %%mm2, %%mm3 \n\t"
  1898. "punpcklbw %%mm7, %%mm0 \n\t"
  1899. "addw %%bx, %%cx \n\t" //2*xalpha += (4*lumXInc)&0xFFFF
  1900. "pshufw $0xFF, %%mm1, %%mm1 \n\t"
  1901. "1: \n\t"
  1902. "adcl %%edx, %%esi \n\t" //xx+= (4*lumXInc)>>16 + carry
  1903. "pshufw $0xFF, %%mm0, %%mm0 \n\t"
  1904. "2: \n\t"
  1905. "psrlw $9, %%mm3 \n\t"
  1906. "psubw %%mm1, %%mm0 \n\t"
  1907. "pmullw %%mm3, %%mm0 \n\t"
  1908. "paddw %%mm6, %%mm2 \n\t" // 2*alpha += xpos&0xFFFF
  1909. "psllw $7, %%mm1 \n\t"
  1910. "paddw %%mm1, %%mm0 \n\t"
  1911. "movq %%mm0, (%%edi, %%eax) \n\t"
  1912. "addl $8, %%eax \n\t"
  1913. // End
  1914. "9: \n\t"
  1915. // "int $3\n\t"
  1916. "leal 0b, %0 \n\t"
  1917. "leal 1b, %1 \n\t"
  1918. "leal 2b, %2 \n\t"
  1919. "decl %1 \n\t"
  1920. "decl %2 \n\t"
  1921. "subl %0, %1 \n\t"
  1922. "subl %0, %2 \n\t"
  1923. "leal 9b, %3 \n\t"
  1924. "subl %0, %3 \n\t"
  1925. :"=r" (fragment), "=r" (imm8OfPShufW1), "=r" (imm8OfPShufW2),
  1926. "=r" (fragmentLength)
  1927. );
  1928. xpos= 0; //lumXInc/2 - 0x8000; // difference between pixel centers
  1929. for(i=0; i<dstW/8; i++)
  1930. {
  1931. int xx=xpos>>16;
  1932. if((i&3) == 0)
  1933. {
  1934. int a=0;
  1935. int b=((xpos+xInc)>>16) - xx;
  1936. int c=((xpos+xInc*2)>>16) - xx;
  1937. int d=((xpos+xInc*3)>>16) - xx;
  1938. memcpy(funnyCode + fragmentLength*i/4, fragment, fragmentLength);
  1939. funnyCode[fragmentLength*i/4 + imm8OfPShufW1]=
  1940. funnyCode[fragmentLength*i/4 + imm8OfPShufW2]=
  1941. a | (b<<2) | (c<<4) | (d<<6);
  1942. // if we dont need to read 8 bytes than dont :), reduces the chance of
  1943. // crossing a cache line
  1944. if(d<3) funnyCode[fragmentLength*i/4 + 1]= 0x6E;
  1945. funnyCode[fragmentLength*(i+4)/4]= RET;
  1946. }
  1947. xpos+=xInc;
  1948. }
  1949. /*
  1950. xpos= 0; //chrXInc/2 - 0x10000; // difference between centers of chrom samples
  1951. for(i=0; i<dstUVw/8; i++)
  1952. {
  1953. int xx=xpos>>16;
  1954. if((i&3) == 0)
  1955. {
  1956. int a=0;
  1957. int b=((xpos+chrXInc)>>16) - xx;
  1958. int c=((xpos+chrXInc*2)>>16) - xx;
  1959. int d=((xpos+chrXInc*3)>>16) - xx;
  1960. memcpy(funnyUVCode + fragmentLength*i/4, fragment, fragmentLength);
  1961. funnyUVCode[fragmentLength*i/4 + imm8OfPShufW1]=
  1962. funnyUVCode[fragmentLength*i/4 + imm8OfPShufW2]=
  1963. a | (b<<2) | (c<<4) | (d<<6);
  1964. // if we dont need to read 8 bytes than dont :), reduces the chance of
  1965. // crossing a cache line
  1966. if(d<3) funnyUVCode[fragmentLength*i/4 + 1]= 0x6E;
  1967. funnyUVCode[fragmentLength*(i+4)/4]= RET;
  1968. }
  1969. xpos+=chrXInc;
  1970. }
  1971. */
  1972. // funnyCode[0]= RET;
  1973. }
  1974. #endif // HAVE_MMX2
  1975. static void RENAME(SwScale_YV12slice)(unsigned char* srcptr[],int stride[], int srcSliceY ,
  1976. int srcSliceH, uint8_t* dstptr[], int dststride, int dstbpp,
  1977. int srcW, int srcH, int dstW, int dstH){
  1978. unsigned int lumXInc= (srcW << 16) / dstW;
  1979. unsigned int lumYInc= (srcH << 16) / dstH;
  1980. unsigned int chrXInc;
  1981. unsigned int chrYInc;
  1982. static int dstY;
  1983. // used to detect a size change
  1984. static int oldDstW= -1;
  1985. static int oldSrcW= -1;
  1986. static int oldDstH= -1;
  1987. static int oldSrcH= -1;
  1988. static int oldFlags=-1;
  1989. static int lastInLumBuf;
  1990. static int lastInChrBuf;
  1991. int chrDstW, chrDstH;
  1992. static int lumBufIndex=0;
  1993. static int chrBufIndex=0;
  1994. static int firstTime=1;
  1995. const int widthAlign= dstbpp==12 ? 16 : 8;
  1996. const int bytespp= (dstbpp+1)/8; //(12->1, 15&16->2, 24->3, 32->4)
  1997. const int over= dstbpp==12 ? (((dstW+15)&(~15))) - dststride
  1998. : (((dstW+7)&(~7)))*bytespp - dststride;
  1999. if(dststride%widthAlign !=0 )
  2000. {
  2001. if(firstTime)
  2002. fprintf(stderr, "SwScaler: Warning: dstStride is not a multiple of %d!\n"
  2003. "SwScaler: ->cannot do aligned memory acesses anymore\n",
  2004. widthAlign);
  2005. }
  2006. if(over>0)
  2007. {
  2008. if(firstTime)
  2009. fprintf(stderr, "SwScaler: Warning: output width is not a multiple of 8 (16 for YV12)\n"
  2010. "SwScaler: and dststride is not large enough to handle %d extra bytes\n"
  2011. "SwScaler: ->using unoptimized C version for last line(s)\n",
  2012. over);
  2013. }
  2014. //printf("%d %d %d %d\n", srcW, srcH, dstW, dstH);
  2015. //printf("%d %d %d %d\n", lumXInc, lumYInc, srcSliceY, srcSliceH);
  2016. #ifdef HAVE_MMX2
  2017. canMMX2BeUsed= (lumXInc <= 0x10000 && (dstW&31)==0 && (srcW&15)==0) ? 1 : 0;
  2018. if(!canMMX2BeUsed && lumXInc <= 0x10000 && (srcW&15)==0 && sws_flags==SWS_FAST_BILINEAR)
  2019. {
  2020. if(firstTime)
  2021. fprintf(stderr, "SwScaler: output Width is not a multiple of 32 -> no MMX2 scaler\n");
  2022. }
  2023. #else
  2024. canMMX2BeUsed=0; // should be 0 anyway but ...
  2025. #endif
  2026. if(firstTime)
  2027. {
  2028. #if defined (DITHER1XBPP) && defined (HAVE_MMX)
  2029. char *dither= " dithered";
  2030. #else
  2031. char *dither= "";
  2032. #endif
  2033. if(sws_flags==SWS_FAST_BILINEAR)
  2034. fprintf(stderr, "\nSwScaler: FAST_BILINEAR scaler ");
  2035. else if(sws_flags==SWS_BILINEAR)
  2036. fprintf(stderr, "\nSwScaler: BILINEAR scaler ");
  2037. else if(sws_flags==SWS_BICUBIC)
  2038. fprintf(stderr, "\nSwScaler: BICUBIC scaler ");
  2039. else
  2040. fprintf(stderr, "\nSwScaler: ehh flags invalid?! ");
  2041. if(dstbpp==15)
  2042. fprintf(stderr, "with%s BGR15 output ", dither);
  2043. else if(dstbpp==16)
  2044. fprintf(stderr, "with%s BGR16 output ", dither);
  2045. else if(dstbpp==24)
  2046. fprintf(stderr, "with BGR24 output ");
  2047. else if(dstbpp==32)
  2048. fprintf(stderr, "with BGR32 output ");
  2049. else if(dstbpp==12)
  2050. fprintf(stderr, "with YV12 output ");
  2051. else
  2052. fprintf(stderr, "without output ");
  2053. #ifdef HAVE_MMX2
  2054. fprintf(stderr, "using MMX2\n");
  2055. #elif defined (HAVE_3DNOW)
  2056. fprintf(stderr, "using 3DNOW\n");
  2057. #elif defined (HAVE_MMX)
  2058. fprintf(stderr, "using MMX\n");
  2059. #elif defined (ARCH_X86)
  2060. fprintf(stderr, "using X86 ASM\n");
  2061. #else
  2062. fprintf(stderr, "using C\n");
  2063. #endif
  2064. }
  2065. // match pixel 0 of the src to pixel 0 of dst and match pixel n-2 of src to pixel n-2 of dst
  2066. // n-2 is the last chrominance sample available
  2067. // this is not perfect, but noone shuld notice the difference, the more correct variant
  2068. // would be like the vertical one, but that would require some special code for the
  2069. // first and last pixel
  2070. if(sws_flags==SWS_FAST_BILINEAR)
  2071. {
  2072. if(canMMX2BeUsed) lumXInc+= 20;
  2073. #ifndef HAVE_MMX //we dont use the x86asm scaler if mmx is available
  2074. else lumXInc = ((srcW-2)<<16)/(dstW-2) - 20;
  2075. #endif
  2076. }
  2077. if(fullUVIpol && !(dstbpp==12)) chrXInc= lumXInc>>1, chrDstW= dstW;
  2078. else chrXInc= lumXInc, chrDstW= (dstW+1)>>1;
  2079. if(dstbpp==12) chrYInc= lumYInc, chrDstH= (dstH+1)>>1;
  2080. else chrYInc= lumYInc>>1, chrDstH= dstH;
  2081. // force calculation of the horizontal interpolation of the first line
  2082. if(srcSliceY ==0){
  2083. // printf("dstW %d, srcw %d, mmx2 %d\n", dstW, srcW, canMMX2BeUsed);
  2084. lumBufIndex=0;
  2085. chrBufIndex=0;
  2086. dstY=0;
  2087. //precalculate horizontal scaler filter coefficients
  2088. if(oldDstW!=dstW || oldSrcW!=srcW || oldFlags!=sws_flags)
  2089. {
  2090. #ifdef HAVE_MMX
  2091. const int filterAlign=4;
  2092. #else
  2093. const int filterAlign=1;
  2094. #endif
  2095. oldDstW= dstW; oldSrcW= srcW; oldFlags= sws_flags;
  2096. RENAME(initFilter)(hLumFilter, hLumFilterPos, &hLumFilterSize, lumXInc,
  2097. srcW , dstW , filterAlign, 1<<14);
  2098. RENAME(initFilter)(hChrFilter, hChrFilterPos, &hChrFilterSize, chrXInc,
  2099. (srcW+1)>>1, chrDstW, filterAlign, 1<<14);
  2100. #ifdef HAVE_MMX2
  2101. // cant downscale !!!
  2102. if(canMMX2BeUsed && sws_flags == SWS_FAST_BILINEAR)
  2103. {
  2104. initMMX2HScaler(dstW , lumXInc, funnyYCode);
  2105. initMMX2HScaler(chrDstW, chrXInc, funnyUVCode);
  2106. }
  2107. #endif
  2108. } // Init Horizontal stuff
  2109. if(oldDstH!=dstH || oldSrcH!=srcH || oldFlags!=sws_flags)
  2110. {
  2111. int i;
  2112. oldDstH= dstH; oldSrcH= srcH; oldFlags= sws_flags; //FIXME swsflags conflict with x check
  2113. // deallocate pixbufs
  2114. for(i=0; i<vLumBufSize; i++) free(lumPixBuf[i]);
  2115. for(i=0; i<vChrBufSize; i++) free(chrPixBuf[i]);
  2116. RENAME(initFilter)(vLumFilter, vLumFilterPos, &vLumFilterSize, lumYInc,
  2117. srcH , dstH, 1, (1<<12)-4);
  2118. RENAME(initFilter)(vChrFilter, vChrFilterPos, &vChrFilterSize, chrYInc,
  2119. (srcH+1)>>1, chrDstH, 1, (1<<12)-4);
  2120. // Calculate Buffer Sizes so that they wont run out while handling these damn slices
  2121. vLumBufSize= vLumFilterSize; vChrBufSize= vChrFilterSize;
  2122. for(i=0; i<dstH; i++)
  2123. {
  2124. int chrI= i*chrDstH / dstH;
  2125. int nextSlice= MAX(vLumFilterPos[i ] + vLumFilterSize - 1,
  2126. ((vChrFilterPos[chrI] + vChrFilterSize - 1)<<1));
  2127. nextSlice&= ~1; // Slices start at even boundaries
  2128. if(vLumFilterPos[i ] + vLumBufSize < nextSlice)
  2129. vLumBufSize= nextSlice - vLumFilterPos[i ];
  2130. if(vChrFilterPos[chrI] + vChrBufSize < (nextSlice>>1))
  2131. vChrBufSize= (nextSlice>>1) - vChrFilterPos[chrI];
  2132. }
  2133. // allocate pixbufs (we use dynamic allocation because otherwise we would need to
  2134. // allocate several megabytes to handle all possible cases)
  2135. for(i=0; i<vLumBufSize; i++)
  2136. lumPixBuf[i]= lumPixBuf[i+vLumBufSize]= (uint16_t*)memalign(8, 4000);
  2137. for(i=0; i<vChrBufSize; i++)
  2138. chrPixBuf[i]= chrPixBuf[i+vChrBufSize]= (uint16_t*)memalign(8, 8000);
  2139. //try to avoid drawing green stuff between the right end and the stride end
  2140. for(i=0; i<vLumBufSize; i++) memset(lumPixBuf[i], 0, 4000);
  2141. for(i=0; i<vChrBufSize; i++) memset(chrPixBuf[i], 64, 8000);
  2142. ASSERT(chrDstH<=dstH)
  2143. ASSERT(vLumFilterSize*dstH*4<16000)
  2144. ASSERT(vChrFilterSize*chrDstH*4<16000)
  2145. #ifdef HAVE_MMX
  2146. // pack filter data for mmx code
  2147. for(i=0; i<vLumFilterSize*dstH; i++)
  2148. lumMmxFilter[4*i]=lumMmxFilter[4*i+1]=lumMmxFilter[4*i+2]=lumMmxFilter[4*i+3]=
  2149. vLumFilter[i];
  2150. for(i=0; i<vChrFilterSize*chrDstH; i++)
  2151. chrMmxFilter[4*i]=chrMmxFilter[4*i+1]=chrMmxFilter[4*i+2]=chrMmxFilter[4*i+3]=
  2152. vChrFilter[i];
  2153. #endif
  2154. }
  2155. if(firstTime && verbose)
  2156. {
  2157. #ifdef HAVE_MMX2
  2158. int mmx2=1;
  2159. #else
  2160. int mmx2=0;
  2161. #endif
  2162. #ifdef HAVE_MMX
  2163. int mmx=1;
  2164. #else
  2165. int mmx=0;
  2166. #endif
  2167. #ifdef HAVE_MMX
  2168. if(canMMX2BeUsed && sws_flags==SWS_FAST_BILINEAR)
  2169. printf("SwScaler: using FAST_BILINEAR MMX2 scaler for horizontal scaling\n");
  2170. else
  2171. {
  2172. if(hLumFilterSize==4)
  2173. printf("SwScaler: using 4-tap MMX scaler for horizontal luminance scaling\n");
  2174. else if(hLumFilterSize==8)
  2175. printf("SwScaler: using 8-tap MMX scaler for horizontal luminance scaling\n");
  2176. else
  2177. printf("SwScaler: using n-tap MMX scaler for horizontal luminance scaling\n");
  2178. if(hChrFilterSize==4)
  2179. printf("SwScaler: using 4-tap MMX scaler for horizontal chrominance scaling\n");
  2180. else if(hChrFilterSize==8)
  2181. printf("SwScaler: using 8-tap MMX scaler for horizontal chrominance scaling\n");
  2182. else
  2183. printf("SwScaler: using n-tap MMX scaler for horizontal chrominance scaling\n");
  2184. }
  2185. #elif defined (ARCH_X86)
  2186. printf("SwScaler: using X86-Asm scaler for horizontal scaling\n");
  2187. #else
  2188. if(sws_flags==SWS_FAST_BILINEAR)
  2189. printf("SwScaler: using FAST_BILINEAR C scaler for horizontal scaling\n");
  2190. else
  2191. printf("SwScaler: using C scaler for horizontal scaling\n");
  2192. #endif
  2193. if(dstbpp==12)
  2194. {
  2195. if(vLumFilterSize==1)
  2196. printf("SwScaler: using 1-tap %s \"scaler\" for vertical scaling (YV12)\n", mmx ? "MMX" : "C");
  2197. else
  2198. printf("SwScaler: using n-tap %s scaler for vertical scaling (YV12)\n", mmx ? "MMX" : "C");
  2199. }
  2200. else
  2201. {
  2202. if(vLumFilterSize==1 && vChrFilterSize==2)
  2203. printf("SwScaler: using 1-tap %s \"scaler\" for vertical luminance scaling (BGR)\n"
  2204. "SwScaler: 2-tap scaler for vertical chrominance scaling (BGR)\n", mmx ? "MMX" : "C");
  2205. else if(vLumFilterSize==2 && vChrFilterSize==2)
  2206. printf("SwScaler: using 2-tap linear %s scaler for vertical scaling (BGR)\n", mmx ? "MMX" : "C");
  2207. else
  2208. printf("SwScaler: using n-tap %s scaler for vertical scaling (BGR)\n", mmx ? "MMX" : "C");
  2209. }
  2210. if(dstbpp==24)
  2211. printf("SwScaler: using %s YV12->BGR24 Converter\n",
  2212. mmx2 ? "MMX2" : (mmx ? "MMX" : "C"));
  2213. else
  2214. printf("SwScaler: using %s YV12->BGR%d Converter\n", mmx ? "MMX" : "C", dstbpp);
  2215. printf("SwScaler: %dx%d -> %dx%d\n", srcW, srcH, dstW, dstH);
  2216. }
  2217. lastInLumBuf= -1;
  2218. lastInChrBuf= -1;
  2219. } // if(firstLine)
  2220. for(;dstY < dstH; dstY++){
  2221. unsigned char *dest =dstptr[0]+dststride*dstY;
  2222. unsigned char *uDest=dstptr[1]+(dststride>>1)*(dstY>>1);
  2223. unsigned char *vDest=dstptr[2]+(dststride>>1)*(dstY>>1);
  2224. const int chrDstY= dstbpp==12 ? (dstY>>1) : dstY;
  2225. const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input
  2226. const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input
  2227. const int lastLumSrcY= firstLumSrcY + vLumFilterSize -1; // Last line needed as input
  2228. const int lastChrSrcY= firstChrSrcY + vChrFilterSize -1; // Last line needed as input
  2229. if(sws_flags == SWS_FAST_BILINEAR)
  2230. {
  2231. //handle holes
  2232. if(firstLumSrcY > lastInLumBuf) lastInLumBuf= firstLumSrcY-1;
  2233. if(firstChrSrcY > lastInChrBuf) lastInChrBuf= firstChrSrcY-1;
  2234. }
  2235. ASSERT(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1)
  2236. ASSERT(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1)
  2237. // Do we have enough lines in this slice to output the dstY line
  2238. if(lastLumSrcY < srcSliceY + srcSliceH && lastChrSrcY < ((srcSliceY + srcSliceH)>>1))
  2239. {
  2240. //Do horizontal scaling
  2241. while(lastInLumBuf < lastLumSrcY)
  2242. {
  2243. uint8_t *src= srcptr[0]+(lastInLumBuf + 1 - srcSliceY)*stride[0];
  2244. lumBufIndex++;
  2245. ASSERT(lumBufIndex < 2*vLumBufSize)
  2246. ASSERT(lastInLumBuf + 1 - srcSliceY < srcSliceH)
  2247. ASSERT(lastInLumBuf + 1 - srcSliceY >= 0)
  2248. // printf("%d %d\n", lumBufIndex, vLumBufSize);
  2249. RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, src, srcW, lumXInc);
  2250. lastInLumBuf++;
  2251. }
  2252. while(lastInChrBuf < lastChrSrcY)
  2253. {
  2254. uint8_t *src1= srcptr[1]+(lastInChrBuf + 1 - (srcSliceY>>1))*stride[1];
  2255. uint8_t *src2= srcptr[2]+(lastInChrBuf + 1 - (srcSliceY>>1))*stride[2];
  2256. chrBufIndex++;
  2257. ASSERT(chrBufIndex < 2*vChrBufSize)
  2258. ASSERT(lastInChrBuf + 1 - (srcSliceY>>1) < (srcSliceH>>1))
  2259. ASSERT(lastInChrBuf + 1 - (srcSliceY>>1) >= 0)
  2260. RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, (srcW+1)>>1, chrXInc);
  2261. lastInChrBuf++;
  2262. }
  2263. //wrap buf index around to stay inside the ring buffer
  2264. if(lumBufIndex >= vLumBufSize ) lumBufIndex-= vLumBufSize;
  2265. if(chrBufIndex >= vChrBufSize ) chrBufIndex-= vChrBufSize;
  2266. }
  2267. else // not enough lines left in this slice -> load the rest in the buffer
  2268. {
  2269. /* printf("%d %d Last:%d %d LastInBuf:%d %d Index:%d %d Y:%d FSize: %d %d BSize: %d %d\n",
  2270. firstChrSrcY,firstLumSrcY,lastChrSrcY,lastLumSrcY,
  2271. lastInChrBuf,lastInLumBuf,chrBufIndex,lumBufIndex,dstY,vChrFilterSize,vLumFilterSize,
  2272. vChrBufSize, vLumBufSize);
  2273. */
  2274. //Do horizontal scaling
  2275. while(lastInLumBuf+1 < srcSliceY + srcSliceH)
  2276. {
  2277. uint8_t *src= srcptr[0]+(lastInLumBuf + 1 - srcSliceY)*stride[0];
  2278. lumBufIndex++;
  2279. ASSERT(lumBufIndex < 2*vLumBufSize)
  2280. ASSERT(lastInLumBuf + 1 - srcSliceY < srcSliceH)
  2281. ASSERT(lastInLumBuf + 1 - srcSliceY >= 0)
  2282. RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, src, srcW, lumXInc);
  2283. lastInLumBuf++;
  2284. }
  2285. while(lastInChrBuf+1 < ((srcSliceY + srcSliceH)>>1))
  2286. {
  2287. uint8_t *src1= srcptr[1]+(lastInChrBuf + 1 - (srcSliceY>>1))*stride[1];
  2288. uint8_t *src2= srcptr[2]+(lastInChrBuf + 1 - (srcSliceY>>1))*stride[2];
  2289. chrBufIndex++;
  2290. ASSERT(chrBufIndex < 2*vChrBufSize)
  2291. ASSERT(lastInChrBuf + 1 - (srcSliceY>>1) < (srcSliceH>>1))
  2292. ASSERT(lastInChrBuf + 1 - (srcSliceY>>1) >= 0)
  2293. RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, (srcW+1)>>1, chrXInc);
  2294. lastInChrBuf++;
  2295. }
  2296. //wrap buf index around to stay inside the ring buffer
  2297. if(lumBufIndex >= vLumBufSize ) lumBufIndex-= vLumBufSize;
  2298. if(chrBufIndex >= vChrBufSize ) chrBufIndex-= vChrBufSize;
  2299. break; //we cant output a dstY line so lets try with the next slice
  2300. }
  2301. #ifdef HAVE_MMX
  2302. b5Dither= dither8[dstY&1];
  2303. g6Dither= dither4[dstY&1];
  2304. g5Dither= dither8[dstY&1];
  2305. r5Dither= dither8[(dstY+1)&1];
  2306. #endif
  2307. if(dstY < dstH-2 || over<=0)
  2308. {
  2309. if(dstbpp==12) //YV12
  2310. {
  2311. if(dstY&1) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
  2312. if(vLumFilterSize == 1 && vChrFilterSize == 1) // Unscaled YV12
  2313. {
  2314. int16_t *lumBuf = lumPixBuf[0];
  2315. int16_t *chrBuf= chrPixBuf[0];
  2316. RENAME(yuv2yuv1)(lumBuf, chrBuf, dest, uDest, vDest, dstW);
  2317. }
  2318. else //General YV12
  2319. {
  2320. int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
  2321. int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
  2322. RENAME(yuv2yuvX)(
  2323. vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
  2324. vChrFilter+(dstY>>1)*vChrFilterSize, chrSrcPtr, vChrFilterSize,
  2325. dest, uDest, vDest, dstW,
  2326. lumMmxFilter+dstY*vLumFilterSize*4, chrMmxFilter+(dstY>>1)*vChrFilterSize*4);
  2327. }
  2328. }
  2329. else
  2330. {
  2331. int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
  2332. int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
  2333. ASSERT(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
  2334. ASSERT(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
  2335. if(vLumFilterSize == 1 && vChrFilterSize == 2) //Unscaled RGB
  2336. {
  2337. int chrAlpha= vChrFilter[2*dstY+1];
  2338. RENAME(yuv2rgb1)(*lumSrcPtr, *chrSrcPtr, *(chrSrcPtr+1),
  2339. dest, dstW, chrAlpha, dstbpp);
  2340. }
  2341. else if(vLumFilterSize == 2 && vChrFilterSize == 2) //BiLinear Upscale RGB
  2342. {
  2343. int lumAlpha= vLumFilter[2*dstY+1];
  2344. int chrAlpha= vChrFilter[2*dstY+1];
  2345. RENAME(yuv2rgb2)(*lumSrcPtr, *(lumSrcPtr+1), *chrSrcPtr, *(chrSrcPtr+1),
  2346. dest, dstW, lumAlpha, chrAlpha, dstbpp);
  2347. }
  2348. else //General RGB
  2349. {
  2350. RENAME(yuv2rgbX)(
  2351. vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
  2352. vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
  2353. dest, dstW, dstbpp,
  2354. lumMmxFilter+dstY*vLumFilterSize*4, chrMmxFilter+dstY*vChrFilterSize*4);
  2355. }
  2356. }
  2357. }
  2358. else // hmm looks like we cant use MMX here without overwriting this arrays tail
  2359. {
  2360. int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
  2361. int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
  2362. if(dstbpp==12) //YV12
  2363. {
  2364. if(dstY&1) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
  2365. yuv2yuvXinC(
  2366. vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
  2367. vChrFilter+(dstY>>1)*vChrFilterSize, chrSrcPtr, vChrFilterSize,
  2368. dest, uDest, vDest, dstW);
  2369. }
  2370. else
  2371. {
  2372. ASSERT(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
  2373. ASSERT(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
  2374. yuv2rgbXinC(
  2375. vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
  2376. vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
  2377. dest, dstW, dstbpp);
  2378. }
  2379. }
  2380. }
  2381. #ifdef HAVE_MMX
  2382. __asm __volatile(SFENCE:::"memory");
  2383. __asm __volatile(EMMS:::"memory");
  2384. #endif
  2385. firstTime=0;
  2386. }