You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

2214 lines
67KB

  1. /*
  2. Copyright (C) 2001-2002 Michael Niedermayer <michaelni@gmx.at>
  3. This program is free software; you can redistribute it and/or modify
  4. it under the terms of the GNU General Public License as published by
  5. the Free Software Foundation; either version 2 of the License, or
  6. (at your option) any later version.
  7. This program is distributed in the hope that it will be useful,
  8. but WITHOUT ANY WARRANTY; without even the implied warranty of
  9. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  10. GNU General Public License for more details.
  11. You should have received a copy of the GNU General Public License
  12. along with this program; if not, write to the Free Software
  13. Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  14. */
  15. #undef MOVNTQ
  16. #undef PAVGB
  17. #undef PREFETCH
  18. #undef PREFETCHW
  19. #undef EMMS
  20. #undef SFENCE
  21. #ifdef HAVE_3DNOW
  22. /* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
  23. #define EMMS "femms"
  24. #else
  25. #define EMMS "emms"
  26. #endif
  27. #ifdef HAVE_3DNOW
  28. #define PREFETCH "prefetch"
  29. #define PREFETCHW "prefetchw"
  30. #elif defined ( HAVE_MMX2 )
  31. #define PREFETCH "prefetchnta"
  32. #define PREFETCHW "prefetcht0"
  33. #else
  34. #define PREFETCH "/nop"
  35. #define PREFETCHW "/nop"
  36. #endif
  37. #ifdef HAVE_MMX2
  38. #define SFENCE "sfence"
  39. #else
  40. #define SFENCE "/nop"
  41. #endif
  42. #ifdef HAVE_MMX2
  43. #define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
  44. #elif defined (HAVE_3DNOW)
  45. #define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
  46. #endif
  47. #ifdef HAVE_MMX2
  48. #define MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
  49. #else
  50. #define MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
  51. #endif
  52. #define YSCALEYUV2YV12X(x) \
  53. "xorl %%eax, %%eax \n\t"\
  54. "pxor %%mm3, %%mm3 \n\t"\
  55. "pxor %%mm4, %%mm4 \n\t"\
  56. "movl %0, %%edx \n\t"\
  57. ".balign 16 \n\t" /* FIXME Unroll? */\
  58. "1: \n\t"\
  59. "movl (%1, %%edx, 4), %%esi \n\t"\
  60. "movq (%2, %%edx, 8), %%mm0 \n\t" /* filterCoeff */\
  61. "movq " #x "(%%esi, %%eax, 2), %%mm2 \n\t" /* srcData */\
  62. "movq 8+" #x "(%%esi, %%eax, 2), %%mm5 \n\t" /* srcData */\
  63. "pmulhw %%mm0, %%mm2 \n\t"\
  64. "pmulhw %%mm0, %%mm5 \n\t"\
  65. "paddw %%mm2, %%mm3 \n\t"\
  66. "paddw %%mm5, %%mm4 \n\t"\
  67. "addl $1, %%edx \n\t"\
  68. " jnz 1b \n\t"\
  69. "psraw $3, %%mm3 \n\t"\
  70. "psraw $3, %%mm4 \n\t"\
  71. "packuswb %%mm4, %%mm3 \n\t"\
  72. MOVNTQ(%%mm3, (%3, %%eax))\
  73. "addl $8, %%eax \n\t"\
  74. "cmpl %4, %%eax \n\t"\
  75. "pxor %%mm3, %%mm3 \n\t"\
  76. "pxor %%mm4, %%mm4 \n\t"\
  77. "movl %0, %%edx \n\t"\
  78. "jb 1b \n\t"
  79. #define YSCALEYUV2YV121 \
  80. "movl %2, %%eax \n\t"\
  81. ".balign 16 \n\t" /* FIXME Unroll? */\
  82. "1: \n\t"\
  83. "movq (%0, %%eax, 2), %%mm0 \n\t"\
  84. "movq 8(%0, %%eax, 2), %%mm1 \n\t"\
  85. "psraw $7, %%mm0 \n\t"\
  86. "psraw $7, %%mm1 \n\t"\
  87. "packuswb %%mm1, %%mm0 \n\t"\
  88. MOVNTQ(%%mm0, (%1, %%eax))\
  89. "addl $8, %%eax \n\t"\
  90. "jnc 1b \n\t"
  91. /*
  92. :: "m" (-lumFilterSize), "m" (-chrFilterSize),
  93. "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
  94. "r" (dest), "m" (dstW),
  95. "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
  96. : "%eax", "%ebx", "%ecx", "%edx", "%esi"
  97. */
  98. #define YSCALEYUV2RGBX \
  99. "xorl %%eax, %%eax \n\t"\
  100. ".balign 16 \n\t"\
  101. "1: \n\t"\
  102. "movl %1, %%edx \n\t" /* -chrFilterSize */\
  103. "movl %3, %%ebx \n\t" /* chrMmxFilter+lumFilterSize */\
  104. "movl %7, %%ecx \n\t" /* chrSrc+lumFilterSize */\
  105. "pxor %%mm3, %%mm3 \n\t"\
  106. "pxor %%mm4, %%mm4 \n\t"\
  107. "2: \n\t"\
  108. "movl (%%ecx, %%edx, 4), %%esi \n\t"\
  109. "movq (%%ebx, %%edx, 8), %%mm0 \n\t" /* filterCoeff */\
  110. "movq (%%esi, %%eax), %%mm2 \n\t" /* UsrcData */\
  111. "movq 4096(%%esi, %%eax), %%mm5 \n\t" /* VsrcData */\
  112. "pmulhw %%mm0, %%mm2 \n\t"\
  113. "pmulhw %%mm0, %%mm5 \n\t"\
  114. "paddw %%mm2, %%mm3 \n\t"\
  115. "paddw %%mm5, %%mm4 \n\t"\
  116. "addl $1, %%edx \n\t"\
  117. " jnz 2b \n\t"\
  118. \
  119. "movl %0, %%edx \n\t" /* -lumFilterSize */\
  120. "movl %2, %%ebx \n\t" /* lumMmxFilter+lumFilterSize */\
  121. "movl %6, %%ecx \n\t" /* lumSrc+lumFilterSize */\
  122. "pxor %%mm1, %%mm1 \n\t"\
  123. "pxor %%mm7, %%mm7 \n\t"\
  124. "2: \n\t"\
  125. "movl (%%ecx, %%edx, 4), %%esi \n\t"\
  126. "movq (%%ebx, %%edx, 8), %%mm0 \n\t" /* filterCoeff */\
  127. "movq (%%esi, %%eax, 2), %%mm2 \n\t" /* Y1srcData */\
  128. "movq 8(%%esi, %%eax, 2), %%mm5 \n\t" /* Y2srcData */\
  129. "pmulhw %%mm0, %%mm2 \n\t"\
  130. "pmulhw %%mm0, %%mm5 \n\t"\
  131. "paddw %%mm2, %%mm1 \n\t"\
  132. "paddw %%mm5, %%mm7 \n\t"\
  133. "addl $1, %%edx \n\t"\
  134. " jnz 2b \n\t"\
  135. \
  136. "psubw "MANGLE(w400)", %%mm3 \n\t" /* (U-128)8*/\
  137. "psubw "MANGLE(w400)", %%mm4 \n\t" /* (V-128)8*/\
  138. "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
  139. "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
  140. "pmulhw "MANGLE(ugCoeff)", %%mm3\n\t"\
  141. "pmulhw "MANGLE(vgCoeff)", %%mm4\n\t"\
  142. /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
  143. "pmulhw "MANGLE(ubCoeff)", %%mm2\n\t"\
  144. "pmulhw "MANGLE(vrCoeff)", %%mm5\n\t"\
  145. "psubw "MANGLE(w80)", %%mm1 \n\t" /* 8(Y-16)*/\
  146. "psubw "MANGLE(w80)", %%mm7 \n\t" /* 8(Y-16)*/\
  147. "pmulhw "MANGLE(yCoeff)", %%mm1 \n\t"\
  148. "pmulhw "MANGLE(yCoeff)", %%mm7 \n\t"\
  149. /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
  150. "paddw %%mm3, %%mm4 \n\t"\
  151. "movq %%mm2, %%mm0 \n\t"\
  152. "movq %%mm5, %%mm6 \n\t"\
  153. "movq %%mm4, %%mm3 \n\t"\
  154. "punpcklwd %%mm2, %%mm2 \n\t"\
  155. "punpcklwd %%mm5, %%mm5 \n\t"\
  156. "punpcklwd %%mm4, %%mm4 \n\t"\
  157. "paddw %%mm1, %%mm2 \n\t"\
  158. "paddw %%mm1, %%mm5 \n\t"\
  159. "paddw %%mm1, %%mm4 \n\t"\
  160. "punpckhwd %%mm0, %%mm0 \n\t"\
  161. "punpckhwd %%mm6, %%mm6 \n\t"\
  162. "punpckhwd %%mm3, %%mm3 \n\t"\
  163. "paddw %%mm7, %%mm0 \n\t"\
  164. "paddw %%mm7, %%mm6 \n\t"\
  165. "paddw %%mm7, %%mm3 \n\t"\
  166. /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
  167. "packuswb %%mm0, %%mm2 \n\t"\
  168. "packuswb %%mm6, %%mm5 \n\t"\
  169. "packuswb %%mm3, %%mm4 \n\t"\
  170. "pxor %%mm7, %%mm7 \n\t"
  171. #define FULL_YSCALEYUV2RGB \
  172. "pxor %%mm7, %%mm7 \n\t"\
  173. "movd %6, %%mm6 \n\t" /*yalpha1*/\
  174. "punpcklwd %%mm6, %%mm6 \n\t"\
  175. "punpcklwd %%mm6, %%mm6 \n\t"\
  176. "movd %7, %%mm5 \n\t" /*uvalpha1*/\
  177. "punpcklwd %%mm5, %%mm5 \n\t"\
  178. "punpcklwd %%mm5, %%mm5 \n\t"\
  179. "xorl %%eax, %%eax \n\t"\
  180. ".balign 16 \n\t"\
  181. "1: \n\t"\
  182. "movq (%0, %%eax, 2), %%mm0 \n\t" /*buf0[eax]*/\
  183. "movq (%1, %%eax, 2), %%mm1 \n\t" /*buf1[eax]*/\
  184. "movq (%2, %%eax,2), %%mm2 \n\t" /* uvbuf0[eax]*/\
  185. "movq (%3, %%eax,2), %%mm3 \n\t" /* uvbuf1[eax]*/\
  186. "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
  187. "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
  188. "pmulhw %%mm6, %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
  189. "pmulhw %%mm5, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
  190. "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  191. "movq 4096(%2, %%eax,2), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
  192. "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
  193. "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
  194. "movq 4096(%3, %%eax,2), %%mm0 \n\t" /* uvbuf1[eax+2048]*/\
  195. "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
  196. "psubw %%mm0, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
  197. "psubw "MANGLE(w80)", %%mm1 \n\t" /* 8(Y-16)*/\
  198. "psubw "MANGLE(w400)", %%mm3 \n\t" /* 8(U-128)*/\
  199. "pmulhw "MANGLE(yCoeff)", %%mm1 \n\t"\
  200. \
  201. \
  202. "pmulhw %%mm5, %%mm4 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
  203. "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
  204. "pmulhw "MANGLE(ubCoeff)", %%mm3\n\t"\
  205. "psraw $4, %%mm0 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
  206. "pmulhw "MANGLE(ugCoeff)", %%mm2\n\t"\
  207. "paddw %%mm4, %%mm0 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
  208. "psubw "MANGLE(w400)", %%mm0 \n\t" /* (V-128)8*/\
  209. \
  210. \
  211. "movq %%mm0, %%mm4 \n\t" /* (V-128)8*/\
  212. "pmulhw "MANGLE(vrCoeff)", %%mm0\n\t"\
  213. "pmulhw "MANGLE(vgCoeff)", %%mm4\n\t"\
  214. "paddw %%mm1, %%mm3 \n\t" /* B*/\
  215. "paddw %%mm1, %%mm0 \n\t" /* R*/\
  216. "packuswb %%mm3, %%mm3 \n\t"\
  217. \
  218. "packuswb %%mm0, %%mm0 \n\t"\
  219. "paddw %%mm4, %%mm2 \n\t"\
  220. "paddw %%mm2, %%mm1 \n\t" /* G*/\
  221. \
  222. "packuswb %%mm1, %%mm1 \n\t"
  223. #define YSCALEYUV2RGB \
  224. "movd %6, %%mm6 \n\t" /*yalpha1*/\
  225. "punpcklwd %%mm6, %%mm6 \n\t"\
  226. "punpcklwd %%mm6, %%mm6 \n\t"\
  227. "movq %%mm6, "MANGLE(asm_yalpha1)"\n\t"\
  228. "movd %7, %%mm5 \n\t" /*uvalpha1*/\
  229. "punpcklwd %%mm5, %%mm5 \n\t"\
  230. "punpcklwd %%mm5, %%mm5 \n\t"\
  231. "movq %%mm5, "MANGLE(asm_uvalpha1)"\n\t"\
  232. "xorl %%eax, %%eax \n\t"\
  233. ".balign 16 \n\t"\
  234. "1: \n\t"\
  235. "movq (%2, %%eax), %%mm2 \n\t" /* uvbuf0[eax]*/\
  236. "movq (%3, %%eax), %%mm3 \n\t" /* uvbuf1[eax]*/\
  237. "movq 4096(%2, %%eax), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
  238. "movq 4096(%3, %%eax), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
  239. "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
  240. "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
  241. "movq "MANGLE(asm_uvalpha1)", %%mm0\n\t"\
  242. "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
  243. "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
  244. "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
  245. "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
  246. "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
  247. "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
  248. "psubw "MANGLE(w400)", %%mm3 \n\t" /* (U-128)8*/\
  249. "psubw "MANGLE(w400)", %%mm4 \n\t" /* (V-128)8*/\
  250. "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
  251. "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
  252. "pmulhw "MANGLE(ugCoeff)", %%mm3\n\t"\
  253. "pmulhw "MANGLE(vgCoeff)", %%mm4\n\t"\
  254. /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
  255. "movq (%0, %%eax, 2), %%mm0 \n\t" /*buf0[eax]*/\
  256. "movq (%1, %%eax, 2), %%mm1 \n\t" /*buf1[eax]*/\
  257. "movq 8(%0, %%eax, 2), %%mm6 \n\t" /*buf0[eax]*/\
  258. "movq 8(%1, %%eax, 2), %%mm7 \n\t" /*buf1[eax]*/\
  259. "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
  260. "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
  261. "pmulhw "MANGLE(asm_yalpha1)", %%mm0\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
  262. "pmulhw "MANGLE(asm_yalpha1)", %%mm6\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
  263. "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  264. "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  265. "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
  266. "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
  267. "pmulhw "MANGLE(ubCoeff)", %%mm2\n\t"\
  268. "pmulhw "MANGLE(vrCoeff)", %%mm5\n\t"\
  269. "psubw "MANGLE(w80)", %%mm1 \n\t" /* 8(Y-16)*/\
  270. "psubw "MANGLE(w80)", %%mm7 \n\t" /* 8(Y-16)*/\
  271. "pmulhw "MANGLE(yCoeff)", %%mm1 \n\t"\
  272. "pmulhw "MANGLE(yCoeff)", %%mm7 \n\t"\
  273. /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
  274. "paddw %%mm3, %%mm4 \n\t"\
  275. "movq %%mm2, %%mm0 \n\t"\
  276. "movq %%mm5, %%mm6 \n\t"\
  277. "movq %%mm4, %%mm3 \n\t"\
  278. "punpcklwd %%mm2, %%mm2 \n\t"\
  279. "punpcklwd %%mm5, %%mm5 \n\t"\
  280. "punpcklwd %%mm4, %%mm4 \n\t"\
  281. "paddw %%mm1, %%mm2 \n\t"\
  282. "paddw %%mm1, %%mm5 \n\t"\
  283. "paddw %%mm1, %%mm4 \n\t"\
  284. "punpckhwd %%mm0, %%mm0 \n\t"\
  285. "punpckhwd %%mm6, %%mm6 \n\t"\
  286. "punpckhwd %%mm3, %%mm3 \n\t"\
  287. "paddw %%mm7, %%mm0 \n\t"\
  288. "paddw %%mm7, %%mm6 \n\t"\
  289. "paddw %%mm7, %%mm3 \n\t"\
  290. /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
  291. "packuswb %%mm0, %%mm2 \n\t"\
  292. "packuswb %%mm6, %%mm5 \n\t"\
  293. "packuswb %%mm3, %%mm4 \n\t"\
  294. "pxor %%mm7, %%mm7 \n\t"
  295. #define YSCALEYUV2RGB1 \
  296. "xorl %%eax, %%eax \n\t"\
  297. ".balign 16 \n\t"\
  298. "1: \n\t"\
  299. "movq (%2, %%eax), %%mm3 \n\t" /* uvbuf0[eax]*/\
  300. "movq 4096(%2, %%eax), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
  301. "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
  302. "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
  303. "psubw "MANGLE(w400)", %%mm3 \n\t" /* (U-128)8*/\
  304. "psubw "MANGLE(w400)", %%mm4 \n\t" /* (V-128)8*/\
  305. "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
  306. "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
  307. "pmulhw "MANGLE(ugCoeff)", %%mm3\n\t"\
  308. "pmulhw "MANGLE(vgCoeff)", %%mm4\n\t"\
  309. /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
  310. "movq (%0, %%eax, 2), %%mm1 \n\t" /*buf0[eax]*/\
  311. "movq 8(%0, %%eax, 2), %%mm7 \n\t" /*buf0[eax]*/\
  312. "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  313. "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  314. "pmulhw "MANGLE(ubCoeff)", %%mm2\n\t"\
  315. "pmulhw "MANGLE(vrCoeff)", %%mm5\n\t"\
  316. "psubw "MANGLE(w80)", %%mm1 \n\t" /* 8(Y-16)*/\
  317. "psubw "MANGLE(w80)", %%mm7 \n\t" /* 8(Y-16)*/\
  318. "pmulhw "MANGLE(yCoeff)", %%mm1 \n\t"\
  319. "pmulhw "MANGLE(yCoeff)", %%mm7 \n\t"\
  320. /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
  321. "paddw %%mm3, %%mm4 \n\t"\
  322. "movq %%mm2, %%mm0 \n\t"\
  323. "movq %%mm5, %%mm6 \n\t"\
  324. "movq %%mm4, %%mm3 \n\t"\
  325. "punpcklwd %%mm2, %%mm2 \n\t"\
  326. "punpcklwd %%mm5, %%mm5 \n\t"\
  327. "punpcklwd %%mm4, %%mm4 \n\t"\
  328. "paddw %%mm1, %%mm2 \n\t"\
  329. "paddw %%mm1, %%mm5 \n\t"\
  330. "paddw %%mm1, %%mm4 \n\t"\
  331. "punpckhwd %%mm0, %%mm0 \n\t"\
  332. "punpckhwd %%mm6, %%mm6 \n\t"\
  333. "punpckhwd %%mm3, %%mm3 \n\t"\
  334. "paddw %%mm7, %%mm0 \n\t"\
  335. "paddw %%mm7, %%mm6 \n\t"\
  336. "paddw %%mm7, %%mm3 \n\t"\
  337. /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
  338. "packuswb %%mm0, %%mm2 \n\t"\
  339. "packuswb %%mm6, %%mm5 \n\t"\
  340. "packuswb %%mm3, %%mm4 \n\t"\
  341. "pxor %%mm7, %%mm7 \n\t"
  342. // do vertical chrominance interpolation
  343. #define YSCALEYUV2RGB1b \
  344. "xorl %%eax, %%eax \n\t"\
  345. ".balign 16 \n\t"\
  346. "1: \n\t"\
  347. "movq (%2, %%eax), %%mm2 \n\t" /* uvbuf0[eax]*/\
  348. "movq (%3, %%eax), %%mm3 \n\t" /* uvbuf1[eax]*/\
  349. "movq 4096(%2, %%eax), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
  350. "movq 4096(%3, %%eax), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
  351. "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
  352. "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
  353. "psrlw $5, %%mm3 \n\t" /*FIXME might overflow*/\
  354. "psrlw $5, %%mm4 \n\t" /*FIXME might overflow*/\
  355. "psubw "MANGLE(w400)", %%mm3 \n\t" /* (U-128)8*/\
  356. "psubw "MANGLE(w400)", %%mm4 \n\t" /* (V-128)8*/\
  357. "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
  358. "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
  359. "pmulhw "MANGLE(ugCoeff)", %%mm3\n\t"\
  360. "pmulhw "MANGLE(vgCoeff)", %%mm4\n\t"\
  361. /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
  362. "movq (%0, %%eax, 2), %%mm1 \n\t" /*buf0[eax]*/\
  363. "movq 8(%0, %%eax, 2), %%mm7 \n\t" /*buf0[eax]*/\
  364. "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  365. "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  366. "pmulhw "MANGLE(ubCoeff)", %%mm2\n\t"\
  367. "pmulhw "MANGLE(vrCoeff)", %%mm5\n\t"\
  368. "psubw "MANGLE(w80)", %%mm1 \n\t" /* 8(Y-16)*/\
  369. "psubw "MANGLE(w80)", %%mm7 \n\t" /* 8(Y-16)*/\
  370. "pmulhw "MANGLE(yCoeff)", %%mm1 \n\t"\
  371. "pmulhw "MANGLE(yCoeff)", %%mm7 \n\t"\
  372. /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
  373. "paddw %%mm3, %%mm4 \n\t"\
  374. "movq %%mm2, %%mm0 \n\t"\
  375. "movq %%mm5, %%mm6 \n\t"\
  376. "movq %%mm4, %%mm3 \n\t"\
  377. "punpcklwd %%mm2, %%mm2 \n\t"\
  378. "punpcklwd %%mm5, %%mm5 \n\t"\
  379. "punpcklwd %%mm4, %%mm4 \n\t"\
  380. "paddw %%mm1, %%mm2 \n\t"\
  381. "paddw %%mm1, %%mm5 \n\t"\
  382. "paddw %%mm1, %%mm4 \n\t"\
  383. "punpckhwd %%mm0, %%mm0 \n\t"\
  384. "punpckhwd %%mm6, %%mm6 \n\t"\
  385. "punpckhwd %%mm3, %%mm3 \n\t"\
  386. "paddw %%mm7, %%mm0 \n\t"\
  387. "paddw %%mm7, %%mm6 \n\t"\
  388. "paddw %%mm7, %%mm3 \n\t"\
  389. /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
  390. "packuswb %%mm0, %%mm2 \n\t"\
  391. "packuswb %%mm6, %%mm5 \n\t"\
  392. "packuswb %%mm3, %%mm4 \n\t"\
  393. "pxor %%mm7, %%mm7 \n\t"
  394. #define WRITEBGR32 \
  395. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
  396. "movq %%mm2, %%mm1 \n\t" /* B */\
  397. "movq %%mm5, %%mm6 \n\t" /* R */\
  398. "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
  399. "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
  400. "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
  401. "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
  402. "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
  403. "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
  404. "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
  405. "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
  406. "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
  407. "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
  408. \
  409. MOVNTQ(%%mm0, (%4, %%eax, 4))\
  410. MOVNTQ(%%mm2, 8(%4, %%eax, 4))\
  411. MOVNTQ(%%mm1, 16(%4, %%eax, 4))\
  412. MOVNTQ(%%mm3, 24(%4, %%eax, 4))\
  413. \
  414. "addl $8, %%eax \n\t"\
  415. "cmpl %5, %%eax \n\t"\
  416. " jb 1b \n\t"
  417. #define WRITEBGR16 \
  418. "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
  419. "pand "MANGLE(bFC)", %%mm4 \n\t" /* G */\
  420. "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
  421. "psrlq $3, %%mm2 \n\t"\
  422. \
  423. "movq %%mm2, %%mm1 \n\t"\
  424. "movq %%mm4, %%mm3 \n\t"\
  425. \
  426. "punpcklbw %%mm7, %%mm3 \n\t"\
  427. "punpcklbw %%mm5, %%mm2 \n\t"\
  428. "punpckhbw %%mm7, %%mm4 \n\t"\
  429. "punpckhbw %%mm5, %%mm1 \n\t"\
  430. \
  431. "psllq $3, %%mm3 \n\t"\
  432. "psllq $3, %%mm4 \n\t"\
  433. \
  434. "por %%mm3, %%mm2 \n\t"\
  435. "por %%mm4, %%mm1 \n\t"\
  436. \
  437. MOVNTQ(%%mm2, (%4, %%eax, 2))\
  438. MOVNTQ(%%mm1, 8(%4, %%eax, 2))\
  439. \
  440. "addl $8, %%eax \n\t"\
  441. "cmpl %5, %%eax \n\t"\
  442. " jb 1b \n\t"
  443. #define WRITEBGR15 \
  444. "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
  445. "pand "MANGLE(bF8)", %%mm4 \n\t" /* G */\
  446. "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
  447. "psrlq $3, %%mm2 \n\t"\
  448. "psrlq $1, %%mm5 \n\t"\
  449. \
  450. "movq %%mm2, %%mm1 \n\t"\
  451. "movq %%mm4, %%mm3 \n\t"\
  452. \
  453. "punpcklbw %%mm7, %%mm3 \n\t"\
  454. "punpcklbw %%mm5, %%mm2 \n\t"\
  455. "punpckhbw %%mm7, %%mm4 \n\t"\
  456. "punpckhbw %%mm5, %%mm1 \n\t"\
  457. \
  458. "psllq $2, %%mm3 \n\t"\
  459. "psllq $2, %%mm4 \n\t"\
  460. \
  461. "por %%mm3, %%mm2 \n\t"\
  462. "por %%mm4, %%mm1 \n\t"\
  463. \
  464. MOVNTQ(%%mm2, (%4, %%eax, 2))\
  465. MOVNTQ(%%mm1, 8(%4, %%eax, 2))\
  466. \
  467. "addl $8, %%eax \n\t"\
  468. "cmpl %5, %%eax \n\t"\
  469. " jb 1b \n\t"
  470. #define WRITEBGR24OLD \
  471. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
  472. "movq %%mm2, %%mm1 \n\t" /* B */\
  473. "movq %%mm5, %%mm6 \n\t" /* R */\
  474. "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
  475. "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
  476. "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
  477. "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
  478. "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
  479. "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
  480. "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
  481. "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
  482. "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
  483. "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
  484. \
  485. "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
  486. "psrlq $8, %%mm0 \n\t" /* 00RGB0RG 0 */\
  487. "pand "MANGLE(bm00000111)", %%mm4\n\t" /* 00000RGB 0 */\
  488. "pand "MANGLE(bm11111000)", %%mm0\n\t" /* 00RGB000 0.5 */\
  489. "por %%mm4, %%mm0 \n\t" /* 00RGBRGB 0 */\
  490. "movq %%mm2, %%mm4 \n\t" /* 0RGB0RGB 1 */\
  491. "psllq $48, %%mm2 \n\t" /* GB000000 1 */\
  492. "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
  493. \
  494. "movq %%mm4, %%mm2 \n\t" /* 0RGB0RGB 1 */\
  495. "psrld $16, %%mm4 \n\t" /* 000R000R 1 */\
  496. "psrlq $24, %%mm2 \n\t" /* 0000RGB0 1.5 */\
  497. "por %%mm4, %%mm2 \n\t" /* 000RRGBR 1 */\
  498. "pand "MANGLE(bm00001111)", %%mm2\n\t" /* 0000RGBR 1 */\
  499. "movq %%mm1, %%mm4 \n\t" /* 0RGB0RGB 2 */\
  500. "psrlq $8, %%mm1 \n\t" /* 00RGB0RG 2 */\
  501. "pand "MANGLE(bm00000111)", %%mm4\n\t" /* 00000RGB 2 */\
  502. "pand "MANGLE(bm11111000)", %%mm1\n\t" /* 00RGB000 2.5 */\
  503. "por %%mm4, %%mm1 \n\t" /* 00RGBRGB 2 */\
  504. "movq %%mm1, %%mm4 \n\t" /* 00RGBRGB 2 */\
  505. "psllq $32, %%mm1 \n\t" /* BRGB0000 2 */\
  506. "por %%mm1, %%mm2 \n\t" /* BRGBRGBR 1 */\
  507. \
  508. "psrlq $32, %%mm4 \n\t" /* 000000RG 2.5 */\
  509. "movq %%mm3, %%mm5 \n\t" /* 0RGB0RGB 3 */\
  510. "psrlq $8, %%mm3 \n\t" /* 00RGB0RG 3 */\
  511. "pand "MANGLE(bm00000111)", %%mm5\n\t" /* 00000RGB 3 */\
  512. "pand "MANGLE(bm11111000)", %%mm3\n\t" /* 00RGB000 3.5 */\
  513. "por %%mm5, %%mm3 \n\t" /* 00RGBRGB 3 */\
  514. "psllq $16, %%mm3 \n\t" /* RGBRGB00 3 */\
  515. "por %%mm4, %%mm3 \n\t" /* RGBRGBRG 2.5 */\
  516. \
  517. MOVNTQ(%%mm0, (%%ebx))\
  518. MOVNTQ(%%mm2, 8(%%ebx))\
  519. MOVNTQ(%%mm3, 16(%%ebx))\
  520. "addl $24, %%ebx \n\t"\
  521. \
  522. "addl $8, %%eax \n\t"\
  523. "cmpl %5, %%eax \n\t"\
  524. " jb 1b \n\t"
  525. #define WRITEBGR24MMX \
  526. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
  527. "movq %%mm2, %%mm1 \n\t" /* B */\
  528. "movq %%mm5, %%mm6 \n\t" /* R */\
  529. "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
  530. "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
  531. "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
  532. "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
  533. "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
  534. "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
  535. "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
  536. "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
  537. "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
  538. "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
  539. \
  540. "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
  541. "movq %%mm2, %%mm6 \n\t" /* 0RGB0RGB 1 */\
  542. "movq %%mm1, %%mm5 \n\t" /* 0RGB0RGB 2 */\
  543. "movq %%mm3, %%mm7 \n\t" /* 0RGB0RGB 3 */\
  544. \
  545. "psllq $40, %%mm0 \n\t" /* RGB00000 0 */\
  546. "psllq $40, %%mm2 \n\t" /* RGB00000 1 */\
  547. "psllq $40, %%mm1 \n\t" /* RGB00000 2 */\
  548. "psllq $40, %%mm3 \n\t" /* RGB00000 3 */\
  549. \
  550. "punpckhdq %%mm4, %%mm0 \n\t" /* 0RGBRGB0 0 */\
  551. "punpckhdq %%mm6, %%mm2 \n\t" /* 0RGBRGB0 1 */\
  552. "punpckhdq %%mm5, %%mm1 \n\t" /* 0RGBRGB0 2 */\
  553. "punpckhdq %%mm7, %%mm3 \n\t" /* 0RGBRGB0 3 */\
  554. \
  555. "psrlq $8, %%mm0 \n\t" /* 00RGBRGB 0 */\
  556. "movq %%mm2, %%mm6 \n\t" /* 0RGBRGB0 1 */\
  557. "psllq $40, %%mm2 \n\t" /* GB000000 1 */\
  558. "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
  559. MOVNTQ(%%mm0, (%%ebx))\
  560. \
  561. "psrlq $24, %%mm6 \n\t" /* 0000RGBR 1 */\
  562. "movq %%mm1, %%mm5 \n\t" /* 0RGBRGB0 2 */\
  563. "psllq $24, %%mm1 \n\t" /* BRGB0000 2 */\
  564. "por %%mm1, %%mm6 \n\t" /* BRGBRGBR 1 */\
  565. MOVNTQ(%%mm6, 8(%%ebx))\
  566. \
  567. "psrlq $40, %%mm5 \n\t" /* 000000RG 2 */\
  568. "psllq $8, %%mm3 \n\t" /* RGBRGB00 3 */\
  569. "por %%mm3, %%mm5 \n\t" /* RGBRGBRG 2 */\
  570. MOVNTQ(%%mm5, 16(%%ebx))\
  571. \
  572. "addl $24, %%ebx \n\t"\
  573. \
  574. "addl $8, %%eax \n\t"\
  575. "cmpl %5, %%eax \n\t"\
  576. " jb 1b \n\t"
  577. #define WRITEBGR24MMX2 \
  578. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
  579. "movq "MANGLE(M24A)", %%mm0 \n\t"\
  580. "movq "MANGLE(M24C)", %%mm7 \n\t"\
  581. "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2 B1 B0 B1 B0 */\
  582. "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2 G1 G0 G1 G0 */\
  583. "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0 R1 R0 R1 R0 */\
  584. \
  585. "pand %%mm0, %%mm1 \n\t" /* B2 B1 B0 */\
  586. "pand %%mm0, %%mm3 \n\t" /* G2 G1 G0 */\
  587. "pand %%mm7, %%mm6 \n\t" /* R1 R0 */\
  588. \
  589. "psllq $8, %%mm3 \n\t" /* G2 G1 G0 */\
  590. "por %%mm1, %%mm6 \n\t"\
  591. "por %%mm3, %%mm6 \n\t"\
  592. MOVNTQ(%%mm6, (%%ebx))\
  593. \
  594. "psrlq $8, %%mm4 \n\t" /* 00 G7 G6 G5 G4 G3 G2 G1 */\
  595. "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4 B3 B2 B3 B2 */\
  596. "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3 G4 G3 G4 G3 */\
  597. "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4 R3 R2 R3 R2 */\
  598. \
  599. "pand "MANGLE(M24B)", %%mm1 \n\t" /* B5 B4 B3 */\
  600. "pand %%mm7, %%mm3 \n\t" /* G4 G3 */\
  601. "pand %%mm0, %%mm6 \n\t" /* R4 R3 R2 */\
  602. \
  603. "por %%mm1, %%mm3 \n\t" /* B5 G4 B4 G3 B3 */\
  604. "por %%mm3, %%mm6 \n\t"\
  605. MOVNTQ(%%mm6, 8(%%ebx))\
  606. \
  607. "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6 B7 B6 B6 B7 */\
  608. "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7 G6 G5 G6 G5 */\
  609. "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6 R5 R4 R5 R4 */\
  610. \
  611. "pand %%mm7, %%mm1 \n\t" /* B7 B6 */\
  612. "pand %%mm0, %%mm3 \n\t" /* G7 G6 G5 */\
  613. "pand "MANGLE(M24B)", %%mm6 \n\t" /* R7 R6 R5 */\
  614. \
  615. "por %%mm1, %%mm3 \n\t"\
  616. "por %%mm3, %%mm6 \n\t"\
  617. MOVNTQ(%%mm6, 16(%%ebx))\
  618. \
  619. "addl $24, %%ebx \n\t"\
  620. \
  621. "addl $8, %%eax \n\t"\
  622. "cmpl %5, %%eax \n\t"\
  623. " jb 1b \n\t"
  624. #ifdef HAVE_MMX2
  625. #undef WRITEBGR24
  626. #define WRITEBGR24 WRITEBGR24MMX2
  627. #else
  628. #undef WRITEBGR24
  629. #define WRITEBGR24 WRITEBGR24MMX
  630. #endif
  631. static inline void RENAME(yuv2yuvX)(int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
  632. int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
  633. uint8_t *dest, uint8_t *uDest, uint8_t *vDest, int dstW,
  634. int16_t * lumMmxFilter, int16_t * chrMmxFilter)
  635. {
  636. #ifdef HAVE_MMX
  637. if(uDest != NULL)
  638. {
  639. asm volatile(
  640. YSCALEYUV2YV12X(0)
  641. :: "m" (-chrFilterSize), "r" (chrSrc+chrFilterSize),
  642. "r" (chrMmxFilter+chrFilterSize*4), "r" (uDest), "m" (dstW>>1)
  643. : "%eax", "%edx", "%esi"
  644. );
  645. asm volatile(
  646. YSCALEYUV2YV12X(4096)
  647. :: "m" (-chrFilterSize), "r" (chrSrc+chrFilterSize),
  648. "r" (chrMmxFilter+chrFilterSize*4), "r" (vDest), "m" (dstW>>1)
  649. : "%eax", "%edx", "%esi"
  650. );
  651. }
  652. asm volatile(
  653. YSCALEYUV2YV12X(0)
  654. :: "m" (-lumFilterSize), "r" (lumSrc+lumFilterSize),
  655. "r" (lumMmxFilter+lumFilterSize*4), "r" (dest), "m" (dstW)
  656. : "%eax", "%edx", "%esi"
  657. );
  658. #else
  659. yuv2yuvXinC(lumFilter, lumSrc, lumFilterSize,
  660. chrFilter, chrSrc, chrFilterSize,
  661. dest, uDest, vDest, dstW);
  662. #endif
  663. }
  664. static inline void RENAME(yuv2yuv1)(int16_t *lumSrc, int16_t *chrSrc,
  665. uint8_t *dest, uint8_t *uDest, uint8_t *vDest, int dstW)
  666. {
  667. #ifdef HAVE_MMX
  668. if(uDest != NULL)
  669. {
  670. asm volatile(
  671. YSCALEYUV2YV121
  672. :: "r" (chrSrc + (dstW>>1)), "r" (uDest + (dstW>>1)),
  673. "g" (-(dstW>>1))
  674. : "%eax"
  675. );
  676. asm volatile(
  677. YSCALEYUV2YV121
  678. :: "r" (chrSrc + 2048 + (dstW>>1)), "r" (vDest + (dstW>>1)),
  679. "g" (-(dstW>>1))
  680. : "%eax"
  681. );
  682. }
  683. asm volatile(
  684. YSCALEYUV2YV121
  685. :: "r" (lumSrc + dstW), "r" (dest + dstW),
  686. "g" (-dstW)
  687. : "%eax"
  688. );
  689. #else
  690. //FIXME Optimize (just quickly writen not opti..)
  691. //FIXME replace MINMAX with LUTs
  692. int i;
  693. for(i=0; i<dstW; i++)
  694. {
  695. int val= lumSrc[i]>>7;
  696. dest[i]= MIN(MAX(val>>19, 0), 255);
  697. }
  698. if(uDest != NULL)
  699. for(i=0; i<(dstW>>1); i++)
  700. {
  701. int u=chrSrc[i]>>7;
  702. int v=chrSrc[i + 2048]>>7;
  703. uDest[i]= MIN(MAX(u>>19, 0), 255);
  704. vDest[i]= MIN(MAX(v>>19, 0), 255);
  705. }
  706. #endif
  707. }
  708. /**
  709. * vertical scale YV12 to RGB
  710. */
  711. static inline void RENAME(yuv2rgbX)(int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
  712. int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
  713. uint8_t *dest, int dstW, int dstFormat, int16_t * lumMmxFilter, int16_t * chrMmxFilter)
  714. {
  715. /* if(flags&SWS_FULL_UV_IPOL)
  716. {
  717. //FIXME
  718. }//FULL_UV_IPOL
  719. else*/
  720. {
  721. #ifdef HAVE_MMX
  722. if(dstFormat == IMGFMT_BGR32) //FIXME untested
  723. {
  724. asm volatile(
  725. YSCALEYUV2RGBX
  726. WRITEBGR32
  727. :: "m" (-lumFilterSize), "m" (-chrFilterSize),
  728. "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
  729. "r" (dest), "m" (dstW),
  730. "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
  731. : "%eax", "%ebx", "%ecx", "%edx", "%esi"
  732. );
  733. }
  734. else if(dstFormat == IMGFMT_BGR24) //FIXME untested
  735. {
  736. asm volatile(
  737. YSCALEYUV2RGBX
  738. "leal (%%eax, %%eax, 2), %%ebx \n\t" //FIXME optimize
  739. "addl %4, %%ebx \n\t"
  740. WRITEBGR24
  741. :: "m" (-lumFilterSize), "m" (-chrFilterSize),
  742. "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
  743. "r" (dest), "m" (dstW),
  744. "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
  745. : "%eax", "%ebx", "%ecx", "%edx", "%esi"
  746. );
  747. }
  748. else if(dstFormat==IMGFMT_BGR15)
  749. {
  750. asm volatile(
  751. YSCALEYUV2RGBX
  752. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  753. #ifdef DITHER1XBPP
  754. "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
  755. "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
  756. "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
  757. #endif
  758. WRITEBGR15
  759. :: "m" (-lumFilterSize), "m" (-chrFilterSize),
  760. "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
  761. "r" (dest), "m" (dstW),
  762. "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
  763. : "%eax", "%ebx", "%ecx", "%edx", "%esi"
  764. );
  765. }
  766. else if(dstFormat==IMGFMT_BGR16)
  767. {
  768. asm volatile(
  769. YSCALEYUV2RGBX
  770. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  771. #ifdef DITHER1XBPP
  772. "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
  773. "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
  774. "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
  775. #endif
  776. WRITEBGR16
  777. :: "m" (-lumFilterSize), "m" (-chrFilterSize),
  778. "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
  779. "r" (dest), "m" (dstW),
  780. "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
  781. : "%eax", "%ebx", "%ecx", "%edx", "%esi"
  782. );
  783. }
  784. #else
  785. yuv2rgbXinC(lumFilter, lumSrc, lumFilterSize,
  786. chrFilter, chrSrc, chrFilterSize,
  787. dest, dstW, dstFormat);
  788. #endif
  789. } //!FULL_UV_IPOL
  790. }
  791. /**
  792. * vertical bilinear scale YV12 to RGB
  793. */
  794. static inline void RENAME(yuv2rgb2)(uint16_t *buf0, uint16_t *buf1, uint16_t *uvbuf0, uint16_t *uvbuf1,
  795. uint8_t *dest, int dstW, int yalpha, int uvalpha, int dstFormat, int flags)
  796. {
  797. int yalpha1=yalpha^4095;
  798. int uvalpha1=uvalpha^4095;
  799. if(flags&SWS_FULL_UV_IPOL)
  800. {
  801. #ifdef HAVE_MMX
  802. if(dstFormat==IMGFMT_BGR32)
  803. {
  804. asm volatile(
  805. FULL_YSCALEYUV2RGB
  806. "punpcklbw %%mm1, %%mm3 \n\t" // BGBGBGBG
  807. "punpcklbw %%mm7, %%mm0 \n\t" // R0R0R0R0
  808. "movq %%mm3, %%mm1 \n\t"
  809. "punpcklwd %%mm0, %%mm3 \n\t" // BGR0BGR0
  810. "punpckhwd %%mm0, %%mm1 \n\t" // BGR0BGR0
  811. MOVNTQ(%%mm3, (%4, %%eax, 4))
  812. MOVNTQ(%%mm1, 8(%4, %%eax, 4))
  813. "addl $4, %%eax \n\t"
  814. "cmpl %5, %%eax \n\t"
  815. " jb 1b \n\t"
  816. :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
  817. "m" (yalpha1), "m" (uvalpha1)
  818. : "%eax"
  819. );
  820. }
  821. else if(dstFormat==IMGFMT_BGR24)
  822. {
  823. asm volatile(
  824. FULL_YSCALEYUV2RGB
  825. // lsb ... msb
  826. "punpcklbw %%mm1, %%mm3 \n\t" // BGBGBGBG
  827. "punpcklbw %%mm7, %%mm0 \n\t" // R0R0R0R0
  828. "movq %%mm3, %%mm1 \n\t"
  829. "punpcklwd %%mm0, %%mm3 \n\t" // BGR0BGR0
  830. "punpckhwd %%mm0, %%mm1 \n\t" // BGR0BGR0
  831. "movq %%mm3, %%mm2 \n\t" // BGR0BGR0
  832. "psrlq $8, %%mm3 \n\t" // GR0BGR00
  833. "pand "MANGLE(bm00000111)", %%mm2\n\t" // BGR00000
  834. "pand "MANGLE(bm11111000)", %%mm3\n\t" // 000BGR00
  835. "por %%mm2, %%mm3 \n\t" // BGRBGR00
  836. "movq %%mm1, %%mm2 \n\t"
  837. "psllq $48, %%mm1 \n\t" // 000000BG
  838. "por %%mm1, %%mm3 \n\t" // BGRBGRBG
  839. "movq %%mm2, %%mm1 \n\t" // BGR0BGR0
  840. "psrld $16, %%mm2 \n\t" // R000R000
  841. "psrlq $24, %%mm1 \n\t" // 0BGR0000
  842. "por %%mm2, %%mm1 \n\t" // RBGRR000
  843. "movl %4, %%ebx \n\t"
  844. "addl %%eax, %%ebx \n\t"
  845. #ifdef HAVE_MMX2
  846. //FIXME Alignment
  847. "movntq %%mm3, (%%ebx, %%eax, 2)\n\t"
  848. "movntq %%mm1, 8(%%ebx, %%eax, 2)\n\t"
  849. #else
  850. "movd %%mm3, (%%ebx, %%eax, 2) \n\t"
  851. "psrlq $32, %%mm3 \n\t"
  852. "movd %%mm3, 4(%%ebx, %%eax, 2) \n\t"
  853. "movd %%mm1, 8(%%ebx, %%eax, 2) \n\t"
  854. #endif
  855. "addl $4, %%eax \n\t"
  856. "cmpl %5, %%eax \n\t"
  857. " jb 1b \n\t"
  858. :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstW),
  859. "m" (yalpha1), "m" (uvalpha1)
  860. : "%eax", "%ebx"
  861. );
  862. }
  863. else if(dstFormat==IMGFMT_BGR15)
  864. {
  865. asm volatile(
  866. FULL_YSCALEYUV2RGB
  867. #ifdef DITHER1XBPP
  868. "paddusb "MANGLE(g5Dither)", %%mm1\n\t"
  869. "paddusb "MANGLE(r5Dither)", %%mm0\n\t"
  870. "paddusb "MANGLE(b5Dither)", %%mm3\n\t"
  871. #endif
  872. "punpcklbw %%mm7, %%mm1 \n\t" // 0G0G0G0G
  873. "punpcklbw %%mm7, %%mm3 \n\t" // 0B0B0B0B
  874. "punpcklbw %%mm7, %%mm0 \n\t" // 0R0R0R0R
  875. "psrlw $3, %%mm3 \n\t"
  876. "psllw $2, %%mm1 \n\t"
  877. "psllw $7, %%mm0 \n\t"
  878. "pand "MANGLE(g15Mask)", %%mm1 \n\t"
  879. "pand "MANGLE(r15Mask)", %%mm0 \n\t"
  880. "por %%mm3, %%mm1 \n\t"
  881. "por %%mm1, %%mm0 \n\t"
  882. MOVNTQ(%%mm0, (%4, %%eax, 2))
  883. "addl $4, %%eax \n\t"
  884. "cmpl %5, %%eax \n\t"
  885. " jb 1b \n\t"
  886. :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
  887. "m" (yalpha1), "m" (uvalpha1)
  888. : "%eax"
  889. );
  890. }
  891. else if(dstFormat==IMGFMT_BGR16)
  892. {
  893. asm volatile(
  894. FULL_YSCALEYUV2RGB
  895. #ifdef DITHER1XBPP
  896. "paddusb "MANGLE(g6Dither)", %%mm1\n\t"
  897. "paddusb "MANGLE(r5Dither)", %%mm0\n\t"
  898. "paddusb "MANGLE(b5Dither)", %%mm3\n\t"
  899. #endif
  900. "punpcklbw %%mm7, %%mm1 \n\t" // 0G0G0G0G
  901. "punpcklbw %%mm7, %%mm3 \n\t" // 0B0B0B0B
  902. "punpcklbw %%mm7, %%mm0 \n\t" // 0R0R0R0R
  903. "psrlw $3, %%mm3 \n\t"
  904. "psllw $3, %%mm1 \n\t"
  905. "psllw $8, %%mm0 \n\t"
  906. "pand "MANGLE(g16Mask)", %%mm1 \n\t"
  907. "pand "MANGLE(r16Mask)", %%mm0 \n\t"
  908. "por %%mm3, %%mm1 \n\t"
  909. "por %%mm1, %%mm0 \n\t"
  910. MOVNTQ(%%mm0, (%4, %%eax, 2))
  911. "addl $4, %%eax \n\t"
  912. "cmpl %5, %%eax \n\t"
  913. " jb 1b \n\t"
  914. :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
  915. "m" (yalpha1), "m" (uvalpha1)
  916. : "%eax"
  917. );
  918. }
  919. #else
  920. if(dstFormat==IMGFMT_BGR32)
  921. {
  922. int i;
  923. for(i=0;i<dstW;i++){
  924. // vertical linear interpolation && yuv2rgb in a single step:
  925. int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
  926. int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
  927. int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
  928. dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)];
  929. dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)];
  930. dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)];
  931. dest+= 4;
  932. }
  933. }
  934. else if(dstFormat==IMGFMT_BGR24)
  935. {
  936. int i;
  937. for(i=0;i<dstW;i++){
  938. // vertical linear interpolation && yuv2rgb in a single step:
  939. int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
  940. int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
  941. int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
  942. dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)];
  943. dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)];
  944. dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)];
  945. dest+= 3;
  946. }
  947. }
  948. else if(dstFormat==IMGFMT_BGR16)
  949. {
  950. int i;
  951. for(i=0;i<dstW;i++){
  952. // vertical linear interpolation && yuv2rgb in a single step:
  953. int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
  954. int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
  955. int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
  956. ((uint16_t*)dest)[i] =
  957. clip_table16b[(Y + yuvtab_40cf[U]) >>13] |
  958. clip_table16g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
  959. clip_table16r[(Y + yuvtab_3343[V]) >>13];
  960. }
  961. }
  962. else if(dstFormat==IMGFMT_BGR15)
  963. {
  964. int i;
  965. for(i=0;i<dstW;i++){
  966. // vertical linear interpolation && yuv2rgb in a single step:
  967. int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
  968. int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
  969. int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
  970. ((uint16_t*)dest)[i] =
  971. clip_table15b[(Y + yuvtab_40cf[U]) >>13] |
  972. clip_table15g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
  973. clip_table15r[(Y + yuvtab_3343[V]) >>13];
  974. }
  975. }
  976. #endif
  977. }//FULL_UV_IPOL
  978. else
  979. {
  980. #ifdef HAVE_MMX
  981. if(dstFormat==IMGFMT_BGR32)
  982. {
  983. asm volatile(
  984. YSCALEYUV2RGB
  985. WRITEBGR32
  986. :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
  987. "m" (yalpha1), "m" (uvalpha1)
  988. : "%eax"
  989. );
  990. }
  991. else if(dstFormat==IMGFMT_BGR24)
  992. {
  993. asm volatile(
  994. "movl %4, %%ebx \n\t"
  995. YSCALEYUV2RGB
  996. WRITEBGR24
  997. :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstW),
  998. "m" (yalpha1), "m" (uvalpha1)
  999. : "%eax", "%ebx"
  1000. );
  1001. }
  1002. else if(dstFormat==IMGFMT_BGR15)
  1003. {
  1004. asm volatile(
  1005. YSCALEYUV2RGB
  1006. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1007. #ifdef DITHER1XBPP
  1008. "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
  1009. "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
  1010. "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
  1011. #endif
  1012. WRITEBGR15
  1013. :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
  1014. "m" (yalpha1), "m" (uvalpha1)
  1015. : "%eax"
  1016. );
  1017. }
  1018. else if(dstFormat==IMGFMT_BGR16)
  1019. {
  1020. asm volatile(
  1021. YSCALEYUV2RGB
  1022. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1023. #ifdef DITHER1XBPP
  1024. "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
  1025. "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
  1026. "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
  1027. #endif
  1028. WRITEBGR16
  1029. :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
  1030. "m" (yalpha1), "m" (uvalpha1)
  1031. : "%eax"
  1032. );
  1033. }
  1034. #else
  1035. if(dstFormat==IMGFMT_BGR32)
  1036. {
  1037. int i;
  1038. for(i=0; i<dstW-1; i+=2){
  1039. // vertical linear interpolation && yuv2rgb in a single step:
  1040. int Y1=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
  1041. int Y2=yuvtab_2568[((buf0[i+1]*yalpha1+buf1[i+1]*yalpha)>>19)];
  1042. int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
  1043. int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
  1044. int Cb= yuvtab_40cf[U];
  1045. int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
  1046. int Cr= yuvtab_3343[V];
  1047. dest[4*i+0]=clip_table[((Y1 + Cb) >>13)];
  1048. dest[4*i+1]=clip_table[((Y1 + Cg) >>13)];
  1049. dest[4*i+2]=clip_table[((Y1 + Cr) >>13)];
  1050. dest[4*i+4]=clip_table[((Y2 + Cb) >>13)];
  1051. dest[4*i+5]=clip_table[((Y2 + Cg) >>13)];
  1052. dest[4*i+6]=clip_table[((Y2 + Cr) >>13)];
  1053. }
  1054. }
  1055. else if(dstFormat==IMGFMT_BGR24)
  1056. {
  1057. int i;
  1058. for(i=0; i<dstW-1; i+=2){
  1059. // vertical linear interpolation && yuv2rgb in a single step:
  1060. int Y1=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
  1061. int Y2=yuvtab_2568[((buf0[i+1]*yalpha1+buf1[i+1]*yalpha)>>19)];
  1062. int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
  1063. int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
  1064. int Cb= yuvtab_40cf[U];
  1065. int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
  1066. int Cr= yuvtab_3343[V];
  1067. dest[0]=clip_table[((Y1 + Cb) >>13)];
  1068. dest[1]=clip_table[((Y1 + Cg) >>13)];
  1069. dest[2]=clip_table[((Y1 + Cr) >>13)];
  1070. dest[3]=clip_table[((Y2 + Cb) >>13)];
  1071. dest[4]=clip_table[((Y2 + Cg) >>13)];
  1072. dest[5]=clip_table[((Y2 + Cr) >>13)];
  1073. dest+=6;
  1074. }
  1075. }
  1076. else if(dstFormat==IMGFMT_BGR16)
  1077. {
  1078. int i;
  1079. #ifdef DITHER1XBPP
  1080. static int ditherb1=1<<14;
  1081. static int ditherg1=1<<13;
  1082. static int ditherr1=2<<14;
  1083. static int ditherb2=3<<14;
  1084. static int ditherg2=3<<13;
  1085. static int ditherr2=0<<14;
  1086. ditherb1 ^= (1^2)<<14;
  1087. ditherg1 ^= (1^2)<<13;
  1088. ditherr1 ^= (1^2)<<14;
  1089. ditherb2 ^= (3^0)<<14;
  1090. ditherg2 ^= (3^0)<<13;
  1091. ditherr2 ^= (3^0)<<14;
  1092. #else
  1093. const int ditherb1=0;
  1094. const int ditherg1=0;
  1095. const int ditherr1=0;
  1096. const int ditherb2=0;
  1097. const int ditherg2=0;
  1098. const int ditherr2=0;
  1099. #endif
  1100. for(i=0; i<dstW-1; i+=2){
  1101. // vertical linear interpolation && yuv2rgb in a single step:
  1102. int Y1=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
  1103. int Y2=yuvtab_2568[((buf0[i+1]*yalpha1+buf1[i+1]*yalpha)>>19)];
  1104. int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
  1105. int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
  1106. int Cb= yuvtab_40cf[U];
  1107. int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
  1108. int Cr= yuvtab_3343[V];
  1109. ((uint16_t*)dest)[i] =
  1110. clip_table16b[(Y1 + Cb + ditherb1) >>13] |
  1111. clip_table16g[(Y1 + Cg + ditherg1) >>13] |
  1112. clip_table16r[(Y1 + Cr + ditherr1) >>13];
  1113. ((uint16_t*)dest)[i+1] =
  1114. clip_table16b[(Y2 + Cb + ditherb2) >>13] |
  1115. clip_table16g[(Y2 + Cg + ditherg2) >>13] |
  1116. clip_table16r[(Y2 + Cr + ditherr2) >>13];
  1117. }
  1118. }
  1119. else if(dstFormat==IMGFMT_BGR15)
  1120. {
  1121. int i;
  1122. #ifdef DITHER1XBPP
  1123. static int ditherb1=1<<14;
  1124. static int ditherg1=1<<14;
  1125. static int ditherr1=2<<14;
  1126. static int ditherb2=3<<14;
  1127. static int ditherg2=3<<14;
  1128. static int ditherr2=0<<14;
  1129. ditherb1 ^= (1^2)<<14;
  1130. ditherg1 ^= (1^2)<<14;
  1131. ditherr1 ^= (1^2)<<14;
  1132. ditherb2 ^= (3^0)<<14;
  1133. ditherg2 ^= (3^0)<<14;
  1134. ditherr2 ^= (3^0)<<14;
  1135. #else
  1136. const int ditherb1=0;
  1137. const int ditherg1=0;
  1138. const int ditherr1=0;
  1139. const int ditherb2=0;
  1140. const int ditherg2=0;
  1141. const int ditherr2=0;
  1142. #endif
  1143. for(i=0; i<dstW-1; i+=2){
  1144. // vertical linear interpolation && yuv2rgb in a single step:
  1145. int Y1=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
  1146. int Y2=yuvtab_2568[((buf0[i+1]*yalpha1+buf1[i+1]*yalpha)>>19)];
  1147. int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
  1148. int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
  1149. int Cb= yuvtab_40cf[U];
  1150. int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
  1151. int Cr= yuvtab_3343[V];
  1152. ((uint16_t*)dest)[i] =
  1153. clip_table15b[(Y1 + Cb + ditherb1) >>13] |
  1154. clip_table15g[(Y1 + Cg + ditherg1) >>13] |
  1155. clip_table15r[(Y1 + Cr + ditherr1) >>13];
  1156. ((uint16_t*)dest)[i+1] =
  1157. clip_table15b[(Y2 + Cb + ditherb2) >>13] |
  1158. clip_table15g[(Y2 + Cg + ditherg2) >>13] |
  1159. clip_table15r[(Y2 + Cr + ditherr2) >>13];
  1160. }
  1161. }
  1162. #endif
  1163. } //!FULL_UV_IPOL
  1164. }
  1165. /**
  1166. * YV12 to RGB without scaling or interpolating
  1167. */
  1168. static inline void RENAME(yuv2rgb1)(uint16_t *buf0, uint16_t *uvbuf0, uint16_t *uvbuf1,
  1169. uint8_t *dest, int dstW, int uvalpha, int dstFormat, int flags)
  1170. {
  1171. int uvalpha1=uvalpha^4095;
  1172. const int yalpha1=0;
  1173. if(flags&SWS_FULL_UV_IPOL)
  1174. {
  1175. RENAME(yuv2rgb2)(buf0, buf0, uvbuf0, uvbuf1, dest, dstW, 0, uvalpha, dstFormat, flags);
  1176. return;
  1177. }
  1178. #ifdef HAVE_MMX
  1179. if( uvalpha < 2048 ) // note this is not correct (shifts chrominance by 0.5 pixels) but its a bit faster
  1180. {
  1181. if(dstFormat==IMGFMT_BGR32)
  1182. {
  1183. asm volatile(
  1184. YSCALEYUV2RGB1
  1185. WRITEBGR32
  1186. :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
  1187. "m" (yalpha1), "m" (uvalpha1)
  1188. : "%eax"
  1189. );
  1190. }
  1191. else if(dstFormat==IMGFMT_BGR24)
  1192. {
  1193. asm volatile(
  1194. "movl %4, %%ebx \n\t"
  1195. YSCALEYUV2RGB1
  1196. WRITEBGR24
  1197. :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstW),
  1198. "m" (yalpha1), "m" (uvalpha1)
  1199. : "%eax", "%ebx"
  1200. );
  1201. }
  1202. else if(dstFormat==IMGFMT_BGR15)
  1203. {
  1204. asm volatile(
  1205. YSCALEYUV2RGB1
  1206. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1207. #ifdef DITHER1XBPP
  1208. "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
  1209. "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
  1210. "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
  1211. #endif
  1212. WRITEBGR15
  1213. :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
  1214. "m" (yalpha1), "m" (uvalpha1)
  1215. : "%eax"
  1216. );
  1217. }
  1218. else if(dstFormat==IMGFMT_BGR16)
  1219. {
  1220. asm volatile(
  1221. YSCALEYUV2RGB1
  1222. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1223. #ifdef DITHER1XBPP
  1224. "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
  1225. "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
  1226. "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
  1227. #endif
  1228. WRITEBGR16
  1229. :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
  1230. "m" (yalpha1), "m" (uvalpha1)
  1231. : "%eax"
  1232. );
  1233. }
  1234. }
  1235. else
  1236. {
  1237. if(dstFormat==IMGFMT_BGR32)
  1238. {
  1239. asm volatile(
  1240. YSCALEYUV2RGB1b
  1241. WRITEBGR32
  1242. :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
  1243. "m" (yalpha1), "m" (uvalpha1)
  1244. : "%eax"
  1245. );
  1246. }
  1247. else if(dstFormat==IMGFMT_BGR24)
  1248. {
  1249. asm volatile(
  1250. "movl %4, %%ebx \n\t"
  1251. YSCALEYUV2RGB1b
  1252. WRITEBGR24
  1253. :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstW),
  1254. "m" (yalpha1), "m" (uvalpha1)
  1255. : "%eax", "%ebx"
  1256. );
  1257. }
  1258. else if(dstFormat==IMGFMT_BGR15)
  1259. {
  1260. asm volatile(
  1261. YSCALEYUV2RGB1b
  1262. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1263. #ifdef DITHER1XBPP
  1264. "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
  1265. "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
  1266. "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
  1267. #endif
  1268. WRITEBGR15
  1269. :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
  1270. "m" (yalpha1), "m" (uvalpha1)
  1271. : "%eax"
  1272. );
  1273. }
  1274. else if(dstFormat==IMGFMT_BGR16)
  1275. {
  1276. asm volatile(
  1277. YSCALEYUV2RGB1b
  1278. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1279. #ifdef DITHER1XBPP
  1280. "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
  1281. "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
  1282. "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
  1283. #endif
  1284. WRITEBGR16
  1285. :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
  1286. "m" (yalpha1), "m" (uvalpha1)
  1287. : "%eax"
  1288. );
  1289. }
  1290. }
  1291. #else
  1292. //FIXME write 2 versions (for even & odd lines)
  1293. if(dstFormat==IMGFMT_BGR32)
  1294. {
  1295. int i;
  1296. for(i=0; i<dstW-1; i+=2){
  1297. // vertical linear interpolation && yuv2rgb in a single step:
  1298. int Y1=yuvtab_2568[buf0[i]>>7];
  1299. int Y2=yuvtab_2568[buf0[i+1]>>7];
  1300. int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
  1301. int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
  1302. int Cb= yuvtab_40cf[U];
  1303. int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
  1304. int Cr= yuvtab_3343[V];
  1305. dest[4*i+0]=clip_table[((Y1 + Cb) >>13)];
  1306. dest[4*i+1]=clip_table[((Y1 + Cg) >>13)];
  1307. dest[4*i+2]=clip_table[((Y1 + Cr) >>13)];
  1308. dest[4*i+4]=clip_table[((Y2 + Cb) >>13)];
  1309. dest[4*i+5]=clip_table[((Y2 + Cg) >>13)];
  1310. dest[4*i+6]=clip_table[((Y2 + Cr) >>13)];
  1311. }
  1312. }
  1313. else if(dstFormat==IMGFMT_BGR24)
  1314. {
  1315. int i;
  1316. for(i=0; i<dstW-1; i+=2){
  1317. // vertical linear interpolation && yuv2rgb in a single step:
  1318. int Y1=yuvtab_2568[buf0[i]>>7];
  1319. int Y2=yuvtab_2568[buf0[i+1]>>7];
  1320. int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
  1321. int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
  1322. int Cb= yuvtab_40cf[U];
  1323. int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
  1324. int Cr= yuvtab_3343[V];
  1325. dest[0]=clip_table[((Y1 + Cb) >>13)];
  1326. dest[1]=clip_table[((Y1 + Cg) >>13)];
  1327. dest[2]=clip_table[((Y1 + Cr) >>13)];
  1328. dest[3]=clip_table[((Y2 + Cb) >>13)];
  1329. dest[4]=clip_table[((Y2 + Cg) >>13)];
  1330. dest[5]=clip_table[((Y2 + Cr) >>13)];
  1331. dest+=6;
  1332. }
  1333. }
  1334. else if(dstFormat==IMGFMT_BGR16)
  1335. {
  1336. int i;
  1337. #ifdef DITHER1XBPP
  1338. static int ditherb1=1<<14;
  1339. static int ditherg1=1<<13;
  1340. static int ditherr1=2<<14;
  1341. static int ditherb2=3<<14;
  1342. static int ditherg2=3<<13;
  1343. static int ditherr2=0<<14;
  1344. ditherb1 ^= (1^2)<<14;
  1345. ditherg1 ^= (1^2)<<13;
  1346. ditherr1 ^= (1^2)<<14;
  1347. ditherb2 ^= (3^0)<<14;
  1348. ditherg2 ^= (3^0)<<13;
  1349. ditherr2 ^= (3^0)<<14;
  1350. #else
  1351. const int ditherb1=0;
  1352. const int ditherg1=0;
  1353. const int ditherr1=0;
  1354. const int ditherb2=0;
  1355. const int ditherg2=0;
  1356. const int ditherr2=0;
  1357. #endif
  1358. for(i=0; i<dstW-1; i+=2){
  1359. // vertical linear interpolation && yuv2rgb in a single step:
  1360. int Y1=yuvtab_2568[buf0[i]>>7];
  1361. int Y2=yuvtab_2568[buf0[i+1]>>7];
  1362. int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
  1363. int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
  1364. int Cb= yuvtab_40cf[U];
  1365. int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
  1366. int Cr= yuvtab_3343[V];
  1367. ((uint16_t*)dest)[i] =
  1368. clip_table16b[(Y1 + Cb + ditherb1) >>13] |
  1369. clip_table16g[(Y1 + Cg + ditherg1) >>13] |
  1370. clip_table16r[(Y1 + Cr + ditherr1) >>13];
  1371. ((uint16_t*)dest)[i+1] =
  1372. clip_table16b[(Y2 + Cb + ditherb2) >>13] |
  1373. clip_table16g[(Y2 + Cg + ditherg2) >>13] |
  1374. clip_table16r[(Y2 + Cr + ditherr2) >>13];
  1375. }
  1376. }
  1377. else if(dstFormat==IMGFMT_BGR15)
  1378. {
  1379. int i;
  1380. #ifdef DITHER1XBPP
  1381. static int ditherb1=1<<14;
  1382. static int ditherg1=1<<14;
  1383. static int ditherr1=2<<14;
  1384. static int ditherb2=3<<14;
  1385. static int ditherg2=3<<14;
  1386. static int ditherr2=0<<14;
  1387. ditherb1 ^= (1^2)<<14;
  1388. ditherg1 ^= (1^2)<<14;
  1389. ditherr1 ^= (1^2)<<14;
  1390. ditherb2 ^= (3^0)<<14;
  1391. ditherg2 ^= (3^0)<<14;
  1392. ditherr2 ^= (3^0)<<14;
  1393. #else
  1394. const int ditherb1=0;
  1395. const int ditherg1=0;
  1396. const int ditherr1=0;
  1397. const int ditherb2=0;
  1398. const int ditherg2=0;
  1399. const int ditherr2=0;
  1400. #endif
  1401. for(i=0; i<dstW-1; i+=2){
  1402. // vertical linear interpolation && yuv2rgb in a single step:
  1403. int Y1=yuvtab_2568[buf0[i]>>7];
  1404. int Y2=yuvtab_2568[buf0[i+1]>>7];
  1405. int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
  1406. int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
  1407. int Cb= yuvtab_40cf[U];
  1408. int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
  1409. int Cr= yuvtab_3343[V];
  1410. ((uint16_t*)dest)[i] =
  1411. clip_table15b[(Y1 + Cb + ditherb1) >>13] |
  1412. clip_table15g[(Y1 + Cg + ditherg1) >>13] |
  1413. clip_table15r[(Y1 + Cr + ditherr1) >>13];
  1414. ((uint16_t*)dest)[i+1] =
  1415. clip_table15b[(Y2 + Cb + ditherb2) >>13] |
  1416. clip_table15g[(Y2 + Cg + ditherg2) >>13] |
  1417. clip_table15r[(Y2 + Cr + ditherr2) >>13];
  1418. }
  1419. }
  1420. #endif
  1421. }
  1422. // Bilinear / Bicubic scaling
  1423. static inline void RENAME(hScale)(int16_t *dst, int dstW, uint8_t *src, int srcW, int xInc,
  1424. int16_t *filter, int16_t *filterPos, int filterSize)
  1425. {
  1426. #ifdef HAVE_MMX
  1427. if(filterSize==4) // allways true for upscaling, sometimes for down too
  1428. {
  1429. int counter= -2*dstW;
  1430. filter-= counter*2;
  1431. filterPos-= counter/2;
  1432. dst-= counter/2;
  1433. asm volatile(
  1434. "pxor %%mm7, %%mm7 \n\t"
  1435. "movq "MANGLE(w02)", %%mm6 \n\t"
  1436. "pushl %%ebp \n\t" // we use 7 regs here ...
  1437. "movl %%eax, %%ebp \n\t"
  1438. ".balign 16 \n\t"
  1439. "1: \n\t"
  1440. "movzwl (%2, %%ebp), %%eax \n\t"
  1441. "movzwl 2(%2, %%ebp), %%ebx \n\t"
  1442. "movq (%1, %%ebp, 4), %%mm1 \n\t"
  1443. "movq 8(%1, %%ebp, 4), %%mm3 \n\t"
  1444. "movd (%3, %%eax), %%mm0 \n\t"
  1445. "movd (%3, %%ebx), %%mm2 \n\t"
  1446. "punpcklbw %%mm7, %%mm0 \n\t"
  1447. "punpcklbw %%mm7, %%mm2 \n\t"
  1448. "pmaddwd %%mm1, %%mm0 \n\t"
  1449. "pmaddwd %%mm2, %%mm3 \n\t"
  1450. "psrad $8, %%mm0 \n\t"
  1451. "psrad $8, %%mm3 \n\t"
  1452. "packssdw %%mm3, %%mm0 \n\t"
  1453. "pmaddwd %%mm6, %%mm0 \n\t"
  1454. "packssdw %%mm0, %%mm0 \n\t"
  1455. "movd %%mm0, (%4, %%ebp) \n\t"
  1456. "addl $4, %%ebp \n\t"
  1457. " jnc 1b \n\t"
  1458. "popl %%ebp \n\t"
  1459. : "+a" (counter)
  1460. : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
  1461. : "%ebx"
  1462. );
  1463. }
  1464. else if(filterSize==8)
  1465. {
  1466. int counter= -2*dstW;
  1467. filter-= counter*4;
  1468. filterPos-= counter/2;
  1469. dst-= counter/2;
  1470. asm volatile(
  1471. "pxor %%mm7, %%mm7 \n\t"
  1472. "movq "MANGLE(w02)", %%mm6 \n\t"
  1473. "pushl %%ebp \n\t" // we use 7 regs here ...
  1474. "movl %%eax, %%ebp \n\t"
  1475. ".balign 16 \n\t"
  1476. "1: \n\t"
  1477. "movzwl (%2, %%ebp), %%eax \n\t"
  1478. "movzwl 2(%2, %%ebp), %%ebx \n\t"
  1479. "movq (%1, %%ebp, 8), %%mm1 \n\t"
  1480. "movq 16(%1, %%ebp, 8), %%mm3 \n\t"
  1481. "movd (%3, %%eax), %%mm0 \n\t"
  1482. "movd (%3, %%ebx), %%mm2 \n\t"
  1483. "punpcklbw %%mm7, %%mm0 \n\t"
  1484. "punpcklbw %%mm7, %%mm2 \n\t"
  1485. "pmaddwd %%mm1, %%mm0 \n\t"
  1486. "pmaddwd %%mm2, %%mm3 \n\t"
  1487. "movq 8(%1, %%ebp, 8), %%mm1 \n\t"
  1488. "movq 24(%1, %%ebp, 8), %%mm5 \n\t"
  1489. "movd 4(%3, %%eax), %%mm4 \n\t"
  1490. "movd 4(%3, %%ebx), %%mm2 \n\t"
  1491. "punpcklbw %%mm7, %%mm4 \n\t"
  1492. "punpcklbw %%mm7, %%mm2 \n\t"
  1493. "pmaddwd %%mm1, %%mm4 \n\t"
  1494. "pmaddwd %%mm2, %%mm5 \n\t"
  1495. "paddd %%mm4, %%mm0 \n\t"
  1496. "paddd %%mm5, %%mm3 \n\t"
  1497. "psrad $8, %%mm0 \n\t"
  1498. "psrad $8, %%mm3 \n\t"
  1499. "packssdw %%mm3, %%mm0 \n\t"
  1500. "pmaddwd %%mm6, %%mm0 \n\t"
  1501. "packssdw %%mm0, %%mm0 \n\t"
  1502. "movd %%mm0, (%4, %%ebp) \n\t"
  1503. "addl $4, %%ebp \n\t"
  1504. " jnc 1b \n\t"
  1505. "popl %%ebp \n\t"
  1506. : "+a" (counter)
  1507. : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
  1508. : "%ebx"
  1509. );
  1510. }
  1511. else
  1512. {
  1513. int counter= -2*dstW;
  1514. // filter-= counter*filterSize/2;
  1515. filterPos-= counter/2;
  1516. dst-= counter/2;
  1517. asm volatile(
  1518. "pxor %%mm7, %%mm7 \n\t"
  1519. "movq "MANGLE(w02)", %%mm6 \n\t"
  1520. ".balign 16 \n\t"
  1521. "1: \n\t"
  1522. "movl %2, %%ecx \n\t"
  1523. "movzwl (%%ecx, %0), %%eax \n\t"
  1524. "movzwl 2(%%ecx, %0), %%ebx \n\t"
  1525. "movl %5, %%ecx \n\t"
  1526. "pxor %%mm4, %%mm4 \n\t"
  1527. "pxor %%mm5, %%mm5 \n\t"
  1528. "2: \n\t"
  1529. "movq (%1), %%mm1 \n\t"
  1530. "movq (%1, %6), %%mm3 \n\t"
  1531. "movd (%%ecx, %%eax), %%mm0 \n\t"
  1532. "movd (%%ecx, %%ebx), %%mm2 \n\t"
  1533. "punpcklbw %%mm7, %%mm0 \n\t"
  1534. "punpcklbw %%mm7, %%mm2 \n\t"
  1535. "pmaddwd %%mm1, %%mm0 \n\t"
  1536. "pmaddwd %%mm2, %%mm3 \n\t"
  1537. "paddd %%mm3, %%mm5 \n\t"
  1538. "paddd %%mm0, %%mm4 \n\t"
  1539. "addl $8, %1 \n\t"
  1540. "addl $4, %%ecx \n\t"
  1541. "cmpl %4, %%ecx \n\t"
  1542. " jb 2b \n\t"
  1543. "addl %6, %1 \n\t"
  1544. "psrad $8, %%mm4 \n\t"
  1545. "psrad $8, %%mm5 \n\t"
  1546. "packssdw %%mm5, %%mm4 \n\t"
  1547. "pmaddwd %%mm6, %%mm4 \n\t"
  1548. "packssdw %%mm4, %%mm4 \n\t"
  1549. "movl %3, %%eax \n\t"
  1550. "movd %%mm4, (%%eax, %0) \n\t"
  1551. "addl $4, %0 \n\t"
  1552. " jnc 1b \n\t"
  1553. : "+r" (counter), "+r" (filter)
  1554. : "m" (filterPos), "m" (dst), "m"(src+filterSize),
  1555. "m" (src), "r" (filterSize*2)
  1556. : "%ebx", "%eax", "%ecx"
  1557. );
  1558. }
  1559. #else
  1560. int i;
  1561. for(i=0; i<dstW; i++)
  1562. {
  1563. int j;
  1564. int srcPos= filterPos[i];
  1565. int val=0;
  1566. // printf("filterPos: %d\n", filterPos[i]);
  1567. for(j=0; j<filterSize; j++)
  1568. {
  1569. // printf("filter: %d, src: %d\n", filter[i], src[srcPos + j]);
  1570. val += ((int)src[srcPos + j])*filter[filterSize*i + j];
  1571. }
  1572. // filter += hFilterSize;
  1573. dst[i] = MIN(MAX(0, val>>7), (1<<15)-1); // the cubic equation does overflow ...
  1574. // dst[i] = val>>7;
  1575. }
  1576. #endif
  1577. }
  1578. // *** horizontal scale Y line to temp buffer
  1579. static inline void RENAME(hyscale)(uint16_t *dst, int dstWidth, uint8_t *src, int srcW, int xInc,
  1580. int flags, int canMMX2BeUsed, int16_t *hLumFilter,
  1581. int16_t *hLumFilterPos, int hLumFilterSize, void *funnyYCode)
  1582. {
  1583. #ifdef HAVE_MMX
  1584. // use the new MMX scaler if th mmx2 cant be used (its faster than the x86asm one)
  1585. if(!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
  1586. #else
  1587. if(!(flags&SWS_FAST_BILINEAR))
  1588. #endif
  1589. {
  1590. RENAME(hScale)(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize);
  1591. }
  1592. else // Fast Bilinear upscale / crap downscale
  1593. {
  1594. #ifdef ARCH_X86
  1595. #ifdef HAVE_MMX2
  1596. int i;
  1597. if(canMMX2BeUsed)
  1598. {
  1599. asm volatile(
  1600. "pxor %%mm7, %%mm7 \n\t"
  1601. "pxor %%mm2, %%mm2 \n\t" // 2*xalpha
  1602. "movd %5, %%mm6 \n\t" // xInc&0xFFFF
  1603. "punpcklwd %%mm6, %%mm6 \n\t"
  1604. "punpcklwd %%mm6, %%mm6 \n\t"
  1605. "movq %%mm6, %%mm2 \n\t"
  1606. "psllq $16, %%mm2 \n\t"
  1607. "paddw %%mm6, %%mm2 \n\t"
  1608. "psllq $16, %%mm2 \n\t"
  1609. "paddw %%mm6, %%mm2 \n\t"
  1610. "psllq $16, %%mm2 \n\t" //0,t,2t,3t t=xInc&0xFF
  1611. "movq %%mm2, %%mm4 \n\t"
  1612. "movd %4, %%mm6 \n\t" //(xInc*4)&0xFFFF
  1613. "punpcklwd %%mm6, %%mm6 \n\t"
  1614. "punpcklwd %%mm6, %%mm6 \n\t"
  1615. "xorl %%eax, %%eax \n\t" // i
  1616. "movl %0, %%esi \n\t" // src
  1617. "movl %1, %%edi \n\t" // buf1
  1618. "movl %3, %%edx \n\t" // (xInc*4)>>16
  1619. "xorl %%ecx, %%ecx \n\t"
  1620. "xorl %%ebx, %%ebx \n\t"
  1621. "movw %4, %%bx \n\t" // (xInc*4)&0xFFFF
  1622. #define FUNNY_Y_CODE \
  1623. PREFETCH" 1024(%%esi) \n\t"\
  1624. PREFETCH" 1056(%%esi) \n\t"\
  1625. PREFETCH" 1088(%%esi) \n\t"\
  1626. "call *%6 \n\t"\
  1627. "movq %%mm4, %%mm2 \n\t"\
  1628. "xorl %%ecx, %%ecx \n\t"
  1629. FUNNY_Y_CODE
  1630. FUNNY_Y_CODE
  1631. FUNNY_Y_CODE
  1632. FUNNY_Y_CODE
  1633. FUNNY_Y_CODE
  1634. FUNNY_Y_CODE
  1635. FUNNY_Y_CODE
  1636. FUNNY_Y_CODE
  1637. :: "m" (src), "m" (dst), "m" (dstWidth), "m" ((xInc*4)>>16),
  1638. "m" ((xInc*4)&0xFFFF), "m" (xInc&0xFFFF), "m" (funnyYCode)
  1639. : "%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi"
  1640. );
  1641. for(i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) dst[i] = src[srcW-1]*128;
  1642. }
  1643. else
  1644. {
  1645. #endif
  1646. //NO MMX just normal asm ...
  1647. asm volatile(
  1648. "xorl %%eax, %%eax \n\t" // i
  1649. "xorl %%ebx, %%ebx \n\t" // xx
  1650. "xorl %%ecx, %%ecx \n\t" // 2*xalpha
  1651. ".balign 16 \n\t"
  1652. "1: \n\t"
  1653. "movzbl (%0, %%ebx), %%edi \n\t" //src[xx]
  1654. "movzbl 1(%0, %%ebx), %%esi \n\t" //src[xx+1]
  1655. "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
  1656. "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
  1657. "shll $16, %%edi \n\t"
  1658. "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
  1659. "movl %1, %%edi \n\t"
  1660. "shrl $9, %%esi \n\t"
  1661. "movw %%si, (%%edi, %%eax, 2) \n\t"
  1662. "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
  1663. "adcl %3, %%ebx \n\t" //xx+= xInc>>8 + carry
  1664. "movzbl (%0, %%ebx), %%edi \n\t" //src[xx]
  1665. "movzbl 1(%0, %%ebx), %%esi \n\t" //src[xx+1]
  1666. "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
  1667. "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
  1668. "shll $16, %%edi \n\t"
  1669. "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
  1670. "movl %1, %%edi \n\t"
  1671. "shrl $9, %%esi \n\t"
  1672. "movw %%si, 2(%%edi, %%eax, 2) \n\t"
  1673. "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
  1674. "adcl %3, %%ebx \n\t" //xx+= xInc>>8 + carry
  1675. "addl $2, %%eax \n\t"
  1676. "cmpl %2, %%eax \n\t"
  1677. " jb 1b \n\t"
  1678. :: "r" (src), "m" (dst), "m" (dstWidth), "m" (xInc>>16), "m" (xInc&0xFFFF)
  1679. : "%eax", "%ebx", "%ecx", "%edi", "%esi"
  1680. );
  1681. #ifdef HAVE_MMX2
  1682. } //if MMX2 cant be used
  1683. #endif
  1684. #else
  1685. int i;
  1686. unsigned int xpos=0;
  1687. for(i=0;i<dstWidth;i++)
  1688. {
  1689. register unsigned int xx=xpos>>16;
  1690. register unsigned int xalpha=(xpos&0xFFFF)>>9;
  1691. dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha;
  1692. xpos+=xInc;
  1693. }
  1694. #endif
  1695. }
  1696. }
  1697. inline static void RENAME(hcscale)(uint16_t *dst, int dstWidth, uint8_t *src1, uint8_t *src2,
  1698. int srcW, int xInc, int flags, int canMMX2BeUsed, int16_t *hChrFilter,
  1699. int16_t *hChrFilterPos, int hChrFilterSize, void *funnyUVCode)
  1700. {
  1701. #ifdef HAVE_MMX
  1702. // use the new MMX scaler if th mmx2 cant be used (its faster than the x86asm one)
  1703. if(!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
  1704. #else
  1705. if(!(flags&SWS_FAST_BILINEAR))
  1706. #endif
  1707. {
  1708. RENAME(hScale)(dst , dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
  1709. RENAME(hScale)(dst+2048, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
  1710. }
  1711. else // Fast Bilinear upscale / crap downscale
  1712. {
  1713. #ifdef ARCH_X86
  1714. #ifdef HAVE_MMX2
  1715. int i;
  1716. if(canMMX2BeUsed)
  1717. {
  1718. asm volatile(
  1719. "pxor %%mm7, %%mm7 \n\t"
  1720. "pxor %%mm2, %%mm2 \n\t" // 2*xalpha
  1721. "movd %5, %%mm6 \n\t" // xInc&0xFFFF
  1722. "punpcklwd %%mm6, %%mm6 \n\t"
  1723. "punpcklwd %%mm6, %%mm6 \n\t"
  1724. "movq %%mm6, %%mm2 \n\t"
  1725. "psllq $16, %%mm2 \n\t"
  1726. "paddw %%mm6, %%mm2 \n\t"
  1727. "psllq $16, %%mm2 \n\t"
  1728. "paddw %%mm6, %%mm2 \n\t"
  1729. "psllq $16, %%mm2 \n\t" //0,t,2t,3t t=xInc&0xFFFF
  1730. "movq %%mm2, %%mm4 \n\t"
  1731. "movd %4, %%mm6 \n\t" //(xInc*4)&0xFFFF
  1732. "punpcklwd %%mm6, %%mm6 \n\t"
  1733. "punpcklwd %%mm6, %%mm6 \n\t"
  1734. "xorl %%eax, %%eax \n\t" // i
  1735. "movl %0, %%esi \n\t" // src
  1736. "movl %1, %%edi \n\t" // buf1
  1737. "movl %3, %%edx \n\t" // (xInc*4)>>16
  1738. "xorl %%ecx, %%ecx \n\t"
  1739. "xorl %%ebx, %%ebx \n\t"
  1740. "movw %4, %%bx \n\t" // (xInc*4)&0xFFFF
  1741. #define FUNNYUVCODE \
  1742. PREFETCH" 1024(%%esi) \n\t"\
  1743. PREFETCH" 1056(%%esi) \n\t"\
  1744. PREFETCH" 1088(%%esi) \n\t"\
  1745. "call *%7 \n\t"\
  1746. "movq %%mm4, %%mm2 \n\t"\
  1747. "xorl %%ecx, %%ecx \n\t"
  1748. FUNNYUVCODE
  1749. FUNNYUVCODE
  1750. FUNNYUVCODE
  1751. FUNNYUVCODE
  1752. FUNNYUVCODE
  1753. FUNNYUVCODE
  1754. FUNNYUVCODE
  1755. FUNNYUVCODE
  1756. "xorl %%eax, %%eax \n\t" // i
  1757. "movl %6, %%esi \n\t" // src
  1758. "movl %1, %%edi \n\t" // buf1
  1759. "addl $4096, %%edi \n\t"
  1760. FUNNYUVCODE
  1761. FUNNYUVCODE
  1762. FUNNYUVCODE
  1763. FUNNYUVCODE
  1764. FUNNYUVCODE
  1765. FUNNYUVCODE
  1766. FUNNYUVCODE
  1767. FUNNYUVCODE
  1768. :: "m" (src1), "m" (dst), "m" (dstWidth), "m" ((xInc*4)>>16),
  1769. "m" ((xInc*4)&0xFFFF), "m" (xInc&0xFFFF), "m" (src2), "m" (funnyUVCode)
  1770. : "%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi"
  1771. );
  1772. for(i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
  1773. {
  1774. // printf("%d %d %d\n", dstWidth, i, srcW);
  1775. dst[i] = src1[srcW-1]*128;
  1776. dst[i+2048] = src2[srcW-1]*128;
  1777. }
  1778. }
  1779. else
  1780. {
  1781. #endif
  1782. asm volatile(
  1783. "xorl %%eax, %%eax \n\t" // i
  1784. "xorl %%ebx, %%ebx \n\t" // xx
  1785. "xorl %%ecx, %%ecx \n\t" // 2*xalpha
  1786. ".balign 16 \n\t"
  1787. "1: \n\t"
  1788. "movl %0, %%esi \n\t"
  1789. "movzbl (%%esi, %%ebx), %%edi \n\t" //src[xx]
  1790. "movzbl 1(%%esi, %%ebx), %%esi \n\t" //src[xx+1]
  1791. "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
  1792. "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
  1793. "shll $16, %%edi \n\t"
  1794. "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
  1795. "movl %1, %%edi \n\t"
  1796. "shrl $9, %%esi \n\t"
  1797. "movw %%si, (%%edi, %%eax, 2) \n\t"
  1798. "movzbl (%5, %%ebx), %%edi \n\t" //src[xx]
  1799. "movzbl 1(%5, %%ebx), %%esi \n\t" //src[xx+1]
  1800. "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
  1801. "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
  1802. "shll $16, %%edi \n\t"
  1803. "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
  1804. "movl %1, %%edi \n\t"
  1805. "shrl $9, %%esi \n\t"
  1806. "movw %%si, 4096(%%edi, %%eax, 2)\n\t"
  1807. "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
  1808. "adcl %3, %%ebx \n\t" //xx+= xInc>>8 + carry
  1809. "addl $1, %%eax \n\t"
  1810. "cmpl %2, %%eax \n\t"
  1811. " jb 1b \n\t"
  1812. :: "m" (src1), "m" (dst), "m" (dstWidth), "m" (xInc>>16), "m" (xInc&0xFFFF),
  1813. "r" (src2)
  1814. : "%eax", "%ebx", "%ecx", "%edi", "%esi"
  1815. );
  1816. #ifdef HAVE_MMX2
  1817. } //if MMX2 cant be used
  1818. #endif
  1819. #else
  1820. int i;
  1821. unsigned int xpos=0;
  1822. for(i=0;i<dstWidth;i++)
  1823. {
  1824. register unsigned int xx=xpos>>16;
  1825. register unsigned int xalpha=(xpos&0xFFFF)>>9;
  1826. dst[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha);
  1827. dst[i+2048]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha);
  1828. /* slower
  1829. dst[i]= (src1[xx]<<7) + (src1[xx+1] - src1[xx])*xalpha;
  1830. dst[i+2048]=(src2[xx]<<7) + (src2[xx+1] - src2[xx])*xalpha;
  1831. */
  1832. xpos+=xInc;
  1833. }
  1834. #endif
  1835. }
  1836. }
  1837. static void RENAME(swScale)(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
  1838. int srcSliceH, uint8_t* dst[], int dstStride[]){
  1839. /* load a few things into local vars to make the code more readable? and faster */
  1840. const int srcW= c->srcW;
  1841. const int dstW= c->dstW;
  1842. const int dstH= c->dstH;
  1843. const int chrDstW= c->chrDstW;
  1844. const int lumXInc= c->lumXInc;
  1845. const int chrXInc= c->chrXInc;
  1846. const int dstFormat= c->dstFormat;
  1847. const int flags= c->flags;
  1848. const int canMMX2BeUsed= c->canMMX2BeUsed;
  1849. int16_t *vLumFilterPos= c->vLumFilterPos;
  1850. int16_t *vChrFilterPos= c->vChrFilterPos;
  1851. int16_t *hLumFilterPos= c->hLumFilterPos;
  1852. int16_t *hChrFilterPos= c->hChrFilterPos;
  1853. int16_t *vLumFilter= c->vLumFilter;
  1854. int16_t *vChrFilter= c->vChrFilter;
  1855. int16_t *hLumFilter= c->hLumFilter;
  1856. int16_t *hChrFilter= c->hChrFilter;
  1857. int16_t *lumMmxFilter= c->lumMmxFilter;
  1858. int16_t *chrMmxFilter= c->chrMmxFilter;
  1859. const int vLumFilterSize= c->vLumFilterSize;
  1860. const int vChrFilterSize= c->vChrFilterSize;
  1861. const int hLumFilterSize= c->hLumFilterSize;
  1862. const int hChrFilterSize= c->hChrFilterSize;
  1863. int16_t **lumPixBuf= c->lumPixBuf;
  1864. int16_t **chrPixBuf= c->chrPixBuf;
  1865. const int vLumBufSize= c->vLumBufSize;
  1866. const int vChrBufSize= c->vChrBufSize;
  1867. uint8_t *funnyYCode= c->funnyYCode;
  1868. uint8_t *funnyUVCode= c->funnyUVCode;
  1869. /* vars whch will change and which we need to storw back in the context */
  1870. int dstY= c->dstY;
  1871. int lumBufIndex= c->lumBufIndex;
  1872. int chrBufIndex= c->chrBufIndex;
  1873. int lastInLumBuf= c->lastInLumBuf;
  1874. int lastInChrBuf= c->lastInChrBuf;
  1875. if(srcSliceY ==0){
  1876. lumBufIndex=0;
  1877. chrBufIndex=0;
  1878. dstY=0;
  1879. lastInLumBuf= -1;
  1880. lastInChrBuf= -1;
  1881. }
  1882. for(;dstY < dstH; dstY++){
  1883. unsigned char *dest =dst[0]+dstStride[0]*dstY;
  1884. unsigned char *uDest=dst[1]+dstStride[1]*(dstY>>1);
  1885. unsigned char *vDest=dst[2]+dstStride[2]*(dstY>>1);
  1886. const int chrDstY= dstFormat==IMGFMT_YV12 ? (dstY>>1) : dstY;
  1887. const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input
  1888. const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input
  1889. const int lastLumSrcY= firstLumSrcY + vLumFilterSize -1; // Last line needed as input
  1890. const int lastChrSrcY= firstChrSrcY + vChrFilterSize -1; // Last line needed as input
  1891. //handle holes (FAST_BILINEAR & weird filters)
  1892. if(firstLumSrcY > lastInLumBuf) lastInLumBuf= firstLumSrcY-1;
  1893. if(firstChrSrcY > lastInChrBuf) lastInChrBuf= firstChrSrcY-1;
  1894. //printf("%d %d %d\n", firstChrSrcY, lastInChrBuf, vChrBufSize);
  1895. ASSERT(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1)
  1896. ASSERT(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1)
  1897. // Do we have enough lines in this slice to output the dstY line
  1898. if(lastLumSrcY < srcSliceY + srcSliceH && lastChrSrcY < ((srcSliceY + srcSliceH)>>1))
  1899. {
  1900. //Do horizontal scaling
  1901. while(lastInLumBuf < lastLumSrcY)
  1902. {
  1903. uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
  1904. lumBufIndex++;
  1905. // printf("%d %d %d %d\n", lumBufIndex, vLumBufSize, lastInLumBuf, lastLumSrcY);
  1906. ASSERT(lumBufIndex < 2*vLumBufSize)
  1907. ASSERT(lastInLumBuf + 1 - srcSliceY < srcSliceH)
  1908. ASSERT(lastInLumBuf + 1 - srcSliceY >= 0)
  1909. // printf("%d %d\n", lumBufIndex, vLumBufSize);
  1910. RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
  1911. flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
  1912. funnyYCode);
  1913. lastInLumBuf++;
  1914. }
  1915. while(lastInChrBuf < lastChrSrcY)
  1916. {
  1917. uint8_t *src1= src[1]+(lastInChrBuf + 1 - (srcSliceY>>1))*srcStride[1];
  1918. uint8_t *src2= src[2]+(lastInChrBuf + 1 - (srcSliceY>>1))*srcStride[2];
  1919. chrBufIndex++;
  1920. ASSERT(chrBufIndex < 2*vChrBufSize)
  1921. ASSERT(lastInChrBuf + 1 - (srcSliceY>>1) < (srcSliceH>>1))
  1922. ASSERT(lastInChrBuf + 1 - (srcSliceY>>1) >= 0)
  1923. //FIXME replace parameters through context struct (some at least)
  1924. RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, (srcW+1)>>1, chrXInc,
  1925. flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
  1926. funnyUVCode);
  1927. lastInChrBuf++;
  1928. }
  1929. //wrap buf index around to stay inside the ring buffer
  1930. if(lumBufIndex >= vLumBufSize ) lumBufIndex-= vLumBufSize;
  1931. if(chrBufIndex >= vChrBufSize ) chrBufIndex-= vChrBufSize;
  1932. }
  1933. else // not enough lines left in this slice -> load the rest in the buffer
  1934. {
  1935. /* printf("%d %d Last:%d %d LastInBuf:%d %d Index:%d %d Y:%d FSize: %d %d BSize: %d %d\n",
  1936. firstChrSrcY,firstLumSrcY,lastChrSrcY,lastLumSrcY,
  1937. lastInChrBuf,lastInLumBuf,chrBufIndex,lumBufIndex,dstY,vChrFilterSize,vLumFilterSize,
  1938. vChrBufSize, vLumBufSize);
  1939. */
  1940. //Do horizontal scaling
  1941. while(lastInLumBuf+1 < srcSliceY + srcSliceH)
  1942. {
  1943. uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
  1944. lumBufIndex++;
  1945. ASSERT(lumBufIndex < 2*vLumBufSize)
  1946. ASSERT(lastInLumBuf + 1 - srcSliceY < srcSliceH)
  1947. ASSERT(lastInLumBuf + 1 - srcSliceY >= 0)
  1948. RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
  1949. flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
  1950. funnyYCode);
  1951. lastInLumBuf++;
  1952. }
  1953. while(lastInChrBuf+1 < ((srcSliceY + srcSliceH)>>1))
  1954. {
  1955. uint8_t *src1= src[1]+(lastInChrBuf + 1 - (srcSliceY>>1))*srcStride[1];
  1956. uint8_t *src2= src[2]+(lastInChrBuf + 1 - (srcSliceY>>1))*srcStride[2];
  1957. chrBufIndex++;
  1958. ASSERT(chrBufIndex < 2*vChrBufSize)
  1959. ASSERT(lastInChrBuf + 1 - (srcSliceY>>1) < (srcSliceH>>1))
  1960. ASSERT(lastInChrBuf + 1 - (srcSliceY>>1) >= 0)
  1961. RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, (srcW+1)>>1, chrXInc,
  1962. flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
  1963. funnyUVCode);
  1964. lastInChrBuf++;
  1965. }
  1966. //wrap buf index around to stay inside the ring buffer
  1967. if(lumBufIndex >= vLumBufSize ) lumBufIndex-= vLumBufSize;
  1968. if(chrBufIndex >= vChrBufSize ) chrBufIndex-= vChrBufSize;
  1969. break; //we cant output a dstY line so lets try with the next slice
  1970. }
  1971. #ifdef HAVE_MMX
  1972. b5Dither= dither8[dstY&1];
  1973. g6Dither= dither4[dstY&1];
  1974. g5Dither= dither8[dstY&1];
  1975. r5Dither= dither8[(dstY+1)&1];
  1976. #endif
  1977. if(dstY < dstH-2)
  1978. {
  1979. if(dstFormat==IMGFMT_YV12) //YV12
  1980. {
  1981. if(dstY&1) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
  1982. if(vLumFilterSize == 1 && vChrFilterSize == 1) // Unscaled YV12
  1983. {
  1984. int16_t *lumBuf = lumPixBuf[0];
  1985. int16_t *chrBuf= chrPixBuf[0];
  1986. RENAME(yuv2yuv1)(lumBuf, chrBuf, dest, uDest, vDest, dstW);
  1987. }
  1988. else //General YV12
  1989. {
  1990. int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
  1991. int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
  1992. RENAME(yuv2yuvX)(
  1993. vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
  1994. vChrFilter+(dstY>>1)*vChrFilterSize, chrSrcPtr, vChrFilterSize,
  1995. dest, uDest, vDest, dstW,
  1996. lumMmxFilter+dstY*vLumFilterSize*4, chrMmxFilter+(dstY>>1)*vChrFilterSize*4);
  1997. }
  1998. }
  1999. else
  2000. {
  2001. int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
  2002. int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
  2003. ASSERT(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
  2004. ASSERT(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
  2005. if(vLumFilterSize == 1 && vChrFilterSize == 2) //Unscaled RGB
  2006. {
  2007. int chrAlpha= vChrFilter[2*dstY+1];
  2008. RENAME(yuv2rgb1)(*lumSrcPtr, *chrSrcPtr, *(chrSrcPtr+1),
  2009. dest, dstW, chrAlpha, dstFormat, flags);
  2010. }
  2011. else if(vLumFilterSize == 2 && vChrFilterSize == 2) //BiLinear Upscale RGB
  2012. {
  2013. int lumAlpha= vLumFilter[2*dstY+1];
  2014. int chrAlpha= vChrFilter[2*dstY+1];
  2015. RENAME(yuv2rgb2)(*lumSrcPtr, *(lumSrcPtr+1), *chrSrcPtr, *(chrSrcPtr+1),
  2016. dest, dstW, lumAlpha, chrAlpha, dstFormat, flags);
  2017. }
  2018. else //General RGB
  2019. {
  2020. RENAME(yuv2rgbX)(
  2021. vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
  2022. vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
  2023. dest, dstW, dstFormat,
  2024. lumMmxFilter+dstY*vLumFilterSize*4, chrMmxFilter+dstY*vChrFilterSize*4);
  2025. }
  2026. }
  2027. }
  2028. else // hmm looks like we cant use MMX here without overwriting this arrays tail
  2029. {
  2030. int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
  2031. int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
  2032. if(dstFormat==IMGFMT_YV12) //YV12
  2033. {
  2034. if(dstY&1) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
  2035. yuv2yuvXinC(
  2036. vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
  2037. vChrFilter+(dstY>>1)*vChrFilterSize, chrSrcPtr, vChrFilterSize,
  2038. dest, uDest, vDest, dstW);
  2039. }
  2040. else
  2041. {
  2042. ASSERT(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
  2043. ASSERT(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
  2044. yuv2rgbXinC(
  2045. vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
  2046. vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
  2047. dest, dstW, dstFormat);
  2048. }
  2049. }
  2050. }
  2051. #ifdef HAVE_MMX
  2052. __asm __volatile(SFENCE:::"memory");
  2053. __asm __volatile(EMMS:::"memory");
  2054. #endif
  2055. /* store changed local vars back in the context */
  2056. c->dstY= dstY;
  2057. c->lumBufIndex= lumBufIndex;
  2058. c->chrBufIndex= chrBufIndex;
  2059. c->lastInLumBuf= lastInLumBuf;
  2060. c->lastInChrBuf= lastInChrBuf;
  2061. }