You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

2591 lines
76KB

  1. /*
  2. Copyright (C) 2001-2002 Michael Niedermayer <michaelni@gmx.at>
  3. This program is free software; you can redistribute it and/or modify
  4. it under the terms of the GNU General Public License as published by
  5. the Free Software Foundation; either version 2 of the License, or
  6. (at your option) any later version.
  7. This program is distributed in the hope that it will be useful,
  8. but WITHOUT ANY WARRANTY; without even the implied warranty of
  9. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  10. GNU General Public License for more details.
  11. You should have received a copy of the GNU General Public License
  12. along with this program; if not, write to the Free Software
  13. Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  14. */
  15. #undef MOVNTQ
  16. #undef PAVGB
  17. #undef PREFETCH
  18. #undef PREFETCHW
  19. #undef EMMS
  20. #undef SFENCE
  21. #ifdef HAVE_3DNOW
  22. /* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
  23. #define EMMS "femms"
  24. #else
  25. #define EMMS "emms"
  26. #endif
  27. #ifdef HAVE_3DNOW
  28. #define PREFETCH "prefetch"
  29. #define PREFETCHW "prefetchw"
  30. #elif defined ( HAVE_MMX2 )
  31. #define PREFETCH "prefetchnta"
  32. #define PREFETCHW "prefetcht0"
  33. #else
  34. #define PREFETCH "/nop"
  35. #define PREFETCHW "/nop"
  36. #endif
  37. #ifdef HAVE_MMX2
  38. #define SFENCE "sfence"
  39. #else
  40. #define SFENCE "/nop"
  41. #endif
  42. #ifdef HAVE_MMX2
  43. #define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
  44. #elif defined (HAVE_3DNOW)
  45. #define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
  46. #endif
  47. #ifdef HAVE_MMX2
  48. #define MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
  49. #else
  50. #define MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
  51. #endif
  52. #define YSCALEYUV2YV12X(x) \
  53. "xorl %%eax, %%eax \n\t"\
  54. "pxor %%mm3, %%mm3 \n\t"\
  55. "pxor %%mm4, %%mm4 \n\t"\
  56. "movl %0, %%edx \n\t"\
  57. ".balign 16 \n\t" /* FIXME Unroll? */\
  58. "1: \n\t"\
  59. "movl (%1, %%edx, 4), %%esi \n\t"\
  60. "movq (%2, %%edx, 8), %%mm0 \n\t" /* filterCoeff */\
  61. "movq " #x "(%%esi, %%eax, 2), %%mm2 \n\t" /* srcData */\
  62. "movq 8+" #x "(%%esi, %%eax, 2), %%mm5 \n\t" /* srcData */\
  63. "pmulhw %%mm0, %%mm2 \n\t"\
  64. "pmulhw %%mm0, %%mm5 \n\t"\
  65. "paddw %%mm2, %%mm3 \n\t"\
  66. "paddw %%mm5, %%mm4 \n\t"\
  67. "addl $1, %%edx \n\t"\
  68. " jnz 1b \n\t"\
  69. "psraw $3, %%mm3 \n\t"\
  70. "psraw $3, %%mm4 \n\t"\
  71. "packuswb %%mm4, %%mm3 \n\t"\
  72. MOVNTQ(%%mm3, (%3, %%eax))\
  73. "addl $8, %%eax \n\t"\
  74. "cmpl %4, %%eax \n\t"\
  75. "pxor %%mm3, %%mm3 \n\t"\
  76. "pxor %%mm4, %%mm4 \n\t"\
  77. "movl %0, %%edx \n\t"\
  78. "jb 1b \n\t"
  79. #define YSCALEYUV2YV121 \
  80. "movl %2, %%eax \n\t"\
  81. ".balign 16 \n\t" /* FIXME Unroll? */\
  82. "1: \n\t"\
  83. "movq (%0, %%eax, 2), %%mm0 \n\t"\
  84. "movq 8(%0, %%eax, 2), %%mm1 \n\t"\
  85. "psraw $7, %%mm0 \n\t"\
  86. "psraw $7, %%mm1 \n\t"\
  87. "packuswb %%mm1, %%mm0 \n\t"\
  88. MOVNTQ(%%mm0, (%1, %%eax))\
  89. "addl $8, %%eax \n\t"\
  90. "jnc 1b \n\t"
  91. /*
  92. :: "m" (-lumFilterSize), "m" (-chrFilterSize),
  93. "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
  94. "r" (dest), "m" (dstW),
  95. "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
  96. : "%eax", "%ebx", "%ecx", "%edx", "%esi"
  97. */
  98. #define YSCALEYUV2RGBX \
  99. "xorl %%eax, %%eax \n\t"\
  100. ".balign 16 \n\t"\
  101. "1: \n\t"\
  102. "movl %1, %%edx \n\t" /* -chrFilterSize */\
  103. "movl %3, %%ebx \n\t" /* chrMmxFilter+chrFilterSize */\
  104. "movl %7, %%ecx \n\t" /* chrSrc+chrFilterSize */\
  105. "pxor %%mm3, %%mm3 \n\t"\
  106. "pxor %%mm4, %%mm4 \n\t"\
  107. "2: \n\t"\
  108. "movl (%%ecx, %%edx, 4), %%esi \n\t"\
  109. "movq (%%ebx, %%edx, 8), %%mm0 \n\t" /* filterCoeff */\
  110. "movq (%%esi, %%eax), %%mm2 \n\t" /* UsrcData */\
  111. "movq 4096(%%esi, %%eax), %%mm5 \n\t" /* VsrcData */\
  112. "pmulhw %%mm0, %%mm2 \n\t"\
  113. "pmulhw %%mm0, %%mm5 \n\t"\
  114. "paddw %%mm2, %%mm3 \n\t"\
  115. "paddw %%mm5, %%mm4 \n\t"\
  116. "addl $1, %%edx \n\t"\
  117. " jnz 2b \n\t"\
  118. \
  119. "movl %0, %%edx \n\t" /* -lumFilterSize */\
  120. "movl %2, %%ebx \n\t" /* lumMmxFilter+lumFilterSize */\
  121. "movl %6, %%ecx \n\t" /* lumSrc+lumFilterSize */\
  122. "pxor %%mm1, %%mm1 \n\t"\
  123. "pxor %%mm7, %%mm7 \n\t"\
  124. "2: \n\t"\
  125. "movl (%%ecx, %%edx, 4), %%esi \n\t"\
  126. "movq (%%ebx, %%edx, 8), %%mm0 \n\t" /* filterCoeff */\
  127. "movq (%%esi, %%eax, 2), %%mm2 \n\t" /* Y1srcData */\
  128. "movq 8(%%esi, %%eax, 2), %%mm5 \n\t" /* Y2srcData */\
  129. "pmulhw %%mm0, %%mm2 \n\t"\
  130. "pmulhw %%mm0, %%mm5 \n\t"\
  131. "paddw %%mm2, %%mm1 \n\t"\
  132. "paddw %%mm5, %%mm7 \n\t"\
  133. "addl $1, %%edx \n\t"\
  134. " jnz 2b \n\t"\
  135. \
  136. "psubw "MANGLE(w400)", %%mm3 \n\t" /* (U-128)8*/\
  137. "psubw "MANGLE(w400)", %%mm4 \n\t" /* (V-128)8*/\
  138. "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
  139. "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
  140. "pmulhw "MANGLE(ugCoeff)", %%mm3\n\t"\
  141. "pmulhw "MANGLE(vgCoeff)", %%mm4\n\t"\
  142. /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
  143. "pmulhw "MANGLE(ubCoeff)", %%mm2\n\t"\
  144. "pmulhw "MANGLE(vrCoeff)", %%mm5\n\t"\
  145. "psubw "MANGLE(w80)", %%mm1 \n\t" /* 8(Y-16)*/\
  146. "psubw "MANGLE(w80)", %%mm7 \n\t" /* 8(Y-16)*/\
  147. "pmulhw "MANGLE(yCoeff)", %%mm1 \n\t"\
  148. "pmulhw "MANGLE(yCoeff)", %%mm7 \n\t"\
  149. /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
  150. "paddw %%mm3, %%mm4 \n\t"\
  151. "movq %%mm2, %%mm0 \n\t"\
  152. "movq %%mm5, %%mm6 \n\t"\
  153. "movq %%mm4, %%mm3 \n\t"\
  154. "punpcklwd %%mm2, %%mm2 \n\t"\
  155. "punpcklwd %%mm5, %%mm5 \n\t"\
  156. "punpcklwd %%mm4, %%mm4 \n\t"\
  157. "paddw %%mm1, %%mm2 \n\t"\
  158. "paddw %%mm1, %%mm5 \n\t"\
  159. "paddw %%mm1, %%mm4 \n\t"\
  160. "punpckhwd %%mm0, %%mm0 \n\t"\
  161. "punpckhwd %%mm6, %%mm6 \n\t"\
  162. "punpckhwd %%mm3, %%mm3 \n\t"\
  163. "paddw %%mm7, %%mm0 \n\t"\
  164. "paddw %%mm7, %%mm6 \n\t"\
  165. "paddw %%mm7, %%mm3 \n\t"\
  166. /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
  167. "packuswb %%mm0, %%mm2 \n\t"\
  168. "packuswb %%mm6, %%mm5 \n\t"\
  169. "packuswb %%mm3, %%mm4 \n\t"\
  170. "pxor %%mm7, %%mm7 \n\t"
  171. #define FULL_YSCALEYUV2RGB \
  172. "pxor %%mm7, %%mm7 \n\t"\
  173. "movd %6, %%mm6 \n\t" /*yalpha1*/\
  174. "punpcklwd %%mm6, %%mm6 \n\t"\
  175. "punpcklwd %%mm6, %%mm6 \n\t"\
  176. "movd %7, %%mm5 \n\t" /*uvalpha1*/\
  177. "punpcklwd %%mm5, %%mm5 \n\t"\
  178. "punpcklwd %%mm5, %%mm5 \n\t"\
  179. "xorl %%eax, %%eax \n\t"\
  180. ".balign 16 \n\t"\
  181. "1: \n\t"\
  182. "movq (%0, %%eax, 2), %%mm0 \n\t" /*buf0[eax]*/\
  183. "movq (%1, %%eax, 2), %%mm1 \n\t" /*buf1[eax]*/\
  184. "movq (%2, %%eax,2), %%mm2 \n\t" /* uvbuf0[eax]*/\
  185. "movq (%3, %%eax,2), %%mm3 \n\t" /* uvbuf1[eax]*/\
  186. "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
  187. "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
  188. "pmulhw %%mm6, %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
  189. "pmulhw %%mm5, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
  190. "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  191. "movq 4096(%2, %%eax,2), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
  192. "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
  193. "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
  194. "movq 4096(%3, %%eax,2), %%mm0 \n\t" /* uvbuf1[eax+2048]*/\
  195. "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
  196. "psubw %%mm0, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
  197. "psubw "MANGLE(w80)", %%mm1 \n\t" /* 8(Y-16)*/\
  198. "psubw "MANGLE(w400)", %%mm3 \n\t" /* 8(U-128)*/\
  199. "pmulhw "MANGLE(yCoeff)", %%mm1 \n\t"\
  200. \
  201. \
  202. "pmulhw %%mm5, %%mm4 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
  203. "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
  204. "pmulhw "MANGLE(ubCoeff)", %%mm3\n\t"\
  205. "psraw $4, %%mm0 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
  206. "pmulhw "MANGLE(ugCoeff)", %%mm2\n\t"\
  207. "paddw %%mm4, %%mm0 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
  208. "psubw "MANGLE(w400)", %%mm0 \n\t" /* (V-128)8*/\
  209. \
  210. \
  211. "movq %%mm0, %%mm4 \n\t" /* (V-128)8*/\
  212. "pmulhw "MANGLE(vrCoeff)", %%mm0\n\t"\
  213. "pmulhw "MANGLE(vgCoeff)", %%mm4\n\t"\
  214. "paddw %%mm1, %%mm3 \n\t" /* B*/\
  215. "paddw %%mm1, %%mm0 \n\t" /* R*/\
  216. "packuswb %%mm3, %%mm3 \n\t"\
  217. \
  218. "packuswb %%mm0, %%mm0 \n\t"\
  219. "paddw %%mm4, %%mm2 \n\t"\
  220. "paddw %%mm2, %%mm1 \n\t" /* G*/\
  221. \
  222. "packuswb %%mm1, %%mm1 \n\t"
  223. #define YSCALEYUV2RGB \
  224. "movd %6, %%mm6 \n\t" /*yalpha1*/\
  225. "punpcklwd %%mm6, %%mm6 \n\t"\
  226. "punpcklwd %%mm6, %%mm6 \n\t"\
  227. "movq %%mm6, 3968(%2) \n\t"\
  228. "movd %7, %%mm5 \n\t" /*uvalpha1*/\
  229. "punpcklwd %%mm5, %%mm5 \n\t"\
  230. "punpcklwd %%mm5, %%mm5 \n\t"\
  231. "movq %%mm5, 3976(%2) \n\t"\
  232. "xorl %%eax, %%eax \n\t"\
  233. ".balign 16 \n\t"\
  234. "1: \n\t"\
  235. "movq (%2, %%eax), %%mm2 \n\t" /* uvbuf0[eax]*/\
  236. "movq (%3, %%eax), %%mm3 \n\t" /* uvbuf1[eax]*/\
  237. "movq 4096(%2, %%eax), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
  238. "movq 4096(%3, %%eax), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
  239. "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
  240. "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
  241. "movq 3976(%2), %%mm0 \n\t"\
  242. "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
  243. "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
  244. "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
  245. "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
  246. "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
  247. "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
  248. "psubw "MANGLE(w400)", %%mm3 \n\t" /* (U-128)8*/\
  249. "psubw "MANGLE(w400)", %%mm4 \n\t" /* (V-128)8*/\
  250. "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
  251. "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
  252. "pmulhw "MANGLE(ugCoeff)", %%mm3\n\t"\
  253. "pmulhw "MANGLE(vgCoeff)", %%mm4\n\t"\
  254. /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
  255. "movq (%0, %%eax, 2), %%mm0 \n\t" /*buf0[eax]*/\
  256. "movq (%1, %%eax, 2), %%mm1 \n\t" /*buf1[eax]*/\
  257. "movq 8(%0, %%eax, 2), %%mm6 \n\t" /*buf0[eax]*/\
  258. "movq 8(%1, %%eax, 2), %%mm7 \n\t" /*buf1[eax]*/\
  259. "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
  260. "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
  261. "pmulhw 3968(%2), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
  262. "pmulhw 3968(%2), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
  263. "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  264. "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  265. "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
  266. "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
  267. "pmulhw "MANGLE(ubCoeff)", %%mm2\n\t"\
  268. "pmulhw "MANGLE(vrCoeff)", %%mm5\n\t"\
  269. "psubw "MANGLE(w80)", %%mm1 \n\t" /* 8(Y-16)*/\
  270. "psubw "MANGLE(w80)", %%mm7 \n\t" /* 8(Y-16)*/\
  271. "pmulhw "MANGLE(yCoeff)", %%mm1 \n\t"\
  272. "pmulhw "MANGLE(yCoeff)", %%mm7 \n\t"\
  273. /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
  274. "paddw %%mm3, %%mm4 \n\t"\
  275. "movq %%mm2, %%mm0 \n\t"\
  276. "movq %%mm5, %%mm6 \n\t"\
  277. "movq %%mm4, %%mm3 \n\t"\
  278. "punpcklwd %%mm2, %%mm2 \n\t"\
  279. "punpcklwd %%mm5, %%mm5 \n\t"\
  280. "punpcklwd %%mm4, %%mm4 \n\t"\
  281. "paddw %%mm1, %%mm2 \n\t"\
  282. "paddw %%mm1, %%mm5 \n\t"\
  283. "paddw %%mm1, %%mm4 \n\t"\
  284. "punpckhwd %%mm0, %%mm0 \n\t"\
  285. "punpckhwd %%mm6, %%mm6 \n\t"\
  286. "punpckhwd %%mm3, %%mm3 \n\t"\
  287. "paddw %%mm7, %%mm0 \n\t"\
  288. "paddw %%mm7, %%mm6 \n\t"\
  289. "paddw %%mm7, %%mm3 \n\t"\
  290. /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
  291. "packuswb %%mm0, %%mm2 \n\t"\
  292. "packuswb %%mm6, %%mm5 \n\t"\
  293. "packuswb %%mm3, %%mm4 \n\t"\
  294. "pxor %%mm7, %%mm7 \n\t"
  295. #define YSCALEYUV2RGB1 \
  296. "xorl %%eax, %%eax \n\t"\
  297. ".balign 16 \n\t"\
  298. "1: \n\t"\
  299. "movq (%2, %%eax), %%mm3 \n\t" /* uvbuf0[eax]*/\
  300. "movq 4096(%2, %%eax), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
  301. "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
  302. "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
  303. "psubw "MANGLE(w400)", %%mm3 \n\t" /* (U-128)8*/\
  304. "psubw "MANGLE(w400)", %%mm4 \n\t" /* (V-128)8*/\
  305. "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
  306. "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
  307. "pmulhw "MANGLE(ugCoeff)", %%mm3\n\t"\
  308. "pmulhw "MANGLE(vgCoeff)", %%mm4\n\t"\
  309. /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
  310. "movq (%0, %%eax, 2), %%mm1 \n\t" /*buf0[eax]*/\
  311. "movq 8(%0, %%eax, 2), %%mm7 \n\t" /*buf0[eax]*/\
  312. "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  313. "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  314. "pmulhw "MANGLE(ubCoeff)", %%mm2\n\t"\
  315. "pmulhw "MANGLE(vrCoeff)", %%mm5\n\t"\
  316. "psubw "MANGLE(w80)", %%mm1 \n\t" /* 8(Y-16)*/\
  317. "psubw "MANGLE(w80)", %%mm7 \n\t" /* 8(Y-16)*/\
  318. "pmulhw "MANGLE(yCoeff)", %%mm1 \n\t"\
  319. "pmulhw "MANGLE(yCoeff)", %%mm7 \n\t"\
  320. /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
  321. "paddw %%mm3, %%mm4 \n\t"\
  322. "movq %%mm2, %%mm0 \n\t"\
  323. "movq %%mm5, %%mm6 \n\t"\
  324. "movq %%mm4, %%mm3 \n\t"\
  325. "punpcklwd %%mm2, %%mm2 \n\t"\
  326. "punpcklwd %%mm5, %%mm5 \n\t"\
  327. "punpcklwd %%mm4, %%mm4 \n\t"\
  328. "paddw %%mm1, %%mm2 \n\t"\
  329. "paddw %%mm1, %%mm5 \n\t"\
  330. "paddw %%mm1, %%mm4 \n\t"\
  331. "punpckhwd %%mm0, %%mm0 \n\t"\
  332. "punpckhwd %%mm6, %%mm6 \n\t"\
  333. "punpckhwd %%mm3, %%mm3 \n\t"\
  334. "paddw %%mm7, %%mm0 \n\t"\
  335. "paddw %%mm7, %%mm6 \n\t"\
  336. "paddw %%mm7, %%mm3 \n\t"\
  337. /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
  338. "packuswb %%mm0, %%mm2 \n\t"\
  339. "packuswb %%mm6, %%mm5 \n\t"\
  340. "packuswb %%mm3, %%mm4 \n\t"\
  341. "pxor %%mm7, %%mm7 \n\t"
  342. // do vertical chrominance interpolation
  343. #define YSCALEYUV2RGB1b \
  344. "xorl %%eax, %%eax \n\t"\
  345. ".balign 16 \n\t"\
  346. "1: \n\t"\
  347. "movq (%2, %%eax), %%mm2 \n\t" /* uvbuf0[eax]*/\
  348. "movq (%3, %%eax), %%mm3 \n\t" /* uvbuf1[eax]*/\
  349. "movq 4096(%2, %%eax), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
  350. "movq 4096(%3, %%eax), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
  351. "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
  352. "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
  353. "psrlw $5, %%mm3 \n\t" /*FIXME might overflow*/\
  354. "psrlw $5, %%mm4 \n\t" /*FIXME might overflow*/\
  355. "psubw "MANGLE(w400)", %%mm3 \n\t" /* (U-128)8*/\
  356. "psubw "MANGLE(w400)", %%mm4 \n\t" /* (V-128)8*/\
  357. "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
  358. "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
  359. "pmulhw "MANGLE(ugCoeff)", %%mm3\n\t"\
  360. "pmulhw "MANGLE(vgCoeff)", %%mm4\n\t"\
  361. /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
  362. "movq (%0, %%eax, 2), %%mm1 \n\t" /*buf0[eax]*/\
  363. "movq 8(%0, %%eax, 2), %%mm7 \n\t" /*buf0[eax]*/\
  364. "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  365. "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  366. "pmulhw "MANGLE(ubCoeff)", %%mm2\n\t"\
  367. "pmulhw "MANGLE(vrCoeff)", %%mm5\n\t"\
  368. "psubw "MANGLE(w80)", %%mm1 \n\t" /* 8(Y-16)*/\
  369. "psubw "MANGLE(w80)", %%mm7 \n\t" /* 8(Y-16)*/\
  370. "pmulhw "MANGLE(yCoeff)", %%mm1 \n\t"\
  371. "pmulhw "MANGLE(yCoeff)", %%mm7 \n\t"\
  372. /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
  373. "paddw %%mm3, %%mm4 \n\t"\
  374. "movq %%mm2, %%mm0 \n\t"\
  375. "movq %%mm5, %%mm6 \n\t"\
  376. "movq %%mm4, %%mm3 \n\t"\
  377. "punpcklwd %%mm2, %%mm2 \n\t"\
  378. "punpcklwd %%mm5, %%mm5 \n\t"\
  379. "punpcklwd %%mm4, %%mm4 \n\t"\
  380. "paddw %%mm1, %%mm2 \n\t"\
  381. "paddw %%mm1, %%mm5 \n\t"\
  382. "paddw %%mm1, %%mm4 \n\t"\
  383. "punpckhwd %%mm0, %%mm0 \n\t"\
  384. "punpckhwd %%mm6, %%mm6 \n\t"\
  385. "punpckhwd %%mm3, %%mm3 \n\t"\
  386. "paddw %%mm7, %%mm0 \n\t"\
  387. "paddw %%mm7, %%mm6 \n\t"\
  388. "paddw %%mm7, %%mm3 \n\t"\
  389. /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
  390. "packuswb %%mm0, %%mm2 \n\t"\
  391. "packuswb %%mm6, %%mm5 \n\t"\
  392. "packuswb %%mm3, %%mm4 \n\t"\
  393. "pxor %%mm7, %%mm7 \n\t"
  394. #define WRITEBGR32 \
  395. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
  396. "movq %%mm2, %%mm1 \n\t" /* B */\
  397. "movq %%mm5, %%mm6 \n\t" /* R */\
  398. "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
  399. "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
  400. "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
  401. "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
  402. "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
  403. "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
  404. "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
  405. "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
  406. "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
  407. "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
  408. \
  409. MOVNTQ(%%mm0, (%4, %%eax, 4))\
  410. MOVNTQ(%%mm2, 8(%4, %%eax, 4))\
  411. MOVNTQ(%%mm1, 16(%4, %%eax, 4))\
  412. MOVNTQ(%%mm3, 24(%4, %%eax, 4))\
  413. \
  414. "addl $8, %%eax \n\t"\
  415. "cmpl %5, %%eax \n\t"\
  416. " jb 1b \n\t"
  417. #define WRITEBGR16 \
  418. "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
  419. "pand "MANGLE(bFC)", %%mm4 \n\t" /* G */\
  420. "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
  421. "psrlq $3, %%mm2 \n\t"\
  422. \
  423. "movq %%mm2, %%mm1 \n\t"\
  424. "movq %%mm4, %%mm3 \n\t"\
  425. \
  426. "punpcklbw %%mm7, %%mm3 \n\t"\
  427. "punpcklbw %%mm5, %%mm2 \n\t"\
  428. "punpckhbw %%mm7, %%mm4 \n\t"\
  429. "punpckhbw %%mm5, %%mm1 \n\t"\
  430. \
  431. "psllq $3, %%mm3 \n\t"\
  432. "psllq $3, %%mm4 \n\t"\
  433. \
  434. "por %%mm3, %%mm2 \n\t"\
  435. "por %%mm4, %%mm1 \n\t"\
  436. \
  437. MOVNTQ(%%mm2, (%4, %%eax, 2))\
  438. MOVNTQ(%%mm1, 8(%4, %%eax, 2))\
  439. \
  440. "addl $8, %%eax \n\t"\
  441. "cmpl %5, %%eax \n\t"\
  442. " jb 1b \n\t"
  443. #define WRITEBGR15 \
  444. "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
  445. "pand "MANGLE(bF8)", %%mm4 \n\t" /* G */\
  446. "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
  447. "psrlq $3, %%mm2 \n\t"\
  448. "psrlq $1, %%mm5 \n\t"\
  449. \
  450. "movq %%mm2, %%mm1 \n\t"\
  451. "movq %%mm4, %%mm3 \n\t"\
  452. \
  453. "punpcklbw %%mm7, %%mm3 \n\t"\
  454. "punpcklbw %%mm5, %%mm2 \n\t"\
  455. "punpckhbw %%mm7, %%mm4 \n\t"\
  456. "punpckhbw %%mm5, %%mm1 \n\t"\
  457. \
  458. "psllq $2, %%mm3 \n\t"\
  459. "psllq $2, %%mm4 \n\t"\
  460. \
  461. "por %%mm3, %%mm2 \n\t"\
  462. "por %%mm4, %%mm1 \n\t"\
  463. \
  464. MOVNTQ(%%mm2, (%4, %%eax, 2))\
  465. MOVNTQ(%%mm1, 8(%4, %%eax, 2))\
  466. \
  467. "addl $8, %%eax \n\t"\
  468. "cmpl %5, %%eax \n\t"\
  469. " jb 1b \n\t"
  470. #define WRITEBGR24OLD \
  471. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
  472. "movq %%mm2, %%mm1 \n\t" /* B */\
  473. "movq %%mm5, %%mm6 \n\t" /* R */\
  474. "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
  475. "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
  476. "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
  477. "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
  478. "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
  479. "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
  480. "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
  481. "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
  482. "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
  483. "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
  484. \
  485. "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
  486. "psrlq $8, %%mm0 \n\t" /* 00RGB0RG 0 */\
  487. "pand "MANGLE(bm00000111)", %%mm4\n\t" /* 00000RGB 0 */\
  488. "pand "MANGLE(bm11111000)", %%mm0\n\t" /* 00RGB000 0.5 */\
  489. "por %%mm4, %%mm0 \n\t" /* 00RGBRGB 0 */\
  490. "movq %%mm2, %%mm4 \n\t" /* 0RGB0RGB 1 */\
  491. "psllq $48, %%mm2 \n\t" /* GB000000 1 */\
  492. "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
  493. \
  494. "movq %%mm4, %%mm2 \n\t" /* 0RGB0RGB 1 */\
  495. "psrld $16, %%mm4 \n\t" /* 000R000R 1 */\
  496. "psrlq $24, %%mm2 \n\t" /* 0000RGB0 1.5 */\
  497. "por %%mm4, %%mm2 \n\t" /* 000RRGBR 1 */\
  498. "pand "MANGLE(bm00001111)", %%mm2\n\t" /* 0000RGBR 1 */\
  499. "movq %%mm1, %%mm4 \n\t" /* 0RGB0RGB 2 */\
  500. "psrlq $8, %%mm1 \n\t" /* 00RGB0RG 2 */\
  501. "pand "MANGLE(bm00000111)", %%mm4\n\t" /* 00000RGB 2 */\
  502. "pand "MANGLE(bm11111000)", %%mm1\n\t" /* 00RGB000 2.5 */\
  503. "por %%mm4, %%mm1 \n\t" /* 00RGBRGB 2 */\
  504. "movq %%mm1, %%mm4 \n\t" /* 00RGBRGB 2 */\
  505. "psllq $32, %%mm1 \n\t" /* BRGB0000 2 */\
  506. "por %%mm1, %%mm2 \n\t" /* BRGBRGBR 1 */\
  507. \
  508. "psrlq $32, %%mm4 \n\t" /* 000000RG 2.5 */\
  509. "movq %%mm3, %%mm5 \n\t" /* 0RGB0RGB 3 */\
  510. "psrlq $8, %%mm3 \n\t" /* 00RGB0RG 3 */\
  511. "pand "MANGLE(bm00000111)", %%mm5\n\t" /* 00000RGB 3 */\
  512. "pand "MANGLE(bm11111000)", %%mm3\n\t" /* 00RGB000 3.5 */\
  513. "por %%mm5, %%mm3 \n\t" /* 00RGBRGB 3 */\
  514. "psllq $16, %%mm3 \n\t" /* RGBRGB00 3 */\
  515. "por %%mm4, %%mm3 \n\t" /* RGBRGBRG 2.5 */\
  516. \
  517. MOVNTQ(%%mm0, (%%ebx))\
  518. MOVNTQ(%%mm2, 8(%%ebx))\
  519. MOVNTQ(%%mm3, 16(%%ebx))\
  520. "addl $24, %%ebx \n\t"\
  521. \
  522. "addl $8, %%eax \n\t"\
  523. "cmpl %5, %%eax \n\t"\
  524. " jb 1b \n\t"
  525. #define WRITEBGR24MMX \
  526. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
  527. "movq %%mm2, %%mm1 \n\t" /* B */\
  528. "movq %%mm5, %%mm6 \n\t" /* R */\
  529. "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
  530. "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
  531. "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
  532. "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
  533. "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
  534. "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
  535. "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
  536. "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
  537. "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
  538. "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
  539. \
  540. "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
  541. "movq %%mm2, %%mm6 \n\t" /* 0RGB0RGB 1 */\
  542. "movq %%mm1, %%mm5 \n\t" /* 0RGB0RGB 2 */\
  543. "movq %%mm3, %%mm7 \n\t" /* 0RGB0RGB 3 */\
  544. \
  545. "psllq $40, %%mm0 \n\t" /* RGB00000 0 */\
  546. "psllq $40, %%mm2 \n\t" /* RGB00000 1 */\
  547. "psllq $40, %%mm1 \n\t" /* RGB00000 2 */\
  548. "psllq $40, %%mm3 \n\t" /* RGB00000 3 */\
  549. \
  550. "punpckhdq %%mm4, %%mm0 \n\t" /* 0RGBRGB0 0 */\
  551. "punpckhdq %%mm6, %%mm2 \n\t" /* 0RGBRGB0 1 */\
  552. "punpckhdq %%mm5, %%mm1 \n\t" /* 0RGBRGB0 2 */\
  553. "punpckhdq %%mm7, %%mm3 \n\t" /* 0RGBRGB0 3 */\
  554. \
  555. "psrlq $8, %%mm0 \n\t" /* 00RGBRGB 0 */\
  556. "movq %%mm2, %%mm6 \n\t" /* 0RGBRGB0 1 */\
  557. "psllq $40, %%mm2 \n\t" /* GB000000 1 */\
  558. "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
  559. MOVNTQ(%%mm0, (%%ebx))\
  560. \
  561. "psrlq $24, %%mm6 \n\t" /* 0000RGBR 1 */\
  562. "movq %%mm1, %%mm5 \n\t" /* 0RGBRGB0 2 */\
  563. "psllq $24, %%mm1 \n\t" /* BRGB0000 2 */\
  564. "por %%mm1, %%mm6 \n\t" /* BRGBRGBR 1 */\
  565. MOVNTQ(%%mm6, 8(%%ebx))\
  566. \
  567. "psrlq $40, %%mm5 \n\t" /* 000000RG 2 */\
  568. "psllq $8, %%mm3 \n\t" /* RGBRGB00 3 */\
  569. "por %%mm3, %%mm5 \n\t" /* RGBRGBRG 2 */\
  570. MOVNTQ(%%mm5, 16(%%ebx))\
  571. \
  572. "addl $24, %%ebx \n\t"\
  573. \
  574. "addl $8, %%eax \n\t"\
  575. "cmpl %5, %%eax \n\t"\
  576. " jb 1b \n\t"
  577. #define WRITEBGR24MMX2 \
  578. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
  579. "movq "MANGLE(M24A)", %%mm0 \n\t"\
  580. "movq "MANGLE(M24C)", %%mm7 \n\t"\
  581. "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2 B1 B0 B1 B0 */\
  582. "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2 G1 G0 G1 G0 */\
  583. "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0 R1 R0 R1 R0 */\
  584. \
  585. "pand %%mm0, %%mm1 \n\t" /* B2 B1 B0 */\
  586. "pand %%mm0, %%mm3 \n\t" /* G2 G1 G0 */\
  587. "pand %%mm7, %%mm6 \n\t" /* R1 R0 */\
  588. \
  589. "psllq $8, %%mm3 \n\t" /* G2 G1 G0 */\
  590. "por %%mm1, %%mm6 \n\t"\
  591. "por %%mm3, %%mm6 \n\t"\
  592. MOVNTQ(%%mm6, (%%ebx))\
  593. \
  594. "psrlq $8, %%mm4 \n\t" /* 00 G7 G6 G5 G4 G3 G2 G1 */\
  595. "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4 B3 B2 B3 B2 */\
  596. "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3 G4 G3 G4 G3 */\
  597. "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4 R3 R2 R3 R2 */\
  598. \
  599. "pand "MANGLE(M24B)", %%mm1 \n\t" /* B5 B4 B3 */\
  600. "pand %%mm7, %%mm3 \n\t" /* G4 G3 */\
  601. "pand %%mm0, %%mm6 \n\t" /* R4 R3 R2 */\
  602. \
  603. "por %%mm1, %%mm3 \n\t" /* B5 G4 B4 G3 B3 */\
  604. "por %%mm3, %%mm6 \n\t"\
  605. MOVNTQ(%%mm6, 8(%%ebx))\
  606. \
  607. "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6 B7 B6 B6 B7 */\
  608. "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7 G6 G5 G6 G5 */\
  609. "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6 R5 R4 R5 R4 */\
  610. \
  611. "pand %%mm7, %%mm1 \n\t" /* B7 B6 */\
  612. "pand %%mm0, %%mm3 \n\t" /* G7 G6 G5 */\
  613. "pand "MANGLE(M24B)", %%mm6 \n\t" /* R7 R6 R5 */\
  614. \
  615. "por %%mm1, %%mm3 \n\t"\
  616. "por %%mm3, %%mm6 \n\t"\
  617. MOVNTQ(%%mm6, 16(%%ebx))\
  618. \
  619. "addl $24, %%ebx \n\t"\
  620. \
  621. "addl $8, %%eax \n\t"\
  622. "cmpl %5, %%eax \n\t"\
  623. " jb 1b \n\t"
  624. #ifdef HAVE_MMX2
  625. #undef WRITEBGR24
  626. #define WRITEBGR24 WRITEBGR24MMX2
  627. #else
  628. #undef WRITEBGR24
  629. #define WRITEBGR24 WRITEBGR24MMX
  630. #endif
  631. static inline void RENAME(yuv2yuvX)(int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
  632. int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
  633. uint8_t *dest, uint8_t *uDest, uint8_t *vDest, int dstW, int chrDstW,
  634. int16_t * lumMmxFilter, int16_t * chrMmxFilter)
  635. {
  636. #ifdef HAVE_MMX
  637. if(uDest != NULL)
  638. {
  639. asm volatile(
  640. YSCALEYUV2YV12X(0)
  641. :: "m" (-chrFilterSize), "r" (chrSrc+chrFilterSize),
  642. "r" (chrMmxFilter+chrFilterSize*4), "r" (uDest), "m" (chrDstW)
  643. : "%eax", "%edx", "%esi"
  644. );
  645. asm volatile(
  646. YSCALEYUV2YV12X(4096)
  647. :: "m" (-chrFilterSize), "r" (chrSrc+chrFilterSize),
  648. "r" (chrMmxFilter+chrFilterSize*4), "r" (vDest), "m" (chrDstW)
  649. : "%eax", "%edx", "%esi"
  650. );
  651. }
  652. asm volatile(
  653. YSCALEYUV2YV12X(0)
  654. :: "m" (-lumFilterSize), "r" (lumSrc+lumFilterSize),
  655. "r" (lumMmxFilter+lumFilterSize*4), "r" (dest), "m" (dstW)
  656. : "%eax", "%edx", "%esi"
  657. );
  658. #else
  659. yuv2yuvXinC(lumFilter, lumSrc, lumFilterSize,
  660. chrFilter, chrSrc, chrFilterSize,
  661. dest, uDest, vDest, dstW, chrDstW);
  662. #endif
  663. }
  664. static inline void RENAME(yuv2yuv1)(int16_t *lumSrc, int16_t *chrSrc,
  665. uint8_t *dest, uint8_t *uDest, uint8_t *vDest, int dstW, int chrDstW)
  666. {
  667. #ifdef HAVE_MMX
  668. if(uDest != NULL)
  669. {
  670. asm volatile(
  671. YSCALEYUV2YV121
  672. :: "r" (chrSrc + chrDstW), "r" (uDest + chrDstW),
  673. "g" (-chrDstW)
  674. : "%eax"
  675. );
  676. asm volatile(
  677. YSCALEYUV2YV121
  678. :: "r" (chrSrc + 2048 + chrDstW), "r" (vDest + chrDstW),
  679. "g" (-chrDstW)
  680. : "%eax"
  681. );
  682. }
  683. asm volatile(
  684. YSCALEYUV2YV121
  685. :: "r" (lumSrc + dstW), "r" (dest + dstW),
  686. "g" (-dstW)
  687. : "%eax"
  688. );
  689. #else
  690. int i;
  691. for(i=0; i<dstW; i++)
  692. {
  693. int val= lumSrc[i]>>7;
  694. if(val&256){
  695. if(val<0) val=0;
  696. else val=255;
  697. }
  698. dest[i]= val;
  699. }
  700. if(uDest != NULL)
  701. for(i=0; i<chrDstW; i++)
  702. {
  703. int u=chrSrc[i]>>7;
  704. int v=chrSrc[i + 2048]>>7;
  705. if((u|v)&256){
  706. if(u<0) u=0;
  707. else if (u>255) u=255;
  708. if(v<0) v=0;
  709. else if (v>255) v=255;
  710. }
  711. uDest[i]= u;
  712. vDest[i]= v;
  713. }
  714. #endif
  715. }
  716. /**
  717. * vertical scale YV12 to RGB
  718. */
  719. static inline void RENAME(yuv2rgbX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
  720. int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
  721. uint8_t *dest, int dstW, int16_t * lumMmxFilter, int16_t * chrMmxFilter, int dstY)
  722. {
  723. switch(c->dstFormat)
  724. {
  725. #ifdef HAVE_MMX
  726. case IMGFMT_BGR32:
  727. {
  728. asm volatile(
  729. YSCALEYUV2RGBX
  730. WRITEBGR32
  731. :: "m" (-lumFilterSize), "m" (-chrFilterSize),
  732. "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
  733. "r" (dest), "m" (dstW),
  734. "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
  735. : "%eax", "%ebx", "%ecx", "%edx", "%esi"
  736. );
  737. }
  738. break;
  739. case IMGFMT_BGR24:
  740. {
  741. asm volatile(
  742. YSCALEYUV2RGBX
  743. "leal (%%eax, %%eax, 2), %%ebx \n\t" //FIXME optimize
  744. "addl %4, %%ebx \n\t"
  745. WRITEBGR24
  746. :: "m" (-lumFilterSize), "m" (-chrFilterSize),
  747. "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
  748. "r" (dest), "m" (dstW),
  749. "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
  750. : "%eax", "%ebx", "%ecx", "%edx", "%esi"
  751. );
  752. }
  753. break;
  754. case IMGFMT_BGR15:
  755. {
  756. asm volatile(
  757. YSCALEYUV2RGBX
  758. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  759. #ifdef DITHER1XBPP
  760. "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
  761. "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
  762. "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
  763. #endif
  764. WRITEBGR15
  765. :: "m" (-lumFilterSize), "m" (-chrFilterSize),
  766. "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
  767. "r" (dest), "m" (dstW),
  768. "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
  769. : "%eax", "%ebx", "%ecx", "%edx", "%esi"
  770. );
  771. }
  772. break;
  773. case IMGFMT_BGR16:
  774. {
  775. asm volatile(
  776. YSCALEYUV2RGBX
  777. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  778. #ifdef DITHER1XBPP
  779. "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
  780. "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
  781. "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
  782. #endif
  783. WRITEBGR16
  784. :: "m" (-lumFilterSize), "m" (-chrFilterSize),
  785. "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
  786. "r" (dest), "m" (dstW),
  787. "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
  788. : "%eax", "%ebx", "%ecx", "%edx", "%esi"
  789. );
  790. }
  791. break;
  792. #endif
  793. default:
  794. yuv2rgbXinC(c, lumFilter, lumSrc, lumFilterSize,
  795. chrFilter, chrSrc, chrFilterSize,
  796. dest, dstW, dstY);
  797. break;
  798. }
  799. }
  800. /**
  801. * vertical bilinear scale YV12 to RGB
  802. */
  803. static inline void RENAME(yuv2rgb2)(SwsContext *c, uint16_t *buf0, uint16_t *buf1, uint16_t *uvbuf0, uint16_t *uvbuf1,
  804. uint8_t *dest, int dstW, int yalpha, int uvalpha, int y)
  805. {
  806. int yalpha1=yalpha^4095;
  807. int uvalpha1=uvalpha^4095;
  808. int i;
  809. #if 0 //isnt used
  810. if(flags&SWS_FULL_CHR_H_INT)
  811. {
  812. switch(dstFormat)
  813. {
  814. #ifdef HAVE_MMX
  815. case IMGFMT_BGR32:
  816. asm volatile(
  817. FULL_YSCALEYUV2RGB
  818. "punpcklbw %%mm1, %%mm3 \n\t" // BGBGBGBG
  819. "punpcklbw %%mm7, %%mm0 \n\t" // R0R0R0R0
  820. "movq %%mm3, %%mm1 \n\t"
  821. "punpcklwd %%mm0, %%mm3 \n\t" // BGR0BGR0
  822. "punpckhwd %%mm0, %%mm1 \n\t" // BGR0BGR0
  823. MOVNTQ(%%mm3, (%4, %%eax, 4))
  824. MOVNTQ(%%mm1, 8(%4, %%eax, 4))
  825. "addl $4, %%eax \n\t"
  826. "cmpl %5, %%eax \n\t"
  827. " jb 1b \n\t"
  828. :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
  829. "m" (yalpha1), "m" (uvalpha1)
  830. : "%eax"
  831. );
  832. break;
  833. case IMGFMT_BGR24:
  834. asm volatile(
  835. FULL_YSCALEYUV2RGB
  836. // lsb ... msb
  837. "punpcklbw %%mm1, %%mm3 \n\t" // BGBGBGBG
  838. "punpcklbw %%mm7, %%mm0 \n\t" // R0R0R0R0
  839. "movq %%mm3, %%mm1 \n\t"
  840. "punpcklwd %%mm0, %%mm3 \n\t" // BGR0BGR0
  841. "punpckhwd %%mm0, %%mm1 \n\t" // BGR0BGR0
  842. "movq %%mm3, %%mm2 \n\t" // BGR0BGR0
  843. "psrlq $8, %%mm3 \n\t" // GR0BGR00
  844. "pand "MANGLE(bm00000111)", %%mm2\n\t" // BGR00000
  845. "pand "MANGLE(bm11111000)", %%mm3\n\t" // 000BGR00
  846. "por %%mm2, %%mm3 \n\t" // BGRBGR00
  847. "movq %%mm1, %%mm2 \n\t"
  848. "psllq $48, %%mm1 \n\t" // 000000BG
  849. "por %%mm1, %%mm3 \n\t" // BGRBGRBG
  850. "movq %%mm2, %%mm1 \n\t" // BGR0BGR0
  851. "psrld $16, %%mm2 \n\t" // R000R000
  852. "psrlq $24, %%mm1 \n\t" // 0BGR0000
  853. "por %%mm2, %%mm1 \n\t" // RBGRR000
  854. "movl %4, %%ebx \n\t"
  855. "addl %%eax, %%ebx \n\t"
  856. #ifdef HAVE_MMX2
  857. //FIXME Alignment
  858. "movntq %%mm3, (%%ebx, %%eax, 2)\n\t"
  859. "movntq %%mm1, 8(%%ebx, %%eax, 2)\n\t"
  860. #else
  861. "movd %%mm3, (%%ebx, %%eax, 2) \n\t"
  862. "psrlq $32, %%mm3 \n\t"
  863. "movd %%mm3, 4(%%ebx, %%eax, 2) \n\t"
  864. "movd %%mm1, 8(%%ebx, %%eax, 2) \n\t"
  865. #endif
  866. "addl $4, %%eax \n\t"
  867. "cmpl %5, %%eax \n\t"
  868. " jb 1b \n\t"
  869. :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstW),
  870. "m" (yalpha1), "m" (uvalpha1)
  871. : "%eax", "%ebx"
  872. );
  873. break;
  874. case IMGFMT_BGR15:
  875. asm volatile(
  876. FULL_YSCALEYUV2RGB
  877. #ifdef DITHER1XBPP
  878. "paddusb "MANGLE(g5Dither)", %%mm1\n\t"
  879. "paddusb "MANGLE(r5Dither)", %%mm0\n\t"
  880. "paddusb "MANGLE(b5Dither)", %%mm3\n\t"
  881. #endif
  882. "punpcklbw %%mm7, %%mm1 \n\t" // 0G0G0G0G
  883. "punpcklbw %%mm7, %%mm3 \n\t" // 0B0B0B0B
  884. "punpcklbw %%mm7, %%mm0 \n\t" // 0R0R0R0R
  885. "psrlw $3, %%mm3 \n\t"
  886. "psllw $2, %%mm1 \n\t"
  887. "psllw $7, %%mm0 \n\t"
  888. "pand "MANGLE(g15Mask)", %%mm1 \n\t"
  889. "pand "MANGLE(r15Mask)", %%mm0 \n\t"
  890. "por %%mm3, %%mm1 \n\t"
  891. "por %%mm1, %%mm0 \n\t"
  892. MOVNTQ(%%mm0, (%4, %%eax, 2))
  893. "addl $4, %%eax \n\t"
  894. "cmpl %5, %%eax \n\t"
  895. " jb 1b \n\t"
  896. :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
  897. "m" (yalpha1), "m" (uvalpha1)
  898. : "%eax"
  899. );
  900. break;
  901. case IMGFMT_BGR16:
  902. asm volatile(
  903. FULL_YSCALEYUV2RGB
  904. #ifdef DITHER1XBPP
  905. "paddusb "MANGLE(g6Dither)", %%mm1\n\t"
  906. "paddusb "MANGLE(r5Dither)", %%mm0\n\t"
  907. "paddusb "MANGLE(b5Dither)", %%mm3\n\t"
  908. #endif
  909. "punpcklbw %%mm7, %%mm1 \n\t" // 0G0G0G0G
  910. "punpcklbw %%mm7, %%mm3 \n\t" // 0B0B0B0B
  911. "punpcklbw %%mm7, %%mm0 \n\t" // 0R0R0R0R
  912. "psrlw $3, %%mm3 \n\t"
  913. "psllw $3, %%mm1 \n\t"
  914. "psllw $8, %%mm0 \n\t"
  915. "pand "MANGLE(g16Mask)", %%mm1 \n\t"
  916. "pand "MANGLE(r16Mask)", %%mm0 \n\t"
  917. "por %%mm3, %%mm1 \n\t"
  918. "por %%mm1, %%mm0 \n\t"
  919. MOVNTQ(%%mm0, (%4, %%eax, 2))
  920. "addl $4, %%eax \n\t"
  921. "cmpl %5, %%eax \n\t"
  922. " jb 1b \n\t"
  923. :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
  924. "m" (yalpha1), "m" (uvalpha1)
  925. : "%eax"
  926. );
  927. break;
  928. #endif
  929. case IMGFMT_RGB32:
  930. #ifndef HAVE_MMX
  931. case IMGFMT_BGR32:
  932. #endif
  933. if(dstFormat==IMGFMT_BGR32)
  934. {
  935. int i;
  936. #ifdef WORDS_BIGENDIAN
  937. dest++;
  938. #endif
  939. for(i=0;i<dstW;i++){
  940. // vertical linear interpolation && yuv2rgb in a single step:
  941. int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
  942. int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
  943. int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
  944. dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)];
  945. dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)];
  946. dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)];
  947. dest+= 4;
  948. }
  949. }
  950. else if(dstFormat==IMGFMT_BGR24)
  951. {
  952. int i;
  953. for(i=0;i<dstW;i++){
  954. // vertical linear interpolation && yuv2rgb in a single step:
  955. int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
  956. int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
  957. int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
  958. dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)];
  959. dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)];
  960. dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)];
  961. dest+= 3;
  962. }
  963. }
  964. else if(dstFormat==IMGFMT_BGR16)
  965. {
  966. int i;
  967. for(i=0;i<dstW;i++){
  968. // vertical linear interpolation && yuv2rgb in a single step:
  969. int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
  970. int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
  971. int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
  972. ((uint16_t*)dest)[i] =
  973. clip_table16b[(Y + yuvtab_40cf[U]) >>13] |
  974. clip_table16g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
  975. clip_table16r[(Y + yuvtab_3343[V]) >>13];
  976. }
  977. }
  978. else if(dstFormat==IMGFMT_BGR15)
  979. {
  980. int i;
  981. for(i=0;i<dstW;i++){
  982. // vertical linear interpolation && yuv2rgb in a single step:
  983. int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
  984. int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
  985. int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
  986. ((uint16_t*)dest)[i] =
  987. clip_table15b[(Y + yuvtab_40cf[U]) >>13] |
  988. clip_table15g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
  989. clip_table15r[(Y + yuvtab_3343[V]) >>13];
  990. }
  991. }
  992. }//FULL_UV_IPOL
  993. else
  994. {
  995. #endif // if 0
  996. #ifdef HAVE_MMX
  997. switch(c->dstFormat)
  998. {
  999. case IMGFMT_BGR32:
  1000. asm volatile(
  1001. YSCALEYUV2RGB
  1002. WRITEBGR32
  1003. :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
  1004. "m" (yalpha1), "m" (uvalpha1)
  1005. : "%eax"
  1006. );
  1007. return;
  1008. case IMGFMT_BGR24:
  1009. asm volatile(
  1010. "movl %4, %%ebx \n\t"
  1011. YSCALEYUV2RGB
  1012. WRITEBGR24
  1013. :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstW),
  1014. "m" (yalpha1), "m" (uvalpha1)
  1015. : "%eax", "%ebx"
  1016. );
  1017. return;
  1018. case IMGFMT_BGR15:
  1019. asm volatile(
  1020. YSCALEYUV2RGB
  1021. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1022. #ifdef DITHER1XBPP
  1023. "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
  1024. "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
  1025. "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
  1026. #endif
  1027. WRITEBGR15
  1028. :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
  1029. "m" (yalpha1), "m" (uvalpha1)
  1030. : "%eax"
  1031. );
  1032. return;
  1033. case IMGFMT_BGR16:
  1034. asm volatile(
  1035. YSCALEYUV2RGB
  1036. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1037. #ifdef DITHER1XBPP
  1038. "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
  1039. "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
  1040. "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
  1041. #endif
  1042. WRITEBGR16
  1043. :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
  1044. "m" (yalpha1), "m" (uvalpha1)
  1045. : "%eax"
  1046. );
  1047. return;
  1048. default: break;
  1049. }
  1050. #endif //HAVE_MMX
  1051. YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB2_C)
  1052. }
  1053. /**
  1054. * YV12 to RGB without scaling or interpolating
  1055. */
  1056. static inline void RENAME(yuv2rgb1)(SwsContext *c, uint16_t *buf0, uint16_t *uvbuf0, uint16_t *uvbuf1,
  1057. uint8_t *dest, int dstW, int uvalpha, int dstFormat, int flags, int y)
  1058. {
  1059. int uvalpha1=uvalpha^4095;
  1060. const int yalpha1=0;
  1061. int i;
  1062. uint16_t *buf1= buf0; //FIXME needed for the rgb1/bgr1
  1063. const int yalpha= 4096; //FIXME ...
  1064. if(flags&SWS_FULL_CHR_H_INT)
  1065. {
  1066. RENAME(yuv2rgb2)(c, buf0, buf0, uvbuf0, uvbuf1, dest, dstW, 0, uvalpha, y);
  1067. return;
  1068. }
  1069. #ifdef HAVE_MMX
  1070. if( uvalpha < 2048 ) // note this is not correct (shifts chrominance by 0.5 pixels) but its a bit faster
  1071. {
  1072. switch(dstFormat)
  1073. {
  1074. case IMGFMT_BGR32:
  1075. asm volatile(
  1076. YSCALEYUV2RGB1
  1077. WRITEBGR32
  1078. :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
  1079. "m" (yalpha1), "m" (uvalpha1)
  1080. : "%eax"
  1081. );
  1082. return;
  1083. case IMGFMT_BGR24:
  1084. asm volatile(
  1085. "movl %4, %%ebx \n\t"
  1086. YSCALEYUV2RGB1
  1087. WRITEBGR24
  1088. :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstW),
  1089. "m" (yalpha1), "m" (uvalpha1)
  1090. : "%eax", "%ebx"
  1091. );
  1092. return;
  1093. case IMGFMT_BGR15:
  1094. asm volatile(
  1095. YSCALEYUV2RGB1
  1096. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1097. #ifdef DITHER1XBPP
  1098. "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
  1099. "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
  1100. "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
  1101. #endif
  1102. WRITEBGR15
  1103. :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
  1104. "m" (yalpha1), "m" (uvalpha1)
  1105. : "%eax"
  1106. );
  1107. return;
  1108. case IMGFMT_BGR16:
  1109. asm volatile(
  1110. YSCALEYUV2RGB1
  1111. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1112. #ifdef DITHER1XBPP
  1113. "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
  1114. "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
  1115. "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
  1116. #endif
  1117. WRITEBGR16
  1118. :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
  1119. "m" (yalpha1), "m" (uvalpha1)
  1120. : "%eax"
  1121. );
  1122. return;
  1123. }
  1124. }
  1125. else
  1126. {
  1127. switch(dstFormat)
  1128. {
  1129. case IMGFMT_BGR32:
  1130. asm volatile(
  1131. YSCALEYUV2RGB1b
  1132. WRITEBGR32
  1133. :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
  1134. "m" (yalpha1), "m" (uvalpha1)
  1135. : "%eax"
  1136. );
  1137. return;
  1138. case IMGFMT_BGR24:
  1139. asm volatile(
  1140. "movl %4, %%ebx \n\t"
  1141. YSCALEYUV2RGB1b
  1142. WRITEBGR24
  1143. :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstW),
  1144. "m" (yalpha1), "m" (uvalpha1)
  1145. : "%eax", "%ebx"
  1146. );
  1147. return;
  1148. case IMGFMT_BGR15:
  1149. asm volatile(
  1150. YSCALEYUV2RGB1b
  1151. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1152. #ifdef DITHER1XBPP
  1153. "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
  1154. "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
  1155. "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
  1156. #endif
  1157. WRITEBGR15
  1158. :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
  1159. "m" (yalpha1), "m" (uvalpha1)
  1160. : "%eax"
  1161. );
  1162. return;
  1163. case IMGFMT_BGR16:
  1164. asm volatile(
  1165. YSCALEYUV2RGB1b
  1166. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1167. #ifdef DITHER1XBPP
  1168. "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
  1169. "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
  1170. "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
  1171. #endif
  1172. WRITEBGR16
  1173. :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
  1174. "m" (yalpha1), "m" (uvalpha1)
  1175. : "%eax"
  1176. );
  1177. return;
  1178. }
  1179. }
  1180. #endif
  1181. if( uvalpha < 2048 )
  1182. {
  1183. YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1_C)
  1184. }else{
  1185. YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1B_C)
  1186. }
  1187. }
  1188. //FIXME yuy2* can read upto 7 samples to much
  1189. static inline void RENAME(yuy2ToY)(uint8_t *dst, uint8_t *src, int width)
  1190. {
  1191. #ifdef HAVE_MMX
  1192. asm volatile(
  1193. "movq "MANGLE(bm01010101)", %%mm2\n\t"
  1194. "movl %0, %%eax \n\t"
  1195. "1: \n\t"
  1196. "movq (%1, %%eax,2), %%mm0 \n\t"
  1197. "movq 8(%1, %%eax,2), %%mm1 \n\t"
  1198. "pand %%mm2, %%mm0 \n\t"
  1199. "pand %%mm2, %%mm1 \n\t"
  1200. "packuswb %%mm1, %%mm0 \n\t"
  1201. "movq %%mm0, (%2, %%eax) \n\t"
  1202. "addl $8, %%eax \n\t"
  1203. " js 1b \n\t"
  1204. : : "g" (-width), "r" (src+width*2), "r" (dst+width)
  1205. : "%eax"
  1206. );
  1207. #else
  1208. int i;
  1209. for(i=0; i<width; i++)
  1210. dst[i]= src[2*i];
  1211. #endif
  1212. }
  1213. static inline void RENAME(yuy2ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
  1214. {
  1215. #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
  1216. asm volatile(
  1217. "movq "MANGLE(bm01010101)", %%mm4\n\t"
  1218. "movl %0, %%eax \n\t"
  1219. "1: \n\t"
  1220. "movq (%1, %%eax,4), %%mm0 \n\t"
  1221. "movq 8(%1, %%eax,4), %%mm1 \n\t"
  1222. "movq (%2, %%eax,4), %%mm2 \n\t"
  1223. "movq 8(%2, %%eax,4), %%mm3 \n\t"
  1224. PAVGB(%%mm2, %%mm0)
  1225. PAVGB(%%mm3, %%mm1)
  1226. "psrlw $8, %%mm0 \n\t"
  1227. "psrlw $8, %%mm1 \n\t"
  1228. "packuswb %%mm1, %%mm0 \n\t"
  1229. "movq %%mm0, %%mm1 \n\t"
  1230. "psrlw $8, %%mm0 \n\t"
  1231. "pand %%mm4, %%mm1 \n\t"
  1232. "packuswb %%mm0, %%mm0 \n\t"
  1233. "packuswb %%mm1, %%mm1 \n\t"
  1234. "movd %%mm0, (%4, %%eax) \n\t"
  1235. "movd %%mm1, (%3, %%eax) \n\t"
  1236. "addl $4, %%eax \n\t"
  1237. " js 1b \n\t"
  1238. : : "g" (-width), "r" (src1+width*4), "r" (src2+width*4), "r" (dstU+width), "r" (dstV+width)
  1239. : "%eax"
  1240. );
  1241. #else
  1242. int i;
  1243. for(i=0; i<width; i++)
  1244. {
  1245. dstU[i]= (src1[4*i + 1] + src2[4*i + 1])>>1;
  1246. dstV[i]= (src1[4*i + 3] + src2[4*i + 3])>>1;
  1247. }
  1248. #endif
  1249. }
  1250. static inline void RENAME(bgr32ToY)(uint8_t *dst, uint8_t *src, int width)
  1251. {
  1252. #ifdef HAVE_MMXFIXME
  1253. #else
  1254. int i;
  1255. for(i=0; i<width; i++)
  1256. {
  1257. int b= src[i*4+0];
  1258. int g= src[i*4+1];
  1259. int r= src[i*4+2];
  1260. dst[i]= ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
  1261. }
  1262. #endif
  1263. }
  1264. static inline void RENAME(bgr32ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
  1265. {
  1266. #ifdef HAVE_MMXFIXME
  1267. #else
  1268. int i;
  1269. for(i=0; i<width; i++)
  1270. {
  1271. int b= src1[8*i + 0] + src1[8*i + 4] + src2[8*i + 0] + src2[8*i + 4];
  1272. int g= src1[8*i + 1] + src1[8*i + 5] + src2[8*i + 1] + src2[8*i + 5];
  1273. int r= src1[8*i + 2] + src1[8*i + 6] + src2[8*i + 2] + src2[8*i + 6];
  1274. dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
  1275. dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
  1276. }
  1277. #endif
  1278. }
  1279. static inline void RENAME(bgr24ToY)(uint8_t *dst, uint8_t *src, int width)
  1280. {
  1281. #ifdef HAVE_MMX
  1282. asm volatile(
  1283. "movl %2, %%eax \n\t"
  1284. "movq "MANGLE(bgr2YCoeff)", %%mm6 \n\t"
  1285. "movq "MANGLE(w1111)", %%mm5 \n\t"
  1286. "pxor %%mm7, %%mm7 \n\t"
  1287. "leal (%%eax, %%eax, 2), %%ebx \n\t"
  1288. ".balign 16 \n\t"
  1289. "1: \n\t"
  1290. PREFETCH" 64(%0, %%ebx) \n\t"
  1291. "movd (%0, %%ebx), %%mm0 \n\t"
  1292. "movd 3(%0, %%ebx), %%mm1 \n\t"
  1293. "punpcklbw %%mm7, %%mm0 \n\t"
  1294. "punpcklbw %%mm7, %%mm1 \n\t"
  1295. "movd 6(%0, %%ebx), %%mm2 \n\t"
  1296. "movd 9(%0, %%ebx), %%mm3 \n\t"
  1297. "punpcklbw %%mm7, %%mm2 \n\t"
  1298. "punpcklbw %%mm7, %%mm3 \n\t"
  1299. "pmaddwd %%mm6, %%mm0 \n\t"
  1300. "pmaddwd %%mm6, %%mm1 \n\t"
  1301. "pmaddwd %%mm6, %%mm2 \n\t"
  1302. "pmaddwd %%mm6, %%mm3 \n\t"
  1303. #ifndef FAST_BGR2YV12
  1304. "psrad $8, %%mm0 \n\t"
  1305. "psrad $8, %%mm1 \n\t"
  1306. "psrad $8, %%mm2 \n\t"
  1307. "psrad $8, %%mm3 \n\t"
  1308. #endif
  1309. "packssdw %%mm1, %%mm0 \n\t"
  1310. "packssdw %%mm3, %%mm2 \n\t"
  1311. "pmaddwd %%mm5, %%mm0 \n\t"
  1312. "pmaddwd %%mm5, %%mm2 \n\t"
  1313. "packssdw %%mm2, %%mm0 \n\t"
  1314. "psraw $7, %%mm0 \n\t"
  1315. "movd 12(%0, %%ebx), %%mm4 \n\t"
  1316. "movd 15(%0, %%ebx), %%mm1 \n\t"
  1317. "punpcklbw %%mm7, %%mm4 \n\t"
  1318. "punpcklbw %%mm7, %%mm1 \n\t"
  1319. "movd 18(%0, %%ebx), %%mm2 \n\t"
  1320. "movd 21(%0, %%ebx), %%mm3 \n\t"
  1321. "punpcklbw %%mm7, %%mm2 \n\t"
  1322. "punpcklbw %%mm7, %%mm3 \n\t"
  1323. "pmaddwd %%mm6, %%mm4 \n\t"
  1324. "pmaddwd %%mm6, %%mm1 \n\t"
  1325. "pmaddwd %%mm6, %%mm2 \n\t"
  1326. "pmaddwd %%mm6, %%mm3 \n\t"
  1327. #ifndef FAST_BGR2YV12
  1328. "psrad $8, %%mm4 \n\t"
  1329. "psrad $8, %%mm1 \n\t"
  1330. "psrad $8, %%mm2 \n\t"
  1331. "psrad $8, %%mm3 \n\t"
  1332. #endif
  1333. "packssdw %%mm1, %%mm4 \n\t"
  1334. "packssdw %%mm3, %%mm2 \n\t"
  1335. "pmaddwd %%mm5, %%mm4 \n\t"
  1336. "pmaddwd %%mm5, %%mm2 \n\t"
  1337. "addl $24, %%ebx \n\t"
  1338. "packssdw %%mm2, %%mm4 \n\t"
  1339. "psraw $7, %%mm4 \n\t"
  1340. "packuswb %%mm4, %%mm0 \n\t"
  1341. "paddusb "MANGLE(bgr2YOffset)", %%mm0 \n\t"
  1342. "movq %%mm0, (%1, %%eax) \n\t"
  1343. "addl $8, %%eax \n\t"
  1344. " js 1b \n\t"
  1345. : : "r" (src+width*3), "r" (dst+width), "g" (-width)
  1346. : "%eax", "%ebx"
  1347. );
  1348. #else
  1349. int i;
  1350. for(i=0; i<width; i++)
  1351. {
  1352. int b= src[i*3+0];
  1353. int g= src[i*3+1];
  1354. int r= src[i*3+2];
  1355. dst[i]= ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
  1356. }
  1357. #endif
  1358. }
  1359. static inline void RENAME(bgr24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
  1360. {
  1361. #ifdef HAVE_MMX
  1362. asm volatile(
  1363. "movl %4, %%eax \n\t"
  1364. "movq "MANGLE(w1111)", %%mm5 \n\t"
  1365. "movq "MANGLE(bgr2UCoeff)", %%mm6 \n\t"
  1366. "pxor %%mm7, %%mm7 \n\t"
  1367. "leal (%%eax, %%eax, 2), %%ebx \n\t"
  1368. "addl %%ebx, %%ebx \n\t"
  1369. ".balign 16 \n\t"
  1370. "1: \n\t"
  1371. PREFETCH" 64(%0, %%ebx) \n\t"
  1372. PREFETCH" 64(%1, %%ebx) \n\t"
  1373. #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
  1374. "movq (%0, %%ebx), %%mm0 \n\t"
  1375. "movq (%1, %%ebx), %%mm1 \n\t"
  1376. "movq 6(%0, %%ebx), %%mm2 \n\t"
  1377. "movq 6(%1, %%ebx), %%mm3 \n\t"
  1378. PAVGB(%%mm1, %%mm0)
  1379. PAVGB(%%mm3, %%mm2)
  1380. "movq %%mm0, %%mm1 \n\t"
  1381. "movq %%mm2, %%mm3 \n\t"
  1382. "psrlq $24, %%mm0 \n\t"
  1383. "psrlq $24, %%mm2 \n\t"
  1384. PAVGB(%%mm1, %%mm0)
  1385. PAVGB(%%mm3, %%mm2)
  1386. "punpcklbw %%mm7, %%mm0 \n\t"
  1387. "punpcklbw %%mm7, %%mm2 \n\t"
  1388. #else
  1389. "movd (%0, %%ebx), %%mm0 \n\t"
  1390. "movd (%1, %%ebx), %%mm1 \n\t"
  1391. "movd 3(%0, %%ebx), %%mm2 \n\t"
  1392. "movd 3(%1, %%ebx), %%mm3 \n\t"
  1393. "punpcklbw %%mm7, %%mm0 \n\t"
  1394. "punpcklbw %%mm7, %%mm1 \n\t"
  1395. "punpcklbw %%mm7, %%mm2 \n\t"
  1396. "punpcklbw %%mm7, %%mm3 \n\t"
  1397. "paddw %%mm1, %%mm0 \n\t"
  1398. "paddw %%mm3, %%mm2 \n\t"
  1399. "paddw %%mm2, %%mm0 \n\t"
  1400. "movd 6(%0, %%ebx), %%mm4 \n\t"
  1401. "movd 6(%1, %%ebx), %%mm1 \n\t"
  1402. "movd 9(%0, %%ebx), %%mm2 \n\t"
  1403. "movd 9(%1, %%ebx), %%mm3 \n\t"
  1404. "punpcklbw %%mm7, %%mm4 \n\t"
  1405. "punpcklbw %%mm7, %%mm1 \n\t"
  1406. "punpcklbw %%mm7, %%mm2 \n\t"
  1407. "punpcklbw %%mm7, %%mm3 \n\t"
  1408. "paddw %%mm1, %%mm4 \n\t"
  1409. "paddw %%mm3, %%mm2 \n\t"
  1410. "paddw %%mm4, %%mm2 \n\t"
  1411. "psrlw $2, %%mm0 \n\t"
  1412. "psrlw $2, %%mm2 \n\t"
  1413. #endif
  1414. "movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t"
  1415. "movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t"
  1416. "pmaddwd %%mm0, %%mm1 \n\t"
  1417. "pmaddwd %%mm2, %%mm3 \n\t"
  1418. "pmaddwd %%mm6, %%mm0 \n\t"
  1419. "pmaddwd %%mm6, %%mm2 \n\t"
  1420. #ifndef FAST_BGR2YV12
  1421. "psrad $8, %%mm0 \n\t"
  1422. "psrad $8, %%mm1 \n\t"
  1423. "psrad $8, %%mm2 \n\t"
  1424. "psrad $8, %%mm3 \n\t"
  1425. #endif
  1426. "packssdw %%mm2, %%mm0 \n\t"
  1427. "packssdw %%mm3, %%mm1 \n\t"
  1428. "pmaddwd %%mm5, %%mm0 \n\t"
  1429. "pmaddwd %%mm5, %%mm1 \n\t"
  1430. "packssdw %%mm1, %%mm0 \n\t" // V1 V0 U1 U0
  1431. "psraw $7, %%mm0 \n\t"
  1432. #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
  1433. "movq 12(%0, %%ebx), %%mm4 \n\t"
  1434. "movq 12(%1, %%ebx), %%mm1 \n\t"
  1435. "movq 18(%0, %%ebx), %%mm2 \n\t"
  1436. "movq 18(%1, %%ebx), %%mm3 \n\t"
  1437. PAVGB(%%mm1, %%mm4)
  1438. PAVGB(%%mm3, %%mm2)
  1439. "movq %%mm4, %%mm1 \n\t"
  1440. "movq %%mm2, %%mm3 \n\t"
  1441. "psrlq $24, %%mm4 \n\t"
  1442. "psrlq $24, %%mm2 \n\t"
  1443. PAVGB(%%mm1, %%mm4)
  1444. PAVGB(%%mm3, %%mm2)
  1445. "punpcklbw %%mm7, %%mm4 \n\t"
  1446. "punpcklbw %%mm7, %%mm2 \n\t"
  1447. #else
  1448. "movd 12(%0, %%ebx), %%mm4 \n\t"
  1449. "movd 12(%1, %%ebx), %%mm1 \n\t"
  1450. "movd 15(%0, %%ebx), %%mm2 \n\t"
  1451. "movd 15(%1, %%ebx), %%mm3 \n\t"
  1452. "punpcklbw %%mm7, %%mm4 \n\t"
  1453. "punpcklbw %%mm7, %%mm1 \n\t"
  1454. "punpcklbw %%mm7, %%mm2 \n\t"
  1455. "punpcklbw %%mm7, %%mm3 \n\t"
  1456. "paddw %%mm1, %%mm4 \n\t"
  1457. "paddw %%mm3, %%mm2 \n\t"
  1458. "paddw %%mm2, %%mm4 \n\t"
  1459. "movd 18(%0, %%ebx), %%mm5 \n\t"
  1460. "movd 18(%1, %%ebx), %%mm1 \n\t"
  1461. "movd 21(%0, %%ebx), %%mm2 \n\t"
  1462. "movd 21(%1, %%ebx), %%mm3 \n\t"
  1463. "punpcklbw %%mm7, %%mm5 \n\t"
  1464. "punpcklbw %%mm7, %%mm1 \n\t"
  1465. "punpcklbw %%mm7, %%mm2 \n\t"
  1466. "punpcklbw %%mm7, %%mm3 \n\t"
  1467. "paddw %%mm1, %%mm5 \n\t"
  1468. "paddw %%mm3, %%mm2 \n\t"
  1469. "paddw %%mm5, %%mm2 \n\t"
  1470. "movq "MANGLE(w1111)", %%mm5 \n\t"
  1471. "psrlw $2, %%mm4 \n\t"
  1472. "psrlw $2, %%mm2 \n\t"
  1473. #endif
  1474. "movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t"
  1475. "movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t"
  1476. "pmaddwd %%mm4, %%mm1 \n\t"
  1477. "pmaddwd %%mm2, %%mm3 \n\t"
  1478. "pmaddwd %%mm6, %%mm4 \n\t"
  1479. "pmaddwd %%mm6, %%mm2 \n\t"
  1480. #ifndef FAST_BGR2YV12
  1481. "psrad $8, %%mm4 \n\t"
  1482. "psrad $8, %%mm1 \n\t"
  1483. "psrad $8, %%mm2 \n\t"
  1484. "psrad $8, %%mm3 \n\t"
  1485. #endif
  1486. "packssdw %%mm2, %%mm4 \n\t"
  1487. "packssdw %%mm3, %%mm1 \n\t"
  1488. "pmaddwd %%mm5, %%mm4 \n\t"
  1489. "pmaddwd %%mm5, %%mm1 \n\t"
  1490. "addl $24, %%ebx \n\t"
  1491. "packssdw %%mm1, %%mm4 \n\t" // V3 V2 U3 U2
  1492. "psraw $7, %%mm4 \n\t"
  1493. "movq %%mm0, %%mm1 \n\t"
  1494. "punpckldq %%mm4, %%mm0 \n\t"
  1495. "punpckhdq %%mm4, %%mm1 \n\t"
  1496. "packsswb %%mm1, %%mm0 \n\t"
  1497. "paddb "MANGLE(bgr2UVOffset)", %%mm0 \n\t"
  1498. "movd %%mm0, (%2, %%eax) \n\t"
  1499. "punpckhdq %%mm0, %%mm0 \n\t"
  1500. "movd %%mm0, (%3, %%eax) \n\t"
  1501. "addl $4, %%eax \n\t"
  1502. " js 1b \n\t"
  1503. : : "r" (src1+width*6), "r" (src2+width*6), "r" (dstU+width), "r" (dstV+width), "g" (-width)
  1504. : "%eax", "%ebx"
  1505. );
  1506. #else
  1507. int i;
  1508. for(i=0; i<width; i++)
  1509. {
  1510. int b= src1[6*i + 0] + src1[6*i + 3] + src2[6*i + 0] + src2[6*i + 3];
  1511. int g= src1[6*i + 1] + src1[6*i + 4] + src2[6*i + 1] + src2[6*i + 4];
  1512. int r= src1[6*i + 2] + src1[6*i + 5] + src2[6*i + 2] + src2[6*i + 5];
  1513. dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
  1514. dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
  1515. }
  1516. #endif
  1517. }
  1518. static inline void RENAME(bgr16ToY)(uint8_t *dst, uint8_t *src, int width)
  1519. {
  1520. int i;
  1521. for(i=0; i<width; i++)
  1522. {
  1523. int d= src[i*2] + (src[i*2+1]<<8);
  1524. int b= d&0x1F;
  1525. int g= (d>>5)&0x3F;
  1526. int r= (d>>11)&0x1F;
  1527. dst[i]= ((2*RY*r + GY*g + 2*BY*b)>>(RGB2YUV_SHIFT-2)) + 16;
  1528. }
  1529. }
  1530. static inline void RENAME(bgr16ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
  1531. {
  1532. int i;
  1533. for(i=0; i<width; i++)
  1534. {
  1535. #if 1
  1536. int d0= le2me_32( ((uint32_t*)src1)[i] );
  1537. int d1= le2me_32( ((uint32_t*)src2)[i] );
  1538. int dl= (d0&0x07E0F81F) + (d1&0x07E0F81F);
  1539. int dh= ((d0>>5)&0x07C0F83F) + ((d1>>5)&0x07C0F83F);
  1540. int dh2= (dh>>11) + (dh<<21);
  1541. int d= dh2 + dl;
  1542. int b= d&0x7F;
  1543. int r= (d>>11)&0x7F;
  1544. int g= d>>21;
  1545. #else
  1546. int d0= src1[i*4] + (src1[i*4+1]<<8);
  1547. int b0= d0&0x1F;
  1548. int g0= (d0>>5)&0x3F;
  1549. int r0= (d0>>11)&0x1F;
  1550. int d1= src1[i*4+2] + (src1[i*4+3]<<8);
  1551. int b1= d1&0x1F;
  1552. int g1= (d1>>5)&0x3F;
  1553. int r1= (d1>>11)&0x1F;
  1554. int d2= src2[i*4] + (src2[i*4+1]<<8);
  1555. int b2= d2&0x1F;
  1556. int g2= (d2>>5)&0x3F;
  1557. int r2= (d2>>11)&0x1F;
  1558. int d3= src2[i*4+2] + (src2[i*4+3]<<8);
  1559. int b3= d3&0x1F;
  1560. int g3= (d3>>5)&0x3F;
  1561. int r3= (d3>>11)&0x1F;
  1562. int b= b0 + b1 + b2 + b3;
  1563. int g= g0 + g1 + g2 + g3;
  1564. int r= r0 + r1 + r2 + r3;
  1565. #endif
  1566. dstU[i]= ((2*RU*r + GU*g + 2*BU*b)>>(RGB2YUV_SHIFT+2-2)) + 128;
  1567. dstV[i]= ((2*RV*r + GV*g + 2*BV*b)>>(RGB2YUV_SHIFT+2-2)) + 128;
  1568. }
  1569. }
  1570. static inline void RENAME(bgr15ToY)(uint8_t *dst, uint8_t *src, int width)
  1571. {
  1572. int i;
  1573. for(i=0; i<width; i++)
  1574. {
  1575. int d= src[i*2] + (src[i*2+1]<<8);
  1576. int b= d&0x1F;
  1577. int g= (d>>5)&0x1F;
  1578. int r= (d>>10)&0x1F;
  1579. dst[i]= ((RY*r + GY*g + BY*b)>>(RGB2YUV_SHIFT-3)) + 16;
  1580. }
  1581. }
  1582. static inline void RENAME(bgr15ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
  1583. {
  1584. int i;
  1585. for(i=0; i<width; i++)
  1586. {
  1587. #if 1
  1588. int d0= le2me_32( ((uint32_t*)src1)[i] );
  1589. int d1= le2me_32( ((uint32_t*)src2)[i] );
  1590. int dl= (d0&0x03E07C1F) + (d1&0x03E07C1F);
  1591. int dh= ((d0>>5)&0x03E0F81F) + ((d1>>5)&0x03E0F81F);
  1592. int dh2= (dh>>11) + (dh<<21);
  1593. int d= dh2 + dl;
  1594. int b= d&0x7F;
  1595. int r= (d>>10)&0x7F;
  1596. int g= d>>21;
  1597. #else
  1598. int d0= src1[i*4] + (src1[i*4+1]<<8);
  1599. int b0= d0&0x1F;
  1600. int g0= (d0>>5)&0x1F;
  1601. int r0= (d0>>10)&0x1F;
  1602. int d1= src1[i*4+2] + (src1[i*4+3]<<8);
  1603. int b1= d1&0x1F;
  1604. int g1= (d1>>5)&0x1F;
  1605. int r1= (d1>>10)&0x1F;
  1606. int d2= src2[i*4] + (src2[i*4+1]<<8);
  1607. int b2= d2&0x1F;
  1608. int g2= (d2>>5)&0x1F;
  1609. int r2= (d2>>10)&0x1F;
  1610. int d3= src2[i*4+2] + (src2[i*4+3]<<8);
  1611. int b3= d3&0x1F;
  1612. int g3= (d3>>5)&0x1F;
  1613. int r3= (d3>>10)&0x1F;
  1614. int b= b0 + b1 + b2 + b3;
  1615. int g= g0 + g1 + g2 + g3;
  1616. int r= r0 + r1 + r2 + r3;
  1617. #endif
  1618. dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2-3)) + 128;
  1619. dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2-3)) + 128;
  1620. }
  1621. }
  1622. static inline void RENAME(rgb32ToY)(uint8_t *dst, uint8_t *src, int width)
  1623. {
  1624. int i;
  1625. for(i=0; i<width; i++)
  1626. {
  1627. int r= src[i*4+0];
  1628. int g= src[i*4+1];
  1629. int b= src[i*4+2];
  1630. dst[i]= ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
  1631. }
  1632. }
  1633. static inline void RENAME(rgb32ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
  1634. {
  1635. int i;
  1636. for(i=0; i<width; i++)
  1637. {
  1638. int r= src1[8*i + 0] + src1[8*i + 4] + src2[8*i + 0] + src2[8*i + 4];
  1639. int g= src1[8*i + 1] + src1[8*i + 5] + src2[8*i + 1] + src2[8*i + 5];
  1640. int b= src1[8*i + 2] + src1[8*i + 6] + src2[8*i + 2] + src2[8*i + 6];
  1641. dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
  1642. dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
  1643. }
  1644. }
  1645. static inline void RENAME(rgb24ToY)(uint8_t *dst, uint8_t *src, int width)
  1646. {
  1647. int i;
  1648. for(i=0; i<width; i++)
  1649. {
  1650. int r= src[i*3+0];
  1651. int g= src[i*3+1];
  1652. int b= src[i*3+2];
  1653. dst[i]= ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
  1654. }
  1655. }
  1656. static inline void RENAME(rgb24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
  1657. {
  1658. int i;
  1659. for(i=0; i<width; i++)
  1660. {
  1661. int r= src1[6*i + 0] + src1[6*i + 3] + src2[6*i + 0] + src2[6*i + 3];
  1662. int g= src1[6*i + 1] + src1[6*i + 4] + src2[6*i + 1] + src2[6*i + 4];
  1663. int b= src1[6*i + 2] + src1[6*i + 5] + src2[6*i + 2] + src2[6*i + 5];
  1664. dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
  1665. dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
  1666. }
  1667. }
  1668. // Bilinear / Bicubic scaling
  1669. static inline void RENAME(hScale)(int16_t *dst, int dstW, uint8_t *src, int srcW, int xInc,
  1670. int16_t *filter, int16_t *filterPos, int filterSize)
  1671. {
  1672. #ifdef HAVE_MMX
  1673. if(filterSize==4) // allways true for upscaling, sometimes for down too
  1674. {
  1675. int counter= -2*dstW;
  1676. filter-= counter*2;
  1677. filterPos-= counter/2;
  1678. dst-= counter/2;
  1679. asm volatile(
  1680. "pxor %%mm7, %%mm7 \n\t"
  1681. "movq "MANGLE(w02)", %%mm6 \n\t"
  1682. "pushl %%ebp \n\t" // we use 7 regs here ...
  1683. "movl %%eax, %%ebp \n\t"
  1684. ".balign 16 \n\t"
  1685. "1: \n\t"
  1686. "movzwl (%2, %%ebp), %%eax \n\t"
  1687. "movzwl 2(%2, %%ebp), %%ebx \n\t"
  1688. "movq (%1, %%ebp, 4), %%mm1 \n\t"
  1689. "movq 8(%1, %%ebp, 4), %%mm3 \n\t"
  1690. "movd (%3, %%eax), %%mm0 \n\t"
  1691. "movd (%3, %%ebx), %%mm2 \n\t"
  1692. "punpcklbw %%mm7, %%mm0 \n\t"
  1693. "punpcklbw %%mm7, %%mm2 \n\t"
  1694. "pmaddwd %%mm1, %%mm0 \n\t"
  1695. "pmaddwd %%mm2, %%mm3 \n\t"
  1696. "psrad $8, %%mm0 \n\t"
  1697. "psrad $8, %%mm3 \n\t"
  1698. "packssdw %%mm3, %%mm0 \n\t"
  1699. "pmaddwd %%mm6, %%mm0 \n\t"
  1700. "packssdw %%mm0, %%mm0 \n\t"
  1701. "movd %%mm0, (%4, %%ebp) \n\t"
  1702. "addl $4, %%ebp \n\t"
  1703. " jnc 1b \n\t"
  1704. "popl %%ebp \n\t"
  1705. : "+a" (counter)
  1706. : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
  1707. : "%ebx"
  1708. );
  1709. }
  1710. else if(filterSize==8)
  1711. {
  1712. int counter= -2*dstW;
  1713. filter-= counter*4;
  1714. filterPos-= counter/2;
  1715. dst-= counter/2;
  1716. asm volatile(
  1717. "pxor %%mm7, %%mm7 \n\t"
  1718. "movq "MANGLE(w02)", %%mm6 \n\t"
  1719. "pushl %%ebp \n\t" // we use 7 regs here ...
  1720. "movl %%eax, %%ebp \n\t"
  1721. ".balign 16 \n\t"
  1722. "1: \n\t"
  1723. "movzwl (%2, %%ebp), %%eax \n\t"
  1724. "movzwl 2(%2, %%ebp), %%ebx \n\t"
  1725. "movq (%1, %%ebp, 8), %%mm1 \n\t"
  1726. "movq 16(%1, %%ebp, 8), %%mm3 \n\t"
  1727. "movd (%3, %%eax), %%mm0 \n\t"
  1728. "movd (%3, %%ebx), %%mm2 \n\t"
  1729. "punpcklbw %%mm7, %%mm0 \n\t"
  1730. "punpcklbw %%mm7, %%mm2 \n\t"
  1731. "pmaddwd %%mm1, %%mm0 \n\t"
  1732. "pmaddwd %%mm2, %%mm3 \n\t"
  1733. "movq 8(%1, %%ebp, 8), %%mm1 \n\t"
  1734. "movq 24(%1, %%ebp, 8), %%mm5 \n\t"
  1735. "movd 4(%3, %%eax), %%mm4 \n\t"
  1736. "movd 4(%3, %%ebx), %%mm2 \n\t"
  1737. "punpcklbw %%mm7, %%mm4 \n\t"
  1738. "punpcklbw %%mm7, %%mm2 \n\t"
  1739. "pmaddwd %%mm1, %%mm4 \n\t"
  1740. "pmaddwd %%mm2, %%mm5 \n\t"
  1741. "paddd %%mm4, %%mm0 \n\t"
  1742. "paddd %%mm5, %%mm3 \n\t"
  1743. "psrad $8, %%mm0 \n\t"
  1744. "psrad $8, %%mm3 \n\t"
  1745. "packssdw %%mm3, %%mm0 \n\t"
  1746. "pmaddwd %%mm6, %%mm0 \n\t"
  1747. "packssdw %%mm0, %%mm0 \n\t"
  1748. "movd %%mm0, (%4, %%ebp) \n\t"
  1749. "addl $4, %%ebp \n\t"
  1750. " jnc 1b \n\t"
  1751. "popl %%ebp \n\t"
  1752. : "+a" (counter)
  1753. : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
  1754. : "%ebx"
  1755. );
  1756. }
  1757. else
  1758. {
  1759. int counter= -2*dstW;
  1760. // filter-= counter*filterSize/2;
  1761. filterPos-= counter/2;
  1762. dst-= counter/2;
  1763. asm volatile(
  1764. "pxor %%mm7, %%mm7 \n\t"
  1765. "movq "MANGLE(w02)", %%mm6 \n\t"
  1766. ".balign 16 \n\t"
  1767. "1: \n\t"
  1768. "movl %2, %%ecx \n\t"
  1769. "movzwl (%%ecx, %0), %%eax \n\t"
  1770. "movzwl 2(%%ecx, %0), %%ebx \n\t"
  1771. "movl %5, %%ecx \n\t"
  1772. "pxor %%mm4, %%mm4 \n\t"
  1773. "pxor %%mm5, %%mm5 \n\t"
  1774. "2: \n\t"
  1775. "movq (%1), %%mm1 \n\t"
  1776. "movq (%1, %6), %%mm3 \n\t"
  1777. "movd (%%ecx, %%eax), %%mm0 \n\t"
  1778. "movd (%%ecx, %%ebx), %%mm2 \n\t"
  1779. "punpcklbw %%mm7, %%mm0 \n\t"
  1780. "punpcklbw %%mm7, %%mm2 \n\t"
  1781. "pmaddwd %%mm1, %%mm0 \n\t"
  1782. "pmaddwd %%mm2, %%mm3 \n\t"
  1783. "paddd %%mm3, %%mm5 \n\t"
  1784. "paddd %%mm0, %%mm4 \n\t"
  1785. "addl $8, %1 \n\t"
  1786. "addl $4, %%ecx \n\t"
  1787. "cmpl %4, %%ecx \n\t"
  1788. " jb 2b \n\t"
  1789. "addl %6, %1 \n\t"
  1790. "psrad $8, %%mm4 \n\t"
  1791. "psrad $8, %%mm5 \n\t"
  1792. "packssdw %%mm5, %%mm4 \n\t"
  1793. "pmaddwd %%mm6, %%mm4 \n\t"
  1794. "packssdw %%mm4, %%mm4 \n\t"
  1795. "movl %3, %%eax \n\t"
  1796. "movd %%mm4, (%%eax, %0) \n\t"
  1797. "addl $4, %0 \n\t"
  1798. " jnc 1b \n\t"
  1799. : "+r" (counter), "+r" (filter)
  1800. : "m" (filterPos), "m" (dst), "m"(src+filterSize),
  1801. "m" (src), "r" (filterSize*2)
  1802. : "%ebx", "%eax", "%ecx"
  1803. );
  1804. }
  1805. #else
  1806. int i;
  1807. for(i=0; i<dstW; i++)
  1808. {
  1809. int j;
  1810. int srcPos= filterPos[i];
  1811. int val=0;
  1812. // printf("filterPos: %d\n", filterPos[i]);
  1813. for(j=0; j<filterSize; j++)
  1814. {
  1815. // printf("filter: %d, src: %d\n", filter[i], src[srcPos + j]);
  1816. val += ((int)src[srcPos + j])*filter[filterSize*i + j];
  1817. }
  1818. // filter += hFilterSize;
  1819. dst[i] = MIN(MAX(0, val>>7), (1<<15)-1); // the cubic equation does overflow ...
  1820. // dst[i] = val>>7;
  1821. }
  1822. #endif
  1823. }
  1824. // *** horizontal scale Y line to temp buffer
  1825. static inline void RENAME(hyscale)(uint16_t *dst, int dstWidth, uint8_t *src, int srcW, int xInc,
  1826. int flags, int canMMX2BeUsed, int16_t *hLumFilter,
  1827. int16_t *hLumFilterPos, int hLumFilterSize, void *funnyYCode,
  1828. int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
  1829. int32_t *mmx2FilterPos)
  1830. {
  1831. if(srcFormat==IMGFMT_YUY2)
  1832. {
  1833. RENAME(yuy2ToY)(formatConvBuffer, src, srcW);
  1834. src= formatConvBuffer;
  1835. }
  1836. else if(srcFormat==IMGFMT_BGR32)
  1837. {
  1838. RENAME(bgr32ToY)(formatConvBuffer, src, srcW);
  1839. src= formatConvBuffer;
  1840. }
  1841. else if(srcFormat==IMGFMT_BGR24)
  1842. {
  1843. RENAME(bgr24ToY)(formatConvBuffer, src, srcW);
  1844. src= formatConvBuffer;
  1845. }
  1846. else if(srcFormat==IMGFMT_BGR16)
  1847. {
  1848. RENAME(bgr16ToY)(formatConvBuffer, src, srcW);
  1849. src= formatConvBuffer;
  1850. }
  1851. else if(srcFormat==IMGFMT_BGR15)
  1852. {
  1853. RENAME(bgr15ToY)(formatConvBuffer, src, srcW);
  1854. src= formatConvBuffer;
  1855. }
  1856. else if(srcFormat==IMGFMT_RGB32)
  1857. {
  1858. RENAME(rgb32ToY)(formatConvBuffer, src, srcW);
  1859. src= formatConvBuffer;
  1860. }
  1861. else if(srcFormat==IMGFMT_RGB24)
  1862. {
  1863. RENAME(rgb24ToY)(formatConvBuffer, src, srcW);
  1864. src= formatConvBuffer;
  1865. }
  1866. #ifdef HAVE_MMX
  1867. // use the new MMX scaler if th mmx2 cant be used (its faster than the x86asm one)
  1868. if(!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
  1869. #else
  1870. if(!(flags&SWS_FAST_BILINEAR))
  1871. #endif
  1872. {
  1873. RENAME(hScale)(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize);
  1874. }
  1875. else // Fast Bilinear upscale / crap downscale
  1876. {
  1877. #ifdef ARCH_X86
  1878. #ifdef HAVE_MMX2
  1879. int i;
  1880. if(canMMX2BeUsed)
  1881. {
  1882. asm volatile(
  1883. "pxor %%mm7, %%mm7 \n\t"
  1884. "movl %0, %%ecx \n\t"
  1885. "movl %1, %%edi \n\t"
  1886. "movl %2, %%edx \n\t"
  1887. "movl %3, %%ebx \n\t"
  1888. "xorl %%eax, %%eax \n\t" // i
  1889. PREFETCH" (%%ecx) \n\t"
  1890. PREFETCH" 32(%%ecx) \n\t"
  1891. PREFETCH" 64(%%ecx) \n\t"
  1892. #define FUNNY_Y_CODE \
  1893. "movl (%%ebx), %%esi \n\t"\
  1894. "call *%4 \n\t"\
  1895. "addl (%%ebx, %%eax), %%ecx \n\t"\
  1896. "addl %%eax, %%edi \n\t"\
  1897. "xorl %%eax, %%eax \n\t"\
  1898. FUNNY_Y_CODE
  1899. FUNNY_Y_CODE
  1900. FUNNY_Y_CODE
  1901. FUNNY_Y_CODE
  1902. FUNNY_Y_CODE
  1903. FUNNY_Y_CODE
  1904. FUNNY_Y_CODE
  1905. FUNNY_Y_CODE
  1906. :: "m" (src), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
  1907. "m" (funnyYCode)
  1908. : "%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi"
  1909. );
  1910. for(i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) dst[i] = src[srcW-1]*128;
  1911. }
  1912. else
  1913. {
  1914. #endif
  1915. //NO MMX just normal asm ...
  1916. asm volatile(
  1917. "xorl %%eax, %%eax \n\t" // i
  1918. "xorl %%ebx, %%ebx \n\t" // xx
  1919. "xorl %%ecx, %%ecx \n\t" // 2*xalpha
  1920. ".balign 16 \n\t"
  1921. "1: \n\t"
  1922. "movzbl (%0, %%ebx), %%edi \n\t" //src[xx]
  1923. "movzbl 1(%0, %%ebx), %%esi \n\t" //src[xx+1]
  1924. "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
  1925. "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
  1926. "shll $16, %%edi \n\t"
  1927. "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
  1928. "movl %1, %%edi \n\t"
  1929. "shrl $9, %%esi \n\t"
  1930. "movw %%si, (%%edi, %%eax, 2) \n\t"
  1931. "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
  1932. "adcl %3, %%ebx \n\t" //xx+= xInc>>8 + carry
  1933. "movzbl (%0, %%ebx), %%edi \n\t" //src[xx]
  1934. "movzbl 1(%0, %%ebx), %%esi \n\t" //src[xx+1]
  1935. "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
  1936. "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
  1937. "shll $16, %%edi \n\t"
  1938. "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
  1939. "movl %1, %%edi \n\t"
  1940. "shrl $9, %%esi \n\t"
  1941. "movw %%si, 2(%%edi, %%eax, 2) \n\t"
  1942. "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
  1943. "adcl %3, %%ebx \n\t" //xx+= xInc>>8 + carry
  1944. "addl $2, %%eax \n\t"
  1945. "cmpl %2, %%eax \n\t"
  1946. " jb 1b \n\t"
  1947. :: "r" (src), "m" (dst), "m" (dstWidth), "m" (xInc>>16), "m" (xInc&0xFFFF)
  1948. : "%eax", "%ebx", "%ecx", "%edi", "%esi"
  1949. );
  1950. #ifdef HAVE_MMX2
  1951. } //if MMX2 cant be used
  1952. #endif
  1953. #else
  1954. int i;
  1955. unsigned int xpos=0;
  1956. for(i=0;i<dstWidth;i++)
  1957. {
  1958. register unsigned int xx=xpos>>16;
  1959. register unsigned int xalpha=(xpos&0xFFFF)>>9;
  1960. dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha;
  1961. xpos+=xInc;
  1962. }
  1963. #endif
  1964. }
  1965. }
  1966. inline static void RENAME(hcscale)(uint16_t *dst, int dstWidth, uint8_t *src1, uint8_t *src2,
  1967. int srcW, int xInc, int flags, int canMMX2BeUsed, int16_t *hChrFilter,
  1968. int16_t *hChrFilterPos, int hChrFilterSize, void *funnyUVCode,
  1969. int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
  1970. int32_t *mmx2FilterPos)
  1971. {
  1972. if(srcFormat==IMGFMT_YUY2)
  1973. {
  1974. RENAME(yuy2ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
  1975. src1= formatConvBuffer;
  1976. src2= formatConvBuffer+2048;
  1977. }
  1978. else if(srcFormat==IMGFMT_BGR32)
  1979. {
  1980. RENAME(bgr32ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
  1981. src1= formatConvBuffer;
  1982. src2= formatConvBuffer+2048;
  1983. }
  1984. else if(srcFormat==IMGFMT_BGR24)
  1985. {
  1986. RENAME(bgr24ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
  1987. src1= formatConvBuffer;
  1988. src2= formatConvBuffer+2048;
  1989. }
  1990. else if(srcFormat==IMGFMT_BGR16)
  1991. {
  1992. RENAME(bgr16ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
  1993. src1= formatConvBuffer;
  1994. src2= formatConvBuffer+2048;
  1995. }
  1996. else if(srcFormat==IMGFMT_BGR15)
  1997. {
  1998. RENAME(bgr15ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
  1999. src1= formatConvBuffer;
  2000. src2= formatConvBuffer+2048;
  2001. }
  2002. else if(srcFormat==IMGFMT_RGB32)
  2003. {
  2004. RENAME(rgb32ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
  2005. src1= formatConvBuffer;
  2006. src2= formatConvBuffer+2048;
  2007. }
  2008. else if(srcFormat==IMGFMT_RGB24)
  2009. {
  2010. RENAME(rgb24ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
  2011. src1= formatConvBuffer;
  2012. src2= formatConvBuffer+2048;
  2013. }
  2014. else if(isGray(srcFormat))
  2015. {
  2016. return;
  2017. }
  2018. #ifdef HAVE_MMX
  2019. // use the new MMX scaler if th mmx2 cant be used (its faster than the x86asm one)
  2020. if(!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
  2021. #else
  2022. if(!(flags&SWS_FAST_BILINEAR))
  2023. #endif
  2024. {
  2025. RENAME(hScale)(dst , dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
  2026. RENAME(hScale)(dst+2048, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
  2027. }
  2028. else // Fast Bilinear upscale / crap downscale
  2029. {
  2030. #ifdef ARCH_X86
  2031. #ifdef HAVE_MMX2
  2032. int i;
  2033. if(canMMX2BeUsed)
  2034. {
  2035. asm volatile(
  2036. "pxor %%mm7, %%mm7 \n\t"
  2037. "movl %0, %%ecx \n\t"
  2038. "movl %1, %%edi \n\t"
  2039. "movl %2, %%edx \n\t"
  2040. "movl %3, %%ebx \n\t"
  2041. "xorl %%eax, %%eax \n\t" // i
  2042. PREFETCH" (%%ecx) \n\t"
  2043. PREFETCH" 32(%%ecx) \n\t"
  2044. PREFETCH" 64(%%ecx) \n\t"
  2045. #define FUNNY_UV_CODE \
  2046. "movl (%%ebx), %%esi \n\t"\
  2047. "call *%4 \n\t"\
  2048. "addl (%%ebx, %%eax), %%ecx \n\t"\
  2049. "addl %%eax, %%edi \n\t"\
  2050. "xorl %%eax, %%eax \n\t"\
  2051. FUNNY_UV_CODE
  2052. FUNNY_UV_CODE
  2053. FUNNY_UV_CODE
  2054. FUNNY_UV_CODE
  2055. "xorl %%eax, %%eax \n\t" // i
  2056. "movl %5, %%ecx \n\t" // src
  2057. "movl %1, %%edi \n\t" // buf1
  2058. "addl $4096, %%edi \n\t"
  2059. PREFETCH" (%%ecx) \n\t"
  2060. PREFETCH" 32(%%ecx) \n\t"
  2061. PREFETCH" 64(%%ecx) \n\t"
  2062. FUNNY_UV_CODE
  2063. FUNNY_UV_CODE
  2064. FUNNY_UV_CODE
  2065. FUNNY_UV_CODE
  2066. :: "m" (src1), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
  2067. "m" (funnyUVCode), "m" (src2)
  2068. : "%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi"
  2069. );
  2070. for(i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
  2071. {
  2072. // printf("%d %d %d\n", dstWidth, i, srcW);
  2073. dst[i] = src1[srcW-1]*128;
  2074. dst[i+2048] = src2[srcW-1]*128;
  2075. }
  2076. }
  2077. else
  2078. {
  2079. #endif
  2080. asm volatile(
  2081. "xorl %%eax, %%eax \n\t" // i
  2082. "xorl %%ebx, %%ebx \n\t" // xx
  2083. "xorl %%ecx, %%ecx \n\t" // 2*xalpha
  2084. ".balign 16 \n\t"
  2085. "1: \n\t"
  2086. "movl %0, %%esi \n\t"
  2087. "movzbl (%%esi, %%ebx), %%edi \n\t" //src[xx]
  2088. "movzbl 1(%%esi, %%ebx), %%esi \n\t" //src[xx+1]
  2089. "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
  2090. "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
  2091. "shll $16, %%edi \n\t"
  2092. "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
  2093. "movl %1, %%edi \n\t"
  2094. "shrl $9, %%esi \n\t"
  2095. "movw %%si, (%%edi, %%eax, 2) \n\t"
  2096. "movzbl (%5, %%ebx), %%edi \n\t" //src[xx]
  2097. "movzbl 1(%5, %%ebx), %%esi \n\t" //src[xx+1]
  2098. "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
  2099. "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
  2100. "shll $16, %%edi \n\t"
  2101. "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
  2102. "movl %1, %%edi \n\t"
  2103. "shrl $9, %%esi \n\t"
  2104. "movw %%si, 4096(%%edi, %%eax, 2)\n\t"
  2105. "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
  2106. "adcl %3, %%ebx \n\t" //xx+= xInc>>8 + carry
  2107. "addl $1, %%eax \n\t"
  2108. "cmpl %2, %%eax \n\t"
  2109. " jb 1b \n\t"
  2110. :: "m" (src1), "m" (dst), "m" (dstWidth), "m" (xInc>>16), "m" (xInc&0xFFFF),
  2111. "r" (src2)
  2112. : "%eax", "%ebx", "%ecx", "%edi", "%esi"
  2113. );
  2114. #ifdef HAVE_MMX2
  2115. } //if MMX2 cant be used
  2116. #endif
  2117. #else
  2118. int i;
  2119. unsigned int xpos=0;
  2120. for(i=0;i<dstWidth;i++)
  2121. {
  2122. register unsigned int xx=xpos>>16;
  2123. register unsigned int xalpha=(xpos&0xFFFF)>>9;
  2124. dst[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha);
  2125. dst[i+2048]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha);
  2126. /* slower
  2127. dst[i]= (src1[xx]<<7) + (src1[xx+1] - src1[xx])*xalpha;
  2128. dst[i+2048]=(src2[xx]<<7) + (src2[xx+1] - src2[xx])*xalpha;
  2129. */
  2130. xpos+=xInc;
  2131. }
  2132. #endif
  2133. }
  2134. }
  2135. static void RENAME(swScale)(SwsContext *c, uint8_t* srcParam[], int srcStrideParam[], int srcSliceY,
  2136. int srcSliceH, uint8_t* dstParam[], int dstStrideParam[]){
  2137. /* load a few things into local vars to make the code more readable? and faster */
  2138. const int srcW= c->srcW;
  2139. const int dstW= c->dstW;
  2140. const int dstH= c->dstH;
  2141. const int chrDstW= c->chrDstW;
  2142. const int chrSrcW= c->chrSrcW;
  2143. const int lumXInc= c->lumXInc;
  2144. const int chrXInc= c->chrXInc;
  2145. const int dstFormat= c->dstFormat;
  2146. const int srcFormat= c->srcFormat;
  2147. const int flags= c->flags;
  2148. const int canMMX2BeUsed= c->canMMX2BeUsed;
  2149. int16_t *vLumFilterPos= c->vLumFilterPos;
  2150. int16_t *vChrFilterPos= c->vChrFilterPos;
  2151. int16_t *hLumFilterPos= c->hLumFilterPos;
  2152. int16_t *hChrFilterPos= c->hChrFilterPos;
  2153. int16_t *vLumFilter= c->vLumFilter;
  2154. int16_t *vChrFilter= c->vChrFilter;
  2155. int16_t *hLumFilter= c->hLumFilter;
  2156. int16_t *hChrFilter= c->hChrFilter;
  2157. int16_t *lumMmxFilter= c->lumMmxFilter;
  2158. int16_t *chrMmxFilter= c->chrMmxFilter;
  2159. const int vLumFilterSize= c->vLumFilterSize;
  2160. const int vChrFilterSize= c->vChrFilterSize;
  2161. const int hLumFilterSize= c->hLumFilterSize;
  2162. const int hChrFilterSize= c->hChrFilterSize;
  2163. int16_t **lumPixBuf= c->lumPixBuf;
  2164. int16_t **chrPixBuf= c->chrPixBuf;
  2165. const int vLumBufSize= c->vLumBufSize;
  2166. const int vChrBufSize= c->vChrBufSize;
  2167. uint8_t *funnyYCode= c->funnyYCode;
  2168. uint8_t *funnyUVCode= c->funnyUVCode;
  2169. uint8_t *formatConvBuffer= c->formatConvBuffer;
  2170. const int chrSrcSliceY= srcSliceY >> c->chrSrcVSubSample;
  2171. const int chrSrcSliceH= -((-srcSliceH) >> c->chrSrcVSubSample);
  2172. /* vars whch will change and which we need to storw back in the context */
  2173. int dstY= c->dstY;
  2174. int lumBufIndex= c->lumBufIndex;
  2175. int chrBufIndex= c->chrBufIndex;
  2176. int lastInLumBuf= c->lastInLumBuf;
  2177. int lastInChrBuf= c->lastInChrBuf;
  2178. int srcStride[3];
  2179. int dstStride[3];
  2180. uint8_t *src[3];
  2181. uint8_t *dst[3];
  2182. orderYUV(c->srcFormat, src, srcStride, srcParam, srcStrideParam);
  2183. orderYUV(c->dstFormat, dst, dstStride, dstParam, dstStrideParam);
  2184. if(isPacked(c->srcFormat)){
  2185. src[0]=
  2186. src[1]=
  2187. src[2]= srcParam[0];
  2188. srcStride[0]=
  2189. srcStride[1]=
  2190. srcStride[2]= srcStrideParam[0];
  2191. }
  2192. srcStride[1]<<= c->vChrDrop;
  2193. srcStride[2]<<= c->vChrDrop;
  2194. // printf("swscale %X %X %X -> %X %X %X\n", (int)src[0], (int)src[1], (int)src[2],
  2195. // (int)dst[0], (int)dst[1], (int)dst[2]);
  2196. #if 0 //self test FIXME move to a vfilter or something
  2197. {
  2198. static volatile int i=0;
  2199. i++;
  2200. if(srcFormat==IMGFMT_YV12 && i==1 && srcSliceH>= c->srcH)
  2201. selfTest(src, srcStride, c->srcW, c->srcH);
  2202. i--;
  2203. }
  2204. #endif
  2205. //printf("sws Strides:%d %d %d -> %d %d %d\n", srcStride[0],srcStride[1],srcStride[2],
  2206. //dstStride[0],dstStride[1],dstStride[2]);
  2207. if(dstStride[0]%8 !=0 || dstStride[1]%8 !=0 || dstStride[2]%8 !=0)
  2208. {
  2209. static int firstTime=1; //FIXME move this into the context perhaps
  2210. if(flags & SWS_PRINT_INFO && firstTime)
  2211. {
  2212. mp_msg(MSGT_SWS,MSGL_WARN,"SwScaler: Warning: dstStride is not aligned!\n"
  2213. "SwScaler: ->cannot do aligned memory acesses anymore\n");
  2214. firstTime=0;
  2215. }
  2216. }
  2217. /* Note the user might start scaling the picture in the middle so this will not get executed
  2218. this is not really intended but works currently, so ppl might do it */
  2219. if(srcSliceY ==0){
  2220. lumBufIndex=0;
  2221. chrBufIndex=0;
  2222. dstY=0;
  2223. lastInLumBuf= -1;
  2224. lastInChrBuf= -1;
  2225. }
  2226. for(;dstY < dstH; dstY++){
  2227. unsigned char *dest =dst[0]+dstStride[0]*dstY;
  2228. const int chrDstY= dstY>>c->chrDstVSubSample;
  2229. unsigned char *uDest=dst[1]+dstStride[1]*chrDstY;
  2230. unsigned char *vDest=dst[2]+dstStride[2]*chrDstY;
  2231. const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input
  2232. const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input
  2233. const int lastLumSrcY= firstLumSrcY + vLumFilterSize -1; // Last line needed as input
  2234. const int lastChrSrcY= firstChrSrcY + vChrFilterSize -1; // Last line needed as input
  2235. //handle holes (FAST_BILINEAR & weird filters)
  2236. if(firstLumSrcY > lastInLumBuf) lastInLumBuf= firstLumSrcY-1;
  2237. if(firstChrSrcY > lastInChrBuf) lastInChrBuf= firstChrSrcY-1;
  2238. //printf("%d %d %d\n", firstChrSrcY, lastInChrBuf, vChrBufSize);
  2239. ASSERT(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1)
  2240. ASSERT(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1)
  2241. // Do we have enough lines in this slice to output the dstY line
  2242. if(lastLumSrcY < srcSliceY + srcSliceH && lastChrSrcY < -((-srcSliceY - srcSliceH)>>c->chrSrcVSubSample))
  2243. {
  2244. //Do horizontal scaling
  2245. while(lastInLumBuf < lastLumSrcY)
  2246. {
  2247. uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
  2248. lumBufIndex++;
  2249. // printf("%d %d %d %d\n", lumBufIndex, vLumBufSize, lastInLumBuf, lastLumSrcY);
  2250. ASSERT(lumBufIndex < 2*vLumBufSize)
  2251. ASSERT(lastInLumBuf + 1 - srcSliceY < srcSliceH)
  2252. ASSERT(lastInLumBuf + 1 - srcSliceY >= 0)
  2253. // printf("%d %d\n", lumBufIndex, vLumBufSize);
  2254. RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
  2255. flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
  2256. funnyYCode, c->srcFormat, formatConvBuffer,
  2257. c->lumMmx2Filter, c->lumMmx2FilterPos);
  2258. lastInLumBuf++;
  2259. }
  2260. while(lastInChrBuf < lastChrSrcY)
  2261. {
  2262. uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
  2263. uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
  2264. chrBufIndex++;
  2265. ASSERT(chrBufIndex < 2*vChrBufSize)
  2266. ASSERT(lastInChrBuf + 1 - chrSrcSliceY < (chrSrcSliceH))
  2267. ASSERT(lastInChrBuf + 1 - chrSrcSliceY >= 0)
  2268. //FIXME replace parameters through context struct (some at least)
  2269. if(!(isGray(srcFormat) || isGray(dstFormat)))
  2270. RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
  2271. flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
  2272. funnyUVCode, c->srcFormat, formatConvBuffer,
  2273. c->chrMmx2Filter, c->chrMmx2FilterPos);
  2274. lastInChrBuf++;
  2275. }
  2276. //wrap buf index around to stay inside the ring buffer
  2277. if(lumBufIndex >= vLumBufSize ) lumBufIndex-= vLumBufSize;
  2278. if(chrBufIndex >= vChrBufSize ) chrBufIndex-= vChrBufSize;
  2279. }
  2280. else // not enough lines left in this slice -> load the rest in the buffer
  2281. {
  2282. /* printf("%d %d Last:%d %d LastInBuf:%d %d Index:%d %d Y:%d FSize: %d %d BSize: %d %d\n",
  2283. firstChrSrcY,firstLumSrcY,lastChrSrcY,lastLumSrcY,
  2284. lastInChrBuf,lastInLumBuf,chrBufIndex,lumBufIndex,dstY,vChrFilterSize,vLumFilterSize,
  2285. vChrBufSize, vLumBufSize);*/
  2286. //Do horizontal scaling
  2287. while(lastInLumBuf+1 < srcSliceY + srcSliceH)
  2288. {
  2289. uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
  2290. lumBufIndex++;
  2291. ASSERT(lumBufIndex < 2*vLumBufSize)
  2292. ASSERT(lastInLumBuf + 1 - srcSliceY < srcSliceH)
  2293. ASSERT(lastInLumBuf + 1 - srcSliceY >= 0)
  2294. RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
  2295. flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
  2296. funnyYCode, c->srcFormat, formatConvBuffer,
  2297. c->lumMmx2Filter, c->lumMmx2FilterPos);
  2298. lastInLumBuf++;
  2299. }
  2300. while(lastInChrBuf+1 < (chrSrcSliceY + chrSrcSliceH))
  2301. {
  2302. uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
  2303. uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
  2304. chrBufIndex++;
  2305. ASSERT(chrBufIndex < 2*vChrBufSize)
  2306. ASSERT(lastInChrBuf + 1 - chrSrcSliceY < chrSrcSliceH)
  2307. ASSERT(lastInChrBuf + 1 - chrSrcSliceY >= 0)
  2308. if(!(isGray(srcFormat) || isGray(dstFormat)))
  2309. RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
  2310. flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
  2311. funnyUVCode, c->srcFormat, formatConvBuffer,
  2312. c->chrMmx2Filter, c->chrMmx2FilterPos);
  2313. lastInChrBuf++;
  2314. }
  2315. //wrap buf index around to stay inside the ring buffer
  2316. if(lumBufIndex >= vLumBufSize ) lumBufIndex-= vLumBufSize;
  2317. if(chrBufIndex >= vChrBufSize ) chrBufIndex-= vChrBufSize;
  2318. break; //we cant output a dstY line so lets try with the next slice
  2319. }
  2320. #ifdef HAVE_MMX
  2321. b5Dither= dither8[dstY&1];
  2322. g6Dither= dither4[dstY&1];
  2323. g5Dither= dither8[dstY&1];
  2324. r5Dither= dither8[(dstY+1)&1];
  2325. #endif
  2326. if(dstY < dstH-2)
  2327. {
  2328. if(isPlanarYUV(dstFormat) || isGray(dstFormat)) //YV12 like
  2329. {
  2330. if((dstY&1) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
  2331. if(vLumFilterSize == 1 && vChrFilterSize == 1) // Unscaled YV12
  2332. {
  2333. int16_t *lumBuf = lumPixBuf[0];
  2334. int16_t *chrBuf= chrPixBuf[0];
  2335. RENAME(yuv2yuv1)(lumBuf, chrBuf, dest, uDest, vDest, dstW, chrDstW);
  2336. }
  2337. else //General YV12
  2338. {
  2339. int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
  2340. int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
  2341. RENAME(yuv2yuvX)(
  2342. vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
  2343. vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
  2344. dest, uDest, vDest, dstW, chrDstW,
  2345. lumMmxFilter+dstY*vLumFilterSize*4, chrMmxFilter+chrDstY*vChrFilterSize*4);
  2346. }
  2347. }
  2348. else
  2349. {
  2350. int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
  2351. int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
  2352. ASSERT(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
  2353. ASSERT(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
  2354. if(vLumFilterSize == 1 && vChrFilterSize == 2) //Unscaled RGB
  2355. {
  2356. int chrAlpha= vChrFilter[2*dstY+1];
  2357. RENAME(yuv2rgb1)(c, *lumSrcPtr, *chrSrcPtr, *(chrSrcPtr+1),
  2358. dest, dstW, chrAlpha, dstFormat, flags, dstY);
  2359. }
  2360. else if(vLumFilterSize == 2 && vChrFilterSize == 2) //BiLinear Upscale RGB
  2361. {
  2362. int lumAlpha= vLumFilter[2*dstY+1];
  2363. int chrAlpha= vChrFilter[2*dstY+1];
  2364. RENAME(yuv2rgb2)(c, *lumSrcPtr, *(lumSrcPtr+1), *chrSrcPtr, *(chrSrcPtr+1),
  2365. dest, dstW, lumAlpha, chrAlpha, dstY);
  2366. }
  2367. else //General RGB
  2368. {
  2369. RENAME(yuv2rgbX)(c,
  2370. vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
  2371. vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
  2372. dest, dstW,
  2373. lumMmxFilter+dstY*vLumFilterSize*4, chrMmxFilter+dstY*vChrFilterSize*4, dstY);
  2374. }
  2375. }
  2376. }
  2377. else // hmm looks like we cant use MMX here without overwriting this arrays tail
  2378. {
  2379. int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
  2380. int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
  2381. if(isPlanarYUV(dstFormat) || isGray(dstFormat)) //YV12
  2382. {
  2383. if((dstY&1) || isGray(dstFormat)) uDest=vDest= NULL;
  2384. yuv2yuvXinC(
  2385. vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
  2386. vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
  2387. dest, uDest, vDest, dstW, chrDstW);
  2388. }
  2389. else
  2390. {
  2391. ASSERT(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
  2392. ASSERT(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
  2393. yuv2rgbXinC(c,
  2394. vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
  2395. vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
  2396. dest, dstW, dstY);
  2397. }
  2398. }
  2399. }
  2400. #ifdef HAVE_MMX
  2401. __asm __volatile(SFENCE:::"memory");
  2402. __asm __volatile(EMMS:::"memory");
  2403. #endif
  2404. /* store changed local vars back in the context */
  2405. c->dstY= dstY;
  2406. c->lumBufIndex= lumBufIndex;
  2407. c->chrBufIndex= chrBufIndex;
  2408. c->lastInLumBuf= lastInLumBuf;
  2409. c->lastInChrBuf= lastInChrBuf;
  2410. }