You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

3186 lines
103KB

  1. /*
  2. Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
  3. This program is free software; you can redistribute it and/or modify
  4. it under the terms of the GNU General Public License as published by
  5. the Free Software Foundation; either version 2 of the License, or
  6. (at your option) any later version.
  7. This program is distributed in the hope that it will be useful,
  8. but WITHOUT ANY WARRANTY; without even the implied warranty of
  9. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  10. GNU General Public License for more details.
  11. You should have received a copy of the GNU General Public License
  12. along with this program; if not, write to the Free Software
  13. Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
  14. the C code (not assembly, mmx, ...) of the swscaler which has been written
  15. by Michael Niedermayer can be used under the LGPL license too
  16. */
  17. #undef REAL_MOVNTQ
  18. #undef MOVNTQ
  19. #undef PAVGB
  20. #undef PREFETCH
  21. #undef PREFETCHW
  22. #undef EMMS
  23. #undef SFENCE
  24. #ifdef HAVE_3DNOW
  25. /* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
  26. #define EMMS "femms"
  27. #else
  28. #define EMMS "emms"
  29. #endif
  30. #ifdef HAVE_3DNOW
  31. #define PREFETCH "prefetch"
  32. #define PREFETCHW "prefetchw"
  33. #elif defined ( HAVE_MMX2 )
  34. #define PREFETCH "prefetchnta"
  35. #define PREFETCHW "prefetcht0"
  36. #else
  37. #define PREFETCH "/nop"
  38. #define PREFETCHW "/nop"
  39. #endif
  40. #ifdef HAVE_MMX2
  41. #define SFENCE "sfence"
  42. #else
  43. #define SFENCE "/nop"
  44. #endif
  45. #ifdef HAVE_MMX2
  46. #define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
  47. #elif defined (HAVE_3DNOW)
  48. #define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
  49. #endif
  50. #ifdef HAVE_MMX2
  51. #define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
  52. #else
  53. #define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
  54. #endif
  55. #define MOVNTQ(a,b) REAL_MOVNTQ(a,b)
  56. #ifdef HAVE_ALTIVEC
  57. #include "swscale_altivec_template.c"
  58. #endif
  59. #define YSCALEYUV2YV12X(x, offset, dest, width) \
  60. asm volatile(\
  61. "xor %%"REG_a", %%"REG_a" \n\t"\
  62. "movq "VROUNDER_OFFSET"(%0), %%mm3\n\t"\
  63. "movq %%mm3, %%mm4 \n\t"\
  64. "lea " offset "(%0), %%"REG_d" \n\t"\
  65. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  66. ASMALIGN(4) /* FIXME Unroll? */\
  67. "1: \n\t"\
  68. "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
  69. "movq " #x "(%%"REG_S", %%"REG_a", 2), %%mm2\n\t" /* srcData */\
  70. "movq 8+" #x "(%%"REG_S", %%"REG_a", 2), %%mm5\n\t" /* srcData */\
  71. "add $16, %%"REG_d" \n\t"\
  72. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  73. "test %%"REG_S", %%"REG_S" \n\t"\
  74. "pmulhw %%mm0, %%mm2 \n\t"\
  75. "pmulhw %%mm0, %%mm5 \n\t"\
  76. "paddw %%mm2, %%mm3 \n\t"\
  77. "paddw %%mm5, %%mm4 \n\t"\
  78. " jnz 1b \n\t"\
  79. "psraw $3, %%mm3 \n\t"\
  80. "psraw $3, %%mm4 \n\t"\
  81. "packuswb %%mm4, %%mm3 \n\t"\
  82. MOVNTQ(%%mm3, (%1, %%REGa))\
  83. "add $8, %%"REG_a" \n\t"\
  84. "cmp %2, %%"REG_a" \n\t"\
  85. "movq "VROUNDER_OFFSET"(%0), %%mm3\n\t"\
  86. "movq %%mm3, %%mm4 \n\t"\
  87. "lea " offset "(%0), %%"REG_d" \n\t"\
  88. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  89. "jb 1b \n\t"\
  90. :: "r" (&c->redDither),\
  91. "r" (dest), "p" (width)\
  92. : "%"REG_a, "%"REG_d, "%"REG_S\
  93. );
  94. #define YSCALEYUV2YV12X_ACCURATE(x, offset, dest, width) \
  95. asm volatile(\
  96. "lea " offset "(%0), %%"REG_d" \n\t"\
  97. "xor %%"REG_a", %%"REG_a" \n\t"\
  98. "pxor %%mm4, %%mm4 \n\t"\
  99. "pxor %%mm5, %%mm5 \n\t"\
  100. "pxor %%mm6, %%mm6 \n\t"\
  101. "pxor %%mm7, %%mm7 \n\t"\
  102. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  103. ASMALIGN(4) \
  104. "1: \n\t"\
  105. "movq " #x "(%%"REG_S", %%"REG_a", 2), %%mm0\n\t" /* srcData */\
  106. "movq 8+" #x "(%%"REG_S", %%"REG_a", 2), %%mm2\n\t" /* srcData */\
  107. "mov 4(%%"REG_d"), %%"REG_S" \n\t"\
  108. "movq " #x "(%%"REG_S", %%"REG_a", 2), %%mm1\n\t" /* srcData */\
  109. "movq %%mm0, %%mm3 \n\t"\
  110. "punpcklwd %%mm1, %%mm0 \n\t"\
  111. "punpckhwd %%mm1, %%mm3 \n\t"\
  112. "movq 8(%%"REG_d"), %%mm1 \n\t" /* filterCoeff */\
  113. "pmaddwd %%mm1, %%mm0 \n\t"\
  114. "pmaddwd %%mm1, %%mm3 \n\t"\
  115. "paddd %%mm0, %%mm4 \n\t"\
  116. "paddd %%mm3, %%mm5 \n\t"\
  117. "movq 8+" #x "(%%"REG_S", %%"REG_a", 2), %%mm3\n\t" /* srcData */\
  118. "mov 16(%%"REG_d"), %%"REG_S" \n\t"\
  119. "add $16, %%"REG_d" \n\t"\
  120. "test %%"REG_S", %%"REG_S" \n\t"\
  121. "movq %%mm2, %%mm0 \n\t"\
  122. "punpcklwd %%mm3, %%mm2 \n\t"\
  123. "punpckhwd %%mm3, %%mm0 \n\t"\
  124. "pmaddwd %%mm1, %%mm2 \n\t"\
  125. "pmaddwd %%mm1, %%mm0 \n\t"\
  126. "paddd %%mm2, %%mm6 \n\t"\
  127. "paddd %%mm0, %%mm7 \n\t"\
  128. " jnz 1b \n\t"\
  129. "psrad $16, %%mm4 \n\t"\
  130. "psrad $16, %%mm5 \n\t"\
  131. "psrad $16, %%mm6 \n\t"\
  132. "psrad $16, %%mm7 \n\t"\
  133. "movq "VROUNDER_OFFSET"(%0), %%mm0\n\t"\
  134. "packssdw %%mm5, %%mm4 \n\t"\
  135. "packssdw %%mm7, %%mm6 \n\t"\
  136. "paddw %%mm0, %%mm4 \n\t"\
  137. "paddw %%mm0, %%mm6 \n\t"\
  138. "psraw $3, %%mm4 \n\t"\
  139. "psraw $3, %%mm6 \n\t"\
  140. "packuswb %%mm6, %%mm4 \n\t"\
  141. MOVNTQ(%%mm4, (%1, %%REGa))\
  142. "add $8, %%"REG_a" \n\t"\
  143. "cmp %2, %%"REG_a" \n\t"\
  144. "lea " offset "(%0), %%"REG_d" \n\t"\
  145. "pxor %%mm4, %%mm4 \n\t"\
  146. "pxor %%mm5, %%mm5 \n\t"\
  147. "pxor %%mm6, %%mm6 \n\t"\
  148. "pxor %%mm7, %%mm7 \n\t"\
  149. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  150. "jb 1b \n\t"\
  151. :: "r" (&c->redDither),\
  152. "r" (dest), "p" (width)\
  153. : "%"REG_a, "%"REG_d, "%"REG_S\
  154. );
  155. #define YSCALEYUV2YV121 \
  156. "mov %2, %%"REG_a" \n\t"\
  157. ASMALIGN(4) /* FIXME Unroll? */\
  158. "1: \n\t"\
  159. "movq (%0, %%"REG_a", 2), %%mm0 \n\t"\
  160. "movq 8(%0, %%"REG_a", 2), %%mm1\n\t"\
  161. "psraw $7, %%mm0 \n\t"\
  162. "psraw $7, %%mm1 \n\t"\
  163. "packuswb %%mm1, %%mm0 \n\t"\
  164. MOVNTQ(%%mm0, (%1, %%REGa))\
  165. "add $8, %%"REG_a" \n\t"\
  166. "jnc 1b \n\t"
  167. /*
  168. :: "m" (-lumFilterSize), "m" (-chrFilterSize),
  169. "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
  170. "r" (dest), "m" (dstW),
  171. "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
  172. : "%eax", "%ebx", "%ecx", "%edx", "%esi"
  173. */
  174. #define YSCALEYUV2PACKEDX \
  175. asm volatile(\
  176. "xor %%"REG_a", %%"REG_a" \n\t"\
  177. ASMALIGN(4)\
  178. "nop \n\t"\
  179. "1: \n\t"\
  180. "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d"\n\t"\
  181. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  182. "movq "VROUNDER_OFFSET"(%0), %%mm3\n\t"\
  183. "movq %%mm3, %%mm4 \n\t"\
  184. ASMALIGN(4)\
  185. "2: \n\t"\
  186. "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
  187. "movq (%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* UsrcData */\
  188. "movq 4096(%%"REG_S", %%"REG_a"), %%mm5 \n\t" /* VsrcData */\
  189. "add $16, %%"REG_d" \n\t"\
  190. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  191. "pmulhw %%mm0, %%mm2 \n\t"\
  192. "pmulhw %%mm0, %%mm5 \n\t"\
  193. "paddw %%mm2, %%mm3 \n\t"\
  194. "paddw %%mm5, %%mm4 \n\t"\
  195. "test %%"REG_S", %%"REG_S" \n\t"\
  196. " jnz 2b \n\t"\
  197. \
  198. "lea "LUM_MMX_FILTER_OFFSET"(%0), %%"REG_d"\n\t"\
  199. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  200. "movq "VROUNDER_OFFSET"(%0), %%mm1\n\t"\
  201. "movq %%mm1, %%mm7 \n\t"\
  202. ASMALIGN(4)\
  203. "2: \n\t"\
  204. "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
  205. "movq (%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* Y1srcData */\
  206. "movq 8(%%"REG_S", %%"REG_a", 2), %%mm5 \n\t" /* Y2srcData */\
  207. "add $16, %%"REG_d" \n\t"\
  208. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  209. "pmulhw %%mm0, %%mm2 \n\t"\
  210. "pmulhw %%mm0, %%mm5 \n\t"\
  211. "paddw %%mm2, %%mm1 \n\t"\
  212. "paddw %%mm5, %%mm7 \n\t"\
  213. "test %%"REG_S", %%"REG_S" \n\t"\
  214. " jnz 2b \n\t"\
  215. #define YSCALEYUV2PACKEDX_END\
  216. :: "r" (&c->redDither), \
  217. "m" (dummy), "m" (dummy), "m" (dummy),\
  218. "r" (dest), "m" (dstW)\
  219. : "%"REG_a, "%"REG_d, "%"REG_S\
  220. );
  221. #define YSCALEYUV2PACKEDX_ACCURATE \
  222. asm volatile(\
  223. "xor %%"REG_a", %%"REG_a" \n\t"\
  224. ASMALIGN(4)\
  225. "nop \n\t"\
  226. "1: \n\t"\
  227. "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d"\n\t"\
  228. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  229. "pxor %%mm4, %%mm4 \n\t"\
  230. "pxor %%mm5, %%mm5 \n\t"\
  231. "pxor %%mm6, %%mm6 \n\t"\
  232. "pxor %%mm7, %%mm7 \n\t"\
  233. ASMALIGN(4)\
  234. "2: \n\t"\
  235. "movq (%%"REG_S", %%"REG_a"), %%mm0 \n\t" /* UsrcData */\
  236. "movq 4096(%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* VsrcData */\
  237. "mov 4(%%"REG_d"), %%"REG_S" \n\t"\
  238. "movq (%%"REG_S", %%"REG_a"), %%mm1 \n\t" /* UsrcData */\
  239. "movq %%mm0, %%mm3 \n\t"\
  240. "punpcklwd %%mm1, %%mm0 \n\t"\
  241. "punpckhwd %%mm1, %%mm3 \n\t"\
  242. "movq 8(%%"REG_d"), %%mm1 \n\t" /* filterCoeff */\
  243. "pmaddwd %%mm1, %%mm0 \n\t"\
  244. "pmaddwd %%mm1, %%mm3 \n\t"\
  245. "paddd %%mm0, %%mm4 \n\t"\
  246. "paddd %%mm3, %%mm5 \n\t"\
  247. "movq 4096(%%"REG_S", %%"REG_a"), %%mm3 \n\t" /* VsrcData */\
  248. "mov 16(%%"REG_d"), %%"REG_S" \n\t"\
  249. "add $16, %%"REG_d" \n\t"\
  250. "test %%"REG_S", %%"REG_S" \n\t"\
  251. "movq %%mm2, %%mm0 \n\t"\
  252. "punpcklwd %%mm3, %%mm2 \n\t"\
  253. "punpckhwd %%mm3, %%mm0 \n\t"\
  254. "pmaddwd %%mm1, %%mm2 \n\t"\
  255. "pmaddwd %%mm1, %%mm0 \n\t"\
  256. "paddd %%mm2, %%mm6 \n\t"\
  257. "paddd %%mm0, %%mm7 \n\t"\
  258. " jnz 2b \n\t"\
  259. "psrad $16, %%mm4 \n\t"\
  260. "psrad $16, %%mm5 \n\t"\
  261. "psrad $16, %%mm6 \n\t"\
  262. "psrad $16, %%mm7 \n\t"\
  263. "movq "VROUNDER_OFFSET"(%0), %%mm0\n\t"\
  264. "packssdw %%mm5, %%mm4 \n\t"\
  265. "packssdw %%mm7, %%mm6 \n\t"\
  266. "paddw %%mm0, %%mm4 \n\t"\
  267. "paddw %%mm0, %%mm6 \n\t"\
  268. "movq %%mm4, "U_TEMP"(%0) \n\t"\
  269. "movq %%mm6, "V_TEMP"(%0) \n\t"\
  270. \
  271. "lea "LUM_MMX_FILTER_OFFSET"(%0), %%"REG_d"\n\t"\
  272. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  273. "pxor %%mm1, %%mm1 \n\t"\
  274. "pxor %%mm5, %%mm5 \n\t"\
  275. "pxor %%mm7, %%mm7 \n\t"\
  276. "pxor %%mm6, %%mm6 \n\t"\
  277. ASMALIGN(4)\
  278. "2: \n\t"\
  279. "movq (%%"REG_S", %%"REG_a", 2), %%mm0 \n\t" /* Y1srcData */\
  280. "movq 8(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* Y2srcData */\
  281. "mov 4(%%"REG_d"), %%"REG_S" \n\t"\
  282. "movq (%%"REG_S", %%"REG_a", 2), %%mm4 \n\t" /* Y1srcData */\
  283. "movq %%mm0, %%mm3 \n\t"\
  284. "punpcklwd %%mm4, %%mm0 \n\t"\
  285. "punpckhwd %%mm4, %%mm3 \n\t"\
  286. "movq 8(%%"REG_d"), %%mm4 \n\t" /* filterCoeff */\
  287. "pmaddwd %%mm4, %%mm0 \n\t"\
  288. "pmaddwd %%mm4, %%mm3 \n\t"\
  289. "paddd %%mm0, %%mm1 \n\t"\
  290. "paddd %%mm3, %%mm5 \n\t"\
  291. "movq 8(%%"REG_S", %%"REG_a", 2), %%mm3 \n\t" /* Y2srcData */\
  292. "mov 16(%%"REG_d"), %%"REG_S" \n\t"\
  293. "add $16, %%"REG_d" \n\t"\
  294. "test %%"REG_S", %%"REG_S" \n\t"\
  295. "movq %%mm2, %%mm0 \n\t"\
  296. "punpcklwd %%mm3, %%mm2 \n\t"\
  297. "punpckhwd %%mm3, %%mm0 \n\t"\
  298. "pmaddwd %%mm4, %%mm2 \n\t"\
  299. "pmaddwd %%mm4, %%mm0 \n\t"\
  300. "paddd %%mm2, %%mm7 \n\t"\
  301. "paddd %%mm0, %%mm6 \n\t"\
  302. " jnz 2b \n\t"\
  303. "psrad $16, %%mm1 \n\t"\
  304. "psrad $16, %%mm5 \n\t"\
  305. "psrad $16, %%mm7 \n\t"\
  306. "psrad $16, %%mm6 \n\t"\
  307. "movq "VROUNDER_OFFSET"(%0), %%mm0\n\t"\
  308. "packssdw %%mm5, %%mm1 \n\t"\
  309. "packssdw %%mm6, %%mm7 \n\t"\
  310. "paddw %%mm0, %%mm1 \n\t"\
  311. "paddw %%mm0, %%mm7 \n\t"\
  312. "movq "U_TEMP"(%0), %%mm3 \n\t"\
  313. "movq "V_TEMP"(%0), %%mm4 \n\t"\
  314. #define YSCALEYUV2RGBX \
  315. "psubw "U_OFFSET"(%0), %%mm3 \n\t" /* (U-128)8*/\
  316. "psubw "V_OFFSET"(%0), %%mm4 \n\t" /* (V-128)8*/\
  317. "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
  318. "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
  319. "pmulhw "UG_COEFF"(%0), %%mm3 \n\t"\
  320. "pmulhw "VG_COEFF"(%0), %%mm4 \n\t"\
  321. /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
  322. "pmulhw "UB_COEFF"(%0), %%mm2 \n\t"\
  323. "pmulhw "VR_COEFF"(%0), %%mm5 \n\t"\
  324. "psubw "Y_OFFSET"(%0), %%mm1 \n\t" /* 8(Y-16)*/\
  325. "psubw "Y_OFFSET"(%0), %%mm7 \n\t" /* 8(Y-16)*/\
  326. "pmulhw "Y_COEFF"(%0), %%mm1 \n\t"\
  327. "pmulhw "Y_COEFF"(%0), %%mm7 \n\t"\
  328. /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
  329. "paddw %%mm3, %%mm4 \n\t"\
  330. "movq %%mm2, %%mm0 \n\t"\
  331. "movq %%mm5, %%mm6 \n\t"\
  332. "movq %%mm4, %%mm3 \n\t"\
  333. "punpcklwd %%mm2, %%mm2 \n\t"\
  334. "punpcklwd %%mm5, %%mm5 \n\t"\
  335. "punpcklwd %%mm4, %%mm4 \n\t"\
  336. "paddw %%mm1, %%mm2 \n\t"\
  337. "paddw %%mm1, %%mm5 \n\t"\
  338. "paddw %%mm1, %%mm4 \n\t"\
  339. "punpckhwd %%mm0, %%mm0 \n\t"\
  340. "punpckhwd %%mm6, %%mm6 \n\t"\
  341. "punpckhwd %%mm3, %%mm3 \n\t"\
  342. "paddw %%mm7, %%mm0 \n\t"\
  343. "paddw %%mm7, %%mm6 \n\t"\
  344. "paddw %%mm7, %%mm3 \n\t"\
  345. /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
  346. "packuswb %%mm0, %%mm2 \n\t"\
  347. "packuswb %%mm6, %%mm5 \n\t"\
  348. "packuswb %%mm3, %%mm4 \n\t"\
  349. "pxor %%mm7, %%mm7 \n\t"
  350. #if 0
  351. #define FULL_YSCALEYUV2RGB \
  352. "pxor %%mm7, %%mm7 \n\t"\
  353. "movd %6, %%mm6 \n\t" /*yalpha1*/\
  354. "punpcklwd %%mm6, %%mm6 \n\t"\
  355. "punpcklwd %%mm6, %%mm6 \n\t"\
  356. "movd %7, %%mm5 \n\t" /*uvalpha1*/\
  357. "punpcklwd %%mm5, %%mm5 \n\t"\
  358. "punpcklwd %%mm5, %%mm5 \n\t"\
  359. "xor %%"REG_a", %%"REG_a" \n\t"\
  360. ASMALIGN(4)\
  361. "1: \n\t"\
  362. "movq (%0, %%"REG_a", 2), %%mm0 \n\t" /*buf0[eax]*/\
  363. "movq (%1, %%"REG_a", 2), %%mm1 \n\t" /*buf1[eax]*/\
  364. "movq (%2, %%"REG_a",2), %%mm2 \n\t" /* uvbuf0[eax]*/\
  365. "movq (%3, %%"REG_a",2), %%mm3 \n\t" /* uvbuf1[eax]*/\
  366. "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
  367. "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
  368. "pmulhw %%mm6, %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
  369. "pmulhw %%mm5, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
  370. "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  371. "movq 4096(%2, %%"REG_a",2), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
  372. "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
  373. "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
  374. "movq 4096(%3, %%"REG_a",2), %%mm0 \n\t" /* uvbuf1[eax+2048]*/\
  375. "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
  376. "psubw %%mm0, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
  377. "psubw "MANGLE(w80)", %%mm1 \n\t" /* 8(Y-16)*/\
  378. "psubw "MANGLE(w400)", %%mm3 \n\t" /* 8(U-128)*/\
  379. "pmulhw "MANGLE(yCoeff)", %%mm1 \n\t"\
  380. \
  381. \
  382. "pmulhw %%mm5, %%mm4 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
  383. "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
  384. "pmulhw "MANGLE(ubCoeff)", %%mm3\n\t"\
  385. "psraw $4, %%mm0 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
  386. "pmulhw "MANGLE(ugCoeff)", %%mm2\n\t"\
  387. "paddw %%mm4, %%mm0 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
  388. "psubw "MANGLE(w400)", %%mm0 \n\t" /* (V-128)8*/\
  389. \
  390. \
  391. "movq %%mm0, %%mm4 \n\t" /* (V-128)8*/\
  392. "pmulhw "MANGLE(vrCoeff)", %%mm0\n\t"\
  393. "pmulhw "MANGLE(vgCoeff)", %%mm4\n\t"\
  394. "paddw %%mm1, %%mm3 \n\t" /* B*/\
  395. "paddw %%mm1, %%mm0 \n\t" /* R*/\
  396. "packuswb %%mm3, %%mm3 \n\t"\
  397. \
  398. "packuswb %%mm0, %%mm0 \n\t"\
  399. "paddw %%mm4, %%mm2 \n\t"\
  400. "paddw %%mm2, %%mm1 \n\t" /* G*/\
  401. \
  402. "packuswb %%mm1, %%mm1 \n\t"
  403. #endif
  404. #define REAL_YSCALEYUV2PACKED(index, c) \
  405. "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t"\
  406. "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1\n\t"\
  407. "psraw $3, %%mm0 \n\t"\
  408. "psraw $3, %%mm1 \n\t"\
  409. "movq %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c")\n\t"\
  410. "movq %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c")\n\t"\
  411. "xor "#index", "#index" \n\t"\
  412. ASMALIGN(4)\
  413. "1: \n\t"\
  414. "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
  415. "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
  416. "movq 4096(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
  417. "movq 4096(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
  418. "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
  419. "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
  420. "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t"\
  421. "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
  422. "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
  423. "psraw $7, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
  424. "psraw $7, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
  425. "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
  426. "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
  427. "movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
  428. "movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
  429. "movq 8(%0, "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
  430. "movq 8(%1, "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
  431. "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
  432. "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
  433. "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
  434. "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
  435. "psraw $7, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  436. "psraw $7, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  437. "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
  438. "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
  439. #define YSCALEYUV2PACKED(index, c) REAL_YSCALEYUV2PACKED(index, c)
  440. #define REAL_YSCALEYUV2RGB(index, c) \
  441. "xor "#index", "#index" \n\t"\
  442. ASMALIGN(4)\
  443. "1: \n\t"\
  444. "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
  445. "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
  446. "movq 4096(%2, "#index"), %%mm5\n\t" /* uvbuf0[eax+2048]*/\
  447. "movq 4096(%3, "#index"), %%mm4\n\t" /* uvbuf1[eax+2048]*/\
  448. "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
  449. "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
  450. "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t"\
  451. "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
  452. "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
  453. "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
  454. "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
  455. "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
  456. "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
  457. "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
  458. "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
  459. "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
  460. "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
  461. "pmulhw "UG_COEFF"("#c"), %%mm3\n\t"\
  462. "pmulhw "VG_COEFF"("#c"), %%mm4\n\t"\
  463. /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
  464. "movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
  465. "movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
  466. "movq 8(%0, "#index", 2), %%mm6\n\t" /*buf0[eax]*/\
  467. "movq 8(%1, "#index", 2), %%mm7\n\t" /*buf1[eax]*/\
  468. "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
  469. "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
  470. "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
  471. "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
  472. "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  473. "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  474. "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
  475. "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
  476. "pmulhw "UB_COEFF"("#c"), %%mm2\n\t"\
  477. "pmulhw "VR_COEFF"("#c"), %%mm5\n\t"\
  478. "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
  479. "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
  480. "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
  481. "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
  482. /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
  483. "paddw %%mm3, %%mm4 \n\t"\
  484. "movq %%mm2, %%mm0 \n\t"\
  485. "movq %%mm5, %%mm6 \n\t"\
  486. "movq %%mm4, %%mm3 \n\t"\
  487. "punpcklwd %%mm2, %%mm2 \n\t"\
  488. "punpcklwd %%mm5, %%mm5 \n\t"\
  489. "punpcklwd %%mm4, %%mm4 \n\t"\
  490. "paddw %%mm1, %%mm2 \n\t"\
  491. "paddw %%mm1, %%mm5 \n\t"\
  492. "paddw %%mm1, %%mm4 \n\t"\
  493. "punpckhwd %%mm0, %%mm0 \n\t"\
  494. "punpckhwd %%mm6, %%mm6 \n\t"\
  495. "punpckhwd %%mm3, %%mm3 \n\t"\
  496. "paddw %%mm7, %%mm0 \n\t"\
  497. "paddw %%mm7, %%mm6 \n\t"\
  498. "paddw %%mm7, %%mm3 \n\t"\
  499. /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
  500. "packuswb %%mm0, %%mm2 \n\t"\
  501. "packuswb %%mm6, %%mm5 \n\t"\
  502. "packuswb %%mm3, %%mm4 \n\t"\
  503. "pxor %%mm7, %%mm7 \n\t"
  504. #define YSCALEYUV2RGB(index, c) REAL_YSCALEYUV2RGB(index, c)
  505. #define REAL_YSCALEYUV2PACKED1(index, c) \
  506. "xor "#index", "#index" \n\t"\
  507. ASMALIGN(4)\
  508. "1: \n\t"\
  509. "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
  510. "movq 4096(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
  511. "psraw $7, %%mm3 \n\t" \
  512. "psraw $7, %%mm4 \n\t" \
  513. "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
  514. "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
  515. "psraw $7, %%mm1 \n\t" \
  516. "psraw $7, %%mm7 \n\t" \
  517. #define YSCALEYUV2PACKED1(index, c) REAL_YSCALEYUV2PACKED1(index, c)
  518. #define REAL_YSCALEYUV2RGB1(index, c) \
  519. "xor "#index", "#index" \n\t"\
  520. ASMALIGN(4)\
  521. "1: \n\t"\
  522. "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
  523. "movq 4096(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
  524. "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
  525. "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
  526. "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
  527. "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
  528. "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
  529. "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
  530. "pmulhw "UG_COEFF"("#c"), %%mm3\n\t"\
  531. "pmulhw "VG_COEFF"("#c"), %%mm4\n\t"\
  532. /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
  533. "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
  534. "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
  535. "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  536. "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  537. "pmulhw "UB_COEFF"("#c"), %%mm2\n\t"\
  538. "pmulhw "VR_COEFF"("#c"), %%mm5\n\t"\
  539. "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
  540. "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
  541. "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
  542. "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
  543. /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
  544. "paddw %%mm3, %%mm4 \n\t"\
  545. "movq %%mm2, %%mm0 \n\t"\
  546. "movq %%mm5, %%mm6 \n\t"\
  547. "movq %%mm4, %%mm3 \n\t"\
  548. "punpcklwd %%mm2, %%mm2 \n\t"\
  549. "punpcklwd %%mm5, %%mm5 \n\t"\
  550. "punpcklwd %%mm4, %%mm4 \n\t"\
  551. "paddw %%mm1, %%mm2 \n\t"\
  552. "paddw %%mm1, %%mm5 \n\t"\
  553. "paddw %%mm1, %%mm4 \n\t"\
  554. "punpckhwd %%mm0, %%mm0 \n\t"\
  555. "punpckhwd %%mm6, %%mm6 \n\t"\
  556. "punpckhwd %%mm3, %%mm3 \n\t"\
  557. "paddw %%mm7, %%mm0 \n\t"\
  558. "paddw %%mm7, %%mm6 \n\t"\
  559. "paddw %%mm7, %%mm3 \n\t"\
  560. /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
  561. "packuswb %%mm0, %%mm2 \n\t"\
  562. "packuswb %%mm6, %%mm5 \n\t"\
  563. "packuswb %%mm3, %%mm4 \n\t"\
  564. "pxor %%mm7, %%mm7 \n\t"
  565. #define YSCALEYUV2RGB1(index, c) REAL_YSCALEYUV2RGB1(index, c)
  566. #define REAL_YSCALEYUV2PACKED1b(index, c) \
  567. "xor "#index", "#index" \n\t"\
  568. ASMALIGN(4)\
  569. "1: \n\t"\
  570. "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
  571. "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
  572. "movq 4096(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
  573. "movq 4096(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
  574. "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
  575. "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
  576. "psrlw $8, %%mm3 \n\t" \
  577. "psrlw $8, %%mm4 \n\t" \
  578. "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
  579. "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
  580. "psraw $7, %%mm1 \n\t" \
  581. "psraw $7, %%mm7 \n\t"
  582. #define YSCALEYUV2PACKED1b(index, c) REAL_YSCALEYUV2PACKED1b(index, c)
  583. // do vertical chrominance interpolation
  584. #define REAL_YSCALEYUV2RGB1b(index, c) \
  585. "xor "#index", "#index" \n\t"\
  586. ASMALIGN(4)\
  587. "1: \n\t"\
  588. "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
  589. "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
  590. "movq 4096(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
  591. "movq 4096(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
  592. "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
  593. "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
  594. "psrlw $5, %%mm3 \n\t" /*FIXME might overflow*/\
  595. "psrlw $5, %%mm4 \n\t" /*FIXME might overflow*/\
  596. "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
  597. "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
  598. "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
  599. "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
  600. "pmulhw "UG_COEFF"("#c"), %%mm3\n\t"\
  601. "pmulhw "VG_COEFF"("#c"), %%mm4\n\t"\
  602. /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
  603. "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
  604. "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
  605. "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  606. "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  607. "pmulhw "UB_COEFF"("#c"), %%mm2\n\t"\
  608. "pmulhw "VR_COEFF"("#c"), %%mm5\n\t"\
  609. "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
  610. "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
  611. "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
  612. "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
  613. /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
  614. "paddw %%mm3, %%mm4 \n\t"\
  615. "movq %%mm2, %%mm0 \n\t"\
  616. "movq %%mm5, %%mm6 \n\t"\
  617. "movq %%mm4, %%mm3 \n\t"\
  618. "punpcklwd %%mm2, %%mm2 \n\t"\
  619. "punpcklwd %%mm5, %%mm5 \n\t"\
  620. "punpcklwd %%mm4, %%mm4 \n\t"\
  621. "paddw %%mm1, %%mm2 \n\t"\
  622. "paddw %%mm1, %%mm5 \n\t"\
  623. "paddw %%mm1, %%mm4 \n\t"\
  624. "punpckhwd %%mm0, %%mm0 \n\t"\
  625. "punpckhwd %%mm6, %%mm6 \n\t"\
  626. "punpckhwd %%mm3, %%mm3 \n\t"\
  627. "paddw %%mm7, %%mm0 \n\t"\
  628. "paddw %%mm7, %%mm6 \n\t"\
  629. "paddw %%mm7, %%mm3 \n\t"\
  630. /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
  631. "packuswb %%mm0, %%mm2 \n\t"\
  632. "packuswb %%mm6, %%mm5 \n\t"\
  633. "packuswb %%mm3, %%mm4 \n\t"\
  634. "pxor %%mm7, %%mm7 \n\t"
  635. #define YSCALEYUV2RGB1b(index, c) REAL_YSCALEYUV2RGB1b(index, c)
  636. #define REAL_WRITEBGR32(dst, dstw, index) \
  637. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
  638. "movq %%mm2, %%mm1 \n\t" /* B */\
  639. "movq %%mm5, %%mm6 \n\t" /* R */\
  640. "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
  641. "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
  642. "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
  643. "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
  644. "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
  645. "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
  646. "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
  647. "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
  648. "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
  649. "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
  650. \
  651. MOVNTQ(%%mm0, (dst, index, 4))\
  652. MOVNTQ(%%mm2, 8(dst, index, 4))\
  653. MOVNTQ(%%mm1, 16(dst, index, 4))\
  654. MOVNTQ(%%mm3, 24(dst, index, 4))\
  655. \
  656. "add $8, "#index" \n\t"\
  657. "cmp "#dstw", "#index" \n\t"\
  658. " jb 1b \n\t"
  659. #define WRITEBGR32(dst, dstw, index) REAL_WRITEBGR32(dst, dstw, index)
  660. #define REAL_WRITEBGR16(dst, dstw, index) \
  661. "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
  662. "pand "MANGLE(bFC)", %%mm4 \n\t" /* G */\
  663. "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
  664. "psrlq $3, %%mm2 \n\t"\
  665. \
  666. "movq %%mm2, %%mm1 \n\t"\
  667. "movq %%mm4, %%mm3 \n\t"\
  668. \
  669. "punpcklbw %%mm7, %%mm3 \n\t"\
  670. "punpcklbw %%mm5, %%mm2 \n\t"\
  671. "punpckhbw %%mm7, %%mm4 \n\t"\
  672. "punpckhbw %%mm5, %%mm1 \n\t"\
  673. \
  674. "psllq $3, %%mm3 \n\t"\
  675. "psllq $3, %%mm4 \n\t"\
  676. \
  677. "por %%mm3, %%mm2 \n\t"\
  678. "por %%mm4, %%mm1 \n\t"\
  679. \
  680. MOVNTQ(%%mm2, (dst, index, 2))\
  681. MOVNTQ(%%mm1, 8(dst, index, 2))\
  682. \
  683. "add $8, "#index" \n\t"\
  684. "cmp "#dstw", "#index" \n\t"\
  685. " jb 1b \n\t"
  686. #define WRITEBGR16(dst, dstw, index) REAL_WRITEBGR16(dst, dstw, index)
  687. #define REAL_WRITEBGR15(dst, dstw, index) \
  688. "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
  689. "pand "MANGLE(bF8)", %%mm4 \n\t" /* G */\
  690. "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
  691. "psrlq $3, %%mm2 \n\t"\
  692. "psrlq $1, %%mm5 \n\t"\
  693. \
  694. "movq %%mm2, %%mm1 \n\t"\
  695. "movq %%mm4, %%mm3 \n\t"\
  696. \
  697. "punpcklbw %%mm7, %%mm3 \n\t"\
  698. "punpcklbw %%mm5, %%mm2 \n\t"\
  699. "punpckhbw %%mm7, %%mm4 \n\t"\
  700. "punpckhbw %%mm5, %%mm1 \n\t"\
  701. \
  702. "psllq $2, %%mm3 \n\t"\
  703. "psllq $2, %%mm4 \n\t"\
  704. \
  705. "por %%mm3, %%mm2 \n\t"\
  706. "por %%mm4, %%mm1 \n\t"\
  707. \
  708. MOVNTQ(%%mm2, (dst, index, 2))\
  709. MOVNTQ(%%mm1, 8(dst, index, 2))\
  710. \
  711. "add $8, "#index" \n\t"\
  712. "cmp "#dstw", "#index" \n\t"\
  713. " jb 1b \n\t"
  714. #define WRITEBGR15(dst, dstw, index) REAL_WRITEBGR15(dst, dstw, index)
  715. #define WRITEBGR24OLD(dst, dstw, index) \
  716. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
  717. "movq %%mm2, %%mm1 \n\t" /* B */\
  718. "movq %%mm5, %%mm6 \n\t" /* R */\
  719. "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
  720. "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
  721. "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
  722. "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
  723. "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
  724. "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
  725. "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
  726. "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
  727. "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
  728. "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
  729. \
  730. "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
  731. "psrlq $8, %%mm0 \n\t" /* 00RGB0RG 0 */\
  732. "pand "MANGLE(bm00000111)", %%mm4\n\t" /* 00000RGB 0 */\
  733. "pand "MANGLE(bm11111000)", %%mm0\n\t" /* 00RGB000 0.5 */\
  734. "por %%mm4, %%mm0 \n\t" /* 00RGBRGB 0 */\
  735. "movq %%mm2, %%mm4 \n\t" /* 0RGB0RGB 1 */\
  736. "psllq $48, %%mm2 \n\t" /* GB000000 1 */\
  737. "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
  738. \
  739. "movq %%mm4, %%mm2 \n\t" /* 0RGB0RGB 1 */\
  740. "psrld $16, %%mm4 \n\t" /* 000R000R 1 */\
  741. "psrlq $24, %%mm2 \n\t" /* 0000RGB0 1.5 */\
  742. "por %%mm4, %%mm2 \n\t" /* 000RRGBR 1 */\
  743. "pand "MANGLE(bm00001111)", %%mm2\n\t" /* 0000RGBR 1 */\
  744. "movq %%mm1, %%mm4 \n\t" /* 0RGB0RGB 2 */\
  745. "psrlq $8, %%mm1 \n\t" /* 00RGB0RG 2 */\
  746. "pand "MANGLE(bm00000111)", %%mm4\n\t" /* 00000RGB 2 */\
  747. "pand "MANGLE(bm11111000)", %%mm1\n\t" /* 00RGB000 2.5 */\
  748. "por %%mm4, %%mm1 \n\t" /* 00RGBRGB 2 */\
  749. "movq %%mm1, %%mm4 \n\t" /* 00RGBRGB 2 */\
  750. "psllq $32, %%mm1 \n\t" /* BRGB0000 2 */\
  751. "por %%mm1, %%mm2 \n\t" /* BRGBRGBR 1 */\
  752. \
  753. "psrlq $32, %%mm4 \n\t" /* 000000RG 2.5 */\
  754. "movq %%mm3, %%mm5 \n\t" /* 0RGB0RGB 3 */\
  755. "psrlq $8, %%mm3 \n\t" /* 00RGB0RG 3 */\
  756. "pand "MANGLE(bm00000111)", %%mm5\n\t" /* 00000RGB 3 */\
  757. "pand "MANGLE(bm11111000)", %%mm3\n\t" /* 00RGB000 3.5 */\
  758. "por %%mm5, %%mm3 \n\t" /* 00RGBRGB 3 */\
  759. "psllq $16, %%mm3 \n\t" /* RGBRGB00 3 */\
  760. "por %%mm4, %%mm3 \n\t" /* RGBRGBRG 2.5 */\
  761. \
  762. MOVNTQ(%%mm0, (dst))\
  763. MOVNTQ(%%mm2, 8(dst))\
  764. MOVNTQ(%%mm3, 16(dst))\
  765. "add $24, "#dst" \n\t"\
  766. \
  767. "add $8, "#index" \n\t"\
  768. "cmp "#dstw", "#index" \n\t"\
  769. " jb 1b \n\t"
  770. #define WRITEBGR24MMX(dst, dstw, index) \
  771. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
  772. "movq %%mm2, %%mm1 \n\t" /* B */\
  773. "movq %%mm5, %%mm6 \n\t" /* R */\
  774. "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
  775. "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
  776. "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
  777. "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
  778. "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
  779. "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
  780. "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
  781. "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
  782. "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
  783. "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
  784. \
  785. "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
  786. "movq %%mm2, %%mm6 \n\t" /* 0RGB0RGB 1 */\
  787. "movq %%mm1, %%mm5 \n\t" /* 0RGB0RGB 2 */\
  788. "movq %%mm3, %%mm7 \n\t" /* 0RGB0RGB 3 */\
  789. \
  790. "psllq $40, %%mm0 \n\t" /* RGB00000 0 */\
  791. "psllq $40, %%mm2 \n\t" /* RGB00000 1 */\
  792. "psllq $40, %%mm1 \n\t" /* RGB00000 2 */\
  793. "psllq $40, %%mm3 \n\t" /* RGB00000 3 */\
  794. \
  795. "punpckhdq %%mm4, %%mm0 \n\t" /* 0RGBRGB0 0 */\
  796. "punpckhdq %%mm6, %%mm2 \n\t" /* 0RGBRGB0 1 */\
  797. "punpckhdq %%mm5, %%mm1 \n\t" /* 0RGBRGB0 2 */\
  798. "punpckhdq %%mm7, %%mm3 \n\t" /* 0RGBRGB0 3 */\
  799. \
  800. "psrlq $8, %%mm0 \n\t" /* 00RGBRGB 0 */\
  801. "movq %%mm2, %%mm6 \n\t" /* 0RGBRGB0 1 */\
  802. "psllq $40, %%mm2 \n\t" /* GB000000 1 */\
  803. "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
  804. MOVNTQ(%%mm0, (dst))\
  805. \
  806. "psrlq $24, %%mm6 \n\t" /* 0000RGBR 1 */\
  807. "movq %%mm1, %%mm5 \n\t" /* 0RGBRGB0 2 */\
  808. "psllq $24, %%mm1 \n\t" /* BRGB0000 2 */\
  809. "por %%mm1, %%mm6 \n\t" /* BRGBRGBR 1 */\
  810. MOVNTQ(%%mm6, 8(dst))\
  811. \
  812. "psrlq $40, %%mm5 \n\t" /* 000000RG 2 */\
  813. "psllq $8, %%mm3 \n\t" /* RGBRGB00 3 */\
  814. "por %%mm3, %%mm5 \n\t" /* RGBRGBRG 2 */\
  815. MOVNTQ(%%mm5, 16(dst))\
  816. \
  817. "add $24, "#dst" \n\t"\
  818. \
  819. "add $8, "#index" \n\t"\
  820. "cmp "#dstw", "#index" \n\t"\
  821. " jb 1b \n\t"
  822. #define WRITEBGR24MMX2(dst, dstw, index) \
  823. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
  824. "movq "MANGLE(M24A)", %%mm0 \n\t"\
  825. "movq "MANGLE(M24C)", %%mm7 \n\t"\
  826. "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2 B1 B0 B1 B0 */\
  827. "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2 G1 G0 G1 G0 */\
  828. "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0 R1 R0 R1 R0 */\
  829. \
  830. "pand %%mm0, %%mm1 \n\t" /* B2 B1 B0 */\
  831. "pand %%mm0, %%mm3 \n\t" /* G2 G1 G0 */\
  832. "pand %%mm7, %%mm6 \n\t" /* R1 R0 */\
  833. \
  834. "psllq $8, %%mm3 \n\t" /* G2 G1 G0 */\
  835. "por %%mm1, %%mm6 \n\t"\
  836. "por %%mm3, %%mm6 \n\t"\
  837. MOVNTQ(%%mm6, (dst))\
  838. \
  839. "psrlq $8, %%mm4 \n\t" /* 00 G7 G6 G5 G4 G3 G2 G1 */\
  840. "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4 B3 B2 B3 B2 */\
  841. "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3 G4 G3 G4 G3 */\
  842. "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4 R3 R2 R3 R2 */\
  843. \
  844. "pand "MANGLE(M24B)", %%mm1 \n\t" /* B5 B4 B3 */\
  845. "pand %%mm7, %%mm3 \n\t" /* G4 G3 */\
  846. "pand %%mm0, %%mm6 \n\t" /* R4 R3 R2 */\
  847. \
  848. "por %%mm1, %%mm3 \n\t" /* B5 G4 B4 G3 B3 */\
  849. "por %%mm3, %%mm6 \n\t"\
  850. MOVNTQ(%%mm6, 8(dst))\
  851. \
  852. "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6 B7 B6 B6 B7 */\
  853. "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7 G6 G5 G6 G5 */\
  854. "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6 R5 R4 R5 R4 */\
  855. \
  856. "pand %%mm7, %%mm1 \n\t" /* B7 B6 */\
  857. "pand %%mm0, %%mm3 \n\t" /* G7 G6 G5 */\
  858. "pand "MANGLE(M24B)", %%mm6 \n\t" /* R7 R6 R5 */\
  859. \
  860. "por %%mm1, %%mm3 \n\t"\
  861. "por %%mm3, %%mm6 \n\t"\
  862. MOVNTQ(%%mm6, 16(dst))\
  863. \
  864. "add $24, "#dst" \n\t"\
  865. \
  866. "add $8, "#index" \n\t"\
  867. "cmp "#dstw", "#index" \n\t"\
  868. " jb 1b \n\t"
  869. #ifdef HAVE_MMX2
  870. #undef WRITEBGR24
  871. #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX2(dst, dstw, index)
  872. #else
  873. #undef WRITEBGR24
  874. #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX(dst, dstw, index)
  875. #endif
  876. #define REAL_WRITEYUY2(dst, dstw, index) \
  877. "packuswb %%mm3, %%mm3 \n\t"\
  878. "packuswb %%mm4, %%mm4 \n\t"\
  879. "packuswb %%mm7, %%mm1 \n\t"\
  880. "punpcklbw %%mm4, %%mm3 \n\t"\
  881. "movq %%mm1, %%mm7 \n\t"\
  882. "punpcklbw %%mm3, %%mm1 \n\t"\
  883. "punpckhbw %%mm3, %%mm7 \n\t"\
  884. \
  885. MOVNTQ(%%mm1, (dst, index, 2))\
  886. MOVNTQ(%%mm7, 8(dst, index, 2))\
  887. \
  888. "add $8, "#index" \n\t"\
  889. "cmp "#dstw", "#index" \n\t"\
  890. " jb 1b \n\t"
  891. #define WRITEYUY2(dst, dstw, index) REAL_WRITEYUY2(dst, dstw, index)
  892. static inline void RENAME(yuv2yuvX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
  893. int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
  894. uint8_t *dest, uint8_t *uDest, uint8_t *vDest, long dstW, long chrDstW)
  895. {
  896. #ifdef HAVE_MMX
  897. if(c->flags & SWS_ACCURATE_RND){
  898. if(uDest){
  899. YSCALEYUV2YV12X_ACCURATE( 0, CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
  900. YSCALEYUV2YV12X_ACCURATE(4096, CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
  901. }
  902. YSCALEYUV2YV12X_ACCURATE(0, LUM_MMX_FILTER_OFFSET, dest, dstW)
  903. }else{
  904. if(uDest){
  905. YSCALEYUV2YV12X( 0, CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
  906. YSCALEYUV2YV12X(4096, CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
  907. }
  908. YSCALEYUV2YV12X(0, LUM_MMX_FILTER_OFFSET, dest, dstW)
  909. }
  910. #else
  911. #ifdef HAVE_ALTIVEC
  912. yuv2yuvX_altivec_real(lumFilter, lumSrc, lumFilterSize,
  913. chrFilter, chrSrc, chrFilterSize,
  914. dest, uDest, vDest, dstW, chrDstW);
  915. #else //HAVE_ALTIVEC
  916. yuv2yuvXinC(lumFilter, lumSrc, lumFilterSize,
  917. chrFilter, chrSrc, chrFilterSize,
  918. dest, uDest, vDest, dstW, chrDstW);
  919. #endif //!HAVE_ALTIVEC
  920. #endif
  921. }
  922. static inline void RENAME(yuv2nv12X)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
  923. int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
  924. uint8_t *dest, uint8_t *uDest, int dstW, int chrDstW, int dstFormat)
  925. {
  926. yuv2nv12XinC(lumFilter, lumSrc, lumFilterSize,
  927. chrFilter, chrSrc, chrFilterSize,
  928. dest, uDest, dstW, chrDstW, dstFormat);
  929. }
  930. static inline void RENAME(yuv2yuv1)(int16_t *lumSrc, int16_t *chrSrc,
  931. uint8_t *dest, uint8_t *uDest, uint8_t *vDest, long dstW, long chrDstW)
  932. {
  933. #ifdef HAVE_MMX
  934. if(uDest != NULL)
  935. {
  936. asm volatile(
  937. YSCALEYUV2YV121
  938. :: "r" (chrSrc + chrDstW), "r" (uDest + chrDstW),
  939. "g" (-chrDstW)
  940. : "%"REG_a
  941. );
  942. asm volatile(
  943. YSCALEYUV2YV121
  944. :: "r" (chrSrc + 2048 + chrDstW), "r" (vDest + chrDstW),
  945. "g" (-chrDstW)
  946. : "%"REG_a
  947. );
  948. }
  949. asm volatile(
  950. YSCALEYUV2YV121
  951. :: "r" (lumSrc + dstW), "r" (dest + dstW),
  952. "g" (-dstW)
  953. : "%"REG_a
  954. );
  955. #else
  956. int i;
  957. for(i=0; i<dstW; i++)
  958. {
  959. int val= lumSrc[i]>>7;
  960. if(val&256){
  961. if(val<0) val=0;
  962. else val=255;
  963. }
  964. dest[i]= val;
  965. }
  966. if(uDest != NULL)
  967. for(i=0; i<chrDstW; i++)
  968. {
  969. int u=chrSrc[i]>>7;
  970. int v=chrSrc[i + 2048]>>7;
  971. if((u|v)&256){
  972. if(u<0) u=0;
  973. else if (u>255) u=255;
  974. if(v<0) v=0;
  975. else if (v>255) v=255;
  976. }
  977. uDest[i]= u;
  978. vDest[i]= v;
  979. }
  980. #endif
  981. }
  982. /**
  983. * vertical scale YV12 to RGB
  984. */
  985. static inline void RENAME(yuv2packedX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
  986. int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
  987. uint8_t *dest, long dstW, long dstY)
  988. {
  989. long dummy=0;
  990. #ifdef HAVE_MMX
  991. if(c->flags & SWS_ACCURATE_RND){
  992. switch(c->dstFormat){
  993. case PIX_FMT_RGB32:
  994. YSCALEYUV2PACKEDX_ACCURATE
  995. YSCALEYUV2RGBX
  996. WRITEBGR32(%4, %5, %%REGa)
  997. YSCALEYUV2PACKEDX_END
  998. return;
  999. case PIX_FMT_BGR24:
  1000. YSCALEYUV2PACKEDX_ACCURATE
  1001. YSCALEYUV2RGBX
  1002. "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"\n\t" //FIXME optimize
  1003. "add %4, %%"REG_c" \n\t"
  1004. WRITEBGR24(%%REGc, %5, %%REGa)
  1005. :: "r" (&c->redDither),
  1006. "m" (dummy), "m" (dummy), "m" (dummy),
  1007. "r" (dest), "m" (dstW)
  1008. : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
  1009. );
  1010. return;
  1011. case PIX_FMT_BGR555:
  1012. YSCALEYUV2PACKEDX_ACCURATE
  1013. YSCALEYUV2RGBX
  1014. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1015. #ifdef DITHER1XBPP
  1016. "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
  1017. "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
  1018. "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
  1019. #endif
  1020. WRITEBGR15(%4, %5, %%REGa)
  1021. YSCALEYUV2PACKEDX_END
  1022. return;
  1023. case PIX_FMT_BGR565:
  1024. YSCALEYUV2PACKEDX_ACCURATE
  1025. YSCALEYUV2RGBX
  1026. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1027. #ifdef DITHER1XBPP
  1028. "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
  1029. "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
  1030. "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
  1031. #endif
  1032. WRITEBGR16(%4, %5, %%REGa)
  1033. YSCALEYUV2PACKEDX_END
  1034. return;
  1035. case PIX_FMT_YUYV422:
  1036. YSCALEYUV2PACKEDX_ACCURATE
  1037. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1038. "psraw $3, %%mm3 \n\t"
  1039. "psraw $3, %%mm4 \n\t"
  1040. "psraw $3, %%mm1 \n\t"
  1041. "psraw $3, %%mm7 \n\t"
  1042. WRITEYUY2(%4, %5, %%REGa)
  1043. YSCALEYUV2PACKEDX_END
  1044. return;
  1045. }
  1046. }else{
  1047. switch(c->dstFormat)
  1048. {
  1049. case PIX_FMT_RGB32:
  1050. YSCALEYUV2PACKEDX
  1051. YSCALEYUV2RGBX
  1052. WRITEBGR32(%4, %5, %%REGa)
  1053. YSCALEYUV2PACKEDX_END
  1054. return;
  1055. case PIX_FMT_BGR24:
  1056. YSCALEYUV2PACKEDX
  1057. YSCALEYUV2RGBX
  1058. "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"\n\t" //FIXME optimize
  1059. "add %4, %%"REG_c" \n\t"
  1060. WRITEBGR24(%%REGc, %5, %%REGa)
  1061. :: "r" (&c->redDither),
  1062. "m" (dummy), "m" (dummy), "m" (dummy),
  1063. "r" (dest), "m" (dstW)
  1064. : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
  1065. );
  1066. return;
  1067. case PIX_FMT_BGR555:
  1068. YSCALEYUV2PACKEDX
  1069. YSCALEYUV2RGBX
  1070. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1071. #ifdef DITHER1XBPP
  1072. "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
  1073. "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
  1074. "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
  1075. #endif
  1076. WRITEBGR15(%4, %5, %%REGa)
  1077. YSCALEYUV2PACKEDX_END
  1078. return;
  1079. case PIX_FMT_BGR565:
  1080. YSCALEYUV2PACKEDX
  1081. YSCALEYUV2RGBX
  1082. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1083. #ifdef DITHER1XBPP
  1084. "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
  1085. "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
  1086. "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
  1087. #endif
  1088. WRITEBGR16(%4, %5, %%REGa)
  1089. YSCALEYUV2PACKEDX_END
  1090. return;
  1091. case PIX_FMT_YUYV422:
  1092. YSCALEYUV2PACKEDX
  1093. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1094. "psraw $3, %%mm3 \n\t"
  1095. "psraw $3, %%mm4 \n\t"
  1096. "psraw $3, %%mm1 \n\t"
  1097. "psraw $3, %%mm7 \n\t"
  1098. WRITEYUY2(%4, %5, %%REGa)
  1099. YSCALEYUV2PACKEDX_END
  1100. return;
  1101. }
  1102. }
  1103. #endif
  1104. #ifdef HAVE_ALTIVEC
  1105. /* The following list of supported dstFormat values should
  1106. match what's found in the body of altivec_yuv2packedX() */
  1107. if(c->dstFormat==PIX_FMT_ABGR || c->dstFormat==PIX_FMT_BGRA ||
  1108. c->dstFormat==PIX_FMT_BGR24 || c->dstFormat==PIX_FMT_RGB24 ||
  1109. c->dstFormat==PIX_FMT_RGBA || c->dstFormat==PIX_FMT_ARGB)
  1110. altivec_yuv2packedX (c, lumFilter, lumSrc, lumFilterSize,
  1111. chrFilter, chrSrc, chrFilterSize,
  1112. dest, dstW, dstY);
  1113. else
  1114. #endif
  1115. yuv2packedXinC(c, lumFilter, lumSrc, lumFilterSize,
  1116. chrFilter, chrSrc, chrFilterSize,
  1117. dest, dstW, dstY);
  1118. }
  1119. /**
  1120. * vertical bilinear scale YV12 to RGB
  1121. */
  1122. static inline void RENAME(yuv2packed2)(SwsContext *c, uint16_t *buf0, uint16_t *buf1, uint16_t *uvbuf0, uint16_t *uvbuf1,
  1123. uint8_t *dest, int dstW, int yalpha, int uvalpha, int y)
  1124. {
  1125. int yalpha1=yalpha^4095;
  1126. int uvalpha1=uvalpha^4095;
  1127. int i;
  1128. #if 0 //isn't used
  1129. if(flags&SWS_FULL_CHR_H_INT)
  1130. {
  1131. switch(dstFormat)
  1132. {
  1133. #ifdef HAVE_MMX
  1134. case PIX_FMT_RGB32:
  1135. asm volatile(
  1136. FULL_YSCALEYUV2RGB
  1137. "punpcklbw %%mm1, %%mm3 \n\t" // BGBGBGBG
  1138. "punpcklbw %%mm7, %%mm0 \n\t" // R0R0R0R0
  1139. "movq %%mm3, %%mm1 \n\t"
  1140. "punpcklwd %%mm0, %%mm3 \n\t" // BGR0BGR0
  1141. "punpckhwd %%mm0, %%mm1 \n\t" // BGR0BGR0
  1142. MOVNTQ(%%mm3, (%4, %%REGa, 4))
  1143. MOVNTQ(%%mm1, 8(%4, %%REGa, 4))
  1144. "add $4, %%"REG_a" \n\t"
  1145. "cmp %5, %%"REG_a" \n\t"
  1146. " jb 1b \n\t"
  1147. :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" ((long)dstW),
  1148. "m" (yalpha1), "m" (uvalpha1)
  1149. : "%"REG_a
  1150. );
  1151. break;
  1152. case PIX_FMT_BGR24:
  1153. asm volatile(
  1154. FULL_YSCALEYUV2RGB
  1155. // lsb ... msb
  1156. "punpcklbw %%mm1, %%mm3 \n\t" // BGBGBGBG
  1157. "punpcklbw %%mm7, %%mm0 \n\t" // R0R0R0R0
  1158. "movq %%mm3, %%mm1 \n\t"
  1159. "punpcklwd %%mm0, %%mm3 \n\t" // BGR0BGR0
  1160. "punpckhwd %%mm0, %%mm1 \n\t" // BGR0BGR0
  1161. "movq %%mm3, %%mm2 \n\t" // BGR0BGR0
  1162. "psrlq $8, %%mm3 \n\t" // GR0BGR00
  1163. "pand "MANGLE(bm00000111)", %%mm2\n\t" // BGR00000
  1164. "pand "MANGLE(bm11111000)", %%mm3\n\t" // 000BGR00
  1165. "por %%mm2, %%mm3 \n\t" // BGRBGR00
  1166. "movq %%mm1, %%mm2 \n\t"
  1167. "psllq $48, %%mm1 \n\t" // 000000BG
  1168. "por %%mm1, %%mm3 \n\t" // BGRBGRBG
  1169. "movq %%mm2, %%mm1 \n\t" // BGR0BGR0
  1170. "psrld $16, %%mm2 \n\t" // R000R000
  1171. "psrlq $24, %%mm1 \n\t" // 0BGR0000
  1172. "por %%mm2, %%mm1 \n\t" // RBGRR000
  1173. "mov %4, %%"REG_b" \n\t"
  1174. "add %%"REG_a", %%"REG_b" \n\t"
  1175. #ifdef HAVE_MMX2
  1176. //FIXME Alignment
  1177. "movntq %%mm3, (%%"REG_b", %%"REG_a", 2)\n\t"
  1178. "movntq %%mm1, 8(%%"REG_b", %%"REG_a", 2)\n\t"
  1179. #else
  1180. "movd %%mm3, (%%"REG_b", %%"REG_a", 2) \n\t"
  1181. "psrlq $32, %%mm3 \n\t"
  1182. "movd %%mm3, 4(%%"REG_b", %%"REG_a", 2) \n\t"
  1183. "movd %%mm1, 8(%%"REG_b", %%"REG_a", 2) \n\t"
  1184. #endif
  1185. "add $4, %%"REG_a" \n\t"
  1186. "cmp %5, %%"REG_a" \n\t"
  1187. " jb 1b \n\t"
  1188. :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstW),
  1189. "m" (yalpha1), "m" (uvalpha1)
  1190. : "%"REG_a, "%"REG_b
  1191. );
  1192. break;
  1193. case PIX_FMT_BGR555:
  1194. asm volatile(
  1195. FULL_YSCALEYUV2RGB
  1196. #ifdef DITHER1XBPP
  1197. "paddusb "MANGLE(g5Dither)", %%mm1\n\t"
  1198. "paddusb "MANGLE(r5Dither)", %%mm0\n\t"
  1199. "paddusb "MANGLE(b5Dither)", %%mm3\n\t"
  1200. #endif
  1201. "punpcklbw %%mm7, %%mm1 \n\t" // 0G0G0G0G
  1202. "punpcklbw %%mm7, %%mm3 \n\t" // 0B0B0B0B
  1203. "punpcklbw %%mm7, %%mm0 \n\t" // 0R0R0R0R
  1204. "psrlw $3, %%mm3 \n\t"
  1205. "psllw $2, %%mm1 \n\t"
  1206. "psllw $7, %%mm0 \n\t"
  1207. "pand "MANGLE(g15Mask)", %%mm1 \n\t"
  1208. "pand "MANGLE(r15Mask)", %%mm0 \n\t"
  1209. "por %%mm3, %%mm1 \n\t"
  1210. "por %%mm1, %%mm0 \n\t"
  1211. MOVNTQ(%%mm0, (%4, %%REGa, 2))
  1212. "add $4, %%"REG_a" \n\t"
  1213. "cmp %5, %%"REG_a" \n\t"
  1214. " jb 1b \n\t"
  1215. :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
  1216. "m" (yalpha1), "m" (uvalpha1)
  1217. : "%"REG_a
  1218. );
  1219. break;
  1220. case PIX_FMT_BGR565:
  1221. asm volatile(
  1222. FULL_YSCALEYUV2RGB
  1223. #ifdef DITHER1XBPP
  1224. "paddusb "MANGLE(g6Dither)", %%mm1\n\t"
  1225. "paddusb "MANGLE(r5Dither)", %%mm0\n\t"
  1226. "paddusb "MANGLE(b5Dither)", %%mm3\n\t"
  1227. #endif
  1228. "punpcklbw %%mm7, %%mm1 \n\t" // 0G0G0G0G
  1229. "punpcklbw %%mm7, %%mm3 \n\t" // 0B0B0B0B
  1230. "punpcklbw %%mm7, %%mm0 \n\t" // 0R0R0R0R
  1231. "psrlw $3, %%mm3 \n\t"
  1232. "psllw $3, %%mm1 \n\t"
  1233. "psllw $8, %%mm0 \n\t"
  1234. "pand "MANGLE(g16Mask)", %%mm1 \n\t"
  1235. "pand "MANGLE(r16Mask)", %%mm0 \n\t"
  1236. "por %%mm3, %%mm1 \n\t"
  1237. "por %%mm1, %%mm0 \n\t"
  1238. MOVNTQ(%%mm0, (%4, %%REGa, 2))
  1239. "add $4, %%"REG_a" \n\t"
  1240. "cmp %5, %%"REG_a" \n\t"
  1241. " jb 1b \n\t"
  1242. :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
  1243. "m" (yalpha1), "m" (uvalpha1)
  1244. : "%"REG_a
  1245. );
  1246. break;
  1247. #endif
  1248. case PIX_FMT_BGR32:
  1249. #ifndef HAVE_MMX
  1250. case PIX_FMT_RGB32:
  1251. #endif
  1252. if(dstFormat==PIX_FMT_RGB32)
  1253. {
  1254. int i;
  1255. #ifdef WORDS_BIGENDIAN
  1256. dest++;
  1257. #endif
  1258. for(i=0;i<dstW;i++){
  1259. // vertical linear interpolation && yuv2rgb in a single step:
  1260. int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
  1261. int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
  1262. int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
  1263. dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)];
  1264. dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)];
  1265. dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)];
  1266. dest+= 4;
  1267. }
  1268. }
  1269. else if(dstFormat==PIX_FMT_BGR24)
  1270. {
  1271. int i;
  1272. for(i=0;i<dstW;i++){
  1273. // vertical linear interpolation && yuv2rgb in a single step:
  1274. int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
  1275. int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
  1276. int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
  1277. dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)];
  1278. dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)];
  1279. dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)];
  1280. dest+= 3;
  1281. }
  1282. }
  1283. else if(dstFormat==PIX_FMT_BGR565)
  1284. {
  1285. int i;
  1286. for(i=0;i<dstW;i++){
  1287. // vertical linear interpolation && yuv2rgb in a single step:
  1288. int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
  1289. int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
  1290. int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
  1291. ((uint16_t*)dest)[i] =
  1292. clip_table16b[(Y + yuvtab_40cf[U]) >>13] |
  1293. clip_table16g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
  1294. clip_table16r[(Y + yuvtab_3343[V]) >>13];
  1295. }
  1296. }
  1297. else if(dstFormat==PIX_FMT_BGR555)
  1298. {
  1299. int i;
  1300. for(i=0;i<dstW;i++){
  1301. // vertical linear interpolation && yuv2rgb in a single step:
  1302. int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
  1303. int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
  1304. int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
  1305. ((uint16_t*)dest)[i] =
  1306. clip_table15b[(Y + yuvtab_40cf[U]) >>13] |
  1307. clip_table15g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
  1308. clip_table15r[(Y + yuvtab_3343[V]) >>13];
  1309. }
  1310. }
  1311. }//FULL_UV_IPOL
  1312. else
  1313. {
  1314. #endif // if 0
  1315. #ifdef HAVE_MMX
  1316. switch(c->dstFormat)
  1317. {
  1318. //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
  1319. case PIX_FMT_RGB32:
  1320. asm volatile(
  1321. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1322. "mov %4, %%"REG_b" \n\t"
  1323. "push %%"REG_BP" \n\t"
  1324. YSCALEYUV2RGB(%%REGBP, %5)
  1325. WRITEBGR32(%%REGb, 8280(%5), %%REGBP)
  1326. "pop %%"REG_BP" \n\t"
  1327. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1328. :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
  1329. "a" (&c->redDither)
  1330. );
  1331. return;
  1332. case PIX_FMT_BGR24:
  1333. asm volatile(
  1334. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1335. "mov %4, %%"REG_b" \n\t"
  1336. "push %%"REG_BP" \n\t"
  1337. YSCALEYUV2RGB(%%REGBP, %5)
  1338. WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
  1339. "pop %%"REG_BP" \n\t"
  1340. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1341. :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
  1342. "a" (&c->redDither)
  1343. );
  1344. return;
  1345. case PIX_FMT_BGR555:
  1346. asm volatile(
  1347. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1348. "mov %4, %%"REG_b" \n\t"
  1349. "push %%"REG_BP" \n\t"
  1350. YSCALEYUV2RGB(%%REGBP, %5)
  1351. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1352. #ifdef DITHER1XBPP
  1353. "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
  1354. "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
  1355. "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
  1356. #endif
  1357. WRITEBGR15(%%REGb, 8280(%5), %%REGBP)
  1358. "pop %%"REG_BP" \n\t"
  1359. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1360. :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
  1361. "a" (&c->redDither)
  1362. );
  1363. return;
  1364. case PIX_FMT_BGR565:
  1365. asm volatile(
  1366. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1367. "mov %4, %%"REG_b" \n\t"
  1368. "push %%"REG_BP" \n\t"
  1369. YSCALEYUV2RGB(%%REGBP, %5)
  1370. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1371. #ifdef DITHER1XBPP
  1372. "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
  1373. "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
  1374. "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
  1375. #endif
  1376. WRITEBGR16(%%REGb, 8280(%5), %%REGBP)
  1377. "pop %%"REG_BP" \n\t"
  1378. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1379. :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
  1380. "a" (&c->redDither)
  1381. );
  1382. return;
  1383. case PIX_FMT_YUYV422:
  1384. asm volatile(
  1385. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1386. "mov %4, %%"REG_b" \n\t"
  1387. "push %%"REG_BP" \n\t"
  1388. YSCALEYUV2PACKED(%%REGBP, %5)
  1389. WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
  1390. "pop %%"REG_BP" \n\t"
  1391. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1392. :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
  1393. "a" (&c->redDither)
  1394. );
  1395. return;
  1396. default: break;
  1397. }
  1398. #endif //HAVE_MMX
  1399. YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB2_C, YSCALE_YUV_2_PACKED2_C)
  1400. }
  1401. /**
  1402. * YV12 to RGB without scaling or interpolating
  1403. */
  1404. static inline void RENAME(yuv2packed1)(SwsContext *c, uint16_t *buf0, uint16_t *uvbuf0, uint16_t *uvbuf1,
  1405. uint8_t *dest, int dstW, int uvalpha, int dstFormat, int flags, int y)
  1406. {
  1407. const int yalpha1=0;
  1408. int i;
  1409. uint16_t *buf1= buf0; //FIXME needed for the rgb1/bgr1
  1410. const int yalpha= 4096; //FIXME ...
  1411. if(flags&SWS_FULL_CHR_H_INT)
  1412. {
  1413. RENAME(yuv2packed2)(c, buf0, buf0, uvbuf0, uvbuf1, dest, dstW, 0, uvalpha, y);
  1414. return;
  1415. }
  1416. #ifdef HAVE_MMX
  1417. if( uvalpha < 2048 ) // note this is not correct (shifts chrominance by 0.5 pixels) but its a bit faster
  1418. {
  1419. switch(dstFormat)
  1420. {
  1421. case PIX_FMT_RGB32:
  1422. asm volatile(
  1423. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1424. "mov %4, %%"REG_b" \n\t"
  1425. "push %%"REG_BP" \n\t"
  1426. YSCALEYUV2RGB1(%%REGBP, %5)
  1427. WRITEBGR32(%%REGb, 8280(%5), %%REGBP)
  1428. "pop %%"REG_BP" \n\t"
  1429. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1430. :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
  1431. "a" (&c->redDither)
  1432. );
  1433. return;
  1434. case PIX_FMT_BGR24:
  1435. asm volatile(
  1436. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1437. "mov %4, %%"REG_b" \n\t"
  1438. "push %%"REG_BP" \n\t"
  1439. YSCALEYUV2RGB1(%%REGBP, %5)
  1440. WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
  1441. "pop %%"REG_BP" \n\t"
  1442. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1443. :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
  1444. "a" (&c->redDither)
  1445. );
  1446. return;
  1447. case PIX_FMT_BGR555:
  1448. asm volatile(
  1449. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1450. "mov %4, %%"REG_b" \n\t"
  1451. "push %%"REG_BP" \n\t"
  1452. YSCALEYUV2RGB1(%%REGBP, %5)
  1453. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1454. #ifdef DITHER1XBPP
  1455. "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
  1456. "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
  1457. "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
  1458. #endif
  1459. WRITEBGR15(%%REGb, 8280(%5), %%REGBP)
  1460. "pop %%"REG_BP" \n\t"
  1461. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1462. :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
  1463. "a" (&c->redDither)
  1464. );
  1465. return;
  1466. case PIX_FMT_BGR565:
  1467. asm volatile(
  1468. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1469. "mov %4, %%"REG_b" \n\t"
  1470. "push %%"REG_BP" \n\t"
  1471. YSCALEYUV2RGB1(%%REGBP, %5)
  1472. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1473. #ifdef DITHER1XBPP
  1474. "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
  1475. "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
  1476. "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
  1477. #endif
  1478. WRITEBGR16(%%REGb, 8280(%5), %%REGBP)
  1479. "pop %%"REG_BP" \n\t"
  1480. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1481. :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
  1482. "a" (&c->redDither)
  1483. );
  1484. return;
  1485. case PIX_FMT_YUYV422:
  1486. asm volatile(
  1487. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1488. "mov %4, %%"REG_b" \n\t"
  1489. "push %%"REG_BP" \n\t"
  1490. YSCALEYUV2PACKED1(%%REGBP, %5)
  1491. WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
  1492. "pop %%"REG_BP" \n\t"
  1493. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1494. :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
  1495. "a" (&c->redDither)
  1496. );
  1497. return;
  1498. }
  1499. }
  1500. else
  1501. {
  1502. switch(dstFormat)
  1503. {
  1504. case PIX_FMT_RGB32:
  1505. asm volatile(
  1506. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1507. "mov %4, %%"REG_b" \n\t"
  1508. "push %%"REG_BP" \n\t"
  1509. YSCALEYUV2RGB1b(%%REGBP, %5)
  1510. WRITEBGR32(%%REGb, 8280(%5), %%REGBP)
  1511. "pop %%"REG_BP" \n\t"
  1512. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1513. :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
  1514. "a" (&c->redDither)
  1515. );
  1516. return;
  1517. case PIX_FMT_BGR24:
  1518. asm volatile(
  1519. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1520. "mov %4, %%"REG_b" \n\t"
  1521. "push %%"REG_BP" \n\t"
  1522. YSCALEYUV2RGB1b(%%REGBP, %5)
  1523. WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
  1524. "pop %%"REG_BP" \n\t"
  1525. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1526. :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
  1527. "a" (&c->redDither)
  1528. );
  1529. return;
  1530. case PIX_FMT_BGR555:
  1531. asm volatile(
  1532. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1533. "mov %4, %%"REG_b" \n\t"
  1534. "push %%"REG_BP" \n\t"
  1535. YSCALEYUV2RGB1b(%%REGBP, %5)
  1536. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1537. #ifdef DITHER1XBPP
  1538. "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
  1539. "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
  1540. "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
  1541. #endif
  1542. WRITEBGR15(%%REGb, 8280(%5), %%REGBP)
  1543. "pop %%"REG_BP" \n\t"
  1544. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1545. :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
  1546. "a" (&c->redDither)
  1547. );
  1548. return;
  1549. case PIX_FMT_BGR565:
  1550. asm volatile(
  1551. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1552. "mov %4, %%"REG_b" \n\t"
  1553. "push %%"REG_BP" \n\t"
  1554. YSCALEYUV2RGB1b(%%REGBP, %5)
  1555. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1556. #ifdef DITHER1XBPP
  1557. "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
  1558. "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
  1559. "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
  1560. #endif
  1561. WRITEBGR16(%%REGb, 8280(%5), %%REGBP)
  1562. "pop %%"REG_BP" \n\t"
  1563. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1564. :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
  1565. "a" (&c->redDither)
  1566. );
  1567. return;
  1568. case PIX_FMT_YUYV422:
  1569. asm volatile(
  1570. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1571. "mov %4, %%"REG_b" \n\t"
  1572. "push %%"REG_BP" \n\t"
  1573. YSCALEYUV2PACKED1b(%%REGBP, %5)
  1574. WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
  1575. "pop %%"REG_BP" \n\t"
  1576. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1577. :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
  1578. "a" (&c->redDither)
  1579. );
  1580. return;
  1581. }
  1582. }
  1583. #endif
  1584. if( uvalpha < 2048 )
  1585. {
  1586. YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1_C, YSCALE_YUV_2_PACKED1_C)
  1587. }else{
  1588. YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1B_C, YSCALE_YUV_2_PACKED1B_C)
  1589. }
  1590. }
  1591. //FIXME yuy2* can read upto 7 samples to much
  1592. static inline void RENAME(yuy2ToY)(uint8_t *dst, uint8_t *src, long width)
  1593. {
  1594. #ifdef HAVE_MMX
  1595. asm volatile(
  1596. "movq "MANGLE(bm01010101)", %%mm2\n\t"
  1597. "mov %0, %%"REG_a" \n\t"
  1598. "1: \n\t"
  1599. "movq (%1, %%"REG_a",2), %%mm0 \n\t"
  1600. "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
  1601. "pand %%mm2, %%mm0 \n\t"
  1602. "pand %%mm2, %%mm1 \n\t"
  1603. "packuswb %%mm1, %%mm0 \n\t"
  1604. "movq %%mm0, (%2, %%"REG_a") \n\t"
  1605. "add $8, %%"REG_a" \n\t"
  1606. " js 1b \n\t"
  1607. : : "g" (-width), "r" (src+width*2), "r" (dst+width)
  1608. : "%"REG_a
  1609. );
  1610. #else
  1611. int i;
  1612. for(i=0; i<width; i++)
  1613. dst[i]= src[2*i];
  1614. #endif
  1615. }
  1616. static inline void RENAME(yuy2ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width)
  1617. {
  1618. #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
  1619. asm volatile(
  1620. "movq "MANGLE(bm01010101)", %%mm4\n\t"
  1621. "mov %0, %%"REG_a" \n\t"
  1622. "1: \n\t"
  1623. "movq (%1, %%"REG_a",4), %%mm0 \n\t"
  1624. "movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
  1625. "movq (%2, %%"REG_a",4), %%mm2 \n\t"
  1626. "movq 8(%2, %%"REG_a",4), %%mm3 \n\t"
  1627. PAVGB(%%mm2, %%mm0)
  1628. PAVGB(%%mm3, %%mm1)
  1629. "psrlw $8, %%mm0 \n\t"
  1630. "psrlw $8, %%mm1 \n\t"
  1631. "packuswb %%mm1, %%mm0 \n\t"
  1632. "movq %%mm0, %%mm1 \n\t"
  1633. "psrlw $8, %%mm0 \n\t"
  1634. "pand %%mm4, %%mm1 \n\t"
  1635. "packuswb %%mm0, %%mm0 \n\t"
  1636. "packuswb %%mm1, %%mm1 \n\t"
  1637. "movd %%mm0, (%4, %%"REG_a") \n\t"
  1638. "movd %%mm1, (%3, %%"REG_a") \n\t"
  1639. "add $4, %%"REG_a" \n\t"
  1640. " js 1b \n\t"
  1641. : : "g" (-width), "r" (src1+width*4), "r" (src2+width*4), "r" (dstU+width), "r" (dstV+width)
  1642. : "%"REG_a
  1643. );
  1644. #else
  1645. int i;
  1646. for(i=0; i<width; i++)
  1647. {
  1648. dstU[i]= (src1[4*i + 1] + src2[4*i + 1])>>1;
  1649. dstV[i]= (src1[4*i + 3] + src2[4*i + 3])>>1;
  1650. }
  1651. #endif
  1652. }
  1653. //this is allmost identical to the previous, end exists only cuz yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses
  1654. static inline void RENAME(uyvyToY)(uint8_t *dst, uint8_t *src, long width)
  1655. {
  1656. #ifdef HAVE_MMX
  1657. asm volatile(
  1658. "mov %0, %%"REG_a" \n\t"
  1659. "1: \n\t"
  1660. "movq (%1, %%"REG_a",2), %%mm0 \n\t"
  1661. "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
  1662. "psrlw $8, %%mm0 \n\t"
  1663. "psrlw $8, %%mm1 \n\t"
  1664. "packuswb %%mm1, %%mm0 \n\t"
  1665. "movq %%mm0, (%2, %%"REG_a") \n\t"
  1666. "add $8, %%"REG_a" \n\t"
  1667. " js 1b \n\t"
  1668. : : "g" (-width), "r" (src+width*2), "r" (dst+width)
  1669. : "%"REG_a
  1670. );
  1671. #else
  1672. int i;
  1673. for(i=0; i<width; i++)
  1674. dst[i]= src[2*i+1];
  1675. #endif
  1676. }
  1677. static inline void RENAME(uyvyToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width)
  1678. {
  1679. #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
  1680. asm volatile(
  1681. "movq "MANGLE(bm01010101)", %%mm4\n\t"
  1682. "mov %0, %%"REG_a" \n\t"
  1683. "1: \n\t"
  1684. "movq (%1, %%"REG_a",4), %%mm0 \n\t"
  1685. "movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
  1686. "movq (%2, %%"REG_a",4), %%mm2 \n\t"
  1687. "movq 8(%2, %%"REG_a",4), %%mm3 \n\t"
  1688. PAVGB(%%mm2, %%mm0)
  1689. PAVGB(%%mm3, %%mm1)
  1690. "pand %%mm4, %%mm0 \n\t"
  1691. "pand %%mm4, %%mm1 \n\t"
  1692. "packuswb %%mm1, %%mm0 \n\t"
  1693. "movq %%mm0, %%mm1 \n\t"
  1694. "psrlw $8, %%mm0 \n\t"
  1695. "pand %%mm4, %%mm1 \n\t"
  1696. "packuswb %%mm0, %%mm0 \n\t"
  1697. "packuswb %%mm1, %%mm1 \n\t"
  1698. "movd %%mm0, (%4, %%"REG_a") \n\t"
  1699. "movd %%mm1, (%3, %%"REG_a") \n\t"
  1700. "add $4, %%"REG_a" \n\t"
  1701. " js 1b \n\t"
  1702. : : "g" (-width), "r" (src1+width*4), "r" (src2+width*4), "r" (dstU+width), "r" (dstV+width)
  1703. : "%"REG_a
  1704. );
  1705. #else
  1706. int i;
  1707. for(i=0; i<width; i++)
  1708. {
  1709. dstU[i]= (src1[4*i + 0] + src2[4*i + 0])>>1;
  1710. dstV[i]= (src1[4*i + 2] + src2[4*i + 2])>>1;
  1711. }
  1712. #endif
  1713. }
  1714. static inline void RENAME(bgr32ToY)(uint8_t *dst, uint8_t *src, int width)
  1715. {
  1716. int i;
  1717. for(i=0; i<width; i++)
  1718. {
  1719. int b= ((uint32_t*)src)[i]&0xFF;
  1720. int g= (((uint32_t*)src)[i]>>8)&0xFF;
  1721. int r= (((uint32_t*)src)[i]>>16)&0xFF;
  1722. dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT);
  1723. }
  1724. }
  1725. static inline void RENAME(bgr32ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
  1726. {
  1727. int i;
  1728. for(i=0; i<width; i++)
  1729. {
  1730. const int a= ((uint32_t*)src1)[2*i+0];
  1731. const int e= ((uint32_t*)src1)[2*i+1];
  1732. const int c= ((uint32_t*)src2)[2*i+0];
  1733. const int d= ((uint32_t*)src2)[2*i+1];
  1734. const int l= (a&0xFF00FF) + (e&0xFF00FF) + (c&0xFF00FF) + (d&0xFF00FF);
  1735. const int h= (a&0x00FF00) + (e&0x00FF00) + (c&0x00FF00) + (d&0x00FF00);
  1736. const int b= l&0x3FF;
  1737. const int g= h>>8;
  1738. const int r= l>>16;
  1739. dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
  1740. dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
  1741. }
  1742. }
  1743. static inline void RENAME(bgr24ToY)(uint8_t *dst, uint8_t *src, long width)
  1744. {
  1745. #ifdef HAVE_MMX
  1746. asm volatile(
  1747. "mov %2, %%"REG_a" \n\t"
  1748. "movq "MANGLE(bgr2YCoeff)", %%mm6 \n\t"
  1749. "movq "MANGLE(w1111)", %%mm5 \n\t"
  1750. "pxor %%mm7, %%mm7 \n\t"
  1751. "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d"\n\t"
  1752. ASMALIGN(4)
  1753. "1: \n\t"
  1754. PREFETCH" 64(%0, %%"REG_d") \n\t"
  1755. "movd (%0, %%"REG_d"), %%mm0 \n\t"
  1756. "movd 3(%0, %%"REG_d"), %%mm1 \n\t"
  1757. "punpcklbw %%mm7, %%mm0 \n\t"
  1758. "punpcklbw %%mm7, %%mm1 \n\t"
  1759. "movd 6(%0, %%"REG_d"), %%mm2 \n\t"
  1760. "movd 9(%0, %%"REG_d"), %%mm3 \n\t"
  1761. "punpcklbw %%mm7, %%mm2 \n\t"
  1762. "punpcklbw %%mm7, %%mm3 \n\t"
  1763. "pmaddwd %%mm6, %%mm0 \n\t"
  1764. "pmaddwd %%mm6, %%mm1 \n\t"
  1765. "pmaddwd %%mm6, %%mm2 \n\t"
  1766. "pmaddwd %%mm6, %%mm3 \n\t"
  1767. #ifndef FAST_BGR2YV12
  1768. "psrad $8, %%mm0 \n\t"
  1769. "psrad $8, %%mm1 \n\t"
  1770. "psrad $8, %%mm2 \n\t"
  1771. "psrad $8, %%mm3 \n\t"
  1772. #endif
  1773. "packssdw %%mm1, %%mm0 \n\t"
  1774. "packssdw %%mm3, %%mm2 \n\t"
  1775. "pmaddwd %%mm5, %%mm0 \n\t"
  1776. "pmaddwd %%mm5, %%mm2 \n\t"
  1777. "packssdw %%mm2, %%mm0 \n\t"
  1778. "psraw $7, %%mm0 \n\t"
  1779. "movd 12(%0, %%"REG_d"), %%mm4 \n\t"
  1780. "movd 15(%0, %%"REG_d"), %%mm1 \n\t"
  1781. "punpcklbw %%mm7, %%mm4 \n\t"
  1782. "punpcklbw %%mm7, %%mm1 \n\t"
  1783. "movd 18(%0, %%"REG_d"), %%mm2 \n\t"
  1784. "movd 21(%0, %%"REG_d"), %%mm3 \n\t"
  1785. "punpcklbw %%mm7, %%mm2 \n\t"
  1786. "punpcklbw %%mm7, %%mm3 \n\t"
  1787. "pmaddwd %%mm6, %%mm4 \n\t"
  1788. "pmaddwd %%mm6, %%mm1 \n\t"
  1789. "pmaddwd %%mm6, %%mm2 \n\t"
  1790. "pmaddwd %%mm6, %%mm3 \n\t"
  1791. #ifndef FAST_BGR2YV12
  1792. "psrad $8, %%mm4 \n\t"
  1793. "psrad $8, %%mm1 \n\t"
  1794. "psrad $8, %%mm2 \n\t"
  1795. "psrad $8, %%mm3 \n\t"
  1796. #endif
  1797. "packssdw %%mm1, %%mm4 \n\t"
  1798. "packssdw %%mm3, %%mm2 \n\t"
  1799. "pmaddwd %%mm5, %%mm4 \n\t"
  1800. "pmaddwd %%mm5, %%mm2 \n\t"
  1801. "add $24, %%"REG_d" \n\t"
  1802. "packssdw %%mm2, %%mm4 \n\t"
  1803. "psraw $7, %%mm4 \n\t"
  1804. "packuswb %%mm4, %%mm0 \n\t"
  1805. "paddusb "MANGLE(bgr2YOffset)", %%mm0 \n\t"
  1806. "movq %%mm0, (%1, %%"REG_a") \n\t"
  1807. "add $8, %%"REG_a" \n\t"
  1808. " js 1b \n\t"
  1809. : : "r" (src+width*3), "r" (dst+width), "g" (-width)
  1810. : "%"REG_a, "%"REG_d
  1811. );
  1812. #else
  1813. int i;
  1814. for(i=0; i<width; i++)
  1815. {
  1816. int b= src[i*3+0];
  1817. int g= src[i*3+1];
  1818. int r= src[i*3+2];
  1819. dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT);
  1820. }
  1821. #endif
  1822. }
  1823. static inline void RENAME(bgr24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width)
  1824. {
  1825. #ifdef HAVE_MMX
  1826. asm volatile(
  1827. "mov %4, %%"REG_a" \n\t"
  1828. "movq "MANGLE(w1111)", %%mm5 \n\t"
  1829. "movq "MANGLE(bgr2UCoeff)", %%mm6 \n\t"
  1830. "pxor %%mm7, %%mm7 \n\t"
  1831. "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d" \n\t"
  1832. "add %%"REG_d", %%"REG_d" \n\t"
  1833. ASMALIGN(4)
  1834. "1: \n\t"
  1835. PREFETCH" 64(%0, %%"REG_d") \n\t"
  1836. PREFETCH" 64(%1, %%"REG_d") \n\t"
  1837. #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
  1838. "movq (%0, %%"REG_d"), %%mm0 \n\t"
  1839. "movq (%1, %%"REG_d"), %%mm1 \n\t"
  1840. "movq 6(%0, %%"REG_d"), %%mm2 \n\t"
  1841. "movq 6(%1, %%"REG_d"), %%mm3 \n\t"
  1842. PAVGB(%%mm1, %%mm0)
  1843. PAVGB(%%mm3, %%mm2)
  1844. "movq %%mm0, %%mm1 \n\t"
  1845. "movq %%mm2, %%mm3 \n\t"
  1846. "psrlq $24, %%mm0 \n\t"
  1847. "psrlq $24, %%mm2 \n\t"
  1848. PAVGB(%%mm1, %%mm0)
  1849. PAVGB(%%mm3, %%mm2)
  1850. "punpcklbw %%mm7, %%mm0 \n\t"
  1851. "punpcklbw %%mm7, %%mm2 \n\t"
  1852. #else
  1853. "movd (%0, %%"REG_d"), %%mm0 \n\t"
  1854. "movd (%1, %%"REG_d"), %%mm1 \n\t"
  1855. "movd 3(%0, %%"REG_d"), %%mm2 \n\t"
  1856. "movd 3(%1, %%"REG_d"), %%mm3 \n\t"
  1857. "punpcklbw %%mm7, %%mm0 \n\t"
  1858. "punpcklbw %%mm7, %%mm1 \n\t"
  1859. "punpcklbw %%mm7, %%mm2 \n\t"
  1860. "punpcklbw %%mm7, %%mm3 \n\t"
  1861. "paddw %%mm1, %%mm0 \n\t"
  1862. "paddw %%mm3, %%mm2 \n\t"
  1863. "paddw %%mm2, %%mm0 \n\t"
  1864. "movd 6(%0, %%"REG_d"), %%mm4 \n\t"
  1865. "movd 6(%1, %%"REG_d"), %%mm1 \n\t"
  1866. "movd 9(%0, %%"REG_d"), %%mm2 \n\t"
  1867. "movd 9(%1, %%"REG_d"), %%mm3 \n\t"
  1868. "punpcklbw %%mm7, %%mm4 \n\t"
  1869. "punpcklbw %%mm7, %%mm1 \n\t"
  1870. "punpcklbw %%mm7, %%mm2 \n\t"
  1871. "punpcklbw %%mm7, %%mm3 \n\t"
  1872. "paddw %%mm1, %%mm4 \n\t"
  1873. "paddw %%mm3, %%mm2 \n\t"
  1874. "paddw %%mm4, %%mm2 \n\t"
  1875. "psrlw $2, %%mm0 \n\t"
  1876. "psrlw $2, %%mm2 \n\t"
  1877. #endif
  1878. "movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t"
  1879. "movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t"
  1880. "pmaddwd %%mm0, %%mm1 \n\t"
  1881. "pmaddwd %%mm2, %%mm3 \n\t"
  1882. "pmaddwd %%mm6, %%mm0 \n\t"
  1883. "pmaddwd %%mm6, %%mm2 \n\t"
  1884. #ifndef FAST_BGR2YV12
  1885. "psrad $8, %%mm0 \n\t"
  1886. "psrad $8, %%mm1 \n\t"
  1887. "psrad $8, %%mm2 \n\t"
  1888. "psrad $8, %%mm3 \n\t"
  1889. #endif
  1890. "packssdw %%mm2, %%mm0 \n\t"
  1891. "packssdw %%mm3, %%mm1 \n\t"
  1892. "pmaddwd %%mm5, %%mm0 \n\t"
  1893. "pmaddwd %%mm5, %%mm1 \n\t"
  1894. "packssdw %%mm1, %%mm0 \n\t" // V1 V0 U1 U0
  1895. "psraw $7, %%mm0 \n\t"
  1896. #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
  1897. "movq 12(%0, %%"REG_d"), %%mm4 \n\t"
  1898. "movq 12(%1, %%"REG_d"), %%mm1 \n\t"
  1899. "movq 18(%0, %%"REG_d"), %%mm2 \n\t"
  1900. "movq 18(%1, %%"REG_d"), %%mm3 \n\t"
  1901. PAVGB(%%mm1, %%mm4)
  1902. PAVGB(%%mm3, %%mm2)
  1903. "movq %%mm4, %%mm1 \n\t"
  1904. "movq %%mm2, %%mm3 \n\t"
  1905. "psrlq $24, %%mm4 \n\t"
  1906. "psrlq $24, %%mm2 \n\t"
  1907. PAVGB(%%mm1, %%mm4)
  1908. PAVGB(%%mm3, %%mm2)
  1909. "punpcklbw %%mm7, %%mm4 \n\t"
  1910. "punpcklbw %%mm7, %%mm2 \n\t"
  1911. #else
  1912. "movd 12(%0, %%"REG_d"), %%mm4 \n\t"
  1913. "movd 12(%1, %%"REG_d"), %%mm1 \n\t"
  1914. "movd 15(%0, %%"REG_d"), %%mm2 \n\t"
  1915. "movd 15(%1, %%"REG_d"), %%mm3 \n\t"
  1916. "punpcklbw %%mm7, %%mm4 \n\t"
  1917. "punpcklbw %%mm7, %%mm1 \n\t"
  1918. "punpcklbw %%mm7, %%mm2 \n\t"
  1919. "punpcklbw %%mm7, %%mm3 \n\t"
  1920. "paddw %%mm1, %%mm4 \n\t"
  1921. "paddw %%mm3, %%mm2 \n\t"
  1922. "paddw %%mm2, %%mm4 \n\t"
  1923. "movd 18(%0, %%"REG_d"), %%mm5 \n\t"
  1924. "movd 18(%1, %%"REG_d"), %%mm1 \n\t"
  1925. "movd 21(%0, %%"REG_d"), %%mm2 \n\t"
  1926. "movd 21(%1, %%"REG_d"), %%mm3 \n\t"
  1927. "punpcklbw %%mm7, %%mm5 \n\t"
  1928. "punpcklbw %%mm7, %%mm1 \n\t"
  1929. "punpcklbw %%mm7, %%mm2 \n\t"
  1930. "punpcklbw %%mm7, %%mm3 \n\t"
  1931. "paddw %%mm1, %%mm5 \n\t"
  1932. "paddw %%mm3, %%mm2 \n\t"
  1933. "paddw %%mm5, %%mm2 \n\t"
  1934. "movq "MANGLE(w1111)", %%mm5 \n\t"
  1935. "psrlw $2, %%mm4 \n\t"
  1936. "psrlw $2, %%mm2 \n\t"
  1937. #endif
  1938. "movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t"
  1939. "movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t"
  1940. "pmaddwd %%mm4, %%mm1 \n\t"
  1941. "pmaddwd %%mm2, %%mm3 \n\t"
  1942. "pmaddwd %%mm6, %%mm4 \n\t"
  1943. "pmaddwd %%mm6, %%mm2 \n\t"
  1944. #ifndef FAST_BGR2YV12
  1945. "psrad $8, %%mm4 \n\t"
  1946. "psrad $8, %%mm1 \n\t"
  1947. "psrad $8, %%mm2 \n\t"
  1948. "psrad $8, %%mm3 \n\t"
  1949. #endif
  1950. "packssdw %%mm2, %%mm4 \n\t"
  1951. "packssdw %%mm3, %%mm1 \n\t"
  1952. "pmaddwd %%mm5, %%mm4 \n\t"
  1953. "pmaddwd %%mm5, %%mm1 \n\t"
  1954. "add $24, %%"REG_d" \n\t"
  1955. "packssdw %%mm1, %%mm4 \n\t" // V3 V2 U3 U2
  1956. "psraw $7, %%mm4 \n\t"
  1957. "movq %%mm0, %%mm1 \n\t"
  1958. "punpckldq %%mm4, %%mm0 \n\t"
  1959. "punpckhdq %%mm4, %%mm1 \n\t"
  1960. "packsswb %%mm1, %%mm0 \n\t"
  1961. "paddb "MANGLE(bgr2UVOffset)", %%mm0 \n\t"
  1962. "movd %%mm0, (%2, %%"REG_a") \n\t"
  1963. "punpckhdq %%mm0, %%mm0 \n\t"
  1964. "movd %%mm0, (%3, %%"REG_a") \n\t"
  1965. "add $4, %%"REG_a" \n\t"
  1966. " js 1b \n\t"
  1967. : : "r" (src1+width*6), "r" (src2+width*6), "r" (dstU+width), "r" (dstV+width), "g" (-width)
  1968. : "%"REG_a, "%"REG_d
  1969. );
  1970. #else
  1971. int i;
  1972. for(i=0; i<width; i++)
  1973. {
  1974. int b= src1[6*i + 0] + src1[6*i + 3] + src2[6*i + 0] + src2[6*i + 3];
  1975. int g= src1[6*i + 1] + src1[6*i + 4] + src2[6*i + 1] + src2[6*i + 4];
  1976. int r= src1[6*i + 2] + src1[6*i + 5] + src2[6*i + 2] + src2[6*i + 5];
  1977. dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
  1978. dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
  1979. }
  1980. #endif
  1981. }
  1982. static inline void RENAME(bgr16ToY)(uint8_t *dst, uint8_t *src, int width)
  1983. {
  1984. int i;
  1985. for(i=0; i<width; i++)
  1986. {
  1987. int d= ((uint16_t*)src)[i];
  1988. int b= d&0x1F;
  1989. int g= (d>>5)&0x3F;
  1990. int r= (d>>11)&0x1F;
  1991. dst[i]= ((2*RY*r + GY*g + 2*BY*b)>>(RGB2YUV_SHIFT-2)) + 16;
  1992. }
  1993. }
  1994. static inline void RENAME(bgr16ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
  1995. {
  1996. int i;
  1997. for(i=0; i<width; i++)
  1998. {
  1999. int d0= ((uint32_t*)src1)[i];
  2000. int d1= ((uint32_t*)src2)[i];
  2001. int dl= (d0&0x07E0F81F) + (d1&0x07E0F81F);
  2002. int dh= ((d0>>5)&0x07C0F83F) + ((d1>>5)&0x07C0F83F);
  2003. int dh2= (dh>>11) + (dh<<21);
  2004. int d= dh2 + dl;
  2005. int b= d&0x7F;
  2006. int r= (d>>11)&0x7F;
  2007. int g= d>>21;
  2008. dstU[i]= ((2*RU*r + GU*g + 2*BU*b)>>(RGB2YUV_SHIFT+2-2)) + 128;
  2009. dstV[i]= ((2*RV*r + GV*g + 2*BV*b)>>(RGB2YUV_SHIFT+2-2)) + 128;
  2010. }
  2011. }
  2012. static inline void RENAME(bgr15ToY)(uint8_t *dst, uint8_t *src, int width)
  2013. {
  2014. int i;
  2015. for(i=0; i<width; i++)
  2016. {
  2017. int d= ((uint16_t*)src)[i];
  2018. int b= d&0x1F;
  2019. int g= (d>>5)&0x1F;
  2020. int r= (d>>10)&0x1F;
  2021. dst[i]= ((RY*r + GY*g + BY*b)>>(RGB2YUV_SHIFT-3)) + 16;
  2022. }
  2023. }
  2024. static inline void RENAME(bgr15ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
  2025. {
  2026. int i;
  2027. for(i=0; i<width; i++)
  2028. {
  2029. int d0= ((uint32_t*)src1)[i];
  2030. int d1= ((uint32_t*)src2)[i];
  2031. int dl= (d0&0x03E07C1F) + (d1&0x03E07C1F);
  2032. int dh= ((d0>>5)&0x03E0F81F) + ((d1>>5)&0x03E0F81F);
  2033. int dh2= (dh>>11) + (dh<<21);
  2034. int d= dh2 + dl;
  2035. int b= d&0x7F;
  2036. int r= (d>>10)&0x7F;
  2037. int g= d>>21;
  2038. dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2-3)) + 128;
  2039. dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2-3)) + 128;
  2040. }
  2041. }
  2042. static inline void RENAME(rgb32ToY)(uint8_t *dst, uint8_t *src, int width)
  2043. {
  2044. int i;
  2045. for(i=0; i<width; i++)
  2046. {
  2047. int r= ((uint32_t*)src)[i]&0xFF;
  2048. int g= (((uint32_t*)src)[i]>>8)&0xFF;
  2049. int b= (((uint32_t*)src)[i]>>16)&0xFF;
  2050. dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT);
  2051. }
  2052. }
  2053. static inline void RENAME(rgb32ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
  2054. {
  2055. int i;
  2056. for(i=0; i<width; i++)
  2057. {
  2058. const int a= ((uint32_t*)src1)[2*i+0];
  2059. const int e= ((uint32_t*)src1)[2*i+1];
  2060. const int c= ((uint32_t*)src2)[2*i+0];
  2061. const int d= ((uint32_t*)src2)[2*i+1];
  2062. const int l= (a&0xFF00FF) + (e&0xFF00FF) + (c&0xFF00FF) + (d&0xFF00FF);
  2063. const int h= (a&0x00FF00) + (e&0x00FF00) + (c&0x00FF00) + (d&0x00FF00);
  2064. const int r= l&0x3FF;
  2065. const int g= h>>8;
  2066. const int b= l>>16;
  2067. dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
  2068. dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
  2069. }
  2070. }
  2071. static inline void RENAME(rgb24ToY)(uint8_t *dst, uint8_t *src, int width)
  2072. {
  2073. int i;
  2074. for(i=0; i<width; i++)
  2075. {
  2076. int r= src[i*3+0];
  2077. int g= src[i*3+1];
  2078. int b= src[i*3+2];
  2079. dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT);
  2080. }
  2081. }
  2082. static inline void RENAME(rgb24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
  2083. {
  2084. int i;
  2085. for(i=0; i<width; i++)
  2086. {
  2087. int r= src1[6*i + 0] + src1[6*i + 3] + src2[6*i + 0] + src2[6*i + 3];
  2088. int g= src1[6*i + 1] + src1[6*i + 4] + src2[6*i + 1] + src2[6*i + 4];
  2089. int b= src1[6*i + 2] + src1[6*i + 5] + src2[6*i + 2] + src2[6*i + 5];
  2090. dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
  2091. dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
  2092. }
  2093. }
  2094. // Bilinear / Bicubic scaling
  2095. static inline void RENAME(hScale)(int16_t *dst, int dstW, uint8_t *src, int srcW, int xInc,
  2096. int16_t *filter, int16_t *filterPos, long filterSize)
  2097. {
  2098. #ifdef HAVE_MMX
  2099. assert(filterSize % 4 == 0 && filterSize>0);
  2100. if(filterSize==4) // allways true for upscaling, sometimes for down too
  2101. {
  2102. long counter= -2*dstW;
  2103. filter-= counter*2;
  2104. filterPos-= counter/2;
  2105. dst-= counter/2;
  2106. asm volatile(
  2107. #if defined(PIC)
  2108. "push %%"REG_b" \n\t"
  2109. #endif
  2110. "pxor %%mm7, %%mm7 \n\t"
  2111. "movq "MANGLE(w02)", %%mm6 \n\t"
  2112. "push %%"REG_BP" \n\t" // we use 7 regs here ...
  2113. "mov %%"REG_a", %%"REG_BP" \n\t"
  2114. ASMALIGN(4)
  2115. "1: \n\t"
  2116. "movzwl (%2, %%"REG_BP"), %%eax \n\t"
  2117. "movzwl 2(%2, %%"REG_BP"), %%ebx\n\t"
  2118. "movq (%1, %%"REG_BP", 4), %%mm1\n\t"
  2119. "movq 8(%1, %%"REG_BP", 4), %%mm3\n\t"
  2120. "movd (%3, %%"REG_a"), %%mm0 \n\t"
  2121. "movd (%3, %%"REG_b"), %%mm2 \n\t"
  2122. "punpcklbw %%mm7, %%mm0 \n\t"
  2123. "punpcklbw %%mm7, %%mm2 \n\t"
  2124. "pmaddwd %%mm1, %%mm0 \n\t"
  2125. "pmaddwd %%mm2, %%mm3 \n\t"
  2126. "psrad $8, %%mm0 \n\t"
  2127. "psrad $8, %%mm3 \n\t"
  2128. "packssdw %%mm3, %%mm0 \n\t"
  2129. "pmaddwd %%mm6, %%mm0 \n\t"
  2130. "packssdw %%mm0, %%mm0 \n\t"
  2131. "movd %%mm0, (%4, %%"REG_BP") \n\t"
  2132. "add $4, %%"REG_BP" \n\t"
  2133. " jnc 1b \n\t"
  2134. "pop %%"REG_BP" \n\t"
  2135. #if defined(PIC)
  2136. "pop %%"REG_b" \n\t"
  2137. #endif
  2138. : "+a" (counter)
  2139. : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
  2140. #if !defined(PIC)
  2141. : "%"REG_b
  2142. #endif
  2143. );
  2144. }
  2145. else if(filterSize==8)
  2146. {
  2147. long counter= -2*dstW;
  2148. filter-= counter*4;
  2149. filterPos-= counter/2;
  2150. dst-= counter/2;
  2151. asm volatile(
  2152. #if defined(PIC)
  2153. "push %%"REG_b" \n\t"
  2154. #endif
  2155. "pxor %%mm7, %%mm7 \n\t"
  2156. "movq "MANGLE(w02)", %%mm6 \n\t"
  2157. "push %%"REG_BP" \n\t" // we use 7 regs here ...
  2158. "mov %%"REG_a", %%"REG_BP" \n\t"
  2159. ASMALIGN(4)
  2160. "1: \n\t"
  2161. "movzwl (%2, %%"REG_BP"), %%eax \n\t"
  2162. "movzwl 2(%2, %%"REG_BP"), %%ebx\n\t"
  2163. "movq (%1, %%"REG_BP", 8), %%mm1\n\t"
  2164. "movq 16(%1, %%"REG_BP", 8), %%mm3\n\t"
  2165. "movd (%3, %%"REG_a"), %%mm0 \n\t"
  2166. "movd (%3, %%"REG_b"), %%mm2 \n\t"
  2167. "punpcklbw %%mm7, %%mm0 \n\t"
  2168. "punpcklbw %%mm7, %%mm2 \n\t"
  2169. "pmaddwd %%mm1, %%mm0 \n\t"
  2170. "pmaddwd %%mm2, %%mm3 \n\t"
  2171. "movq 8(%1, %%"REG_BP", 8), %%mm1\n\t"
  2172. "movq 24(%1, %%"REG_BP", 8), %%mm5\n\t"
  2173. "movd 4(%3, %%"REG_a"), %%mm4 \n\t"
  2174. "movd 4(%3, %%"REG_b"), %%mm2 \n\t"
  2175. "punpcklbw %%mm7, %%mm4 \n\t"
  2176. "punpcklbw %%mm7, %%mm2 \n\t"
  2177. "pmaddwd %%mm1, %%mm4 \n\t"
  2178. "pmaddwd %%mm2, %%mm5 \n\t"
  2179. "paddd %%mm4, %%mm0 \n\t"
  2180. "paddd %%mm5, %%mm3 \n\t"
  2181. "psrad $8, %%mm0 \n\t"
  2182. "psrad $8, %%mm3 \n\t"
  2183. "packssdw %%mm3, %%mm0 \n\t"
  2184. "pmaddwd %%mm6, %%mm0 \n\t"
  2185. "packssdw %%mm0, %%mm0 \n\t"
  2186. "movd %%mm0, (%4, %%"REG_BP") \n\t"
  2187. "add $4, %%"REG_BP" \n\t"
  2188. " jnc 1b \n\t"
  2189. "pop %%"REG_BP" \n\t"
  2190. #if defined(PIC)
  2191. "pop %%"REG_b" \n\t"
  2192. #endif
  2193. : "+a" (counter)
  2194. : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
  2195. #if !defined(PIC)
  2196. : "%"REG_b
  2197. #endif
  2198. );
  2199. }
  2200. else
  2201. {
  2202. uint8_t *offset = src+filterSize;
  2203. long counter= -2*dstW;
  2204. // filter-= counter*filterSize/2;
  2205. filterPos-= counter/2;
  2206. dst-= counter/2;
  2207. asm volatile(
  2208. "pxor %%mm7, %%mm7 \n\t"
  2209. "movq "MANGLE(w02)", %%mm6 \n\t"
  2210. ASMALIGN(4)
  2211. "1: \n\t"
  2212. "mov %2, %%"REG_c" \n\t"
  2213. "movzwl (%%"REG_c", %0), %%eax \n\t"
  2214. "movzwl 2(%%"REG_c", %0), %%edx \n\t"
  2215. "mov %5, %%"REG_c" \n\t"
  2216. "pxor %%mm4, %%mm4 \n\t"
  2217. "pxor %%mm5, %%mm5 \n\t"
  2218. "2: \n\t"
  2219. "movq (%1), %%mm1 \n\t"
  2220. "movq (%1, %6), %%mm3 \n\t"
  2221. "movd (%%"REG_c", %%"REG_a"), %%mm0\n\t"
  2222. "movd (%%"REG_c", %%"REG_d"), %%mm2\n\t"
  2223. "punpcklbw %%mm7, %%mm0 \n\t"
  2224. "punpcklbw %%mm7, %%mm2 \n\t"
  2225. "pmaddwd %%mm1, %%mm0 \n\t"
  2226. "pmaddwd %%mm2, %%mm3 \n\t"
  2227. "paddd %%mm3, %%mm5 \n\t"
  2228. "paddd %%mm0, %%mm4 \n\t"
  2229. "add $8, %1 \n\t"
  2230. "add $4, %%"REG_c" \n\t"
  2231. "cmp %4, %%"REG_c" \n\t"
  2232. " jb 2b \n\t"
  2233. "add %6, %1 \n\t"
  2234. "psrad $8, %%mm4 \n\t"
  2235. "psrad $8, %%mm5 \n\t"
  2236. "packssdw %%mm5, %%mm4 \n\t"
  2237. "pmaddwd %%mm6, %%mm4 \n\t"
  2238. "packssdw %%mm4, %%mm4 \n\t"
  2239. "mov %3, %%"REG_a" \n\t"
  2240. "movd %%mm4, (%%"REG_a", %0) \n\t"
  2241. "add $4, %0 \n\t"
  2242. " jnc 1b \n\t"
  2243. : "+r" (counter), "+r" (filter)
  2244. : "m" (filterPos), "m" (dst), "m"(offset),
  2245. "m" (src), "r" (filterSize*2)
  2246. : "%"REG_a, "%"REG_c, "%"REG_d
  2247. );
  2248. }
  2249. #else
  2250. #ifdef HAVE_ALTIVEC
  2251. hScale_altivec_real(dst, dstW, src, srcW, xInc, filter, filterPos, filterSize);
  2252. #else
  2253. int i;
  2254. for(i=0; i<dstW; i++)
  2255. {
  2256. int j;
  2257. int srcPos= filterPos[i];
  2258. int val=0;
  2259. // printf("filterPos: %d\n", filterPos[i]);
  2260. for(j=0; j<filterSize; j++)
  2261. {
  2262. // printf("filter: %d, src: %d\n", filter[i], src[srcPos + j]);
  2263. val += ((int)src[srcPos + j])*filter[filterSize*i + j];
  2264. }
  2265. // filter += hFilterSize;
  2266. dst[i] = FFMIN(FFMAX(0, val>>7), (1<<15)-1); // the cubic equation does overflow ...
  2267. // dst[i] = val>>7;
  2268. }
  2269. #endif
  2270. #endif
  2271. }
  2272. // *** horizontal scale Y line to temp buffer
  2273. static inline void RENAME(hyscale)(uint16_t *dst, long dstWidth, uint8_t *src, int srcW, int xInc,
  2274. int flags, int canMMX2BeUsed, int16_t *hLumFilter,
  2275. int16_t *hLumFilterPos, int hLumFilterSize, void *funnyYCode,
  2276. int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
  2277. int32_t *mmx2FilterPos)
  2278. {
  2279. if(srcFormat==PIX_FMT_YUYV422)
  2280. {
  2281. RENAME(yuy2ToY)(formatConvBuffer, src, srcW);
  2282. src= formatConvBuffer;
  2283. }
  2284. else if(srcFormat==PIX_FMT_UYVY422)
  2285. {
  2286. RENAME(uyvyToY)(formatConvBuffer, src, srcW);
  2287. src= formatConvBuffer;
  2288. }
  2289. else if(srcFormat==PIX_FMT_RGB32)
  2290. {
  2291. RENAME(bgr32ToY)(formatConvBuffer, src, srcW);
  2292. src= formatConvBuffer;
  2293. }
  2294. else if(srcFormat==PIX_FMT_BGR24)
  2295. {
  2296. RENAME(bgr24ToY)(formatConvBuffer, src, srcW);
  2297. src= formatConvBuffer;
  2298. }
  2299. else if(srcFormat==PIX_FMT_BGR565)
  2300. {
  2301. RENAME(bgr16ToY)(formatConvBuffer, src, srcW);
  2302. src= formatConvBuffer;
  2303. }
  2304. else if(srcFormat==PIX_FMT_BGR555)
  2305. {
  2306. RENAME(bgr15ToY)(formatConvBuffer, src, srcW);
  2307. src= formatConvBuffer;
  2308. }
  2309. else if(srcFormat==PIX_FMT_BGR32)
  2310. {
  2311. RENAME(rgb32ToY)(formatConvBuffer, src, srcW);
  2312. src= formatConvBuffer;
  2313. }
  2314. else if(srcFormat==PIX_FMT_RGB24)
  2315. {
  2316. RENAME(rgb24ToY)(formatConvBuffer, src, srcW);
  2317. src= formatConvBuffer;
  2318. }
  2319. #ifdef HAVE_MMX
  2320. // use the new MMX scaler if the mmx2 can't be used (its faster than the x86asm one)
  2321. if(!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
  2322. #else
  2323. if(!(flags&SWS_FAST_BILINEAR))
  2324. #endif
  2325. {
  2326. RENAME(hScale)(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize);
  2327. }
  2328. else // Fast Bilinear upscale / crap downscale
  2329. {
  2330. #if defined(ARCH_X86) || defined(ARCH_X86_64)
  2331. #ifdef HAVE_MMX2
  2332. int i;
  2333. #if defined(PIC)
  2334. uint64_t ebxsave __attribute__((aligned(8)));
  2335. #endif
  2336. if(canMMX2BeUsed)
  2337. {
  2338. asm volatile(
  2339. #if defined(PIC)
  2340. "mov %%"REG_b", %5 \n\t"
  2341. #endif
  2342. "pxor %%mm7, %%mm7 \n\t"
  2343. "mov %0, %%"REG_c" \n\t"
  2344. "mov %1, %%"REG_D" \n\t"
  2345. "mov %2, %%"REG_d" \n\t"
  2346. "mov %3, %%"REG_b" \n\t"
  2347. "xor %%"REG_a", %%"REG_a" \n\t" // i
  2348. PREFETCH" (%%"REG_c") \n\t"
  2349. PREFETCH" 32(%%"REG_c") \n\t"
  2350. PREFETCH" 64(%%"REG_c") \n\t"
  2351. #ifdef ARCH_X86_64
  2352. #define FUNNY_Y_CODE \
  2353. "movl (%%"REG_b"), %%esi \n\t"\
  2354. "call *%4 \n\t"\
  2355. "movl (%%"REG_b", %%"REG_a"), %%esi\n\t"\
  2356. "add %%"REG_S", %%"REG_c" \n\t"\
  2357. "add %%"REG_a", %%"REG_D" \n\t"\
  2358. "xor %%"REG_a", %%"REG_a" \n\t"\
  2359. #else
  2360. #define FUNNY_Y_CODE \
  2361. "movl (%%"REG_b"), %%esi \n\t"\
  2362. "call *%4 \n\t"\
  2363. "addl (%%"REG_b", %%"REG_a"), %%"REG_c"\n\t"\
  2364. "add %%"REG_a", %%"REG_D" \n\t"\
  2365. "xor %%"REG_a", %%"REG_a" \n\t"\
  2366. #endif
  2367. FUNNY_Y_CODE
  2368. FUNNY_Y_CODE
  2369. FUNNY_Y_CODE
  2370. FUNNY_Y_CODE
  2371. FUNNY_Y_CODE
  2372. FUNNY_Y_CODE
  2373. FUNNY_Y_CODE
  2374. FUNNY_Y_CODE
  2375. #if defined(PIC)
  2376. "mov %5, %%"REG_b" \n\t"
  2377. #endif
  2378. :: "m" (src), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
  2379. "m" (funnyYCode)
  2380. #if defined(PIC)
  2381. ,"m" (ebxsave)
  2382. #endif
  2383. : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
  2384. #if !defined(PIC)
  2385. ,"%"REG_b
  2386. #endif
  2387. );
  2388. for(i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) dst[i] = src[srcW-1]*128;
  2389. }
  2390. else
  2391. {
  2392. #endif
  2393. long xInc_shr16 = xInc >> 16;
  2394. uint16_t xInc_mask = xInc & 0xffff;
  2395. //NO MMX just normal asm ...
  2396. asm volatile(
  2397. "xor %%"REG_a", %%"REG_a" \n\t" // i
  2398. "xor %%"REG_d", %%"REG_d" \n\t" // xx
  2399. "xorl %%ecx, %%ecx \n\t" // 2*xalpha
  2400. ASMALIGN(4)
  2401. "1: \n\t"
  2402. "movzbl (%0, %%"REG_d"), %%edi \n\t" //src[xx]
  2403. "movzbl 1(%0, %%"REG_d"), %%esi \n\t" //src[xx+1]
  2404. "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
  2405. "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
  2406. "shll $16, %%edi \n\t"
  2407. "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
  2408. "mov %1, %%"REG_D" \n\t"
  2409. "shrl $9, %%esi \n\t"
  2410. "movw %%si, (%%"REG_D", %%"REG_a", 2)\n\t"
  2411. "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
  2412. "adc %3, %%"REG_d" \n\t" //xx+= xInc>>8 + carry
  2413. "movzbl (%0, %%"REG_d"), %%edi \n\t" //src[xx]
  2414. "movzbl 1(%0, %%"REG_d"), %%esi \n\t" //src[xx+1]
  2415. "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
  2416. "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
  2417. "shll $16, %%edi \n\t"
  2418. "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
  2419. "mov %1, %%"REG_D" \n\t"
  2420. "shrl $9, %%esi \n\t"
  2421. "movw %%si, 2(%%"REG_D", %%"REG_a", 2)\n\t"
  2422. "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
  2423. "adc %3, %%"REG_d" \n\t" //xx+= xInc>>8 + carry
  2424. "add $2, %%"REG_a" \n\t"
  2425. "cmp %2, %%"REG_a" \n\t"
  2426. " jb 1b \n\t"
  2427. :: "r" (src), "m" (dst), "m" (dstWidth), "m" (xInc_shr16), "m" (xInc_mask)
  2428. : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi"
  2429. );
  2430. #ifdef HAVE_MMX2
  2431. } //if MMX2 can't be used
  2432. #endif
  2433. #else
  2434. int i;
  2435. unsigned int xpos=0;
  2436. for(i=0;i<dstWidth;i++)
  2437. {
  2438. register unsigned int xx=xpos>>16;
  2439. register unsigned int xalpha=(xpos&0xFFFF)>>9;
  2440. dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha;
  2441. xpos+=xInc;
  2442. }
  2443. #endif
  2444. }
  2445. }
  2446. inline static void RENAME(hcscale)(uint16_t *dst, long dstWidth, uint8_t *src1, uint8_t *src2,
  2447. int srcW, int xInc, int flags, int canMMX2BeUsed, int16_t *hChrFilter,
  2448. int16_t *hChrFilterPos, int hChrFilterSize, void *funnyUVCode,
  2449. int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
  2450. int32_t *mmx2FilterPos)
  2451. {
  2452. if(srcFormat==PIX_FMT_YUYV422)
  2453. {
  2454. RENAME(yuy2ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
  2455. src1= formatConvBuffer;
  2456. src2= formatConvBuffer+2048;
  2457. }
  2458. else if(srcFormat==PIX_FMT_UYVY422)
  2459. {
  2460. RENAME(uyvyToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
  2461. src1= formatConvBuffer;
  2462. src2= formatConvBuffer+2048;
  2463. }
  2464. else if(srcFormat==PIX_FMT_RGB32)
  2465. {
  2466. RENAME(bgr32ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
  2467. src1= formatConvBuffer;
  2468. src2= formatConvBuffer+2048;
  2469. }
  2470. else if(srcFormat==PIX_FMT_BGR24)
  2471. {
  2472. RENAME(bgr24ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
  2473. src1= formatConvBuffer;
  2474. src2= formatConvBuffer+2048;
  2475. }
  2476. else if(srcFormat==PIX_FMT_BGR565)
  2477. {
  2478. RENAME(bgr16ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
  2479. src1= formatConvBuffer;
  2480. src2= formatConvBuffer+2048;
  2481. }
  2482. else if(srcFormat==PIX_FMT_BGR555)
  2483. {
  2484. RENAME(bgr15ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
  2485. src1= formatConvBuffer;
  2486. src2= formatConvBuffer+2048;
  2487. }
  2488. else if(srcFormat==PIX_FMT_BGR32)
  2489. {
  2490. RENAME(rgb32ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
  2491. src1= formatConvBuffer;
  2492. src2= formatConvBuffer+2048;
  2493. }
  2494. else if(srcFormat==PIX_FMT_RGB24)
  2495. {
  2496. RENAME(rgb24ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
  2497. src1= formatConvBuffer;
  2498. src2= formatConvBuffer+2048;
  2499. }
  2500. else if(isGray(srcFormat))
  2501. {
  2502. return;
  2503. }
  2504. #ifdef HAVE_MMX
  2505. // use the new MMX scaler if the mmx2 can't be used (its faster than the x86asm one)
  2506. if(!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
  2507. #else
  2508. if(!(flags&SWS_FAST_BILINEAR))
  2509. #endif
  2510. {
  2511. RENAME(hScale)(dst , dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
  2512. RENAME(hScale)(dst+2048, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
  2513. }
  2514. else // Fast Bilinear upscale / crap downscale
  2515. {
  2516. #if defined(ARCH_X86) || defined(ARCH_X86_64)
  2517. #ifdef HAVE_MMX2
  2518. int i;
  2519. #if defined(PIC)
  2520. uint64_t ebxsave __attribute__((aligned(8)));
  2521. #endif
  2522. if(canMMX2BeUsed)
  2523. {
  2524. asm volatile(
  2525. #if defined(PIC)
  2526. "mov %%"REG_b", %6 \n\t"
  2527. #endif
  2528. "pxor %%mm7, %%mm7 \n\t"
  2529. "mov %0, %%"REG_c" \n\t"
  2530. "mov %1, %%"REG_D" \n\t"
  2531. "mov %2, %%"REG_d" \n\t"
  2532. "mov %3, %%"REG_b" \n\t"
  2533. "xor %%"REG_a", %%"REG_a" \n\t" // i
  2534. PREFETCH" (%%"REG_c") \n\t"
  2535. PREFETCH" 32(%%"REG_c") \n\t"
  2536. PREFETCH" 64(%%"REG_c") \n\t"
  2537. #ifdef ARCH_X86_64
  2538. #define FUNNY_UV_CODE \
  2539. "movl (%%"REG_b"), %%esi \n\t"\
  2540. "call *%4 \n\t"\
  2541. "movl (%%"REG_b", %%"REG_a"), %%esi\n\t"\
  2542. "add %%"REG_S", %%"REG_c" \n\t"\
  2543. "add %%"REG_a", %%"REG_D" \n\t"\
  2544. "xor %%"REG_a", %%"REG_a" \n\t"\
  2545. #else
  2546. #define FUNNY_UV_CODE \
  2547. "movl (%%"REG_b"), %%esi \n\t"\
  2548. "call *%4 \n\t"\
  2549. "addl (%%"REG_b", %%"REG_a"), %%"REG_c"\n\t"\
  2550. "add %%"REG_a", %%"REG_D" \n\t"\
  2551. "xor %%"REG_a", %%"REG_a" \n\t"\
  2552. #endif
  2553. FUNNY_UV_CODE
  2554. FUNNY_UV_CODE
  2555. FUNNY_UV_CODE
  2556. FUNNY_UV_CODE
  2557. "xor %%"REG_a", %%"REG_a" \n\t" // i
  2558. "mov %5, %%"REG_c" \n\t" // src
  2559. "mov %1, %%"REG_D" \n\t" // buf1
  2560. "add $4096, %%"REG_D" \n\t"
  2561. PREFETCH" (%%"REG_c") \n\t"
  2562. PREFETCH" 32(%%"REG_c") \n\t"
  2563. PREFETCH" 64(%%"REG_c") \n\t"
  2564. FUNNY_UV_CODE
  2565. FUNNY_UV_CODE
  2566. FUNNY_UV_CODE
  2567. FUNNY_UV_CODE
  2568. #if defined(PIC)
  2569. "mov %6, %%"REG_b" \n\t"
  2570. #endif
  2571. :: "m" (src1), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
  2572. "m" (funnyUVCode), "m" (src2)
  2573. #if defined(PIC)
  2574. ,"m" (ebxsave)
  2575. #endif
  2576. : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
  2577. #if !defined(PIC)
  2578. ,"%"REG_b
  2579. #endif
  2580. );
  2581. for(i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
  2582. {
  2583. // printf("%d %d %d\n", dstWidth, i, srcW);
  2584. dst[i] = src1[srcW-1]*128;
  2585. dst[i+2048] = src2[srcW-1]*128;
  2586. }
  2587. }
  2588. else
  2589. {
  2590. #endif
  2591. long xInc_shr16 = (long) (xInc >> 16);
  2592. uint16_t xInc_mask = xInc & 0xffff;
  2593. asm volatile(
  2594. "xor %%"REG_a", %%"REG_a" \n\t" // i
  2595. "xor %%"REG_d", %%"REG_d" \n\t" // xx
  2596. "xorl %%ecx, %%ecx \n\t" // 2*xalpha
  2597. ASMALIGN(4)
  2598. "1: \n\t"
  2599. "mov %0, %%"REG_S" \n\t"
  2600. "movzbl (%%"REG_S", %%"REG_d"), %%edi \n\t" //src[xx]
  2601. "movzbl 1(%%"REG_S", %%"REG_d"), %%esi \n\t" //src[xx+1]
  2602. "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
  2603. "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
  2604. "shll $16, %%edi \n\t"
  2605. "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
  2606. "mov %1, %%"REG_D" \n\t"
  2607. "shrl $9, %%esi \n\t"
  2608. "movw %%si, (%%"REG_D", %%"REG_a", 2)\n\t"
  2609. "movzbl (%5, %%"REG_d"), %%edi \n\t" //src[xx]
  2610. "movzbl 1(%5, %%"REG_d"), %%esi \n\t" //src[xx+1]
  2611. "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
  2612. "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
  2613. "shll $16, %%edi \n\t"
  2614. "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
  2615. "mov %1, %%"REG_D" \n\t"
  2616. "shrl $9, %%esi \n\t"
  2617. "movw %%si, 4096(%%"REG_D", %%"REG_a", 2)\n\t"
  2618. "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
  2619. "adc %3, %%"REG_d" \n\t" //xx+= xInc>>8 + carry
  2620. "add $1, %%"REG_a" \n\t"
  2621. "cmp %2, %%"REG_a" \n\t"
  2622. " jb 1b \n\t"
  2623. /* GCC-3.3 makes MPlayer crash on IA-32 machines when using "g" operand here,
  2624. which is needed to support GCC-4.0 */
  2625. #if defined(ARCH_X86_64) && ((__GNUC__ > 3) || ( __GNUC__ == 3 && __GNUC_MINOR__ >= 4))
  2626. :: "m" (src1), "m" (dst), "g" ((long)dstWidth), "m" (xInc_shr16), "m" (xInc_mask),
  2627. #else
  2628. :: "m" (src1), "m" (dst), "m" ((long)dstWidth), "m" (xInc_shr16), "m" (xInc_mask),
  2629. #endif
  2630. "r" (src2)
  2631. : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi"
  2632. );
  2633. #ifdef HAVE_MMX2
  2634. } //if MMX2 can't be used
  2635. #endif
  2636. #else
  2637. int i;
  2638. unsigned int xpos=0;
  2639. for(i=0;i<dstWidth;i++)
  2640. {
  2641. register unsigned int xx=xpos>>16;
  2642. register unsigned int xalpha=(xpos&0xFFFF)>>9;
  2643. dst[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha);
  2644. dst[i+2048]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha);
  2645. /* slower
  2646. dst[i]= (src1[xx]<<7) + (src1[xx+1] - src1[xx])*xalpha;
  2647. dst[i+2048]=(src2[xx]<<7) + (src2[xx+1] - src2[xx])*xalpha;
  2648. */
  2649. xpos+=xInc;
  2650. }
  2651. #endif
  2652. }
  2653. }
  2654. static int RENAME(swScale)(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
  2655. int srcSliceH, uint8_t* dst[], int dstStride[]){
  2656. /* load a few things into local vars to make the code more readable? and faster */
  2657. const int srcW= c->srcW;
  2658. const int dstW= c->dstW;
  2659. const int dstH= c->dstH;
  2660. const int chrDstW= c->chrDstW;
  2661. const int chrSrcW= c->chrSrcW;
  2662. const int lumXInc= c->lumXInc;
  2663. const int chrXInc= c->chrXInc;
  2664. const int dstFormat= c->dstFormat;
  2665. const int srcFormat= c->srcFormat;
  2666. const int flags= c->flags;
  2667. const int canMMX2BeUsed= c->canMMX2BeUsed;
  2668. int16_t *vLumFilterPos= c->vLumFilterPos;
  2669. int16_t *vChrFilterPos= c->vChrFilterPos;
  2670. int16_t *hLumFilterPos= c->hLumFilterPos;
  2671. int16_t *hChrFilterPos= c->hChrFilterPos;
  2672. int16_t *vLumFilter= c->vLumFilter;
  2673. int16_t *vChrFilter= c->vChrFilter;
  2674. int16_t *hLumFilter= c->hLumFilter;
  2675. int16_t *hChrFilter= c->hChrFilter;
  2676. int32_t *lumMmxFilter= c->lumMmxFilter;
  2677. int32_t *chrMmxFilter= c->chrMmxFilter;
  2678. const int vLumFilterSize= c->vLumFilterSize;
  2679. const int vChrFilterSize= c->vChrFilterSize;
  2680. const int hLumFilterSize= c->hLumFilterSize;
  2681. const int hChrFilterSize= c->hChrFilterSize;
  2682. int16_t **lumPixBuf= c->lumPixBuf;
  2683. int16_t **chrPixBuf= c->chrPixBuf;
  2684. const int vLumBufSize= c->vLumBufSize;
  2685. const int vChrBufSize= c->vChrBufSize;
  2686. uint8_t *funnyYCode= c->funnyYCode;
  2687. uint8_t *funnyUVCode= c->funnyUVCode;
  2688. uint8_t *formatConvBuffer= c->formatConvBuffer;
  2689. const int chrSrcSliceY= srcSliceY >> c->chrSrcVSubSample;
  2690. const int chrSrcSliceH= -((-srcSliceH) >> c->chrSrcVSubSample);
  2691. int lastDstY;
  2692. /* vars whch will change and which we need to storw back in the context */
  2693. int dstY= c->dstY;
  2694. int lumBufIndex= c->lumBufIndex;
  2695. int chrBufIndex= c->chrBufIndex;
  2696. int lastInLumBuf= c->lastInLumBuf;
  2697. int lastInChrBuf= c->lastInChrBuf;
  2698. if(isPacked(c->srcFormat)){
  2699. src[0]=
  2700. src[1]=
  2701. src[2]= src[0];
  2702. srcStride[0]=
  2703. srcStride[1]=
  2704. srcStride[2]= srcStride[0];
  2705. }
  2706. srcStride[1]<<= c->vChrDrop;
  2707. srcStride[2]<<= c->vChrDrop;
  2708. // printf("swscale %X %X %X -> %X %X %X\n", (int)src[0], (int)src[1], (int)src[2],
  2709. // (int)dst[0], (int)dst[1], (int)dst[2]);
  2710. #if 0 //self test FIXME move to a vfilter or something
  2711. {
  2712. static volatile int i=0;
  2713. i++;
  2714. if(srcFormat==PIX_FMT_YUV420P && i==1 && srcSliceH>= c->srcH)
  2715. selfTest(src, srcStride, c->srcW, c->srcH);
  2716. i--;
  2717. }
  2718. #endif
  2719. //printf("sws Strides:%d %d %d -> %d %d %d\n", srcStride[0],srcStride[1],srcStride[2],
  2720. //dstStride[0],dstStride[1],dstStride[2]);
  2721. if(dstStride[0]%8 !=0 || dstStride[1]%8 !=0 || dstStride[2]%8 !=0)
  2722. {
  2723. static int firstTime=1; //FIXME move this into the context perhaps
  2724. if(flags & SWS_PRINT_INFO && firstTime)
  2725. {
  2726. MSG_WARN("SwScaler: Warning: dstStride is not aligned!\n"
  2727. "SwScaler: ->cannot do aligned memory acesses anymore\n");
  2728. firstTime=0;
  2729. }
  2730. }
  2731. /* Note the user might start scaling the picture in the middle so this will not get executed
  2732. this is not really intended but works currently, so ppl might do it */
  2733. if(srcSliceY ==0){
  2734. lumBufIndex=0;
  2735. chrBufIndex=0;
  2736. dstY=0;
  2737. lastInLumBuf= -1;
  2738. lastInChrBuf= -1;
  2739. }
  2740. lastDstY= dstY;
  2741. for(;dstY < dstH; dstY++){
  2742. unsigned char *dest =dst[0]+dstStride[0]*dstY;
  2743. const int chrDstY= dstY>>c->chrDstVSubSample;
  2744. unsigned char *uDest=dst[1]+dstStride[1]*chrDstY;
  2745. unsigned char *vDest=dst[2]+dstStride[2]*chrDstY;
  2746. const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input
  2747. const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input
  2748. const int lastLumSrcY= firstLumSrcY + vLumFilterSize -1; // Last line needed as input
  2749. const int lastChrSrcY= firstChrSrcY + vChrFilterSize -1; // Last line needed as input
  2750. //printf("dstY:%d dstH:%d firstLumSrcY:%d lastInLumBuf:%d vLumBufSize: %d vChrBufSize: %d slice: %d %d vLumFilterSize: %d firstChrSrcY: %d vChrFilterSize: %d c->chrSrcVSubSample: %d\n",
  2751. // dstY, dstH, firstLumSrcY, lastInLumBuf, vLumBufSize, vChrBufSize, srcSliceY, srcSliceH, vLumFilterSize, firstChrSrcY, vChrFilterSize, c->chrSrcVSubSample);
  2752. //handle holes (FAST_BILINEAR & weird filters)
  2753. if(firstLumSrcY > lastInLumBuf) lastInLumBuf= firstLumSrcY-1;
  2754. if(firstChrSrcY > lastInChrBuf) lastInChrBuf= firstChrSrcY-1;
  2755. //printf("%d %d %d\n", firstChrSrcY, lastInChrBuf, vChrBufSize);
  2756. ASSERT(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1)
  2757. ASSERT(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1)
  2758. // Do we have enough lines in this slice to output the dstY line
  2759. if(lastLumSrcY < srcSliceY + srcSliceH && lastChrSrcY < -((-srcSliceY - srcSliceH)>>c->chrSrcVSubSample))
  2760. {
  2761. //Do horizontal scaling
  2762. while(lastInLumBuf < lastLumSrcY)
  2763. {
  2764. uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
  2765. lumBufIndex++;
  2766. // printf("%d %d %d %d\n", lumBufIndex, vLumBufSize, lastInLumBuf, lastLumSrcY);
  2767. ASSERT(lumBufIndex < 2*vLumBufSize)
  2768. ASSERT(lastInLumBuf + 1 - srcSliceY < srcSliceH)
  2769. ASSERT(lastInLumBuf + 1 - srcSliceY >= 0)
  2770. // printf("%d %d\n", lumBufIndex, vLumBufSize);
  2771. RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
  2772. flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
  2773. funnyYCode, c->srcFormat, formatConvBuffer,
  2774. c->lumMmx2Filter, c->lumMmx2FilterPos);
  2775. lastInLumBuf++;
  2776. }
  2777. while(lastInChrBuf < lastChrSrcY)
  2778. {
  2779. uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
  2780. uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
  2781. chrBufIndex++;
  2782. ASSERT(chrBufIndex < 2*vChrBufSize)
  2783. ASSERT(lastInChrBuf + 1 - chrSrcSliceY < (chrSrcSliceH))
  2784. ASSERT(lastInChrBuf + 1 - chrSrcSliceY >= 0)
  2785. //FIXME replace parameters through context struct (some at least)
  2786. if(!(isGray(srcFormat) || isGray(dstFormat)))
  2787. RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
  2788. flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
  2789. funnyUVCode, c->srcFormat, formatConvBuffer,
  2790. c->chrMmx2Filter, c->chrMmx2FilterPos);
  2791. lastInChrBuf++;
  2792. }
  2793. //wrap buf index around to stay inside the ring buffer
  2794. if(lumBufIndex >= vLumBufSize ) lumBufIndex-= vLumBufSize;
  2795. if(chrBufIndex >= vChrBufSize ) chrBufIndex-= vChrBufSize;
  2796. }
  2797. else // not enough lines left in this slice -> load the rest in the buffer
  2798. {
  2799. /* printf("%d %d Last:%d %d LastInBuf:%d %d Index:%d %d Y:%d FSize: %d %d BSize: %d %d\n",
  2800. firstChrSrcY,firstLumSrcY,lastChrSrcY,lastLumSrcY,
  2801. lastInChrBuf,lastInLumBuf,chrBufIndex,lumBufIndex,dstY,vChrFilterSize,vLumFilterSize,
  2802. vChrBufSize, vLumBufSize);*/
  2803. //Do horizontal scaling
  2804. while(lastInLumBuf+1 < srcSliceY + srcSliceH)
  2805. {
  2806. uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
  2807. lumBufIndex++;
  2808. ASSERT(lumBufIndex < 2*vLumBufSize)
  2809. ASSERT(lastInLumBuf + 1 - srcSliceY < srcSliceH)
  2810. ASSERT(lastInLumBuf + 1 - srcSliceY >= 0)
  2811. RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
  2812. flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
  2813. funnyYCode, c->srcFormat, formatConvBuffer,
  2814. c->lumMmx2Filter, c->lumMmx2FilterPos);
  2815. lastInLumBuf++;
  2816. }
  2817. while(lastInChrBuf+1 < (chrSrcSliceY + chrSrcSliceH))
  2818. {
  2819. uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
  2820. uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
  2821. chrBufIndex++;
  2822. ASSERT(chrBufIndex < 2*vChrBufSize)
  2823. ASSERT(lastInChrBuf + 1 - chrSrcSliceY < chrSrcSliceH)
  2824. ASSERT(lastInChrBuf + 1 - chrSrcSliceY >= 0)
  2825. if(!(isGray(srcFormat) || isGray(dstFormat)))
  2826. RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
  2827. flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
  2828. funnyUVCode, c->srcFormat, formatConvBuffer,
  2829. c->chrMmx2Filter, c->chrMmx2FilterPos);
  2830. lastInChrBuf++;
  2831. }
  2832. //wrap buf index around to stay inside the ring buffer
  2833. if(lumBufIndex >= vLumBufSize ) lumBufIndex-= vLumBufSize;
  2834. if(chrBufIndex >= vChrBufSize ) chrBufIndex-= vChrBufSize;
  2835. break; //we can't output a dstY line so let's try with the next slice
  2836. }
  2837. #ifdef HAVE_MMX
  2838. b5Dither= dither8[dstY&1];
  2839. g6Dither= dither4[dstY&1];
  2840. g5Dither= dither8[dstY&1];
  2841. r5Dither= dither8[(dstY+1)&1];
  2842. #endif
  2843. if(dstY < dstH-2)
  2844. {
  2845. int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
  2846. int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
  2847. #ifdef HAVE_MMX
  2848. int i;
  2849. if(flags & SWS_ACCURATE_RND){
  2850. for(i=0; i<vLumFilterSize; i+=2){
  2851. lumMmxFilter[2*i+0]= lumSrcPtr[i ];
  2852. lumMmxFilter[2*i+1]= lumSrcPtr[i+(vLumFilterSize>1)];
  2853. lumMmxFilter[2*i+2]=
  2854. lumMmxFilter[2*i+3]= vLumFilter[dstY*vLumFilterSize + i ]
  2855. + (vLumFilterSize>1 ? vLumFilter[dstY*vLumFilterSize + i + 1]<<16 : 0);
  2856. }
  2857. for(i=0; i<vChrFilterSize; i+=2){
  2858. chrMmxFilter[2*i+0]= chrSrcPtr[i ];
  2859. chrMmxFilter[2*i+1]= chrSrcPtr[i+(vChrFilterSize>1)];
  2860. chrMmxFilter[2*i+2]=
  2861. chrMmxFilter[2*i+3]= vChrFilter[chrDstY*vChrFilterSize + i ]
  2862. + (vChrFilterSize>1 ? vChrFilter[chrDstY*vChrFilterSize + i + 1]<<16 : 0);
  2863. }
  2864. }else{
  2865. for(i=0; i<vLumFilterSize; i++)
  2866. {
  2867. lumMmxFilter[4*i+0]= (int32_t)lumSrcPtr[i];
  2868. lumMmxFilter[4*i+2]=
  2869. lumMmxFilter[4*i+3]=
  2870. ((uint16_t)vLumFilter[dstY*vLumFilterSize + i])*0x10001;
  2871. }
  2872. for(i=0; i<vChrFilterSize; i++)
  2873. {
  2874. chrMmxFilter[4*i+0]= (int32_t)chrSrcPtr[i];
  2875. chrMmxFilter[4*i+2]=
  2876. chrMmxFilter[4*i+3]=
  2877. ((uint16_t)vChrFilter[chrDstY*vChrFilterSize + i])*0x10001;
  2878. }
  2879. }
  2880. #endif
  2881. if(dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21){
  2882. const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
  2883. if(dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
  2884. RENAME(yuv2nv12X)(c,
  2885. vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
  2886. vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
  2887. dest, uDest, dstW, chrDstW, dstFormat);
  2888. }
  2889. else if(isPlanarYUV(dstFormat) || isGray(dstFormat)) //YV12 like
  2890. {
  2891. const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
  2892. if((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
  2893. if(vLumFilterSize == 1 && vChrFilterSize == 1) // Unscaled YV12
  2894. {
  2895. int16_t *lumBuf = lumPixBuf[0];
  2896. int16_t *chrBuf= chrPixBuf[0];
  2897. RENAME(yuv2yuv1)(lumBuf, chrBuf, dest, uDest, vDest, dstW, chrDstW);
  2898. }
  2899. else //General YV12
  2900. {
  2901. RENAME(yuv2yuvX)(c,
  2902. vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
  2903. vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
  2904. dest, uDest, vDest, dstW, chrDstW);
  2905. }
  2906. }
  2907. else
  2908. {
  2909. ASSERT(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
  2910. ASSERT(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
  2911. if(vLumFilterSize == 1 && vChrFilterSize == 2) //Unscaled RGB
  2912. {
  2913. int chrAlpha= vChrFilter[2*dstY+1];
  2914. RENAME(yuv2packed1)(c, *lumSrcPtr, *chrSrcPtr, *(chrSrcPtr+1),
  2915. dest, dstW, chrAlpha, dstFormat, flags, dstY);
  2916. }
  2917. else if(vLumFilterSize == 2 && vChrFilterSize == 2) //BiLinear Upscale RGB
  2918. {
  2919. int lumAlpha= vLumFilter[2*dstY+1];
  2920. int chrAlpha= vChrFilter[2*dstY+1];
  2921. lumMmxFilter[2]=
  2922. lumMmxFilter[3]= vLumFilter[2*dstY ]*0x10001;
  2923. chrMmxFilter[2]=
  2924. chrMmxFilter[3]= vChrFilter[2*chrDstY]*0x10001;
  2925. RENAME(yuv2packed2)(c, *lumSrcPtr, *(lumSrcPtr+1), *chrSrcPtr, *(chrSrcPtr+1),
  2926. dest, dstW, lumAlpha, chrAlpha, dstY);
  2927. }
  2928. else //General RGB
  2929. {
  2930. RENAME(yuv2packedX)(c,
  2931. vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
  2932. vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
  2933. dest, dstW, dstY);
  2934. }
  2935. }
  2936. }
  2937. else // hmm looks like we can't use MMX here without overwriting this array's tail
  2938. {
  2939. int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
  2940. int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
  2941. if(dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21){
  2942. const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
  2943. if(dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
  2944. yuv2nv12XinC(
  2945. vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
  2946. vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
  2947. dest, uDest, dstW, chrDstW, dstFormat);
  2948. }
  2949. else if(isPlanarYUV(dstFormat) || isGray(dstFormat)) //YV12
  2950. {
  2951. const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
  2952. if((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
  2953. yuv2yuvXinC(
  2954. vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
  2955. vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
  2956. dest, uDest, vDest, dstW, chrDstW);
  2957. }
  2958. else
  2959. {
  2960. ASSERT(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
  2961. ASSERT(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
  2962. yuv2packedXinC(c,
  2963. vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
  2964. vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
  2965. dest, dstW, dstY);
  2966. }
  2967. }
  2968. }
  2969. #ifdef HAVE_MMX
  2970. __asm __volatile(SFENCE:::"memory");
  2971. __asm __volatile(EMMS:::"memory");
  2972. #endif
  2973. /* store changed local vars back in the context */
  2974. c->dstY= dstY;
  2975. c->lumBufIndex= lumBufIndex;
  2976. c->chrBufIndex= chrBufIndex;
  2977. c->lastInLumBuf= lastInLumBuf;
  2978. c->lastInChrBuf= lastInChrBuf;
  2979. return dstY - lastDstY;
  2980. }