You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

3224 lines
139KB

  1. /*
  2. * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
  3. *
  4. * This file is part of FFmpeg.
  5. *
  6. * FFmpeg is free software; you can redistribute it and/or modify
  7. * it under the terms of the GNU General Public License as published by
  8. * the Free Software Foundation; either version 2 of the License, or
  9. * (at your option) any later version.
  10. *
  11. * FFmpeg is distributed in the hope that it will be useful,
  12. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  14. * GNU General Public License for more details.
  15. *
  16. * You should have received a copy of the GNU General Public License
  17. * along with FFmpeg; if not, write to the Free Software
  18. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  19. *
  20. * The C code (not assembly, MMX, ...) of this file can be used
  21. * under the LGPL license.
  22. */
  23. #undef REAL_MOVNTQ
  24. #undef MOVNTQ
  25. #undef PAVGB
  26. #undef PREFETCH
  27. #undef PREFETCHW
  28. #undef EMMS
  29. #undef SFENCE
  30. #if HAVE_AMD3DNOW
  31. /* On K6 femms is faster than emms. On K7 femms is directly mapped to emms. */
  32. #define EMMS "femms"
  33. #else
  34. #define EMMS "emms"
  35. #endif
  36. #if HAVE_AMD3DNOW
  37. #define PREFETCH "prefetch"
  38. #define PREFETCHW "prefetchw"
  39. #elif HAVE_MMX2
  40. #define PREFETCH "prefetchnta"
  41. #define PREFETCHW "prefetcht0"
  42. #else
  43. #define PREFETCH " # nop"
  44. #define PREFETCHW " # nop"
  45. #endif
  46. #if HAVE_MMX2
  47. #define SFENCE "sfence"
  48. #else
  49. #define SFENCE " # nop"
  50. #endif
  51. #if HAVE_MMX2
  52. #define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
  53. #elif HAVE_AMD3DNOW
  54. #define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
  55. #endif
  56. #if HAVE_MMX2
  57. #define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
  58. #else
  59. #define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
  60. #endif
  61. #define MOVNTQ(a,b) REAL_MOVNTQ(a,b)
  62. #if HAVE_ALTIVEC
  63. #include "ppc/swscale_altivec_template.c"
  64. #endif
  65. #define YSCALEYUV2YV12X(x, offset, dest, width) \
  66. __asm__ volatile(\
  67. "xor %%"REG_a", %%"REG_a" \n\t"\
  68. "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
  69. "movq %%mm3, %%mm4 \n\t"\
  70. "lea " offset "(%0), %%"REG_d" \n\t"\
  71. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  72. ASMALIGN(4) /* FIXME Unroll? */\
  73. "1: \n\t"\
  74. "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
  75. "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* srcData */\
  76. "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm5 \n\t" /* srcData */\
  77. "add $16, %%"REG_d" \n\t"\
  78. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  79. "test %%"REG_S", %%"REG_S" \n\t"\
  80. "pmulhw %%mm0, %%mm2 \n\t"\
  81. "pmulhw %%mm0, %%mm5 \n\t"\
  82. "paddw %%mm2, %%mm3 \n\t"\
  83. "paddw %%mm5, %%mm4 \n\t"\
  84. " jnz 1b \n\t"\
  85. "psraw $3, %%mm3 \n\t"\
  86. "psraw $3, %%mm4 \n\t"\
  87. "packuswb %%mm4, %%mm3 \n\t"\
  88. MOVNTQ(%%mm3, (%1, %%REGa))\
  89. "add $8, %%"REG_a" \n\t"\
  90. "cmp %2, %%"REG_a" \n\t"\
  91. "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
  92. "movq %%mm3, %%mm4 \n\t"\
  93. "lea " offset "(%0), %%"REG_d" \n\t"\
  94. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  95. "jb 1b \n\t"\
  96. :: "r" (&c->redDither),\
  97. "r" (dest), "g" (width)\
  98. : "%"REG_a, "%"REG_d, "%"REG_S\
  99. );
  100. #define YSCALEYUV2YV12X_ACCURATE(x, offset, dest, width) \
  101. __asm__ volatile(\
  102. "lea " offset "(%0), %%"REG_d" \n\t"\
  103. "xor %%"REG_a", %%"REG_a" \n\t"\
  104. "pxor %%mm4, %%mm4 \n\t"\
  105. "pxor %%mm5, %%mm5 \n\t"\
  106. "pxor %%mm6, %%mm6 \n\t"\
  107. "pxor %%mm7, %%mm7 \n\t"\
  108. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  109. ASMALIGN(4) \
  110. "1: \n\t"\
  111. "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm0 \n\t" /* srcData */\
  112. "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* srcData */\
  113. "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
  114. "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm1 \n\t" /* srcData */\
  115. "movq %%mm0, %%mm3 \n\t"\
  116. "punpcklwd %%mm1, %%mm0 \n\t"\
  117. "punpckhwd %%mm1, %%mm3 \n\t"\
  118. "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm1 \n\t" /* filterCoeff */\
  119. "pmaddwd %%mm1, %%mm0 \n\t"\
  120. "pmaddwd %%mm1, %%mm3 \n\t"\
  121. "paddd %%mm0, %%mm4 \n\t"\
  122. "paddd %%mm3, %%mm5 \n\t"\
  123. "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm3 \n\t" /* srcData */\
  124. "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
  125. "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
  126. "test %%"REG_S", %%"REG_S" \n\t"\
  127. "movq %%mm2, %%mm0 \n\t"\
  128. "punpcklwd %%mm3, %%mm2 \n\t"\
  129. "punpckhwd %%mm3, %%mm0 \n\t"\
  130. "pmaddwd %%mm1, %%mm2 \n\t"\
  131. "pmaddwd %%mm1, %%mm0 \n\t"\
  132. "paddd %%mm2, %%mm6 \n\t"\
  133. "paddd %%mm0, %%mm7 \n\t"\
  134. " jnz 1b \n\t"\
  135. "psrad $16, %%mm4 \n\t"\
  136. "psrad $16, %%mm5 \n\t"\
  137. "psrad $16, %%mm6 \n\t"\
  138. "psrad $16, %%mm7 \n\t"\
  139. "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
  140. "packssdw %%mm5, %%mm4 \n\t"\
  141. "packssdw %%mm7, %%mm6 \n\t"\
  142. "paddw %%mm0, %%mm4 \n\t"\
  143. "paddw %%mm0, %%mm6 \n\t"\
  144. "psraw $3, %%mm4 \n\t"\
  145. "psraw $3, %%mm6 \n\t"\
  146. "packuswb %%mm6, %%mm4 \n\t"\
  147. MOVNTQ(%%mm4, (%1, %%REGa))\
  148. "add $8, %%"REG_a" \n\t"\
  149. "cmp %2, %%"REG_a" \n\t"\
  150. "lea " offset "(%0), %%"REG_d" \n\t"\
  151. "pxor %%mm4, %%mm4 \n\t"\
  152. "pxor %%mm5, %%mm5 \n\t"\
  153. "pxor %%mm6, %%mm6 \n\t"\
  154. "pxor %%mm7, %%mm7 \n\t"\
  155. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  156. "jb 1b \n\t"\
  157. :: "r" (&c->redDither),\
  158. "r" (dest), "g" (width)\
  159. : "%"REG_a, "%"REG_d, "%"REG_S\
  160. );
  161. #define YSCALEYUV2YV121 \
  162. "mov %2, %%"REG_a" \n\t"\
  163. ASMALIGN(4) /* FIXME Unroll? */\
  164. "1: \n\t"\
  165. "movq (%0, %%"REG_a", 2), %%mm0 \n\t"\
  166. "movq 8(%0, %%"REG_a", 2), %%mm1 \n\t"\
  167. "psraw $7, %%mm0 \n\t"\
  168. "psraw $7, %%mm1 \n\t"\
  169. "packuswb %%mm1, %%mm0 \n\t"\
  170. MOVNTQ(%%mm0, (%1, %%REGa))\
  171. "add $8, %%"REG_a" \n\t"\
  172. "jnc 1b \n\t"
  173. #define YSCALEYUV2YV121_ACCURATE \
  174. "mov %2, %%"REG_a" \n\t"\
  175. "pcmpeqw %%mm7, %%mm7 \n\t"\
  176. "psrlw $15, %%mm7 \n\t"\
  177. "psllw $6, %%mm7 \n\t"\
  178. ASMALIGN(4) /* FIXME Unroll? */\
  179. "1: \n\t"\
  180. "movq (%0, %%"REG_a", 2), %%mm0 \n\t"\
  181. "movq 8(%0, %%"REG_a", 2), %%mm1 \n\t"\
  182. "paddsw %%mm7, %%mm0 \n\t"\
  183. "paddsw %%mm7, %%mm1 \n\t"\
  184. "psraw $7, %%mm0 \n\t"\
  185. "psraw $7, %%mm1 \n\t"\
  186. "packuswb %%mm1, %%mm0 \n\t"\
  187. MOVNTQ(%%mm0, (%1, %%REGa))\
  188. "add $8, %%"REG_a" \n\t"\
  189. "jnc 1b \n\t"
  190. /*
  191. :: "m" (-lumFilterSize), "m" (-chrFilterSize),
  192. "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
  193. "r" (dest), "m" (dstW),
  194. "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
  195. : "%eax", "%ebx", "%ecx", "%edx", "%esi"
  196. */
  197. #define YSCALEYUV2PACKEDX_UV \
  198. __asm__ volatile(\
  199. "xor %%"REG_a", %%"REG_a" \n\t"\
  200. ASMALIGN(4)\
  201. "nop \n\t"\
  202. "1: \n\t"\
  203. "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
  204. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  205. "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
  206. "movq %%mm3, %%mm4 \n\t"\
  207. ASMALIGN(4)\
  208. "2: \n\t"\
  209. "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
  210. "movq (%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* UsrcData */\
  211. "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm5 \n\t" /* VsrcData */\
  212. "add $16, %%"REG_d" \n\t"\
  213. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  214. "pmulhw %%mm0, %%mm2 \n\t"\
  215. "pmulhw %%mm0, %%mm5 \n\t"\
  216. "paddw %%mm2, %%mm3 \n\t"\
  217. "paddw %%mm5, %%mm4 \n\t"\
  218. "test %%"REG_S", %%"REG_S" \n\t"\
  219. " jnz 2b \n\t"\
  220. #define YSCALEYUV2PACKEDX_YA(offset,coeff,src1,src2,dst1,dst2) \
  221. "lea "offset"(%0), %%"REG_d" \n\t"\
  222. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  223. "movq "VROUNDER_OFFSET"(%0), "#dst1" \n\t"\
  224. "movq "#dst1", "#dst2" \n\t"\
  225. ASMALIGN(4)\
  226. "2: \n\t"\
  227. "movq 8(%%"REG_d"), "#coeff" \n\t" /* filterCoeff */\
  228. "movq (%%"REG_S", %%"REG_a", 2), "#src1" \n\t" /* Y1srcData */\
  229. "movq 8(%%"REG_S", %%"REG_a", 2), "#src2" \n\t" /* Y2srcData */\
  230. "add $16, %%"REG_d" \n\t"\
  231. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  232. "pmulhw "#coeff", "#src1" \n\t"\
  233. "pmulhw "#coeff", "#src2" \n\t"\
  234. "paddw "#src1", "#dst1" \n\t"\
  235. "paddw "#src2", "#dst2" \n\t"\
  236. "test %%"REG_S", %%"REG_S" \n\t"\
  237. " jnz 2b \n\t"\
  238. #define YSCALEYUV2PACKEDX \
  239. YSCALEYUV2PACKEDX_UV \
  240. YSCALEYUV2PACKEDX_YA(LUM_MMX_FILTER_OFFSET,%%mm0,%%mm2,%%mm5,%%mm1,%%mm7) \
  241. #define YSCALEYUV2PACKEDX_END \
  242. :: "r" (&c->redDither), \
  243. "m" (dummy), "m" (dummy), "m" (dummy),\
  244. "r" (dest), "m" (dstW) \
  245. : "%"REG_a, "%"REG_d, "%"REG_S \
  246. );
  247. #define YSCALEYUV2PACKEDX_ACCURATE_UV \
  248. __asm__ volatile(\
  249. "xor %%"REG_a", %%"REG_a" \n\t"\
  250. ASMALIGN(4)\
  251. "nop \n\t"\
  252. "1: \n\t"\
  253. "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
  254. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  255. "pxor %%mm4, %%mm4 \n\t"\
  256. "pxor %%mm5, %%mm5 \n\t"\
  257. "pxor %%mm6, %%mm6 \n\t"\
  258. "pxor %%mm7, %%mm7 \n\t"\
  259. ASMALIGN(4)\
  260. "2: \n\t"\
  261. "movq (%%"REG_S", %%"REG_a"), %%mm0 \n\t" /* UsrcData */\
  262. "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* VsrcData */\
  263. "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
  264. "movq (%%"REG_S", %%"REG_a"), %%mm1 \n\t" /* UsrcData */\
  265. "movq %%mm0, %%mm3 \n\t"\
  266. "punpcklwd %%mm1, %%mm0 \n\t"\
  267. "punpckhwd %%mm1, %%mm3 \n\t"\
  268. "movq "STR(APCK_COEF)"(%%"REG_d"),%%mm1 \n\t" /* filterCoeff */\
  269. "pmaddwd %%mm1, %%mm0 \n\t"\
  270. "pmaddwd %%mm1, %%mm3 \n\t"\
  271. "paddd %%mm0, %%mm4 \n\t"\
  272. "paddd %%mm3, %%mm5 \n\t"\
  273. "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm3 \n\t" /* VsrcData */\
  274. "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
  275. "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
  276. "test %%"REG_S", %%"REG_S" \n\t"\
  277. "movq %%mm2, %%mm0 \n\t"\
  278. "punpcklwd %%mm3, %%mm2 \n\t"\
  279. "punpckhwd %%mm3, %%mm0 \n\t"\
  280. "pmaddwd %%mm1, %%mm2 \n\t"\
  281. "pmaddwd %%mm1, %%mm0 \n\t"\
  282. "paddd %%mm2, %%mm6 \n\t"\
  283. "paddd %%mm0, %%mm7 \n\t"\
  284. " jnz 2b \n\t"\
  285. "psrad $16, %%mm4 \n\t"\
  286. "psrad $16, %%mm5 \n\t"\
  287. "psrad $16, %%mm6 \n\t"\
  288. "psrad $16, %%mm7 \n\t"\
  289. "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
  290. "packssdw %%mm5, %%mm4 \n\t"\
  291. "packssdw %%mm7, %%mm6 \n\t"\
  292. "paddw %%mm0, %%mm4 \n\t"\
  293. "paddw %%mm0, %%mm6 \n\t"\
  294. "movq %%mm4, "U_TEMP"(%0) \n\t"\
  295. "movq %%mm6, "V_TEMP"(%0) \n\t"\
  296. #define YSCALEYUV2PACKEDX_ACCURATE_YA(offset) \
  297. "lea "offset"(%0), %%"REG_d" \n\t"\
  298. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  299. "pxor %%mm1, %%mm1 \n\t"\
  300. "pxor %%mm5, %%mm5 \n\t"\
  301. "pxor %%mm7, %%mm7 \n\t"\
  302. "pxor %%mm6, %%mm6 \n\t"\
  303. ASMALIGN(4)\
  304. "2: \n\t"\
  305. "movq (%%"REG_S", %%"REG_a", 2), %%mm0 \n\t" /* Y1srcData */\
  306. "movq 8(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* Y2srcData */\
  307. "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
  308. "movq (%%"REG_S", %%"REG_a", 2), %%mm4 \n\t" /* Y1srcData */\
  309. "movq %%mm0, %%mm3 \n\t"\
  310. "punpcklwd %%mm4, %%mm0 \n\t"\
  311. "punpckhwd %%mm4, %%mm3 \n\t"\
  312. "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm4 \n\t" /* filterCoeff */\
  313. "pmaddwd %%mm4, %%mm0 \n\t"\
  314. "pmaddwd %%mm4, %%mm3 \n\t"\
  315. "paddd %%mm0, %%mm1 \n\t"\
  316. "paddd %%mm3, %%mm5 \n\t"\
  317. "movq 8(%%"REG_S", %%"REG_a", 2), %%mm3 \n\t" /* Y2srcData */\
  318. "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
  319. "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
  320. "test %%"REG_S", %%"REG_S" \n\t"\
  321. "movq %%mm2, %%mm0 \n\t"\
  322. "punpcklwd %%mm3, %%mm2 \n\t"\
  323. "punpckhwd %%mm3, %%mm0 \n\t"\
  324. "pmaddwd %%mm4, %%mm2 \n\t"\
  325. "pmaddwd %%mm4, %%mm0 \n\t"\
  326. "paddd %%mm2, %%mm7 \n\t"\
  327. "paddd %%mm0, %%mm6 \n\t"\
  328. " jnz 2b \n\t"\
  329. "psrad $16, %%mm1 \n\t"\
  330. "psrad $16, %%mm5 \n\t"\
  331. "psrad $16, %%mm7 \n\t"\
  332. "psrad $16, %%mm6 \n\t"\
  333. "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
  334. "packssdw %%mm5, %%mm1 \n\t"\
  335. "packssdw %%mm6, %%mm7 \n\t"\
  336. "paddw %%mm0, %%mm1 \n\t"\
  337. "paddw %%mm0, %%mm7 \n\t"\
  338. "movq "U_TEMP"(%0), %%mm3 \n\t"\
  339. "movq "V_TEMP"(%0), %%mm4 \n\t"\
  340. #define YSCALEYUV2PACKEDX_ACCURATE \
  341. YSCALEYUV2PACKEDX_ACCURATE_UV \
  342. YSCALEYUV2PACKEDX_ACCURATE_YA(LUM_MMX_FILTER_OFFSET)
  343. #define YSCALEYUV2RGBX \
  344. "psubw "U_OFFSET"(%0), %%mm3 \n\t" /* (U-128)8*/\
  345. "psubw "V_OFFSET"(%0), %%mm4 \n\t" /* (V-128)8*/\
  346. "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
  347. "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
  348. "pmulhw "UG_COEFF"(%0), %%mm3 \n\t"\
  349. "pmulhw "VG_COEFF"(%0), %%mm4 \n\t"\
  350. /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
  351. "pmulhw "UB_COEFF"(%0), %%mm2 \n\t"\
  352. "pmulhw "VR_COEFF"(%0), %%mm5 \n\t"\
  353. "psubw "Y_OFFSET"(%0), %%mm1 \n\t" /* 8(Y-16)*/\
  354. "psubw "Y_OFFSET"(%0), %%mm7 \n\t" /* 8(Y-16)*/\
  355. "pmulhw "Y_COEFF"(%0), %%mm1 \n\t"\
  356. "pmulhw "Y_COEFF"(%0), %%mm7 \n\t"\
  357. /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
  358. "paddw %%mm3, %%mm4 \n\t"\
  359. "movq %%mm2, %%mm0 \n\t"\
  360. "movq %%mm5, %%mm6 \n\t"\
  361. "movq %%mm4, %%mm3 \n\t"\
  362. "punpcklwd %%mm2, %%mm2 \n\t"\
  363. "punpcklwd %%mm5, %%mm5 \n\t"\
  364. "punpcklwd %%mm4, %%mm4 \n\t"\
  365. "paddw %%mm1, %%mm2 \n\t"\
  366. "paddw %%mm1, %%mm5 \n\t"\
  367. "paddw %%mm1, %%mm4 \n\t"\
  368. "punpckhwd %%mm0, %%mm0 \n\t"\
  369. "punpckhwd %%mm6, %%mm6 \n\t"\
  370. "punpckhwd %%mm3, %%mm3 \n\t"\
  371. "paddw %%mm7, %%mm0 \n\t"\
  372. "paddw %%mm7, %%mm6 \n\t"\
  373. "paddw %%mm7, %%mm3 \n\t"\
  374. /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
  375. "packuswb %%mm0, %%mm2 \n\t"\
  376. "packuswb %%mm6, %%mm5 \n\t"\
  377. "packuswb %%mm3, %%mm4 \n\t"\
  378. #define REAL_YSCALEYUV2PACKED(index, c) \
  379. "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
  380. "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1 \n\t"\
  381. "psraw $3, %%mm0 \n\t"\
  382. "psraw $3, %%mm1 \n\t"\
  383. "movq %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c") \n\t"\
  384. "movq %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c") \n\t"\
  385. "xor "#index", "#index" \n\t"\
  386. ASMALIGN(4)\
  387. "1: \n\t"\
  388. "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
  389. "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
  390. "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
  391. "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
  392. "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
  393. "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
  394. "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
  395. "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
  396. "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
  397. "psraw $7, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
  398. "psraw $7, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
  399. "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
  400. "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
  401. "movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
  402. "movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
  403. "movq 8(%0, "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
  404. "movq 8(%1, "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
  405. "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
  406. "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
  407. "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
  408. "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
  409. "psraw $7, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  410. "psraw $7, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  411. "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
  412. "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
  413. #define YSCALEYUV2PACKED(index, c) REAL_YSCALEYUV2PACKED(index, c)
  414. #define REAL_YSCALEYUV2RGB_UV(index, c) \
  415. "xor "#index", "#index" \n\t"\
  416. ASMALIGN(4)\
  417. "1: \n\t"\
  418. "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
  419. "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
  420. "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
  421. "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
  422. "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
  423. "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
  424. "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
  425. "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
  426. "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
  427. "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
  428. "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
  429. "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
  430. "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
  431. "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
  432. "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
  433. "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
  434. "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
  435. "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
  436. "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
  437. /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
  438. #define REAL_YSCALEYUV2RGB_YA(index, c, b1, b2) \
  439. "movq ("#b1", "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
  440. "movq ("#b2", "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
  441. "movq 8("#b1", "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
  442. "movq 8("#b2", "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
  443. "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
  444. "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
  445. "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
  446. "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
  447. "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  448. "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  449. "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
  450. "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
  451. #define REAL_YSCALEYUV2RGB_COEFF(c) \
  452. "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
  453. "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
  454. "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
  455. "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
  456. "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
  457. "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
  458. /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
  459. "paddw %%mm3, %%mm4 \n\t"\
  460. "movq %%mm2, %%mm0 \n\t"\
  461. "movq %%mm5, %%mm6 \n\t"\
  462. "movq %%mm4, %%mm3 \n\t"\
  463. "punpcklwd %%mm2, %%mm2 \n\t"\
  464. "punpcklwd %%mm5, %%mm5 \n\t"\
  465. "punpcklwd %%mm4, %%mm4 \n\t"\
  466. "paddw %%mm1, %%mm2 \n\t"\
  467. "paddw %%mm1, %%mm5 \n\t"\
  468. "paddw %%mm1, %%mm4 \n\t"\
  469. "punpckhwd %%mm0, %%mm0 \n\t"\
  470. "punpckhwd %%mm6, %%mm6 \n\t"\
  471. "punpckhwd %%mm3, %%mm3 \n\t"\
  472. "paddw %%mm7, %%mm0 \n\t"\
  473. "paddw %%mm7, %%mm6 \n\t"\
  474. "paddw %%mm7, %%mm3 \n\t"\
  475. /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
  476. "packuswb %%mm0, %%mm2 \n\t"\
  477. "packuswb %%mm6, %%mm5 \n\t"\
  478. "packuswb %%mm3, %%mm4 \n\t"\
  479. #define YSCALEYUV2RGB_YA(index, c, b1, b2) REAL_YSCALEYUV2RGB_YA(index, c, b1, b2)
  480. #define YSCALEYUV2RGB(index, c) \
  481. REAL_YSCALEYUV2RGB_UV(index, c) \
  482. REAL_YSCALEYUV2RGB_YA(index, c, %0, %1) \
  483. REAL_YSCALEYUV2RGB_COEFF(c)
  484. #define REAL_YSCALEYUV2PACKED1(index, c) \
  485. "xor "#index", "#index" \n\t"\
  486. ASMALIGN(4)\
  487. "1: \n\t"\
  488. "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
  489. "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
  490. "psraw $7, %%mm3 \n\t" \
  491. "psraw $7, %%mm4 \n\t" \
  492. "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
  493. "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
  494. "psraw $7, %%mm1 \n\t" \
  495. "psraw $7, %%mm7 \n\t" \
  496. #define YSCALEYUV2PACKED1(index, c) REAL_YSCALEYUV2PACKED1(index, c)
  497. #define REAL_YSCALEYUV2RGB1(index, c) \
  498. "xor "#index", "#index" \n\t"\
  499. ASMALIGN(4)\
  500. "1: \n\t"\
  501. "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
  502. "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
  503. "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
  504. "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
  505. "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
  506. "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
  507. "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
  508. "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
  509. "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
  510. "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
  511. /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
  512. "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
  513. "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
  514. "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  515. "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  516. "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
  517. "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
  518. "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
  519. "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
  520. "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
  521. "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
  522. /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
  523. "paddw %%mm3, %%mm4 \n\t"\
  524. "movq %%mm2, %%mm0 \n\t"\
  525. "movq %%mm5, %%mm6 \n\t"\
  526. "movq %%mm4, %%mm3 \n\t"\
  527. "punpcklwd %%mm2, %%mm2 \n\t"\
  528. "punpcklwd %%mm5, %%mm5 \n\t"\
  529. "punpcklwd %%mm4, %%mm4 \n\t"\
  530. "paddw %%mm1, %%mm2 \n\t"\
  531. "paddw %%mm1, %%mm5 \n\t"\
  532. "paddw %%mm1, %%mm4 \n\t"\
  533. "punpckhwd %%mm0, %%mm0 \n\t"\
  534. "punpckhwd %%mm6, %%mm6 \n\t"\
  535. "punpckhwd %%mm3, %%mm3 \n\t"\
  536. "paddw %%mm7, %%mm0 \n\t"\
  537. "paddw %%mm7, %%mm6 \n\t"\
  538. "paddw %%mm7, %%mm3 \n\t"\
  539. /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
  540. "packuswb %%mm0, %%mm2 \n\t"\
  541. "packuswb %%mm6, %%mm5 \n\t"\
  542. "packuswb %%mm3, %%mm4 \n\t"\
  543. #define YSCALEYUV2RGB1(index, c) REAL_YSCALEYUV2RGB1(index, c)
  544. #define REAL_YSCALEYUV2PACKED1b(index, c) \
  545. "xor "#index", "#index" \n\t"\
  546. ASMALIGN(4)\
  547. "1: \n\t"\
  548. "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
  549. "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
  550. "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
  551. "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
  552. "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
  553. "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
  554. "psrlw $8, %%mm3 \n\t" \
  555. "psrlw $8, %%mm4 \n\t" \
  556. "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
  557. "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
  558. "psraw $7, %%mm1 \n\t" \
  559. "psraw $7, %%mm7 \n\t"
  560. #define YSCALEYUV2PACKED1b(index, c) REAL_YSCALEYUV2PACKED1b(index, c)
  561. // do vertical chrominance interpolation
  562. #define REAL_YSCALEYUV2RGB1b(index, c) \
  563. "xor "#index", "#index" \n\t"\
  564. ASMALIGN(4)\
  565. "1: \n\t"\
  566. "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
  567. "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
  568. "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
  569. "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
  570. "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
  571. "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
  572. "psrlw $5, %%mm3 \n\t" /*FIXME might overflow*/\
  573. "psrlw $5, %%mm4 \n\t" /*FIXME might overflow*/\
  574. "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
  575. "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
  576. "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
  577. "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
  578. "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
  579. "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
  580. /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
  581. "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
  582. "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
  583. "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  584. "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  585. "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
  586. "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
  587. "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
  588. "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
  589. "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
  590. "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
  591. /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
  592. "paddw %%mm3, %%mm4 \n\t"\
  593. "movq %%mm2, %%mm0 \n\t"\
  594. "movq %%mm5, %%mm6 \n\t"\
  595. "movq %%mm4, %%mm3 \n\t"\
  596. "punpcklwd %%mm2, %%mm2 \n\t"\
  597. "punpcklwd %%mm5, %%mm5 \n\t"\
  598. "punpcklwd %%mm4, %%mm4 \n\t"\
  599. "paddw %%mm1, %%mm2 \n\t"\
  600. "paddw %%mm1, %%mm5 \n\t"\
  601. "paddw %%mm1, %%mm4 \n\t"\
  602. "punpckhwd %%mm0, %%mm0 \n\t"\
  603. "punpckhwd %%mm6, %%mm6 \n\t"\
  604. "punpckhwd %%mm3, %%mm3 \n\t"\
  605. "paddw %%mm7, %%mm0 \n\t"\
  606. "paddw %%mm7, %%mm6 \n\t"\
  607. "paddw %%mm7, %%mm3 \n\t"\
  608. /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
  609. "packuswb %%mm0, %%mm2 \n\t"\
  610. "packuswb %%mm6, %%mm5 \n\t"\
  611. "packuswb %%mm3, %%mm4 \n\t"\
  612. #define YSCALEYUV2RGB1b(index, c) REAL_YSCALEYUV2RGB1b(index, c)
  613. #define REAL_YSCALEYUV2RGB1_ALPHA(index) \
  614. "movq (%1, "#index", 2), %%mm7 \n\t" /* abuf0[index ] */\
  615. "movq 8(%1, "#index", 2), %%mm1 \n\t" /* abuf0[index+4] */\
  616. "psraw $7, %%mm7 \n\t" /* abuf0[index ] >>7 */\
  617. "psraw $7, %%mm1 \n\t" /* abuf0[index+4] >>7 */\
  618. "packuswb %%mm1, %%mm7 \n\t"
  619. #define YSCALEYUV2RGB1_ALPHA(index) REAL_YSCALEYUV2RGB1_ALPHA(index)
  620. #define REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) \
  621. "movq "#b", "#q2" \n\t" /* B */\
  622. "movq "#r", "#t" \n\t" /* R */\
  623. "punpcklbw "#g", "#b" \n\t" /* GBGBGBGB 0 */\
  624. "punpcklbw "#a", "#r" \n\t" /* ARARARAR 0 */\
  625. "punpckhbw "#g", "#q2" \n\t" /* GBGBGBGB 2 */\
  626. "punpckhbw "#a", "#t" \n\t" /* ARARARAR 2 */\
  627. "movq "#b", "#q0" \n\t" /* GBGBGBGB 0 */\
  628. "movq "#q2", "#q3" \n\t" /* GBGBGBGB 2 */\
  629. "punpcklwd "#r", "#q0" \n\t" /* ARGBARGB 0 */\
  630. "punpckhwd "#r", "#b" \n\t" /* ARGBARGB 1 */\
  631. "punpcklwd "#t", "#q2" \n\t" /* ARGBARGB 2 */\
  632. "punpckhwd "#t", "#q3" \n\t" /* ARGBARGB 3 */\
  633. \
  634. MOVNTQ( q0, (dst, index, 4))\
  635. MOVNTQ( b, 8(dst, index, 4))\
  636. MOVNTQ( q2, 16(dst, index, 4))\
  637. MOVNTQ( q3, 24(dst, index, 4))\
  638. \
  639. "add $8, "#index" \n\t"\
  640. "cmp "#dstw", "#index" \n\t"\
  641. " jb 1b \n\t"
  642. #define WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t)
  643. #define REAL_WRITERGB16(dst, dstw, index) \
  644. "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
  645. "pand "MANGLE(bFC)", %%mm4 \n\t" /* G */\
  646. "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
  647. "psrlq $3, %%mm2 \n\t"\
  648. \
  649. "movq %%mm2, %%mm1 \n\t"\
  650. "movq %%mm4, %%mm3 \n\t"\
  651. \
  652. "punpcklbw %%mm7, %%mm3 \n\t"\
  653. "punpcklbw %%mm5, %%mm2 \n\t"\
  654. "punpckhbw %%mm7, %%mm4 \n\t"\
  655. "punpckhbw %%mm5, %%mm1 \n\t"\
  656. \
  657. "psllq $3, %%mm3 \n\t"\
  658. "psllq $3, %%mm4 \n\t"\
  659. \
  660. "por %%mm3, %%mm2 \n\t"\
  661. "por %%mm4, %%mm1 \n\t"\
  662. \
  663. MOVNTQ(%%mm2, (dst, index, 2))\
  664. MOVNTQ(%%mm1, 8(dst, index, 2))\
  665. \
  666. "add $8, "#index" \n\t"\
  667. "cmp "#dstw", "#index" \n\t"\
  668. " jb 1b \n\t"
  669. #define WRITERGB16(dst, dstw, index) REAL_WRITERGB16(dst, dstw, index)
  670. #define REAL_WRITERGB15(dst, dstw, index) \
  671. "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
  672. "pand "MANGLE(bF8)", %%mm4 \n\t" /* G */\
  673. "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
  674. "psrlq $3, %%mm2 \n\t"\
  675. "psrlq $1, %%mm5 \n\t"\
  676. \
  677. "movq %%mm2, %%mm1 \n\t"\
  678. "movq %%mm4, %%mm3 \n\t"\
  679. \
  680. "punpcklbw %%mm7, %%mm3 \n\t"\
  681. "punpcklbw %%mm5, %%mm2 \n\t"\
  682. "punpckhbw %%mm7, %%mm4 \n\t"\
  683. "punpckhbw %%mm5, %%mm1 \n\t"\
  684. \
  685. "psllq $2, %%mm3 \n\t"\
  686. "psllq $2, %%mm4 \n\t"\
  687. \
  688. "por %%mm3, %%mm2 \n\t"\
  689. "por %%mm4, %%mm1 \n\t"\
  690. \
  691. MOVNTQ(%%mm2, (dst, index, 2))\
  692. MOVNTQ(%%mm1, 8(dst, index, 2))\
  693. \
  694. "add $8, "#index" \n\t"\
  695. "cmp "#dstw", "#index" \n\t"\
  696. " jb 1b \n\t"
  697. #define WRITERGB15(dst, dstw, index) REAL_WRITERGB15(dst, dstw, index)
  698. #define WRITEBGR24OLD(dst, dstw, index) \
  699. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
  700. "movq %%mm2, %%mm1 \n\t" /* B */\
  701. "movq %%mm5, %%mm6 \n\t" /* R */\
  702. "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
  703. "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
  704. "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
  705. "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
  706. "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
  707. "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
  708. "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
  709. "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
  710. "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
  711. "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
  712. \
  713. "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
  714. "psrlq $8, %%mm0 \n\t" /* 00RGB0RG 0 */\
  715. "pand "MANGLE(bm00000111)", %%mm4 \n\t" /* 00000RGB 0 */\
  716. "pand "MANGLE(bm11111000)", %%mm0 \n\t" /* 00RGB000 0.5 */\
  717. "por %%mm4, %%mm0 \n\t" /* 00RGBRGB 0 */\
  718. "movq %%mm2, %%mm4 \n\t" /* 0RGB0RGB 1 */\
  719. "psllq $48, %%mm2 \n\t" /* GB000000 1 */\
  720. "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
  721. \
  722. "movq %%mm4, %%mm2 \n\t" /* 0RGB0RGB 1 */\
  723. "psrld $16, %%mm4 \n\t" /* 000R000R 1 */\
  724. "psrlq $24, %%mm2 \n\t" /* 0000RGB0 1.5 */\
  725. "por %%mm4, %%mm2 \n\t" /* 000RRGBR 1 */\
  726. "pand "MANGLE(bm00001111)", %%mm2 \n\t" /* 0000RGBR 1 */\
  727. "movq %%mm1, %%mm4 \n\t" /* 0RGB0RGB 2 */\
  728. "psrlq $8, %%mm1 \n\t" /* 00RGB0RG 2 */\
  729. "pand "MANGLE(bm00000111)", %%mm4 \n\t" /* 00000RGB 2 */\
  730. "pand "MANGLE(bm11111000)", %%mm1 \n\t" /* 00RGB000 2.5 */\
  731. "por %%mm4, %%mm1 \n\t" /* 00RGBRGB 2 */\
  732. "movq %%mm1, %%mm4 \n\t" /* 00RGBRGB 2 */\
  733. "psllq $32, %%mm1 \n\t" /* BRGB0000 2 */\
  734. "por %%mm1, %%mm2 \n\t" /* BRGBRGBR 1 */\
  735. \
  736. "psrlq $32, %%mm4 \n\t" /* 000000RG 2.5 */\
  737. "movq %%mm3, %%mm5 \n\t" /* 0RGB0RGB 3 */\
  738. "psrlq $8, %%mm3 \n\t" /* 00RGB0RG 3 */\
  739. "pand "MANGLE(bm00000111)", %%mm5 \n\t" /* 00000RGB 3 */\
  740. "pand "MANGLE(bm11111000)", %%mm3 \n\t" /* 00RGB000 3.5 */\
  741. "por %%mm5, %%mm3 \n\t" /* 00RGBRGB 3 */\
  742. "psllq $16, %%mm3 \n\t" /* RGBRGB00 3 */\
  743. "por %%mm4, %%mm3 \n\t" /* RGBRGBRG 2.5 */\
  744. \
  745. MOVNTQ(%%mm0, (dst))\
  746. MOVNTQ(%%mm2, 8(dst))\
  747. MOVNTQ(%%mm3, 16(dst))\
  748. "add $24, "#dst" \n\t"\
  749. \
  750. "add $8, "#index" \n\t"\
  751. "cmp "#dstw", "#index" \n\t"\
  752. " jb 1b \n\t"
  753. #define WRITEBGR24MMX(dst, dstw, index) \
  754. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
  755. "movq %%mm2, %%mm1 \n\t" /* B */\
  756. "movq %%mm5, %%mm6 \n\t" /* R */\
  757. "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
  758. "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
  759. "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
  760. "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
  761. "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
  762. "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
  763. "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
  764. "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
  765. "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
  766. "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
  767. \
  768. "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
  769. "movq %%mm2, %%mm6 \n\t" /* 0RGB0RGB 1 */\
  770. "movq %%mm1, %%mm5 \n\t" /* 0RGB0RGB 2 */\
  771. "movq %%mm3, %%mm7 \n\t" /* 0RGB0RGB 3 */\
  772. \
  773. "psllq $40, %%mm0 \n\t" /* RGB00000 0 */\
  774. "psllq $40, %%mm2 \n\t" /* RGB00000 1 */\
  775. "psllq $40, %%mm1 \n\t" /* RGB00000 2 */\
  776. "psllq $40, %%mm3 \n\t" /* RGB00000 3 */\
  777. \
  778. "punpckhdq %%mm4, %%mm0 \n\t" /* 0RGBRGB0 0 */\
  779. "punpckhdq %%mm6, %%mm2 \n\t" /* 0RGBRGB0 1 */\
  780. "punpckhdq %%mm5, %%mm1 \n\t" /* 0RGBRGB0 2 */\
  781. "punpckhdq %%mm7, %%mm3 \n\t" /* 0RGBRGB0 3 */\
  782. \
  783. "psrlq $8, %%mm0 \n\t" /* 00RGBRGB 0 */\
  784. "movq %%mm2, %%mm6 \n\t" /* 0RGBRGB0 1 */\
  785. "psllq $40, %%mm2 \n\t" /* GB000000 1 */\
  786. "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
  787. MOVNTQ(%%mm0, (dst))\
  788. \
  789. "psrlq $24, %%mm6 \n\t" /* 0000RGBR 1 */\
  790. "movq %%mm1, %%mm5 \n\t" /* 0RGBRGB0 2 */\
  791. "psllq $24, %%mm1 \n\t" /* BRGB0000 2 */\
  792. "por %%mm1, %%mm6 \n\t" /* BRGBRGBR 1 */\
  793. MOVNTQ(%%mm6, 8(dst))\
  794. \
  795. "psrlq $40, %%mm5 \n\t" /* 000000RG 2 */\
  796. "psllq $8, %%mm3 \n\t" /* RGBRGB00 3 */\
  797. "por %%mm3, %%mm5 \n\t" /* RGBRGBRG 2 */\
  798. MOVNTQ(%%mm5, 16(dst))\
  799. \
  800. "add $24, "#dst" \n\t"\
  801. \
  802. "add $8, "#index" \n\t"\
  803. "cmp "#dstw", "#index" \n\t"\
  804. " jb 1b \n\t"
  805. #define WRITEBGR24MMX2(dst, dstw, index) \
  806. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
  807. "movq "MANGLE(ff_M24A)", %%mm0 \n\t"\
  808. "movq "MANGLE(ff_M24C)", %%mm7 \n\t"\
  809. "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2 B1 B0 B1 B0 */\
  810. "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2 G1 G0 G1 G0 */\
  811. "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0 R1 R0 R1 R0 */\
  812. \
  813. "pand %%mm0, %%mm1 \n\t" /* B2 B1 B0 */\
  814. "pand %%mm0, %%mm3 \n\t" /* G2 G1 G0 */\
  815. "pand %%mm7, %%mm6 \n\t" /* R1 R0 */\
  816. \
  817. "psllq $8, %%mm3 \n\t" /* G2 G1 G0 */\
  818. "por %%mm1, %%mm6 \n\t"\
  819. "por %%mm3, %%mm6 \n\t"\
  820. MOVNTQ(%%mm6, (dst))\
  821. \
  822. "psrlq $8, %%mm4 \n\t" /* 00 G7 G6 G5 G4 G3 G2 G1 */\
  823. "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4 B3 B2 B3 B2 */\
  824. "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3 G4 G3 G4 G3 */\
  825. "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4 R3 R2 R3 R2 */\
  826. \
  827. "pand "MANGLE(ff_M24B)", %%mm1 \n\t" /* B5 B4 B3 */\
  828. "pand %%mm7, %%mm3 \n\t" /* G4 G3 */\
  829. "pand %%mm0, %%mm6 \n\t" /* R4 R3 R2 */\
  830. \
  831. "por %%mm1, %%mm3 \n\t" /* B5 G4 B4 G3 B3 */\
  832. "por %%mm3, %%mm6 \n\t"\
  833. MOVNTQ(%%mm6, 8(dst))\
  834. \
  835. "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6 B7 B6 B6 B7 */\
  836. "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7 G6 G5 G6 G5 */\
  837. "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6 R5 R4 R5 R4 */\
  838. \
  839. "pand %%mm7, %%mm1 \n\t" /* B7 B6 */\
  840. "pand %%mm0, %%mm3 \n\t" /* G7 G6 G5 */\
  841. "pand "MANGLE(ff_M24B)", %%mm6 \n\t" /* R7 R6 R5 */\
  842. \
  843. "por %%mm1, %%mm3 \n\t"\
  844. "por %%mm3, %%mm6 \n\t"\
  845. MOVNTQ(%%mm6, 16(dst))\
  846. \
  847. "add $24, "#dst" \n\t"\
  848. \
  849. "add $8, "#index" \n\t"\
  850. "cmp "#dstw", "#index" \n\t"\
  851. " jb 1b \n\t"
  852. #if HAVE_MMX2
  853. #undef WRITEBGR24
  854. #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX2(dst, dstw, index)
  855. #else
  856. #undef WRITEBGR24
  857. #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX(dst, dstw, index)
  858. #endif
  859. #define REAL_WRITEYUY2(dst, dstw, index) \
  860. "packuswb %%mm3, %%mm3 \n\t"\
  861. "packuswb %%mm4, %%mm4 \n\t"\
  862. "packuswb %%mm7, %%mm1 \n\t"\
  863. "punpcklbw %%mm4, %%mm3 \n\t"\
  864. "movq %%mm1, %%mm7 \n\t"\
  865. "punpcklbw %%mm3, %%mm1 \n\t"\
  866. "punpckhbw %%mm3, %%mm7 \n\t"\
  867. \
  868. MOVNTQ(%%mm1, (dst, index, 2))\
  869. MOVNTQ(%%mm7, 8(dst, index, 2))\
  870. \
  871. "add $8, "#index" \n\t"\
  872. "cmp "#dstw", "#index" \n\t"\
  873. " jb 1b \n\t"
  874. #define WRITEYUY2(dst, dstw, index) REAL_WRITEYUY2(dst, dstw, index)
  875. static inline void RENAME(yuv2yuvX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
  876. int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize, int16_t **alpSrc,
  877. uint8_t *dest, uint8_t *uDest, uint8_t *vDest, uint8_t *aDest, long dstW, long chrDstW)
  878. {
  879. #if HAVE_MMX
  880. if(!(c->flags & SWS_BITEXACT)){
  881. if (c->flags & SWS_ACCURATE_RND){
  882. if (uDest){
  883. YSCALEYUV2YV12X_ACCURATE( "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
  884. YSCALEYUV2YV12X_ACCURATE(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
  885. }
  886. if (CONFIG_SWSCALE_ALPHA && aDest){
  887. YSCALEYUV2YV12X_ACCURATE( "0", ALP_MMX_FILTER_OFFSET, aDest, dstW)
  888. }
  889. YSCALEYUV2YV12X_ACCURATE("0", LUM_MMX_FILTER_OFFSET, dest, dstW)
  890. }else{
  891. if (uDest){
  892. YSCALEYUV2YV12X( "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
  893. YSCALEYUV2YV12X(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
  894. }
  895. if (CONFIG_SWSCALE_ALPHA && aDest){
  896. YSCALEYUV2YV12X( "0", ALP_MMX_FILTER_OFFSET, aDest, dstW)
  897. }
  898. YSCALEYUV2YV12X("0", LUM_MMX_FILTER_OFFSET, dest, dstW)
  899. }
  900. return;
  901. }
  902. #endif
  903. #if HAVE_ALTIVEC
  904. yuv2yuvX_altivec_real(lumFilter, lumSrc, lumFilterSize,
  905. chrFilter, chrSrc, chrFilterSize,
  906. dest, uDest, vDest, dstW, chrDstW);
  907. #else //HAVE_ALTIVEC
  908. yuv2yuvXinC(lumFilter, lumSrc, lumFilterSize,
  909. chrFilter, chrSrc, chrFilterSize,
  910. alpSrc, dest, uDest, vDest, aDest, dstW, chrDstW);
  911. #endif //!HAVE_ALTIVEC
  912. }
  913. static inline void RENAME(yuv2nv12X)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
  914. int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
  915. uint8_t *dest, uint8_t *uDest, int dstW, int chrDstW, int dstFormat)
  916. {
  917. yuv2nv12XinC(lumFilter, lumSrc, lumFilterSize,
  918. chrFilter, chrSrc, chrFilterSize,
  919. dest, uDest, dstW, chrDstW, dstFormat);
  920. }
  921. static inline void RENAME(yuv2yuv1)(SwsContext *c, int16_t *lumSrc, int16_t *chrSrc, int16_t *alpSrc,
  922. uint8_t *dest, uint8_t *uDest, uint8_t *vDest, uint8_t *aDest, long dstW, long chrDstW)
  923. {
  924. int i;
  925. #if HAVE_MMX
  926. if(!(c->flags & SWS_BITEXACT)){
  927. long p= 4;
  928. uint8_t *src[4]= {alpSrc + dstW, lumSrc + dstW, chrSrc + chrDstW, chrSrc + VOFW + chrDstW};
  929. uint8_t *dst[4]= {aDest, dest, uDest, vDest};
  930. x86_reg counter[4]= {dstW, dstW, chrDstW, chrDstW};
  931. if (c->flags & SWS_ACCURATE_RND){
  932. while(p--){
  933. if (dst[p]){
  934. __asm__ volatile(
  935. YSCALEYUV2YV121_ACCURATE
  936. :: "r" (src[p]), "r" (dst[p] + counter[p]),
  937. "g" (-counter[p])
  938. : "%"REG_a
  939. );
  940. }
  941. }
  942. }else{
  943. while(p--){
  944. if (dst[p]){
  945. __asm__ volatile(
  946. YSCALEYUV2YV121
  947. :: "r" (src[p]), "r" (dst[p] + counter[p]),
  948. "g" (-counter[p])
  949. : "%"REG_a
  950. );
  951. }
  952. }
  953. }
  954. return;
  955. }
  956. #endif
  957. for (i=0; i<dstW; i++)
  958. {
  959. int val= (lumSrc[i]+64)>>7;
  960. if (val&256){
  961. if (val<0) val=0;
  962. else val=255;
  963. }
  964. dest[i]= val;
  965. }
  966. if (uDest)
  967. for (i=0; i<chrDstW; i++)
  968. {
  969. int u=(chrSrc[i ]+64)>>7;
  970. int v=(chrSrc[i + VOFW]+64)>>7;
  971. if ((u|v)&256){
  972. if (u<0) u=0;
  973. else if (u>255) u=255;
  974. if (v<0) v=0;
  975. else if (v>255) v=255;
  976. }
  977. uDest[i]= u;
  978. vDest[i]= v;
  979. }
  980. if (CONFIG_SWSCALE_ALPHA && aDest)
  981. for (i=0; i<dstW; i++){
  982. int val= (alpSrc[i]+64)>>7;
  983. aDest[i]= av_clip_uint8(val);
  984. }
  985. }
  986. /**
  987. * vertical scale YV12 to RGB
  988. */
  989. static inline void RENAME(yuv2packedX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
  990. int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
  991. int16_t **alpSrc, uint8_t *dest, long dstW, long dstY)
  992. {
  993. #if HAVE_MMX
  994. x86_reg dummy=0;
  995. if(!(c->flags & SWS_BITEXACT)){
  996. if (c->flags & SWS_ACCURATE_RND){
  997. switch(c->dstFormat){
  998. case PIX_FMT_RGB32:
  999. if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf){
  1000. YSCALEYUV2PACKEDX_ACCURATE
  1001. YSCALEYUV2RGBX
  1002. "movq %%mm2, "U_TEMP"(%0) \n\t"
  1003. "movq %%mm4, "V_TEMP"(%0) \n\t"
  1004. "movq %%mm5, "Y_TEMP"(%0) \n\t"
  1005. YSCALEYUV2PACKEDX_ACCURATE_YA(ALP_MMX_FILTER_OFFSET)
  1006. "movq "Y_TEMP"(%0), %%mm5 \n\t"
  1007. "psraw $3, %%mm1 \n\t"
  1008. "psraw $3, %%mm7 \n\t"
  1009. "packuswb %%mm7, %%mm1 \n\t"
  1010. WRITEBGR32(%4, %5, %%REGa, %%mm3, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm2, %%mm6)
  1011. YSCALEYUV2PACKEDX_END
  1012. }else{
  1013. YSCALEYUV2PACKEDX_ACCURATE
  1014. YSCALEYUV2RGBX
  1015. "pcmpeqd %%mm7, %%mm7 \n\t"
  1016. WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
  1017. YSCALEYUV2PACKEDX_END
  1018. }
  1019. return;
  1020. case PIX_FMT_BGR24:
  1021. YSCALEYUV2PACKEDX_ACCURATE
  1022. YSCALEYUV2RGBX
  1023. "pxor %%mm7, %%mm7 \n\t"
  1024. "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"\n\t" //FIXME optimize
  1025. "add %4, %%"REG_c" \n\t"
  1026. WRITEBGR24(%%REGc, %5, %%REGa)
  1027. :: "r" (&c->redDither),
  1028. "m" (dummy), "m" (dummy), "m" (dummy),
  1029. "r" (dest), "m" (dstW)
  1030. : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
  1031. );
  1032. return;
  1033. case PIX_FMT_RGB555:
  1034. YSCALEYUV2PACKEDX_ACCURATE
  1035. YSCALEYUV2RGBX
  1036. "pxor %%mm7, %%mm7 \n\t"
  1037. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1038. #ifdef DITHER1XBPP
  1039. "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
  1040. "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
  1041. "paddusb "RED_DITHER"(%0), %%mm5\n\t"
  1042. #endif
  1043. WRITERGB15(%4, %5, %%REGa)
  1044. YSCALEYUV2PACKEDX_END
  1045. return;
  1046. case PIX_FMT_RGB565:
  1047. YSCALEYUV2PACKEDX_ACCURATE
  1048. YSCALEYUV2RGBX
  1049. "pxor %%mm7, %%mm7 \n\t"
  1050. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1051. #ifdef DITHER1XBPP
  1052. "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
  1053. "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
  1054. "paddusb "RED_DITHER"(%0), %%mm5\n\t"
  1055. #endif
  1056. WRITERGB16(%4, %5, %%REGa)
  1057. YSCALEYUV2PACKEDX_END
  1058. return;
  1059. case PIX_FMT_YUYV422:
  1060. YSCALEYUV2PACKEDX_ACCURATE
  1061. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1062. "psraw $3, %%mm3 \n\t"
  1063. "psraw $3, %%mm4 \n\t"
  1064. "psraw $3, %%mm1 \n\t"
  1065. "psraw $3, %%mm7 \n\t"
  1066. WRITEYUY2(%4, %5, %%REGa)
  1067. YSCALEYUV2PACKEDX_END
  1068. return;
  1069. }
  1070. }else{
  1071. switch(c->dstFormat)
  1072. {
  1073. case PIX_FMT_RGB32:
  1074. if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf){
  1075. YSCALEYUV2PACKEDX
  1076. YSCALEYUV2RGBX
  1077. YSCALEYUV2PACKEDX_YA(ALP_MMX_FILTER_OFFSET, %%mm0, %%mm3, %%mm6, %%mm1, %%mm7)
  1078. "psraw $3, %%mm1 \n\t"
  1079. "psraw $3, %%mm7 \n\t"
  1080. "packuswb %%mm7, %%mm1 \n\t"
  1081. WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
  1082. YSCALEYUV2PACKEDX_END
  1083. }else{
  1084. YSCALEYUV2PACKEDX
  1085. YSCALEYUV2RGBX
  1086. "pcmpeqd %%mm7, %%mm7 \n\t"
  1087. WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
  1088. YSCALEYUV2PACKEDX_END
  1089. }
  1090. return;
  1091. case PIX_FMT_BGR24:
  1092. YSCALEYUV2PACKEDX
  1093. YSCALEYUV2RGBX
  1094. "pxor %%mm7, %%mm7 \n\t"
  1095. "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c" \n\t" //FIXME optimize
  1096. "add %4, %%"REG_c" \n\t"
  1097. WRITEBGR24(%%REGc, %5, %%REGa)
  1098. :: "r" (&c->redDither),
  1099. "m" (dummy), "m" (dummy), "m" (dummy),
  1100. "r" (dest), "m" (dstW)
  1101. : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
  1102. );
  1103. return;
  1104. case PIX_FMT_RGB555:
  1105. YSCALEYUV2PACKEDX
  1106. YSCALEYUV2RGBX
  1107. "pxor %%mm7, %%mm7 \n\t"
  1108. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1109. #ifdef DITHER1XBPP
  1110. "paddusb "BLUE_DITHER"(%0), %%mm2 \n\t"
  1111. "paddusb "GREEN_DITHER"(%0), %%mm4 \n\t"
  1112. "paddusb "RED_DITHER"(%0), %%mm5 \n\t"
  1113. #endif
  1114. WRITERGB15(%4, %5, %%REGa)
  1115. YSCALEYUV2PACKEDX_END
  1116. return;
  1117. case PIX_FMT_RGB565:
  1118. YSCALEYUV2PACKEDX
  1119. YSCALEYUV2RGBX
  1120. "pxor %%mm7, %%mm7 \n\t"
  1121. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1122. #ifdef DITHER1XBPP
  1123. "paddusb "BLUE_DITHER"(%0), %%mm2 \n\t"
  1124. "paddusb "GREEN_DITHER"(%0), %%mm4 \n\t"
  1125. "paddusb "RED_DITHER"(%0), %%mm5 \n\t"
  1126. #endif
  1127. WRITERGB16(%4, %5, %%REGa)
  1128. YSCALEYUV2PACKEDX_END
  1129. return;
  1130. case PIX_FMT_YUYV422:
  1131. YSCALEYUV2PACKEDX
  1132. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1133. "psraw $3, %%mm3 \n\t"
  1134. "psraw $3, %%mm4 \n\t"
  1135. "psraw $3, %%mm1 \n\t"
  1136. "psraw $3, %%mm7 \n\t"
  1137. WRITEYUY2(%4, %5, %%REGa)
  1138. YSCALEYUV2PACKEDX_END
  1139. return;
  1140. }
  1141. }
  1142. }
  1143. #endif /* HAVE_MMX */
  1144. #if HAVE_ALTIVEC
  1145. /* The following list of supported dstFormat values should
  1146. match what's found in the body of ff_yuv2packedX_altivec() */
  1147. if (!(c->flags & SWS_BITEXACT) && !c->alpPixBuf &&
  1148. (c->dstFormat==PIX_FMT_ABGR || c->dstFormat==PIX_FMT_BGRA ||
  1149. c->dstFormat==PIX_FMT_BGR24 || c->dstFormat==PIX_FMT_RGB24 ||
  1150. c->dstFormat==PIX_FMT_RGBA || c->dstFormat==PIX_FMT_ARGB))
  1151. ff_yuv2packedX_altivec(c, lumFilter, lumSrc, lumFilterSize,
  1152. chrFilter, chrSrc, chrFilterSize,
  1153. dest, dstW, dstY);
  1154. else
  1155. #endif
  1156. yuv2packedXinC(c, lumFilter, lumSrc, lumFilterSize,
  1157. chrFilter, chrSrc, chrFilterSize,
  1158. alpSrc, dest, dstW, dstY);
  1159. }
  1160. /**
  1161. * vertical bilinear scale YV12 to RGB
  1162. */
  1163. static inline void RENAME(yuv2packed2)(SwsContext *c, uint16_t *buf0, uint16_t *buf1, uint16_t *uvbuf0, uint16_t *uvbuf1,
  1164. uint16_t *abuf0, uint16_t *abuf1, uint8_t *dest, int dstW, int yalpha, int uvalpha, int y)
  1165. {
  1166. int yalpha1=4095- yalpha;
  1167. int uvalpha1=4095-uvalpha;
  1168. int i;
  1169. #if HAVE_MMX
  1170. if(!(c->flags & SWS_BITEXACT)){
  1171. switch(c->dstFormat)
  1172. {
  1173. //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
  1174. case PIX_FMT_RGB32:
  1175. if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf){
  1176. #if ARCH_X86_64
  1177. __asm__ volatile(
  1178. YSCALEYUV2RGB(%%REGBP, %5)
  1179. YSCALEYUV2RGB_YA(%%REGBP, %5, %6, %7)
  1180. "psraw $3, %%mm1 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
  1181. "psraw $3, %%mm7 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
  1182. "packuswb %%mm7, %%mm1 \n\t"
  1183. WRITEBGR32(%4, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
  1184. :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "r" (dest),
  1185. "a" (&c->redDither)
  1186. ,"r" (abuf0), "r" (abuf1)
  1187. : "%"REG_BP
  1188. );
  1189. #else
  1190. *(uint16_t **)(&c->u_temp)=abuf0;
  1191. *(uint16_t **)(&c->v_temp)=abuf1;
  1192. __asm__ volatile(
  1193. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1194. "mov %4, %%"REG_b" \n\t"
  1195. "push %%"REG_BP" \n\t"
  1196. YSCALEYUV2RGB(%%REGBP, %5)
  1197. "push %0 \n\t"
  1198. "push %1 \n\t"
  1199. "mov "U_TEMP"(%5), %0 \n\t"
  1200. "mov "V_TEMP"(%5), %1 \n\t"
  1201. YSCALEYUV2RGB_YA(%%REGBP, %5, %0, %1)
  1202. "psraw $3, %%mm1 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
  1203. "psraw $3, %%mm7 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
  1204. "packuswb %%mm7, %%mm1 \n\t"
  1205. "pop %1 \n\t"
  1206. "pop %0 \n\t"
  1207. WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
  1208. "pop %%"REG_BP" \n\t"
  1209. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1210. :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
  1211. "a" (&c->redDither)
  1212. );
  1213. #endif
  1214. }else{
  1215. __asm__ volatile(
  1216. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1217. "mov %4, %%"REG_b" \n\t"
  1218. "push %%"REG_BP" \n\t"
  1219. YSCALEYUV2RGB(%%REGBP, %5)
  1220. "pcmpeqd %%mm7, %%mm7 \n\t"
  1221. WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
  1222. "pop %%"REG_BP" \n\t"
  1223. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1224. :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
  1225. "a" (&c->redDither)
  1226. );
  1227. }
  1228. return;
  1229. case PIX_FMT_BGR24:
  1230. __asm__ volatile(
  1231. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1232. "mov %4, %%"REG_b" \n\t"
  1233. "push %%"REG_BP" \n\t"
  1234. YSCALEYUV2RGB(%%REGBP, %5)
  1235. "pxor %%mm7, %%mm7 \n\t"
  1236. WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
  1237. "pop %%"REG_BP" \n\t"
  1238. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1239. :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
  1240. "a" (&c->redDither)
  1241. );
  1242. return;
  1243. case PIX_FMT_RGB555:
  1244. __asm__ volatile(
  1245. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1246. "mov %4, %%"REG_b" \n\t"
  1247. "push %%"REG_BP" \n\t"
  1248. YSCALEYUV2RGB(%%REGBP, %5)
  1249. "pxor %%mm7, %%mm7 \n\t"
  1250. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1251. #ifdef DITHER1XBPP
  1252. "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
  1253. "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
  1254. "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
  1255. #endif
  1256. WRITERGB15(%%REGb, 8280(%5), %%REGBP)
  1257. "pop %%"REG_BP" \n\t"
  1258. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1259. :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
  1260. "a" (&c->redDither)
  1261. );
  1262. return;
  1263. case PIX_FMT_RGB565:
  1264. __asm__ volatile(
  1265. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1266. "mov %4, %%"REG_b" \n\t"
  1267. "push %%"REG_BP" \n\t"
  1268. YSCALEYUV2RGB(%%REGBP, %5)
  1269. "pxor %%mm7, %%mm7 \n\t"
  1270. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1271. #ifdef DITHER1XBPP
  1272. "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
  1273. "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
  1274. "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
  1275. #endif
  1276. WRITERGB16(%%REGb, 8280(%5), %%REGBP)
  1277. "pop %%"REG_BP" \n\t"
  1278. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1279. :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
  1280. "a" (&c->redDither)
  1281. );
  1282. return;
  1283. case PIX_FMT_YUYV422:
  1284. __asm__ volatile(
  1285. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1286. "mov %4, %%"REG_b" \n\t"
  1287. "push %%"REG_BP" \n\t"
  1288. YSCALEYUV2PACKED(%%REGBP, %5)
  1289. WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
  1290. "pop %%"REG_BP" \n\t"
  1291. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1292. :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
  1293. "a" (&c->redDither)
  1294. );
  1295. return;
  1296. default: break;
  1297. }
  1298. }
  1299. #endif //HAVE_MMX
  1300. YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB2_C, YSCALE_YUV_2_PACKED2_C(void,0), YSCALE_YUV_2_GRAY16_2_C, YSCALE_YUV_2_MONO2_C)
  1301. }
  1302. /**
  1303. * YV12 to RGB without scaling or interpolating
  1304. */
  1305. static inline void RENAME(yuv2packed1)(SwsContext *c, uint16_t *buf0, uint16_t *uvbuf0, uint16_t *uvbuf1,
  1306. uint16_t *abuf0, uint8_t *dest, int dstW, int uvalpha, int dstFormat, int flags, int y)
  1307. {
  1308. const int yalpha1=0;
  1309. int i;
  1310. uint16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
  1311. const int yalpha= 4096; //FIXME ...
  1312. if (flags&SWS_FULL_CHR_H_INT)
  1313. {
  1314. RENAME(yuv2packed2)(c, buf0, buf0, uvbuf0, uvbuf1, abuf0, abuf0, dest, dstW, 0, uvalpha, y);
  1315. return;
  1316. }
  1317. #if HAVE_MMX
  1318. if(!(flags & SWS_BITEXACT)){
  1319. if (uvalpha < 2048) // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
  1320. {
  1321. switch(dstFormat)
  1322. {
  1323. case PIX_FMT_RGB32:
  1324. if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf){
  1325. __asm__ volatile(
  1326. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1327. "mov %4, %%"REG_b" \n\t"
  1328. "push %%"REG_BP" \n\t"
  1329. YSCALEYUV2RGB1(%%REGBP, %5)
  1330. YSCALEYUV2RGB1_ALPHA(%%REGBP)
  1331. WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
  1332. "pop %%"REG_BP" \n\t"
  1333. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1334. :: "c" (buf0), "d" (abuf0), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
  1335. "a" (&c->redDither)
  1336. );
  1337. }else{
  1338. __asm__ volatile(
  1339. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1340. "mov %4, %%"REG_b" \n\t"
  1341. "push %%"REG_BP" \n\t"
  1342. YSCALEYUV2RGB1(%%REGBP, %5)
  1343. "pcmpeqd %%mm7, %%mm7 \n\t"
  1344. WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
  1345. "pop %%"REG_BP" \n\t"
  1346. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1347. :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
  1348. "a" (&c->redDither)
  1349. );
  1350. }
  1351. return;
  1352. case PIX_FMT_BGR24:
  1353. __asm__ volatile(
  1354. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1355. "mov %4, %%"REG_b" \n\t"
  1356. "push %%"REG_BP" \n\t"
  1357. YSCALEYUV2RGB1(%%REGBP, %5)
  1358. "pxor %%mm7, %%mm7 \n\t"
  1359. WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
  1360. "pop %%"REG_BP" \n\t"
  1361. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1362. :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
  1363. "a" (&c->redDither)
  1364. );
  1365. return;
  1366. case PIX_FMT_RGB555:
  1367. __asm__ volatile(
  1368. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1369. "mov %4, %%"REG_b" \n\t"
  1370. "push %%"REG_BP" \n\t"
  1371. YSCALEYUV2RGB1(%%REGBP, %5)
  1372. "pxor %%mm7, %%mm7 \n\t"
  1373. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1374. #ifdef DITHER1XBPP
  1375. "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
  1376. "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
  1377. "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
  1378. #endif
  1379. WRITERGB15(%%REGb, 8280(%5), %%REGBP)
  1380. "pop %%"REG_BP" \n\t"
  1381. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1382. :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
  1383. "a" (&c->redDither)
  1384. );
  1385. return;
  1386. case PIX_FMT_RGB565:
  1387. __asm__ volatile(
  1388. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1389. "mov %4, %%"REG_b" \n\t"
  1390. "push %%"REG_BP" \n\t"
  1391. YSCALEYUV2RGB1(%%REGBP, %5)
  1392. "pxor %%mm7, %%mm7 \n\t"
  1393. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1394. #ifdef DITHER1XBPP
  1395. "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
  1396. "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
  1397. "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
  1398. #endif
  1399. WRITERGB16(%%REGb, 8280(%5), %%REGBP)
  1400. "pop %%"REG_BP" \n\t"
  1401. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1402. :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
  1403. "a" (&c->redDither)
  1404. );
  1405. return;
  1406. case PIX_FMT_YUYV422:
  1407. __asm__ volatile(
  1408. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1409. "mov %4, %%"REG_b" \n\t"
  1410. "push %%"REG_BP" \n\t"
  1411. YSCALEYUV2PACKED1(%%REGBP, %5)
  1412. WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
  1413. "pop %%"REG_BP" \n\t"
  1414. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1415. :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
  1416. "a" (&c->redDither)
  1417. );
  1418. return;
  1419. }
  1420. }
  1421. else
  1422. {
  1423. switch(dstFormat)
  1424. {
  1425. case PIX_FMT_RGB32:
  1426. if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf){
  1427. __asm__ volatile(
  1428. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1429. "mov %4, %%"REG_b" \n\t"
  1430. "push %%"REG_BP" \n\t"
  1431. YSCALEYUV2RGB1b(%%REGBP, %5)
  1432. YSCALEYUV2RGB1_ALPHA(%%REGBP)
  1433. WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
  1434. "pop %%"REG_BP" \n\t"
  1435. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1436. :: "c" (buf0), "d" (abuf0), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
  1437. "a" (&c->redDither)
  1438. );
  1439. }else{
  1440. __asm__ volatile(
  1441. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1442. "mov %4, %%"REG_b" \n\t"
  1443. "push %%"REG_BP" \n\t"
  1444. YSCALEYUV2RGB1b(%%REGBP, %5)
  1445. "pcmpeqd %%mm7, %%mm7 \n\t"
  1446. WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
  1447. "pop %%"REG_BP" \n\t"
  1448. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1449. :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
  1450. "a" (&c->redDither)
  1451. );
  1452. }
  1453. return;
  1454. case PIX_FMT_BGR24:
  1455. __asm__ volatile(
  1456. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1457. "mov %4, %%"REG_b" \n\t"
  1458. "push %%"REG_BP" \n\t"
  1459. YSCALEYUV2RGB1b(%%REGBP, %5)
  1460. "pxor %%mm7, %%mm7 \n\t"
  1461. WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
  1462. "pop %%"REG_BP" \n\t"
  1463. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1464. :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
  1465. "a" (&c->redDither)
  1466. );
  1467. return;
  1468. case PIX_FMT_RGB555:
  1469. __asm__ volatile(
  1470. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1471. "mov %4, %%"REG_b" \n\t"
  1472. "push %%"REG_BP" \n\t"
  1473. YSCALEYUV2RGB1b(%%REGBP, %5)
  1474. "pxor %%mm7, %%mm7 \n\t"
  1475. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1476. #ifdef DITHER1XBPP
  1477. "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
  1478. "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
  1479. "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
  1480. #endif
  1481. WRITERGB15(%%REGb, 8280(%5), %%REGBP)
  1482. "pop %%"REG_BP" \n\t"
  1483. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1484. :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
  1485. "a" (&c->redDither)
  1486. );
  1487. return;
  1488. case PIX_FMT_RGB565:
  1489. __asm__ volatile(
  1490. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1491. "mov %4, %%"REG_b" \n\t"
  1492. "push %%"REG_BP" \n\t"
  1493. YSCALEYUV2RGB1b(%%REGBP, %5)
  1494. "pxor %%mm7, %%mm7 \n\t"
  1495. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1496. #ifdef DITHER1XBPP
  1497. "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
  1498. "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
  1499. "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
  1500. #endif
  1501. WRITERGB16(%%REGb, 8280(%5), %%REGBP)
  1502. "pop %%"REG_BP" \n\t"
  1503. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1504. :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
  1505. "a" (&c->redDither)
  1506. );
  1507. return;
  1508. case PIX_FMT_YUYV422:
  1509. __asm__ volatile(
  1510. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1511. "mov %4, %%"REG_b" \n\t"
  1512. "push %%"REG_BP" \n\t"
  1513. YSCALEYUV2PACKED1b(%%REGBP, %5)
  1514. WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
  1515. "pop %%"REG_BP" \n\t"
  1516. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1517. :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
  1518. "a" (&c->redDither)
  1519. );
  1520. return;
  1521. }
  1522. }
  1523. }
  1524. #endif /* HAVE_MMX */
  1525. if (uvalpha < 2048)
  1526. {
  1527. YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1_C, YSCALE_YUV_2_PACKED1_C(void,0), YSCALE_YUV_2_GRAY16_1_C, YSCALE_YUV_2_MONO2_C)
  1528. }else{
  1529. YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1B_C, YSCALE_YUV_2_PACKED1B_C(void,0), YSCALE_YUV_2_GRAY16_1_C, YSCALE_YUV_2_MONO2_C)
  1530. }
  1531. }
  1532. //FIXME yuy2* can read up to 7 samples too much
  1533. static inline void RENAME(yuy2ToY)(uint8_t *dst, uint8_t *src, long width, uint32_t *unused)
  1534. {
  1535. #if HAVE_MMX
  1536. __asm__ volatile(
  1537. "movq "MANGLE(bm01010101)", %%mm2 \n\t"
  1538. "mov %0, %%"REG_a" \n\t"
  1539. "1: \n\t"
  1540. "movq (%1, %%"REG_a",2), %%mm0 \n\t"
  1541. "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
  1542. "pand %%mm2, %%mm0 \n\t"
  1543. "pand %%mm2, %%mm1 \n\t"
  1544. "packuswb %%mm1, %%mm0 \n\t"
  1545. "movq %%mm0, (%2, %%"REG_a") \n\t"
  1546. "add $8, %%"REG_a" \n\t"
  1547. " js 1b \n\t"
  1548. : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst+width)
  1549. : "%"REG_a
  1550. );
  1551. #else
  1552. int i;
  1553. for (i=0; i<width; i++)
  1554. dst[i]= src[2*i];
  1555. #endif
  1556. }
  1557. static inline void RENAME(yuy2ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width, uint32_t *unused)
  1558. {
  1559. #if HAVE_MMX
  1560. __asm__ volatile(
  1561. "movq "MANGLE(bm01010101)", %%mm4 \n\t"
  1562. "mov %0, %%"REG_a" \n\t"
  1563. "1: \n\t"
  1564. "movq (%1, %%"REG_a",4), %%mm0 \n\t"
  1565. "movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
  1566. "psrlw $8, %%mm0 \n\t"
  1567. "psrlw $8, %%mm1 \n\t"
  1568. "packuswb %%mm1, %%mm0 \n\t"
  1569. "movq %%mm0, %%mm1 \n\t"
  1570. "psrlw $8, %%mm0 \n\t"
  1571. "pand %%mm4, %%mm1 \n\t"
  1572. "packuswb %%mm0, %%mm0 \n\t"
  1573. "packuswb %%mm1, %%mm1 \n\t"
  1574. "movd %%mm0, (%3, %%"REG_a") \n\t"
  1575. "movd %%mm1, (%2, %%"REG_a") \n\t"
  1576. "add $4, %%"REG_a" \n\t"
  1577. " js 1b \n\t"
  1578. : : "g" ((x86_reg)-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
  1579. : "%"REG_a
  1580. );
  1581. #else
  1582. int i;
  1583. for (i=0; i<width; i++)
  1584. {
  1585. dstU[i]= src1[4*i + 1];
  1586. dstV[i]= src1[4*i + 3];
  1587. }
  1588. #endif
  1589. assert(src1 == src2);
  1590. }
  1591. /* This is almost identical to the previous, end exists only because
  1592. * yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses. */
  1593. static inline void RENAME(uyvyToY)(uint8_t *dst, uint8_t *src, long width, uint32_t *unused)
  1594. {
  1595. #if HAVE_MMX
  1596. __asm__ volatile(
  1597. "mov %0, %%"REG_a" \n\t"
  1598. "1: \n\t"
  1599. "movq (%1, %%"REG_a",2), %%mm0 \n\t"
  1600. "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
  1601. "psrlw $8, %%mm0 \n\t"
  1602. "psrlw $8, %%mm1 \n\t"
  1603. "packuswb %%mm1, %%mm0 \n\t"
  1604. "movq %%mm0, (%2, %%"REG_a") \n\t"
  1605. "add $8, %%"REG_a" \n\t"
  1606. " js 1b \n\t"
  1607. : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst+width)
  1608. : "%"REG_a
  1609. );
  1610. #else
  1611. int i;
  1612. for (i=0; i<width; i++)
  1613. dst[i]= src[2*i+1];
  1614. #endif
  1615. }
  1616. static inline void RENAME(uyvyToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width, uint32_t *unused)
  1617. {
  1618. #if HAVE_MMX
  1619. __asm__ volatile(
  1620. "movq "MANGLE(bm01010101)", %%mm4 \n\t"
  1621. "mov %0, %%"REG_a" \n\t"
  1622. "1: \n\t"
  1623. "movq (%1, %%"REG_a",4), %%mm0 \n\t"
  1624. "movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
  1625. "pand %%mm4, %%mm0 \n\t"
  1626. "pand %%mm4, %%mm1 \n\t"
  1627. "packuswb %%mm1, %%mm0 \n\t"
  1628. "movq %%mm0, %%mm1 \n\t"
  1629. "psrlw $8, %%mm0 \n\t"
  1630. "pand %%mm4, %%mm1 \n\t"
  1631. "packuswb %%mm0, %%mm0 \n\t"
  1632. "packuswb %%mm1, %%mm1 \n\t"
  1633. "movd %%mm0, (%3, %%"REG_a") \n\t"
  1634. "movd %%mm1, (%2, %%"REG_a") \n\t"
  1635. "add $4, %%"REG_a" \n\t"
  1636. " js 1b \n\t"
  1637. : : "g" ((x86_reg)-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
  1638. : "%"REG_a
  1639. );
  1640. #else
  1641. int i;
  1642. for (i=0; i<width; i++)
  1643. {
  1644. dstU[i]= src1[4*i + 0];
  1645. dstV[i]= src1[4*i + 2];
  1646. }
  1647. #endif
  1648. assert(src1 == src2);
  1649. }
  1650. #define BGR2Y(type, name, shr, shg, shb, maskr, maskg, maskb, RY, GY, BY, S)\
  1651. static inline void RENAME(name)(uint8_t *dst, uint8_t *src, long width, uint32_t *unused)\
  1652. {\
  1653. int i;\
  1654. for (i=0; i<width; i++)\
  1655. {\
  1656. int b= (((type*)src)[i]>>shb)&maskb;\
  1657. int g= (((type*)src)[i]>>shg)&maskg;\
  1658. int r= (((type*)src)[i]>>shr)&maskr;\
  1659. \
  1660. dst[i]= (((RY)*r + (GY)*g + (BY)*b + (33<<((S)-1)))>>(S));\
  1661. }\
  1662. }
  1663. BGR2Y(uint32_t, bgr32ToY,16, 0, 0, 0x00FF, 0xFF00, 0x00FF, RY<< 8, GY , BY<< 8, RGB2YUV_SHIFT+8)
  1664. BGR2Y(uint32_t, rgb32ToY, 0, 0,16, 0x00FF, 0xFF00, 0x00FF, RY<< 8, GY , BY<< 8, RGB2YUV_SHIFT+8)
  1665. BGR2Y(uint16_t, bgr16ToY, 0, 0, 0, 0x001F, 0x07E0, 0xF800, RY<<11, GY<<5, BY , RGB2YUV_SHIFT+8)
  1666. BGR2Y(uint16_t, bgr15ToY, 0, 0, 0, 0x001F, 0x03E0, 0x7C00, RY<<10, GY<<5, BY , RGB2YUV_SHIFT+7)
  1667. BGR2Y(uint16_t, rgb16ToY, 0, 0, 0, 0xF800, 0x07E0, 0x001F, RY , GY<<5, BY<<11, RGB2YUV_SHIFT+8)
  1668. BGR2Y(uint16_t, rgb15ToY, 0, 0, 0, 0x7C00, 0x03E0, 0x001F, RY , GY<<5, BY<<10, RGB2YUV_SHIFT+7)
  1669. static inline void RENAME(abgrToA)(uint8_t *dst, uint8_t *src, long width, uint32_t *unused){
  1670. int i;
  1671. for (i=0; i<width; i++){
  1672. dst[i]= src[4*i];
  1673. }
  1674. }
  1675. #define BGR2UV(type, name, shr, shg, shb, maska, maskr, maskg, maskb, RU, GU, BU, RV, GV, BV, S)\
  1676. static inline void RENAME(name)(uint8_t *dstU, uint8_t *dstV, uint8_t *src, uint8_t *dummy, long width, uint32_t *unused)\
  1677. {\
  1678. int i;\
  1679. for (i=0; i<width; i++)\
  1680. {\
  1681. int b= (((type*)src)[i]&maskb)>>shb;\
  1682. int g= (((type*)src)[i]&maskg)>>shg;\
  1683. int r= (((type*)src)[i]&maskr)>>shr;\
  1684. \
  1685. dstU[i]= ((RU)*r + (GU)*g + (BU)*b + (257<<((S)-1)))>>(S);\
  1686. dstV[i]= ((RV)*r + (GV)*g + (BV)*b + (257<<((S)-1)))>>(S);\
  1687. }\
  1688. }\
  1689. static inline void RENAME(name ## _half)(uint8_t *dstU, uint8_t *dstV, uint8_t *src, uint8_t *dummy, long width, uint32_t *unused)\
  1690. {\
  1691. int i;\
  1692. for (i=0; i<width; i++)\
  1693. {\
  1694. int pix0= ((type*)src)[2*i+0];\
  1695. int pix1= ((type*)src)[2*i+1];\
  1696. int g= (pix0&~(maskr|maskb))+(pix1&~(maskr|maskb));\
  1697. int b= ((pix0+pix1-g)&(maskb|(2*maskb)))>>shb;\
  1698. int r= ((pix0+pix1-g)&(maskr|(2*maskr)))>>shr;\
  1699. g&= maskg|(2*maskg);\
  1700. \
  1701. g>>=shg;\
  1702. \
  1703. dstU[i]= ((RU)*r + (GU)*g + (BU)*b + (257<<(S)))>>((S)+1);\
  1704. dstV[i]= ((RV)*r + (GV)*g + (BV)*b + (257<<(S)))>>((S)+1);\
  1705. }\
  1706. }
  1707. BGR2UV(uint32_t, bgr32ToUV,16, 0, 0, 0xFF000000, 0xFF0000, 0xFF00, 0x00FF, RU<< 8, GU , BU<< 8, RV<< 8, GV , BV<< 8, RGB2YUV_SHIFT+8)
  1708. BGR2UV(uint32_t, rgb32ToUV, 0, 0,16, 0xFF000000, 0x00FF, 0xFF00, 0xFF0000, RU<< 8, GU , BU<< 8, RV<< 8, GV , BV<< 8, RGB2YUV_SHIFT+8)
  1709. BGR2UV(uint16_t, bgr16ToUV, 0, 0, 0, 0, 0x001F, 0x07E0, 0xF800, RU<<11, GU<<5, BU , RV<<11, GV<<5, BV , RGB2YUV_SHIFT+8)
  1710. BGR2UV(uint16_t, bgr15ToUV, 0, 0, 0, 0, 0x001F, 0x03E0, 0x7C00, RU<<10, GU<<5, BU , RV<<10, GV<<5, BV , RGB2YUV_SHIFT+7)
  1711. BGR2UV(uint16_t, rgb16ToUV, 0, 0, 0, 0, 0xF800, 0x07E0, 0x001F, RU , GU<<5, BU<<11, RV , GV<<5, BV<<11, RGB2YUV_SHIFT+8)
  1712. BGR2UV(uint16_t, rgb15ToUV, 0, 0, 0, 0, 0x7C00, 0x03E0, 0x001F, RU , GU<<5, BU<<10, RV , GV<<5, BV<<10, RGB2YUV_SHIFT+7)
  1713. #if HAVE_MMX
  1714. static inline void RENAME(bgr24ToY_mmx)(uint8_t *dst, uint8_t *src, long width, int srcFormat)
  1715. {
  1716. if(srcFormat == PIX_FMT_BGR24){
  1717. __asm__ volatile(
  1718. "movq "MANGLE(ff_bgr24toY1Coeff)", %%mm5 \n\t"
  1719. "movq "MANGLE(ff_bgr24toY2Coeff)", %%mm6 \n\t"
  1720. :
  1721. );
  1722. }else{
  1723. __asm__ volatile(
  1724. "movq "MANGLE(ff_rgb24toY1Coeff)", %%mm5 \n\t"
  1725. "movq "MANGLE(ff_rgb24toY2Coeff)", %%mm6 \n\t"
  1726. :
  1727. );
  1728. }
  1729. __asm__ volatile(
  1730. "movq "MANGLE(ff_bgr24toYOffset)", %%mm4 \n\t"
  1731. "mov %2, %%"REG_a" \n\t"
  1732. "pxor %%mm7, %%mm7 \n\t"
  1733. "1: \n\t"
  1734. PREFETCH" 64(%0) \n\t"
  1735. "movd (%0), %%mm0 \n\t"
  1736. "movd 2(%0), %%mm1 \n\t"
  1737. "movd 6(%0), %%mm2 \n\t"
  1738. "movd 8(%0), %%mm3 \n\t"
  1739. "add $12, %0 \n\t"
  1740. "punpcklbw %%mm7, %%mm0 \n\t"
  1741. "punpcklbw %%mm7, %%mm1 \n\t"
  1742. "punpcklbw %%mm7, %%mm2 \n\t"
  1743. "punpcklbw %%mm7, %%mm3 \n\t"
  1744. "pmaddwd %%mm5, %%mm0 \n\t"
  1745. "pmaddwd %%mm6, %%mm1 \n\t"
  1746. "pmaddwd %%mm5, %%mm2 \n\t"
  1747. "pmaddwd %%mm6, %%mm3 \n\t"
  1748. "paddd %%mm1, %%mm0 \n\t"
  1749. "paddd %%mm3, %%mm2 \n\t"
  1750. "paddd %%mm4, %%mm0 \n\t"
  1751. "paddd %%mm4, %%mm2 \n\t"
  1752. "psrad $15, %%mm0 \n\t"
  1753. "psrad $15, %%mm2 \n\t"
  1754. "packssdw %%mm2, %%mm0 \n\t"
  1755. "packuswb %%mm0, %%mm0 \n\t"
  1756. "movd %%mm0, (%1, %%"REG_a") \n\t"
  1757. "add $4, %%"REG_a" \n\t"
  1758. " js 1b \n\t"
  1759. : "+r" (src)
  1760. : "r" (dst+width), "g" ((x86_reg)-width)
  1761. : "%"REG_a
  1762. );
  1763. }
  1764. static inline void RENAME(bgr24ToUV_mmx)(uint8_t *dstU, uint8_t *dstV, uint8_t *src, long width, int srcFormat)
  1765. {
  1766. __asm__ volatile(
  1767. "movq 24+%4, %%mm6 \n\t"
  1768. "mov %3, %%"REG_a" \n\t"
  1769. "pxor %%mm7, %%mm7 \n\t"
  1770. "1: \n\t"
  1771. PREFETCH" 64(%0) \n\t"
  1772. "movd (%0), %%mm0 \n\t"
  1773. "movd 2(%0), %%mm1 \n\t"
  1774. "punpcklbw %%mm7, %%mm0 \n\t"
  1775. "punpcklbw %%mm7, %%mm1 \n\t"
  1776. "movq %%mm0, %%mm2 \n\t"
  1777. "movq %%mm1, %%mm3 \n\t"
  1778. "pmaddwd %4, %%mm0 \n\t"
  1779. "pmaddwd 8+%4, %%mm1 \n\t"
  1780. "pmaddwd 16+%4, %%mm2 \n\t"
  1781. "pmaddwd %%mm6, %%mm3 \n\t"
  1782. "paddd %%mm1, %%mm0 \n\t"
  1783. "paddd %%mm3, %%mm2 \n\t"
  1784. "movd 6(%0), %%mm1 \n\t"
  1785. "movd 8(%0), %%mm3 \n\t"
  1786. "add $12, %0 \n\t"
  1787. "punpcklbw %%mm7, %%mm1 \n\t"
  1788. "punpcklbw %%mm7, %%mm3 \n\t"
  1789. "movq %%mm1, %%mm4 \n\t"
  1790. "movq %%mm3, %%mm5 \n\t"
  1791. "pmaddwd %4, %%mm1 \n\t"
  1792. "pmaddwd 8+%4, %%mm3 \n\t"
  1793. "pmaddwd 16+%4, %%mm4 \n\t"
  1794. "pmaddwd %%mm6, %%mm5 \n\t"
  1795. "paddd %%mm3, %%mm1 \n\t"
  1796. "paddd %%mm5, %%mm4 \n\t"
  1797. "movq "MANGLE(ff_bgr24toUVOffset)", %%mm3 \n\t"
  1798. "paddd %%mm3, %%mm0 \n\t"
  1799. "paddd %%mm3, %%mm2 \n\t"
  1800. "paddd %%mm3, %%mm1 \n\t"
  1801. "paddd %%mm3, %%mm4 \n\t"
  1802. "psrad $15, %%mm0 \n\t"
  1803. "psrad $15, %%mm2 \n\t"
  1804. "psrad $15, %%mm1 \n\t"
  1805. "psrad $15, %%mm4 \n\t"
  1806. "packssdw %%mm1, %%mm0 \n\t"
  1807. "packssdw %%mm4, %%mm2 \n\t"
  1808. "packuswb %%mm0, %%mm0 \n\t"
  1809. "packuswb %%mm2, %%mm2 \n\t"
  1810. "movd %%mm0, (%1, %%"REG_a") \n\t"
  1811. "movd %%mm2, (%2, %%"REG_a") \n\t"
  1812. "add $4, %%"REG_a" \n\t"
  1813. " js 1b \n\t"
  1814. : "+r" (src)
  1815. : "r" (dstU+width), "r" (dstV+width), "g" ((x86_reg)-width), "m"(ff_bgr24toUV[srcFormat == PIX_FMT_RGB24][0])
  1816. : "%"REG_a
  1817. );
  1818. }
  1819. #endif
  1820. static inline void RENAME(bgr24ToY)(uint8_t *dst, uint8_t *src, long width, uint32_t *unused)
  1821. {
  1822. #if HAVE_MMX
  1823. RENAME(bgr24ToY_mmx)(dst, src, width, PIX_FMT_BGR24);
  1824. #else
  1825. int i;
  1826. for (i=0; i<width; i++)
  1827. {
  1828. int b= src[i*3+0];
  1829. int g= src[i*3+1];
  1830. int r= src[i*3+2];
  1831. dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
  1832. }
  1833. #endif /* HAVE_MMX */
  1834. }
  1835. static inline void RENAME(bgr24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width, uint32_t *unused)
  1836. {
  1837. #if HAVE_MMX
  1838. RENAME(bgr24ToUV_mmx)(dstU, dstV, src1, width, PIX_FMT_BGR24);
  1839. #else
  1840. int i;
  1841. for (i=0; i<width; i++)
  1842. {
  1843. int b= src1[3*i + 0];
  1844. int g= src1[3*i + 1];
  1845. int r= src1[3*i + 2];
  1846. dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
  1847. dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
  1848. }
  1849. #endif /* HAVE_MMX */
  1850. assert(src1 == src2);
  1851. }
  1852. static inline void RENAME(bgr24ToUV_half)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width, uint32_t *unused)
  1853. {
  1854. int i;
  1855. for (i=0; i<width; i++)
  1856. {
  1857. int b= src1[6*i + 0] + src1[6*i + 3];
  1858. int g= src1[6*i + 1] + src1[6*i + 4];
  1859. int r= src1[6*i + 2] + src1[6*i + 5];
  1860. dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
  1861. dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
  1862. }
  1863. assert(src1 == src2);
  1864. }
  1865. static inline void RENAME(rgb24ToY)(uint8_t *dst, uint8_t *src, long width, uint32_t *unused)
  1866. {
  1867. #if HAVE_MMX
  1868. RENAME(bgr24ToY_mmx)(dst, src, width, PIX_FMT_RGB24);
  1869. #else
  1870. int i;
  1871. for (i=0; i<width; i++)
  1872. {
  1873. int r= src[i*3+0];
  1874. int g= src[i*3+1];
  1875. int b= src[i*3+2];
  1876. dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
  1877. }
  1878. #endif
  1879. }
  1880. static inline void RENAME(rgb24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width, uint32_t *unused)
  1881. {
  1882. #if HAVE_MMX
  1883. assert(src1==src2);
  1884. RENAME(bgr24ToUV_mmx)(dstU, dstV, src1, width, PIX_FMT_RGB24);
  1885. #else
  1886. int i;
  1887. assert(src1==src2);
  1888. for (i=0; i<width; i++)
  1889. {
  1890. int r= src1[3*i + 0];
  1891. int g= src1[3*i + 1];
  1892. int b= src1[3*i + 2];
  1893. dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
  1894. dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
  1895. }
  1896. #endif
  1897. }
  1898. static inline void RENAME(rgb24ToUV_half)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width, uint32_t *unused)
  1899. {
  1900. int i;
  1901. assert(src1==src2);
  1902. for (i=0; i<width; i++)
  1903. {
  1904. int r= src1[6*i + 0] + src1[6*i + 3];
  1905. int g= src1[6*i + 1] + src1[6*i + 4];
  1906. int b= src1[6*i + 2] + src1[6*i + 5];
  1907. dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
  1908. dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
  1909. }
  1910. }
  1911. static inline void RENAME(palToY)(uint8_t *dst, uint8_t *src, long width, uint32_t *pal)
  1912. {
  1913. int i;
  1914. for (i=0; i<width; i++)
  1915. {
  1916. int d= src[i];
  1917. dst[i]= pal[d] & 0xFF;
  1918. }
  1919. }
  1920. static inline void RENAME(palToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width, uint32_t *pal)
  1921. {
  1922. int i;
  1923. assert(src1 == src2);
  1924. for (i=0; i<width; i++)
  1925. {
  1926. int p= pal[src1[i]];
  1927. dstU[i]= p>>8;
  1928. dstV[i]= p>>16;
  1929. }
  1930. }
  1931. static inline void RENAME(monowhite2Y)(uint8_t *dst, uint8_t *src, long width, uint32_t *unused)
  1932. {
  1933. int i, j;
  1934. for (i=0; i<width/8; i++){
  1935. int d= ~src[i];
  1936. for(j=0; j<8; j++)
  1937. dst[8*i+j]= ((d>>(7-j))&1)*255;
  1938. }
  1939. }
  1940. static inline void RENAME(monoblack2Y)(uint8_t *dst, uint8_t *src, long width, uint32_t *unused)
  1941. {
  1942. int i, j;
  1943. for (i=0; i<width/8; i++){
  1944. int d= src[i];
  1945. for(j=0; j<8; j++)
  1946. dst[8*i+j]= ((d>>(7-j))&1)*255;
  1947. }
  1948. }
  1949. // bilinear / bicubic scaling
  1950. static inline void RENAME(hScale)(int16_t *dst, int dstW, uint8_t *src, int srcW, int xInc,
  1951. int16_t *filter, int16_t *filterPos, long filterSize)
  1952. {
  1953. #if HAVE_MMX
  1954. assert(filterSize % 4 == 0 && filterSize>0);
  1955. if (filterSize==4) // Always true for upscaling, sometimes for down, too.
  1956. {
  1957. x86_reg counter= -2*dstW;
  1958. filter-= counter*2;
  1959. filterPos-= counter/2;
  1960. dst-= counter/2;
  1961. __asm__ volatile(
  1962. #if defined(PIC)
  1963. "push %%"REG_b" \n\t"
  1964. #endif
  1965. "pxor %%mm7, %%mm7 \n\t"
  1966. "push %%"REG_BP" \n\t" // we use 7 regs here ...
  1967. "mov %%"REG_a", %%"REG_BP" \n\t"
  1968. ASMALIGN(4)
  1969. "1: \n\t"
  1970. "movzwl (%2, %%"REG_BP"), %%eax \n\t"
  1971. "movzwl 2(%2, %%"REG_BP"), %%ebx \n\t"
  1972. "movq (%1, %%"REG_BP", 4), %%mm1 \n\t"
  1973. "movq 8(%1, %%"REG_BP", 4), %%mm3 \n\t"
  1974. "movd (%3, %%"REG_a"), %%mm0 \n\t"
  1975. "movd (%3, %%"REG_b"), %%mm2 \n\t"
  1976. "punpcklbw %%mm7, %%mm0 \n\t"
  1977. "punpcklbw %%mm7, %%mm2 \n\t"
  1978. "pmaddwd %%mm1, %%mm0 \n\t"
  1979. "pmaddwd %%mm2, %%mm3 \n\t"
  1980. "movq %%mm0, %%mm4 \n\t"
  1981. "punpckldq %%mm3, %%mm0 \n\t"
  1982. "punpckhdq %%mm3, %%mm4 \n\t"
  1983. "paddd %%mm4, %%mm0 \n\t"
  1984. "psrad $7, %%mm0 \n\t"
  1985. "packssdw %%mm0, %%mm0 \n\t"
  1986. "movd %%mm0, (%4, %%"REG_BP") \n\t"
  1987. "add $4, %%"REG_BP" \n\t"
  1988. " jnc 1b \n\t"
  1989. "pop %%"REG_BP" \n\t"
  1990. #if defined(PIC)
  1991. "pop %%"REG_b" \n\t"
  1992. #endif
  1993. : "+a" (counter)
  1994. : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
  1995. #if !defined(PIC)
  1996. : "%"REG_b
  1997. #endif
  1998. );
  1999. }
  2000. else if (filterSize==8)
  2001. {
  2002. x86_reg counter= -2*dstW;
  2003. filter-= counter*4;
  2004. filterPos-= counter/2;
  2005. dst-= counter/2;
  2006. __asm__ volatile(
  2007. #if defined(PIC)
  2008. "push %%"REG_b" \n\t"
  2009. #endif
  2010. "pxor %%mm7, %%mm7 \n\t"
  2011. "push %%"REG_BP" \n\t" // we use 7 regs here ...
  2012. "mov %%"REG_a", %%"REG_BP" \n\t"
  2013. ASMALIGN(4)
  2014. "1: \n\t"
  2015. "movzwl (%2, %%"REG_BP"), %%eax \n\t"
  2016. "movzwl 2(%2, %%"REG_BP"), %%ebx \n\t"
  2017. "movq (%1, %%"REG_BP", 8), %%mm1 \n\t"
  2018. "movq 16(%1, %%"REG_BP", 8), %%mm3 \n\t"
  2019. "movd (%3, %%"REG_a"), %%mm0 \n\t"
  2020. "movd (%3, %%"REG_b"), %%mm2 \n\t"
  2021. "punpcklbw %%mm7, %%mm0 \n\t"
  2022. "punpcklbw %%mm7, %%mm2 \n\t"
  2023. "pmaddwd %%mm1, %%mm0 \n\t"
  2024. "pmaddwd %%mm2, %%mm3 \n\t"
  2025. "movq 8(%1, %%"REG_BP", 8), %%mm1 \n\t"
  2026. "movq 24(%1, %%"REG_BP", 8), %%mm5 \n\t"
  2027. "movd 4(%3, %%"REG_a"), %%mm4 \n\t"
  2028. "movd 4(%3, %%"REG_b"), %%mm2 \n\t"
  2029. "punpcklbw %%mm7, %%mm4 \n\t"
  2030. "punpcklbw %%mm7, %%mm2 \n\t"
  2031. "pmaddwd %%mm1, %%mm4 \n\t"
  2032. "pmaddwd %%mm2, %%mm5 \n\t"
  2033. "paddd %%mm4, %%mm0 \n\t"
  2034. "paddd %%mm5, %%mm3 \n\t"
  2035. "movq %%mm0, %%mm4 \n\t"
  2036. "punpckldq %%mm3, %%mm0 \n\t"
  2037. "punpckhdq %%mm3, %%mm4 \n\t"
  2038. "paddd %%mm4, %%mm0 \n\t"
  2039. "psrad $7, %%mm0 \n\t"
  2040. "packssdw %%mm0, %%mm0 \n\t"
  2041. "movd %%mm0, (%4, %%"REG_BP") \n\t"
  2042. "add $4, %%"REG_BP" \n\t"
  2043. " jnc 1b \n\t"
  2044. "pop %%"REG_BP" \n\t"
  2045. #if defined(PIC)
  2046. "pop %%"REG_b" \n\t"
  2047. #endif
  2048. : "+a" (counter)
  2049. : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
  2050. #if !defined(PIC)
  2051. : "%"REG_b
  2052. #endif
  2053. );
  2054. }
  2055. else
  2056. {
  2057. uint8_t *offset = src+filterSize;
  2058. x86_reg counter= -2*dstW;
  2059. //filter-= counter*filterSize/2;
  2060. filterPos-= counter/2;
  2061. dst-= counter/2;
  2062. __asm__ volatile(
  2063. "pxor %%mm7, %%mm7 \n\t"
  2064. ASMALIGN(4)
  2065. "1: \n\t"
  2066. "mov %2, %%"REG_c" \n\t"
  2067. "movzwl (%%"REG_c", %0), %%eax \n\t"
  2068. "movzwl 2(%%"REG_c", %0), %%edx \n\t"
  2069. "mov %5, %%"REG_c" \n\t"
  2070. "pxor %%mm4, %%mm4 \n\t"
  2071. "pxor %%mm5, %%mm5 \n\t"
  2072. "2: \n\t"
  2073. "movq (%1), %%mm1 \n\t"
  2074. "movq (%1, %6), %%mm3 \n\t"
  2075. "movd (%%"REG_c", %%"REG_a"), %%mm0 \n\t"
  2076. "movd (%%"REG_c", %%"REG_d"), %%mm2 \n\t"
  2077. "punpcklbw %%mm7, %%mm0 \n\t"
  2078. "punpcklbw %%mm7, %%mm2 \n\t"
  2079. "pmaddwd %%mm1, %%mm0 \n\t"
  2080. "pmaddwd %%mm2, %%mm3 \n\t"
  2081. "paddd %%mm3, %%mm5 \n\t"
  2082. "paddd %%mm0, %%mm4 \n\t"
  2083. "add $8, %1 \n\t"
  2084. "add $4, %%"REG_c" \n\t"
  2085. "cmp %4, %%"REG_c" \n\t"
  2086. " jb 2b \n\t"
  2087. "add %6, %1 \n\t"
  2088. "movq %%mm4, %%mm0 \n\t"
  2089. "punpckldq %%mm5, %%mm4 \n\t"
  2090. "punpckhdq %%mm5, %%mm0 \n\t"
  2091. "paddd %%mm0, %%mm4 \n\t"
  2092. "psrad $7, %%mm4 \n\t"
  2093. "packssdw %%mm4, %%mm4 \n\t"
  2094. "mov %3, %%"REG_a" \n\t"
  2095. "movd %%mm4, (%%"REG_a", %0) \n\t"
  2096. "add $4, %0 \n\t"
  2097. " jnc 1b \n\t"
  2098. : "+r" (counter), "+r" (filter)
  2099. : "m" (filterPos), "m" (dst), "m"(offset),
  2100. "m" (src), "r" ((x86_reg)filterSize*2)
  2101. : "%"REG_a, "%"REG_c, "%"REG_d
  2102. );
  2103. }
  2104. #else
  2105. #if HAVE_ALTIVEC
  2106. hScale_altivec_real(dst, dstW, src, srcW, xInc, filter, filterPos, filterSize);
  2107. #else
  2108. int i;
  2109. for (i=0; i<dstW; i++)
  2110. {
  2111. int j;
  2112. int srcPos= filterPos[i];
  2113. int val=0;
  2114. //printf("filterPos: %d\n", filterPos[i]);
  2115. for (j=0; j<filterSize; j++)
  2116. {
  2117. //printf("filter: %d, src: %d\n", filter[i], src[srcPos + j]);
  2118. val += ((int)src[srcPos + j])*filter[filterSize*i + j];
  2119. }
  2120. //filter += hFilterSize;
  2121. dst[i] = FFMIN(val>>7, (1<<15)-1); // the cubic equation does overflow ...
  2122. //dst[i] = val>>7;
  2123. }
  2124. #endif /* HAVE_ALTIVEC */
  2125. #endif /* HAVE_MMX */
  2126. }
  2127. // *** horizontal scale Y line to temp buffer
  2128. static inline void RENAME(hyscale)(SwsContext *c, uint16_t *dst, long dstWidth, uint8_t *src, int srcW, int xInc,
  2129. int flags, int canMMX2BeUsed, int16_t *hLumFilter,
  2130. int16_t *hLumFilterPos, int hLumFilterSize, void *funnyYCode,
  2131. int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
  2132. int32_t *mmx2FilterPos, uint32_t *pal, int isAlpha)
  2133. {
  2134. if (srcFormat==PIX_FMT_YUYV422 || srcFormat==PIX_FMT_GRAY16BE)
  2135. {
  2136. RENAME(yuy2ToY)(formatConvBuffer, src, srcW, pal);
  2137. src= formatConvBuffer;
  2138. }
  2139. else if (srcFormat==PIX_FMT_UYVY422 || srcFormat==PIX_FMT_GRAY16LE)
  2140. {
  2141. RENAME(uyvyToY)(formatConvBuffer, src, srcW, pal);
  2142. src= formatConvBuffer;
  2143. }
  2144. else if (srcFormat==PIX_FMT_RGB32)
  2145. {
  2146. if (isAlpha)
  2147. RENAME(abgrToA)(formatConvBuffer, src+3, srcW, pal);
  2148. else
  2149. RENAME(bgr32ToY)(formatConvBuffer, src, srcW, pal);
  2150. src= formatConvBuffer;
  2151. }
  2152. else if (srcFormat==PIX_FMT_RGB32_1)
  2153. {
  2154. if (isAlpha)
  2155. RENAME(abgrToA)(formatConvBuffer, src, srcW, pal);
  2156. else
  2157. RENAME(bgr32ToY)(formatConvBuffer, src+ALT32_CORR, srcW, pal);
  2158. src= formatConvBuffer;
  2159. }
  2160. else if (srcFormat==PIX_FMT_BGR24)
  2161. {
  2162. RENAME(bgr24ToY)(formatConvBuffer, src, srcW, pal);
  2163. src= formatConvBuffer;
  2164. }
  2165. else if (srcFormat==PIX_FMT_BGR565)
  2166. {
  2167. RENAME(bgr16ToY)(formatConvBuffer, src, srcW, pal);
  2168. src= formatConvBuffer;
  2169. }
  2170. else if (srcFormat==PIX_FMT_BGR555)
  2171. {
  2172. RENAME(bgr15ToY)(formatConvBuffer, src, srcW, pal);
  2173. src= formatConvBuffer;
  2174. }
  2175. else if (srcFormat==PIX_FMT_BGR32)
  2176. {
  2177. if (isAlpha)
  2178. RENAME(abgrToA)(formatConvBuffer, src+3, srcW, pal);
  2179. else
  2180. RENAME(rgb32ToY)(formatConvBuffer, src, srcW, pal);
  2181. src= formatConvBuffer;
  2182. }
  2183. else if (srcFormat==PIX_FMT_BGR32_1)
  2184. {
  2185. if (isAlpha)
  2186. RENAME(abgrToA)(formatConvBuffer, src, srcW, pal);
  2187. else
  2188. RENAME(rgb32ToY)(formatConvBuffer, src+ALT32_CORR, srcW, pal);
  2189. src= formatConvBuffer;
  2190. }
  2191. else if (srcFormat==PIX_FMT_RGB24)
  2192. {
  2193. RENAME(rgb24ToY)(formatConvBuffer, src, srcW, pal);
  2194. src= formatConvBuffer;
  2195. }
  2196. else if (srcFormat==PIX_FMT_RGB565)
  2197. {
  2198. RENAME(rgb16ToY)(formatConvBuffer, src, srcW, pal);
  2199. src= formatConvBuffer;
  2200. }
  2201. else if (srcFormat==PIX_FMT_RGB555)
  2202. {
  2203. RENAME(rgb15ToY)(formatConvBuffer, src, srcW, pal);
  2204. src= formatConvBuffer;
  2205. }
  2206. else if (srcFormat==PIX_FMT_RGB8 || srcFormat==PIX_FMT_BGR8 || srcFormat==PIX_FMT_PAL8 || srcFormat==PIX_FMT_BGR4_BYTE || srcFormat==PIX_FMT_RGB4_BYTE)
  2207. {
  2208. RENAME(palToY)(formatConvBuffer, src, srcW, pal);
  2209. src= formatConvBuffer;
  2210. }
  2211. else if (srcFormat==PIX_FMT_MONOBLACK)
  2212. {
  2213. RENAME(monoblack2Y)(formatConvBuffer, src, srcW, pal);
  2214. src= formatConvBuffer;
  2215. }
  2216. else if (srcFormat==PIX_FMT_MONOWHITE)
  2217. {
  2218. RENAME(monowhite2Y)(formatConvBuffer, src, srcW, pal);
  2219. src= formatConvBuffer;
  2220. }
  2221. #if HAVE_MMX
  2222. // Use the new MMX scaler if the MMX2 one can't be used (it is faster than the x86 ASM one).
  2223. if (!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
  2224. #else
  2225. if (!(flags&SWS_FAST_BILINEAR))
  2226. #endif
  2227. {
  2228. RENAME(hScale)(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize);
  2229. }
  2230. else // fast bilinear upscale / crap downscale
  2231. {
  2232. #if ARCH_X86 && CONFIG_GPL
  2233. #if HAVE_MMX2
  2234. int i;
  2235. #if defined(PIC)
  2236. uint64_t ebxsave __attribute__((aligned(8)));
  2237. #endif
  2238. if (canMMX2BeUsed)
  2239. {
  2240. __asm__ volatile(
  2241. #if defined(PIC)
  2242. "mov %%"REG_b", %5 \n\t"
  2243. #endif
  2244. "pxor %%mm7, %%mm7 \n\t"
  2245. "mov %0, %%"REG_c" \n\t"
  2246. "mov %1, %%"REG_D" \n\t"
  2247. "mov %2, %%"REG_d" \n\t"
  2248. "mov %3, %%"REG_b" \n\t"
  2249. "xor %%"REG_a", %%"REG_a" \n\t" // i
  2250. PREFETCH" (%%"REG_c") \n\t"
  2251. PREFETCH" 32(%%"REG_c") \n\t"
  2252. PREFETCH" 64(%%"REG_c") \n\t"
  2253. #if ARCH_X86_64
  2254. #define FUNNY_Y_CODE \
  2255. "movl (%%"REG_b"), %%esi \n\t"\
  2256. "call *%4 \n\t"\
  2257. "movl (%%"REG_b", %%"REG_a"), %%esi \n\t"\
  2258. "add %%"REG_S", %%"REG_c" \n\t"\
  2259. "add %%"REG_a", %%"REG_D" \n\t"\
  2260. "xor %%"REG_a", %%"REG_a" \n\t"\
  2261. #else
  2262. #define FUNNY_Y_CODE \
  2263. "movl (%%"REG_b"), %%esi \n\t"\
  2264. "call *%4 \n\t"\
  2265. "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\
  2266. "add %%"REG_a", %%"REG_D" \n\t"\
  2267. "xor %%"REG_a", %%"REG_a" \n\t"\
  2268. #endif /* ARCH_X86_64 */
  2269. FUNNY_Y_CODE
  2270. FUNNY_Y_CODE
  2271. FUNNY_Y_CODE
  2272. FUNNY_Y_CODE
  2273. FUNNY_Y_CODE
  2274. FUNNY_Y_CODE
  2275. FUNNY_Y_CODE
  2276. FUNNY_Y_CODE
  2277. #if defined(PIC)
  2278. "mov %5, %%"REG_b" \n\t"
  2279. #endif
  2280. :: "m" (src), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
  2281. "m" (funnyYCode)
  2282. #if defined(PIC)
  2283. ,"m" (ebxsave)
  2284. #endif
  2285. : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
  2286. #if !defined(PIC)
  2287. ,"%"REG_b
  2288. #endif
  2289. );
  2290. for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) dst[i] = src[srcW-1]*128;
  2291. }
  2292. else
  2293. {
  2294. #endif /* HAVE_MMX2 */
  2295. x86_reg xInc_shr16 = xInc >> 16;
  2296. uint16_t xInc_mask = xInc & 0xffff;
  2297. //NO MMX just normal asm ...
  2298. __asm__ volatile(
  2299. "xor %%"REG_a", %%"REG_a" \n\t" // i
  2300. "xor %%"REG_d", %%"REG_d" \n\t" // xx
  2301. "xorl %%ecx, %%ecx \n\t" // 2*xalpha
  2302. ASMALIGN(4)
  2303. "1: \n\t"
  2304. "movzbl (%0, %%"REG_d"), %%edi \n\t" //src[xx]
  2305. "movzbl 1(%0, %%"REG_d"), %%esi \n\t" //src[xx+1]
  2306. "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
  2307. "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
  2308. "shll $16, %%edi \n\t"
  2309. "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
  2310. "mov %1, %%"REG_D" \n\t"
  2311. "shrl $9, %%esi \n\t"
  2312. "movw %%si, (%%"REG_D", %%"REG_a", 2) \n\t"
  2313. "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
  2314. "adc %3, %%"REG_d" \n\t" //xx+= xInc>>8 + carry
  2315. "movzbl (%0, %%"REG_d"), %%edi \n\t" //src[xx]
  2316. "movzbl 1(%0, %%"REG_d"), %%esi \n\t" //src[xx+1]
  2317. "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
  2318. "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
  2319. "shll $16, %%edi \n\t"
  2320. "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
  2321. "mov %1, %%"REG_D" \n\t"
  2322. "shrl $9, %%esi \n\t"
  2323. "movw %%si, 2(%%"REG_D", %%"REG_a", 2) \n\t"
  2324. "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
  2325. "adc %3, %%"REG_d" \n\t" //xx+= xInc>>8 + carry
  2326. "add $2, %%"REG_a" \n\t"
  2327. "cmp %2, %%"REG_a" \n\t"
  2328. " jb 1b \n\t"
  2329. :: "r" (src), "m" (dst), "m" (dstWidth), "m" (xInc_shr16), "m" (xInc_mask)
  2330. : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi"
  2331. );
  2332. #if HAVE_MMX2
  2333. } //if MMX2 can't be used
  2334. #endif
  2335. #else
  2336. int i;
  2337. unsigned int xpos=0;
  2338. for (i=0;i<dstWidth;i++)
  2339. {
  2340. register unsigned int xx=xpos>>16;
  2341. register unsigned int xalpha=(xpos&0xFFFF)>>9;
  2342. dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha;
  2343. xpos+=xInc;
  2344. }
  2345. #endif /* ARCH_X86 */
  2346. }
  2347. if(!isAlpha && c->srcRange != c->dstRange && !(isRGB(c->dstFormat) || isBGR(c->dstFormat))){
  2348. int i;
  2349. //FIXME all pal and rgb srcFormats could do this convertion as well
  2350. //FIXME all scalers more complex than bilinear could do half of this transform
  2351. if(c->srcRange){
  2352. for (i=0; i<dstWidth; i++)
  2353. dst[i]= (dst[i]*14071 + 33561947)>>14;
  2354. }else{
  2355. for (i=0; i<dstWidth; i++)
  2356. dst[i]= (FFMIN(dst[i],30189)*19077 - 39057361)>>14;
  2357. }
  2358. }
  2359. }
  2360. inline static void RENAME(hcscale)(SwsContext *c, uint16_t *dst, long dstWidth, uint8_t *src1, uint8_t *src2,
  2361. int srcW, int xInc, int flags, int canMMX2BeUsed, int16_t *hChrFilter,
  2362. int16_t *hChrFilterPos, int hChrFilterSize, void *funnyUVCode,
  2363. int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
  2364. int32_t *mmx2FilterPos, uint32_t *pal)
  2365. {
  2366. if (srcFormat==PIX_FMT_YUYV422)
  2367. {
  2368. RENAME(yuy2ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
  2369. src1= formatConvBuffer;
  2370. src2= formatConvBuffer+VOFW;
  2371. }
  2372. else if (srcFormat==PIX_FMT_UYVY422)
  2373. {
  2374. RENAME(uyvyToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
  2375. src1= formatConvBuffer;
  2376. src2= formatConvBuffer+VOFW;
  2377. }
  2378. else if (srcFormat==PIX_FMT_RGB32)
  2379. {
  2380. if(c->chrSrcHSubSample)
  2381. RENAME(bgr32ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
  2382. else
  2383. RENAME(bgr32ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
  2384. src1= formatConvBuffer;
  2385. src2= formatConvBuffer+VOFW;
  2386. }
  2387. else if (srcFormat==PIX_FMT_RGB32_1)
  2388. {
  2389. if(c->chrSrcHSubSample)
  2390. RENAME(bgr32ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1+ALT32_CORR, src2+ALT32_CORR, srcW, pal);
  2391. else
  2392. RENAME(bgr32ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1+ALT32_CORR, src2+ALT32_CORR, srcW, pal);
  2393. src1= formatConvBuffer;
  2394. src2= formatConvBuffer+VOFW;
  2395. }
  2396. else if (srcFormat==PIX_FMT_BGR24)
  2397. {
  2398. if(c->chrSrcHSubSample)
  2399. RENAME(bgr24ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
  2400. else
  2401. RENAME(bgr24ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
  2402. src1= formatConvBuffer;
  2403. src2= formatConvBuffer+VOFW;
  2404. }
  2405. else if (srcFormat==PIX_FMT_BGR565)
  2406. {
  2407. if(c->chrSrcHSubSample)
  2408. RENAME(bgr16ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
  2409. else
  2410. RENAME(bgr16ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
  2411. src1= formatConvBuffer;
  2412. src2= formatConvBuffer+VOFW;
  2413. }
  2414. else if (srcFormat==PIX_FMT_BGR555)
  2415. {
  2416. if(c->chrSrcHSubSample)
  2417. RENAME(bgr15ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
  2418. else
  2419. RENAME(bgr15ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
  2420. src1= formatConvBuffer;
  2421. src2= formatConvBuffer+VOFW;
  2422. }
  2423. else if (srcFormat==PIX_FMT_BGR32)
  2424. {
  2425. if(c->chrSrcHSubSample)
  2426. RENAME(rgb32ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
  2427. else
  2428. RENAME(rgb32ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
  2429. src1= formatConvBuffer;
  2430. src2= formatConvBuffer+VOFW;
  2431. }
  2432. else if (srcFormat==PIX_FMT_BGR32_1)
  2433. {
  2434. if(c->chrSrcHSubSample)
  2435. RENAME(rgb32ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1+ALT32_CORR, src2+ALT32_CORR, srcW, pal);
  2436. else
  2437. RENAME(rgb32ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1+ALT32_CORR, src2+ALT32_CORR, srcW, pal);
  2438. src1= formatConvBuffer;
  2439. src2= formatConvBuffer+VOFW;
  2440. }
  2441. else if (srcFormat==PIX_FMT_RGB24)
  2442. {
  2443. if(c->chrSrcHSubSample)
  2444. RENAME(rgb24ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
  2445. else
  2446. RENAME(rgb24ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
  2447. src1= formatConvBuffer;
  2448. src2= formatConvBuffer+VOFW;
  2449. }
  2450. else if (srcFormat==PIX_FMT_RGB565)
  2451. {
  2452. if(c->chrSrcHSubSample)
  2453. RENAME(rgb16ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
  2454. else
  2455. RENAME(rgb16ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
  2456. src1= formatConvBuffer;
  2457. src2= formatConvBuffer+VOFW;
  2458. }
  2459. else if (srcFormat==PIX_FMT_RGB555)
  2460. {
  2461. if(c->chrSrcHSubSample)
  2462. RENAME(rgb15ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
  2463. else
  2464. RENAME(rgb15ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
  2465. src1= formatConvBuffer;
  2466. src2= formatConvBuffer+VOFW;
  2467. }
  2468. else if (isGray(srcFormat) || srcFormat==PIX_FMT_MONOBLACK || srcFormat==PIX_FMT_MONOWHITE)
  2469. {
  2470. return;
  2471. }
  2472. else if (srcFormat==PIX_FMT_RGB8 || srcFormat==PIX_FMT_BGR8 || srcFormat==PIX_FMT_PAL8 || srcFormat==PIX_FMT_BGR4_BYTE || srcFormat==PIX_FMT_RGB4_BYTE)
  2473. {
  2474. RENAME(palToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
  2475. src1= formatConvBuffer;
  2476. src2= formatConvBuffer+VOFW;
  2477. }
  2478. #if HAVE_MMX
  2479. // Use the new MMX scaler if the MMX2 one can't be used (it is faster than the x86 ASM one).
  2480. if (!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
  2481. #else
  2482. if (!(flags&SWS_FAST_BILINEAR))
  2483. #endif
  2484. {
  2485. RENAME(hScale)(dst , dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
  2486. RENAME(hScale)(dst+VOFW, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
  2487. }
  2488. else // fast bilinear upscale / crap downscale
  2489. {
  2490. #if ARCH_X86 && CONFIG_GPL
  2491. #if HAVE_MMX2
  2492. int i;
  2493. #if defined(PIC)
  2494. uint64_t ebxsave __attribute__((aligned(8)));
  2495. #endif
  2496. if (canMMX2BeUsed)
  2497. {
  2498. __asm__ volatile(
  2499. #if defined(PIC)
  2500. "mov %%"REG_b", %6 \n\t"
  2501. #endif
  2502. "pxor %%mm7, %%mm7 \n\t"
  2503. "mov %0, %%"REG_c" \n\t"
  2504. "mov %1, %%"REG_D" \n\t"
  2505. "mov %2, %%"REG_d" \n\t"
  2506. "mov %3, %%"REG_b" \n\t"
  2507. "xor %%"REG_a", %%"REG_a" \n\t" // i
  2508. PREFETCH" (%%"REG_c") \n\t"
  2509. PREFETCH" 32(%%"REG_c") \n\t"
  2510. PREFETCH" 64(%%"REG_c") \n\t"
  2511. #if ARCH_X86_64
  2512. #define FUNNY_UV_CODE \
  2513. "movl (%%"REG_b"), %%esi \n\t"\
  2514. "call *%4 \n\t"\
  2515. "movl (%%"REG_b", %%"REG_a"), %%esi \n\t"\
  2516. "add %%"REG_S", %%"REG_c" \n\t"\
  2517. "add %%"REG_a", %%"REG_D" \n\t"\
  2518. "xor %%"REG_a", %%"REG_a" \n\t"\
  2519. #else
  2520. #define FUNNY_UV_CODE \
  2521. "movl (%%"REG_b"), %%esi \n\t"\
  2522. "call *%4 \n\t"\
  2523. "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\
  2524. "add %%"REG_a", %%"REG_D" \n\t"\
  2525. "xor %%"REG_a", %%"REG_a" \n\t"\
  2526. #endif /* ARCH_X86_64 */
  2527. FUNNY_UV_CODE
  2528. FUNNY_UV_CODE
  2529. FUNNY_UV_CODE
  2530. FUNNY_UV_CODE
  2531. "xor %%"REG_a", %%"REG_a" \n\t" // i
  2532. "mov %5, %%"REG_c" \n\t" // src
  2533. "mov %1, %%"REG_D" \n\t" // buf1
  2534. "add $"AV_STRINGIFY(VOF)", %%"REG_D" \n\t"
  2535. PREFETCH" (%%"REG_c") \n\t"
  2536. PREFETCH" 32(%%"REG_c") \n\t"
  2537. PREFETCH" 64(%%"REG_c") \n\t"
  2538. FUNNY_UV_CODE
  2539. FUNNY_UV_CODE
  2540. FUNNY_UV_CODE
  2541. FUNNY_UV_CODE
  2542. #if defined(PIC)
  2543. "mov %6, %%"REG_b" \n\t"
  2544. #endif
  2545. :: "m" (src1), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
  2546. "m" (funnyUVCode), "m" (src2)
  2547. #if defined(PIC)
  2548. ,"m" (ebxsave)
  2549. #endif
  2550. : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
  2551. #if !defined(PIC)
  2552. ,"%"REG_b
  2553. #endif
  2554. );
  2555. for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
  2556. {
  2557. //printf("%d %d %d\n", dstWidth, i, srcW);
  2558. dst[i] = src1[srcW-1]*128;
  2559. dst[i+VOFW] = src2[srcW-1]*128;
  2560. }
  2561. }
  2562. else
  2563. {
  2564. #endif /* HAVE_MMX2 */
  2565. x86_reg xInc_shr16 = (x86_reg) (xInc >> 16);
  2566. uint16_t xInc_mask = xInc & 0xffff;
  2567. __asm__ volatile(
  2568. "xor %%"REG_a", %%"REG_a" \n\t" // i
  2569. "xor %%"REG_d", %%"REG_d" \n\t" // xx
  2570. "xorl %%ecx, %%ecx \n\t" // 2*xalpha
  2571. ASMALIGN(4)
  2572. "1: \n\t"
  2573. "mov %0, %%"REG_S" \n\t"
  2574. "movzbl (%%"REG_S", %%"REG_d"), %%edi \n\t" //src[xx]
  2575. "movzbl 1(%%"REG_S", %%"REG_d"), %%esi \n\t" //src[xx+1]
  2576. "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
  2577. "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
  2578. "shll $16, %%edi \n\t"
  2579. "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
  2580. "mov %1, %%"REG_D" \n\t"
  2581. "shrl $9, %%esi \n\t"
  2582. "movw %%si, (%%"REG_D", %%"REG_a", 2) \n\t"
  2583. "movzbl (%5, %%"REG_d"), %%edi \n\t" //src[xx]
  2584. "movzbl 1(%5, %%"REG_d"), %%esi \n\t" //src[xx+1]
  2585. "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
  2586. "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
  2587. "shll $16, %%edi \n\t"
  2588. "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
  2589. "mov %1, %%"REG_D" \n\t"
  2590. "shrl $9, %%esi \n\t"
  2591. "movw %%si, "AV_STRINGIFY(VOF)"(%%"REG_D", %%"REG_a", 2) \n\t"
  2592. "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
  2593. "adc %3, %%"REG_d" \n\t" //xx+= xInc>>8 + carry
  2594. "add $1, %%"REG_a" \n\t"
  2595. "cmp %2, %%"REG_a" \n\t"
  2596. " jb 1b \n\t"
  2597. /* GCC 3.3 makes MPlayer crash on IA-32 machines when using "g" operand here,
  2598. which is needed to support GCC 4.0. */
  2599. #if ARCH_X86_64 && ((__GNUC__ > 3) || (__GNUC__ == 3 && __GNUC_MINOR__ >= 4))
  2600. :: "m" (src1), "m" (dst), "g" (dstWidth), "m" (xInc_shr16), "m" (xInc_mask),
  2601. #else
  2602. :: "m" (src1), "m" (dst), "m" (dstWidth), "m" (xInc_shr16), "m" (xInc_mask),
  2603. #endif
  2604. "r" (src2)
  2605. : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi"
  2606. );
  2607. #if HAVE_MMX2
  2608. } //if MMX2 can't be used
  2609. #endif
  2610. #else
  2611. int i;
  2612. unsigned int xpos=0;
  2613. for (i=0;i<dstWidth;i++)
  2614. {
  2615. register unsigned int xx=xpos>>16;
  2616. register unsigned int xalpha=(xpos&0xFFFF)>>9;
  2617. dst[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha);
  2618. dst[i+VOFW]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha);
  2619. /* slower
  2620. dst[i]= (src1[xx]<<7) + (src1[xx+1] - src1[xx])*xalpha;
  2621. dst[i+VOFW]=(src2[xx]<<7) + (src2[xx+1] - src2[xx])*xalpha;
  2622. */
  2623. xpos+=xInc;
  2624. }
  2625. #endif /* ARCH_X86 */
  2626. }
  2627. if(c->srcRange != c->dstRange && !(isRGB(c->dstFormat) || isBGR(c->dstFormat))){
  2628. int i;
  2629. //FIXME all pal and rgb srcFormats could do this convertion as well
  2630. //FIXME all scalers more complex than bilinear could do half of this transform
  2631. if(c->srcRange){
  2632. for (i=0; i<dstWidth; i++){
  2633. dst[i ]= (dst[i ]*1799 + 4081085)>>11; //1469
  2634. dst[i+VOFW]= (dst[i+VOFW]*1799 + 4081085)>>11; //1469
  2635. }
  2636. }else{
  2637. for (i=0; i<dstWidth; i++){
  2638. dst[i ]= (FFMIN(dst[i ],30775)*4663 - 9289992)>>12; //-264
  2639. dst[i+VOFW]= (FFMIN(dst[i+VOFW],30775)*4663 - 9289992)>>12; //-264
  2640. }
  2641. }
  2642. }
  2643. }
  2644. static int RENAME(swScale)(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
  2645. int srcSliceH, uint8_t* dst[], int dstStride[]){
  2646. /* load a few things into local vars to make the code more readable? and faster */
  2647. const int srcW= c->srcW;
  2648. const int dstW= c->dstW;
  2649. const int dstH= c->dstH;
  2650. const int chrDstW= c->chrDstW;
  2651. const int chrSrcW= c->chrSrcW;
  2652. const int lumXInc= c->lumXInc;
  2653. const int chrXInc= c->chrXInc;
  2654. const int dstFormat= c->dstFormat;
  2655. const int srcFormat= c->srcFormat;
  2656. const int flags= c->flags;
  2657. const int canMMX2BeUsed= c->canMMX2BeUsed;
  2658. int16_t *vLumFilterPos= c->vLumFilterPos;
  2659. int16_t *vChrFilterPos= c->vChrFilterPos;
  2660. int16_t *hLumFilterPos= c->hLumFilterPos;
  2661. int16_t *hChrFilterPos= c->hChrFilterPos;
  2662. int16_t *vLumFilter= c->vLumFilter;
  2663. int16_t *vChrFilter= c->vChrFilter;
  2664. int16_t *hLumFilter= c->hLumFilter;
  2665. int16_t *hChrFilter= c->hChrFilter;
  2666. int32_t *lumMmxFilter= c->lumMmxFilter;
  2667. int32_t *chrMmxFilter= c->chrMmxFilter;
  2668. int32_t *alpMmxFilter= c->alpMmxFilter;
  2669. const int vLumFilterSize= c->vLumFilterSize;
  2670. const int vChrFilterSize= c->vChrFilterSize;
  2671. const int hLumFilterSize= c->hLumFilterSize;
  2672. const int hChrFilterSize= c->hChrFilterSize;
  2673. int16_t **lumPixBuf= c->lumPixBuf;
  2674. int16_t **chrPixBuf= c->chrPixBuf;
  2675. int16_t **alpPixBuf= c->alpPixBuf;
  2676. const int vLumBufSize= c->vLumBufSize;
  2677. const int vChrBufSize= c->vChrBufSize;
  2678. uint8_t *funnyYCode= c->funnyYCode;
  2679. uint8_t *funnyUVCode= c->funnyUVCode;
  2680. uint8_t *formatConvBuffer= c->formatConvBuffer;
  2681. const int chrSrcSliceY= srcSliceY >> c->chrSrcVSubSample;
  2682. const int chrSrcSliceH= -((-srcSliceH) >> c->chrSrcVSubSample);
  2683. int lastDstY;
  2684. uint32_t *pal=c->pal_yuv;
  2685. /* vars which will change and which we need to store back in the context */
  2686. int dstY= c->dstY;
  2687. int lumBufIndex= c->lumBufIndex;
  2688. int chrBufIndex= c->chrBufIndex;
  2689. int lastInLumBuf= c->lastInLumBuf;
  2690. int lastInChrBuf= c->lastInChrBuf;
  2691. if (isPacked(c->srcFormat)){
  2692. src[0]=
  2693. src[1]=
  2694. src[2]=
  2695. src[3]= src[0];
  2696. srcStride[0]=
  2697. srcStride[1]=
  2698. srcStride[2]=
  2699. srcStride[3]= srcStride[0];
  2700. }
  2701. srcStride[1]<<= c->vChrDrop;
  2702. srcStride[2]<<= c->vChrDrop;
  2703. //printf("swscale %X %X %X -> %X %X %X\n", (int)src[0], (int)src[1], (int)src[2],
  2704. // (int)dst[0], (int)dst[1], (int)dst[2]);
  2705. #if 0 //self test FIXME move to a vfilter or something
  2706. {
  2707. static volatile int i=0;
  2708. i++;
  2709. if (srcFormat==PIX_FMT_YUV420P && i==1 && srcSliceH>= c->srcH)
  2710. selfTest(src, srcStride, c->srcW, c->srcH);
  2711. i--;
  2712. }
  2713. #endif
  2714. //printf("sws Strides:%d %d %d -> %d %d %d\n", srcStride[0],srcStride[1],srcStride[2],
  2715. //dstStride[0],dstStride[1],dstStride[2]);
  2716. if (dstStride[0]%8 !=0 || dstStride[1]%8 !=0 || dstStride[2]%8 !=0 || dstStride[3]%8 != 0)
  2717. {
  2718. static int warnedAlready=0; //FIXME move this into the context perhaps
  2719. if (flags & SWS_PRINT_INFO && !warnedAlready)
  2720. {
  2721. av_log(c, AV_LOG_WARNING, "Warning: dstStride is not aligned!\n"
  2722. " ->cannot do aligned memory accesses anymore\n");
  2723. warnedAlready=1;
  2724. }
  2725. }
  2726. /* Note the user might start scaling the picture in the middle so this
  2727. will not get executed. This is not really intended but works
  2728. currently, so people might do it. */
  2729. if (srcSliceY ==0){
  2730. lumBufIndex=0;
  2731. chrBufIndex=0;
  2732. dstY=0;
  2733. lastInLumBuf= -1;
  2734. lastInChrBuf= -1;
  2735. }
  2736. lastDstY= dstY;
  2737. for (;dstY < dstH; dstY++){
  2738. unsigned char *dest =dst[0]+dstStride[0]*dstY;
  2739. const int chrDstY= dstY>>c->chrDstVSubSample;
  2740. unsigned char *uDest=dst[1]+dstStride[1]*chrDstY;
  2741. unsigned char *vDest=dst[2]+dstStride[2]*chrDstY;
  2742. unsigned char *aDest=(CONFIG_SWSCALE_ALPHA && alpPixBuf) ? dst[3]+dstStride[3]*dstY : NULL;
  2743. const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input
  2744. const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input
  2745. const int lastLumSrcY= firstLumSrcY + vLumFilterSize -1; // Last line needed as input
  2746. const int lastChrSrcY= firstChrSrcY + vChrFilterSize -1; // Last line needed as input
  2747. //printf("dstY:%d dstH:%d firstLumSrcY:%d lastInLumBuf:%d vLumBufSize: %d vChrBufSize: %d slice: %d %d vLumFilterSize: %d firstChrSrcY: %d vChrFilterSize: %d c->chrSrcVSubSample: %d\n",
  2748. // dstY, dstH, firstLumSrcY, lastInLumBuf, vLumBufSize, vChrBufSize, srcSliceY, srcSliceH, vLumFilterSize, firstChrSrcY, vChrFilterSize, c->chrSrcVSubSample);
  2749. //handle holes (FAST_BILINEAR & weird filters)
  2750. if (firstLumSrcY > lastInLumBuf) lastInLumBuf= firstLumSrcY-1;
  2751. if (firstChrSrcY > lastInChrBuf) lastInChrBuf= firstChrSrcY-1;
  2752. //printf("%d %d %d\n", firstChrSrcY, lastInChrBuf, vChrBufSize);
  2753. assert(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1);
  2754. assert(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1);
  2755. // Do we have enough lines in this slice to output the dstY line
  2756. if (lastLumSrcY < srcSliceY + srcSliceH && lastChrSrcY < -((-srcSliceY - srcSliceH)>>c->chrSrcVSubSample))
  2757. {
  2758. //Do horizontal scaling
  2759. while(lastInLumBuf < lastLumSrcY)
  2760. {
  2761. uint8_t *src1= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
  2762. uint8_t *src2= src[3]+(lastInLumBuf + 1 - srcSliceY)*srcStride[3];
  2763. lumBufIndex++;
  2764. //printf("%d %d %d %d\n", lumBufIndex, vLumBufSize, lastInLumBuf, lastLumSrcY);
  2765. assert(lumBufIndex < 2*vLumBufSize);
  2766. assert(lastInLumBuf + 1 - srcSliceY < srcSliceH);
  2767. assert(lastInLumBuf + 1 - srcSliceY >= 0);
  2768. //printf("%d %d\n", lumBufIndex, vLumBufSize);
  2769. RENAME(hyscale)(c, lumPixBuf[ lumBufIndex ], dstW, src1, srcW, lumXInc,
  2770. flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
  2771. funnyYCode, c->srcFormat, formatConvBuffer,
  2772. c->lumMmx2Filter, c->lumMmx2FilterPos, pal, 0);
  2773. if (CONFIG_SWSCALE_ALPHA && alpPixBuf)
  2774. RENAME(hyscale)(c, alpPixBuf[ lumBufIndex ], dstW, src2, srcW, lumXInc,
  2775. flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
  2776. funnyYCode, c->srcFormat, formatConvBuffer,
  2777. c->lumMmx2Filter, c->lumMmx2FilterPos, pal, 1);
  2778. lastInLumBuf++;
  2779. }
  2780. while(lastInChrBuf < lastChrSrcY)
  2781. {
  2782. uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
  2783. uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
  2784. chrBufIndex++;
  2785. assert(chrBufIndex < 2*vChrBufSize);
  2786. assert(lastInChrBuf + 1 - chrSrcSliceY < (chrSrcSliceH));
  2787. assert(lastInChrBuf + 1 - chrSrcSliceY >= 0);
  2788. //FIXME replace parameters through context struct (some at least)
  2789. if (!(isGray(srcFormat) || isGray(dstFormat)))
  2790. RENAME(hcscale)(c, chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
  2791. flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
  2792. funnyUVCode, c->srcFormat, formatConvBuffer,
  2793. c->chrMmx2Filter, c->chrMmx2FilterPos, pal);
  2794. lastInChrBuf++;
  2795. }
  2796. //wrap buf index around to stay inside the ring buffer
  2797. if (lumBufIndex >= vLumBufSize) lumBufIndex-= vLumBufSize;
  2798. if (chrBufIndex >= vChrBufSize) chrBufIndex-= vChrBufSize;
  2799. }
  2800. else // not enough lines left in this slice -> load the rest in the buffer
  2801. {
  2802. /* printf("%d %d Last:%d %d LastInBuf:%d %d Index:%d %d Y:%d FSize: %d %d BSize: %d %d\n",
  2803. firstChrSrcY,firstLumSrcY,lastChrSrcY,lastLumSrcY,
  2804. lastInChrBuf,lastInLumBuf,chrBufIndex,lumBufIndex,dstY,vChrFilterSize,vLumFilterSize,
  2805. vChrBufSize, vLumBufSize);*/
  2806. //Do horizontal scaling
  2807. while(lastInLumBuf+1 < srcSliceY + srcSliceH)
  2808. {
  2809. uint8_t *src1= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
  2810. uint8_t *src2= src[3]+(lastInLumBuf + 1 - srcSliceY)*srcStride[3];
  2811. lumBufIndex++;
  2812. assert(lumBufIndex < 2*vLumBufSize);
  2813. assert(lastInLumBuf + 1 - srcSliceY < srcSliceH);
  2814. assert(lastInLumBuf + 1 - srcSliceY >= 0);
  2815. RENAME(hyscale)(c, lumPixBuf[ lumBufIndex ], dstW, src1, srcW, lumXInc,
  2816. flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
  2817. funnyYCode, c->srcFormat, formatConvBuffer,
  2818. c->lumMmx2Filter, c->lumMmx2FilterPos, pal, 0);
  2819. if (CONFIG_SWSCALE_ALPHA && alpPixBuf)
  2820. RENAME(hyscale)(c, alpPixBuf[ lumBufIndex ], dstW, src2, srcW, lumXInc,
  2821. flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
  2822. funnyYCode, c->srcFormat, formatConvBuffer,
  2823. c->lumMmx2Filter, c->lumMmx2FilterPos, pal, 1);
  2824. lastInLumBuf++;
  2825. }
  2826. while(lastInChrBuf+1 < (chrSrcSliceY + chrSrcSliceH))
  2827. {
  2828. uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
  2829. uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
  2830. chrBufIndex++;
  2831. assert(chrBufIndex < 2*vChrBufSize);
  2832. assert(lastInChrBuf + 1 - chrSrcSliceY < chrSrcSliceH);
  2833. assert(lastInChrBuf + 1 - chrSrcSliceY >= 0);
  2834. if (!(isGray(srcFormat) || isGray(dstFormat)))
  2835. RENAME(hcscale)(c, chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
  2836. flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
  2837. funnyUVCode, c->srcFormat, formatConvBuffer,
  2838. c->chrMmx2Filter, c->chrMmx2FilterPos, pal);
  2839. lastInChrBuf++;
  2840. }
  2841. //wrap buf index around to stay inside the ring buffer
  2842. if (lumBufIndex >= vLumBufSize) lumBufIndex-= vLumBufSize;
  2843. if (chrBufIndex >= vChrBufSize) chrBufIndex-= vChrBufSize;
  2844. break; //we can't output a dstY line so let's try with the next slice
  2845. }
  2846. #if HAVE_MMX
  2847. c->blueDither= ff_dither8[dstY&1];
  2848. if (c->dstFormat == PIX_FMT_RGB555 || c->dstFormat == PIX_FMT_BGR555)
  2849. c->greenDither= ff_dither8[dstY&1];
  2850. else
  2851. c->greenDither= ff_dither4[dstY&1];
  2852. c->redDither= ff_dither8[(dstY+1)&1];
  2853. #endif
  2854. if (dstY < dstH-2)
  2855. {
  2856. int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
  2857. int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
  2858. int16_t **alpSrcPtr= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? alpPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize : NULL;
  2859. #if HAVE_MMX
  2860. int i;
  2861. if (flags & SWS_ACCURATE_RND){
  2862. int s= APCK_SIZE / 8;
  2863. for (i=0; i<vLumFilterSize; i+=2){
  2864. *(void**)&lumMmxFilter[s*i ]= lumSrcPtr[i ];
  2865. *(void**)&lumMmxFilter[s*i+APCK_PTR2/4 ]= lumSrcPtr[i+(vLumFilterSize>1)];
  2866. lumMmxFilter[s*i+APCK_COEF/4 ]=
  2867. lumMmxFilter[s*i+APCK_COEF/4+1]= vLumFilter[dstY*vLumFilterSize + i ]
  2868. + (vLumFilterSize>1 ? vLumFilter[dstY*vLumFilterSize + i + 1]<<16 : 0);
  2869. if (CONFIG_SWSCALE_ALPHA && alpPixBuf){
  2870. *(void**)&alpMmxFilter[s*i ]= alpSrcPtr[i ];
  2871. *(void**)&alpMmxFilter[s*i+APCK_PTR2/4 ]= alpSrcPtr[i+(vLumFilterSize>1)];
  2872. alpMmxFilter[s*i+APCK_COEF/4 ]=
  2873. alpMmxFilter[s*i+APCK_COEF/4+1]= lumMmxFilter[s*i+APCK_COEF/4 ];
  2874. }
  2875. }
  2876. for (i=0; i<vChrFilterSize; i+=2){
  2877. *(void**)&chrMmxFilter[s*i ]= chrSrcPtr[i ];
  2878. *(void**)&chrMmxFilter[s*i+APCK_PTR2/4 ]= chrSrcPtr[i+(vChrFilterSize>1)];
  2879. chrMmxFilter[s*i+APCK_COEF/4 ]=
  2880. chrMmxFilter[s*i+APCK_COEF/4+1]= vChrFilter[chrDstY*vChrFilterSize + i ]
  2881. + (vChrFilterSize>1 ? vChrFilter[chrDstY*vChrFilterSize + i + 1]<<16 : 0);
  2882. }
  2883. }else{
  2884. for (i=0; i<vLumFilterSize; i++)
  2885. {
  2886. lumMmxFilter[4*i+0]= (int32_t)lumSrcPtr[i];
  2887. lumMmxFilter[4*i+1]= (uint64_t)lumSrcPtr[i] >> 32;
  2888. lumMmxFilter[4*i+2]=
  2889. lumMmxFilter[4*i+3]=
  2890. ((uint16_t)vLumFilter[dstY*vLumFilterSize + i])*0x10001;
  2891. if (CONFIG_SWSCALE_ALPHA && alpPixBuf){
  2892. alpMmxFilter[4*i+0]= (int32_t)alpSrcPtr[i];
  2893. alpMmxFilter[4*i+1]= (uint64_t)alpSrcPtr[i] >> 32;
  2894. alpMmxFilter[4*i+2]=
  2895. alpMmxFilter[4*i+3]= lumMmxFilter[4*i+2];
  2896. }
  2897. }
  2898. for (i=0; i<vChrFilterSize; i++)
  2899. {
  2900. chrMmxFilter[4*i+0]= (int32_t)chrSrcPtr[i];
  2901. chrMmxFilter[4*i+1]= (uint64_t)chrSrcPtr[i] >> 32;
  2902. chrMmxFilter[4*i+2]=
  2903. chrMmxFilter[4*i+3]=
  2904. ((uint16_t)vChrFilter[chrDstY*vChrFilterSize + i])*0x10001;
  2905. }
  2906. }
  2907. #endif
  2908. if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21){
  2909. const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
  2910. if (dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
  2911. RENAME(yuv2nv12X)(c,
  2912. vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
  2913. vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
  2914. dest, uDest, dstW, chrDstW, dstFormat);
  2915. }
  2916. else if (isPlanarYUV(dstFormat) || dstFormat==PIX_FMT_GRAY8) //YV12 like
  2917. {
  2918. const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
  2919. if ((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
  2920. if (vLumFilterSize == 1 && vChrFilterSize == 1) // unscaled YV12
  2921. {
  2922. int16_t *lumBuf = lumPixBuf[0];
  2923. int16_t *chrBuf= chrPixBuf[0];
  2924. int16_t *alpBuf= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? alpPixBuf[0] : NULL;
  2925. RENAME(yuv2yuv1)(c, lumBuf, chrBuf, alpBuf, dest, uDest, vDest, aDest, dstW, chrDstW);
  2926. }
  2927. else //General YV12
  2928. {
  2929. RENAME(yuv2yuvX)(c,
  2930. vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
  2931. vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
  2932. alpSrcPtr, dest, uDest, vDest, aDest, dstW, chrDstW);
  2933. }
  2934. }
  2935. else
  2936. {
  2937. assert(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
  2938. assert(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
  2939. if (vLumFilterSize == 1 && vChrFilterSize == 2) //unscaled RGB
  2940. {
  2941. int chrAlpha= vChrFilter[2*dstY+1];
  2942. if(flags & SWS_FULL_CHR_H_INT){
  2943. yuv2rgbXinC_full(c, //FIXME write a packed1_full function
  2944. vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
  2945. vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
  2946. alpSrcPtr, dest, dstW, dstY);
  2947. }else{
  2948. RENAME(yuv2packed1)(c, *lumSrcPtr, *chrSrcPtr, *(chrSrcPtr+1),
  2949. alpPixBuf ? *alpSrcPtr : NULL,
  2950. dest, dstW, chrAlpha, dstFormat, flags, dstY);
  2951. }
  2952. }
  2953. else if (vLumFilterSize == 2 && vChrFilterSize == 2) //bilinear upscale RGB
  2954. {
  2955. int lumAlpha= vLumFilter[2*dstY+1];
  2956. int chrAlpha= vChrFilter[2*dstY+1];
  2957. lumMmxFilter[2]=
  2958. lumMmxFilter[3]= vLumFilter[2*dstY ]*0x10001;
  2959. chrMmxFilter[2]=
  2960. chrMmxFilter[3]= vChrFilter[2*chrDstY]*0x10001;
  2961. if(flags & SWS_FULL_CHR_H_INT){
  2962. yuv2rgbXinC_full(c, //FIXME write a packed2_full function
  2963. vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
  2964. vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
  2965. alpSrcPtr, dest, dstW, dstY);
  2966. }else{
  2967. RENAME(yuv2packed2)(c, *lumSrcPtr, *(lumSrcPtr+1), *chrSrcPtr, *(chrSrcPtr+1),
  2968. alpPixBuf ? *alpSrcPtr : NULL, alpPixBuf ? *(alpSrcPtr+1) : NULL,
  2969. dest, dstW, lumAlpha, chrAlpha, dstY);
  2970. }
  2971. }
  2972. else //general RGB
  2973. {
  2974. if(flags & SWS_FULL_CHR_H_INT){
  2975. yuv2rgbXinC_full(c,
  2976. vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
  2977. vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
  2978. alpSrcPtr, dest, dstW, dstY);
  2979. }else{
  2980. RENAME(yuv2packedX)(c,
  2981. vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
  2982. vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
  2983. alpSrcPtr, dest, dstW, dstY);
  2984. }
  2985. }
  2986. }
  2987. }
  2988. else // hmm looks like we can't use MMX here without overwriting this array's tail
  2989. {
  2990. int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
  2991. int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
  2992. int16_t **alpSrcPtr= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? alpPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize : NULL;
  2993. if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21){
  2994. const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
  2995. if (dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
  2996. yuv2nv12XinC(
  2997. vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
  2998. vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
  2999. dest, uDest, dstW, chrDstW, dstFormat);
  3000. }
  3001. else if (isPlanarYUV(dstFormat) || dstFormat==PIX_FMT_GRAY8) //YV12
  3002. {
  3003. const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
  3004. if ((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
  3005. yuv2yuvXinC(
  3006. vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
  3007. vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
  3008. alpSrcPtr, dest, uDest, vDest, aDest, dstW, chrDstW);
  3009. }
  3010. else
  3011. {
  3012. assert(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
  3013. assert(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
  3014. if(flags & SWS_FULL_CHR_H_INT){
  3015. yuv2rgbXinC_full(c,
  3016. vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
  3017. vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
  3018. alpSrcPtr, dest, dstW, dstY);
  3019. }else{
  3020. yuv2packedXinC(c,
  3021. vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
  3022. vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
  3023. alpSrcPtr, dest, dstW, dstY);
  3024. }
  3025. }
  3026. }
  3027. }
  3028. if ((dstFormat == PIX_FMT_YUVA420P) && !alpPixBuf)
  3029. fillPlane(dst[3], dstStride[3], dstW, dstY-lastDstY, lastDstY, 255);
  3030. #if HAVE_MMX
  3031. __asm__ volatile(SFENCE:::"memory");
  3032. __asm__ volatile(EMMS:::"memory");
  3033. #endif
  3034. /* store changed local vars back in the context */
  3035. c->dstY= dstY;
  3036. c->lumBufIndex= lumBufIndex;
  3037. c->chrBufIndex= chrBufIndex;
  3038. c->lastInLumBuf= lastInLumBuf;
  3039. c->lastInChrBuf= lastInChrBuf;
  3040. return dstY - lastDstY;
  3041. }