You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

3152 lines
137KB

  1. /*
  2. * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
  3. *
  4. * This file is part of FFmpeg.
  5. *
  6. * FFmpeg is free software; you can redistribute it and/or modify
  7. * it under the terms of the GNU General Public License as published by
  8. * the Free Software Foundation; either version 2 of the License, or
  9. * (at your option) any later version.
  10. *
  11. * FFmpeg is distributed in the hope that it will be useful,
  12. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  14. * GNU General Public License for more details.
  15. *
  16. * You should have received a copy of the GNU General Public License
  17. * along with FFmpeg; if not, write to the Free Software
  18. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  19. *
  20. * The C code (not assembly, MMX, ...) of this file can be used
  21. * under the LGPL license.
  22. */
  23. #undef REAL_MOVNTQ
  24. #undef MOVNTQ
  25. #undef PAVGB
  26. #undef PREFETCH
  27. #undef PREFETCHW
  28. #if HAVE_AMD3DNOW
  29. #define PREFETCH "prefetch"
  30. #define PREFETCHW "prefetchw"
  31. #elif HAVE_MMX2
  32. #define PREFETCH "prefetchnta"
  33. #define PREFETCHW "prefetcht0"
  34. #else
  35. #define PREFETCH " # nop"
  36. #define PREFETCHW " # nop"
  37. #endif
  38. #if HAVE_MMX2
  39. #define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
  40. #elif HAVE_AMD3DNOW
  41. #define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
  42. #endif
  43. #if HAVE_MMX2
  44. #define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
  45. #else
  46. #define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
  47. #endif
  48. #define MOVNTQ(a,b) REAL_MOVNTQ(a,b)
  49. #if HAVE_ALTIVEC
  50. #include "ppc/swscale_altivec_template.c"
  51. #endif
  52. #define YSCALEYUV2YV12X(x, offset, dest, width) \
  53. __asm__ volatile(\
  54. "xor %%"REG_a", %%"REG_a" \n\t"\
  55. "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
  56. "movq %%mm3, %%mm4 \n\t"\
  57. "lea " offset "(%0), %%"REG_d" \n\t"\
  58. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  59. ASMALIGN(4) /* FIXME Unroll? */\
  60. "1: \n\t"\
  61. "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
  62. "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* srcData */\
  63. "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm5 \n\t" /* srcData */\
  64. "add $16, %%"REG_d" \n\t"\
  65. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  66. "test %%"REG_S", %%"REG_S" \n\t"\
  67. "pmulhw %%mm0, %%mm2 \n\t"\
  68. "pmulhw %%mm0, %%mm5 \n\t"\
  69. "paddw %%mm2, %%mm3 \n\t"\
  70. "paddw %%mm5, %%mm4 \n\t"\
  71. " jnz 1b \n\t"\
  72. "psraw $3, %%mm3 \n\t"\
  73. "psraw $3, %%mm4 \n\t"\
  74. "packuswb %%mm4, %%mm3 \n\t"\
  75. MOVNTQ(%%mm3, (%1, %%REGa))\
  76. "add $8, %%"REG_a" \n\t"\
  77. "cmp %2, %%"REG_a" \n\t"\
  78. "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
  79. "movq %%mm3, %%mm4 \n\t"\
  80. "lea " offset "(%0), %%"REG_d" \n\t"\
  81. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  82. "jb 1b \n\t"\
  83. :: "r" (&c->redDither),\
  84. "r" (dest), "g" (width)\
  85. : "%"REG_a, "%"REG_d, "%"REG_S\
  86. );
  87. #define YSCALEYUV2YV12X_ACCURATE(x, offset, dest, width) \
  88. __asm__ volatile(\
  89. "lea " offset "(%0), %%"REG_d" \n\t"\
  90. "xor %%"REG_a", %%"REG_a" \n\t"\
  91. "pxor %%mm4, %%mm4 \n\t"\
  92. "pxor %%mm5, %%mm5 \n\t"\
  93. "pxor %%mm6, %%mm6 \n\t"\
  94. "pxor %%mm7, %%mm7 \n\t"\
  95. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  96. ASMALIGN(4) \
  97. "1: \n\t"\
  98. "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm0 \n\t" /* srcData */\
  99. "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* srcData */\
  100. "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
  101. "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm1 \n\t" /* srcData */\
  102. "movq %%mm0, %%mm3 \n\t"\
  103. "punpcklwd %%mm1, %%mm0 \n\t"\
  104. "punpckhwd %%mm1, %%mm3 \n\t"\
  105. "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm1 \n\t" /* filterCoeff */\
  106. "pmaddwd %%mm1, %%mm0 \n\t"\
  107. "pmaddwd %%mm1, %%mm3 \n\t"\
  108. "paddd %%mm0, %%mm4 \n\t"\
  109. "paddd %%mm3, %%mm5 \n\t"\
  110. "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm3 \n\t" /* srcData */\
  111. "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
  112. "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
  113. "test %%"REG_S", %%"REG_S" \n\t"\
  114. "movq %%mm2, %%mm0 \n\t"\
  115. "punpcklwd %%mm3, %%mm2 \n\t"\
  116. "punpckhwd %%mm3, %%mm0 \n\t"\
  117. "pmaddwd %%mm1, %%mm2 \n\t"\
  118. "pmaddwd %%mm1, %%mm0 \n\t"\
  119. "paddd %%mm2, %%mm6 \n\t"\
  120. "paddd %%mm0, %%mm7 \n\t"\
  121. " jnz 1b \n\t"\
  122. "psrad $16, %%mm4 \n\t"\
  123. "psrad $16, %%mm5 \n\t"\
  124. "psrad $16, %%mm6 \n\t"\
  125. "psrad $16, %%mm7 \n\t"\
  126. "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
  127. "packssdw %%mm5, %%mm4 \n\t"\
  128. "packssdw %%mm7, %%mm6 \n\t"\
  129. "paddw %%mm0, %%mm4 \n\t"\
  130. "paddw %%mm0, %%mm6 \n\t"\
  131. "psraw $3, %%mm4 \n\t"\
  132. "psraw $3, %%mm6 \n\t"\
  133. "packuswb %%mm6, %%mm4 \n\t"\
  134. MOVNTQ(%%mm4, (%1, %%REGa))\
  135. "add $8, %%"REG_a" \n\t"\
  136. "cmp %2, %%"REG_a" \n\t"\
  137. "lea " offset "(%0), %%"REG_d" \n\t"\
  138. "pxor %%mm4, %%mm4 \n\t"\
  139. "pxor %%mm5, %%mm5 \n\t"\
  140. "pxor %%mm6, %%mm6 \n\t"\
  141. "pxor %%mm7, %%mm7 \n\t"\
  142. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  143. "jb 1b \n\t"\
  144. :: "r" (&c->redDither),\
  145. "r" (dest), "g" (width)\
  146. : "%"REG_a, "%"REG_d, "%"REG_S\
  147. );
  148. #define YSCALEYUV2YV121 \
  149. "mov %2, %%"REG_a" \n\t"\
  150. ASMALIGN(4) /* FIXME Unroll? */\
  151. "1: \n\t"\
  152. "movq (%0, %%"REG_a", 2), %%mm0 \n\t"\
  153. "movq 8(%0, %%"REG_a", 2), %%mm1 \n\t"\
  154. "psraw $7, %%mm0 \n\t"\
  155. "psraw $7, %%mm1 \n\t"\
  156. "packuswb %%mm1, %%mm0 \n\t"\
  157. MOVNTQ(%%mm0, (%1, %%REGa))\
  158. "add $8, %%"REG_a" \n\t"\
  159. "jnc 1b \n\t"
  160. #define YSCALEYUV2YV121_ACCURATE \
  161. "mov %2, %%"REG_a" \n\t"\
  162. "pcmpeqw %%mm7, %%mm7 \n\t"\
  163. "psrlw $15, %%mm7 \n\t"\
  164. "psllw $6, %%mm7 \n\t"\
  165. ASMALIGN(4) /* FIXME Unroll? */\
  166. "1: \n\t"\
  167. "movq (%0, %%"REG_a", 2), %%mm0 \n\t"\
  168. "movq 8(%0, %%"REG_a", 2), %%mm1 \n\t"\
  169. "paddsw %%mm7, %%mm0 \n\t"\
  170. "paddsw %%mm7, %%mm1 \n\t"\
  171. "psraw $7, %%mm0 \n\t"\
  172. "psraw $7, %%mm1 \n\t"\
  173. "packuswb %%mm1, %%mm0 \n\t"\
  174. MOVNTQ(%%mm0, (%1, %%REGa))\
  175. "add $8, %%"REG_a" \n\t"\
  176. "jnc 1b \n\t"
  177. /*
  178. :: "m" (-lumFilterSize), "m" (-chrFilterSize),
  179. "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
  180. "r" (dest), "m" (dstW),
  181. "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
  182. : "%eax", "%ebx", "%ecx", "%edx", "%esi"
  183. */
  184. #define YSCALEYUV2PACKEDX_UV \
  185. __asm__ volatile(\
  186. "xor %%"REG_a", %%"REG_a" \n\t"\
  187. ASMALIGN(4)\
  188. "nop \n\t"\
  189. "1: \n\t"\
  190. "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
  191. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  192. "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
  193. "movq %%mm3, %%mm4 \n\t"\
  194. ASMALIGN(4)\
  195. "2: \n\t"\
  196. "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
  197. "movq (%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* UsrcData */\
  198. "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm5 \n\t" /* VsrcData */\
  199. "add $16, %%"REG_d" \n\t"\
  200. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  201. "pmulhw %%mm0, %%mm2 \n\t"\
  202. "pmulhw %%mm0, %%mm5 \n\t"\
  203. "paddw %%mm2, %%mm3 \n\t"\
  204. "paddw %%mm5, %%mm4 \n\t"\
  205. "test %%"REG_S", %%"REG_S" \n\t"\
  206. " jnz 2b \n\t"\
  207. #define YSCALEYUV2PACKEDX_YA(offset,coeff,src1,src2,dst1,dst2) \
  208. "lea "offset"(%0), %%"REG_d" \n\t"\
  209. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  210. "movq "VROUNDER_OFFSET"(%0), "#dst1" \n\t"\
  211. "movq "#dst1", "#dst2" \n\t"\
  212. ASMALIGN(4)\
  213. "2: \n\t"\
  214. "movq 8(%%"REG_d"), "#coeff" \n\t" /* filterCoeff */\
  215. "movq (%%"REG_S", %%"REG_a", 2), "#src1" \n\t" /* Y1srcData */\
  216. "movq 8(%%"REG_S", %%"REG_a", 2), "#src2" \n\t" /* Y2srcData */\
  217. "add $16, %%"REG_d" \n\t"\
  218. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  219. "pmulhw "#coeff", "#src1" \n\t"\
  220. "pmulhw "#coeff", "#src2" \n\t"\
  221. "paddw "#src1", "#dst1" \n\t"\
  222. "paddw "#src2", "#dst2" \n\t"\
  223. "test %%"REG_S", %%"REG_S" \n\t"\
  224. " jnz 2b \n\t"\
  225. #define YSCALEYUV2PACKEDX \
  226. YSCALEYUV2PACKEDX_UV \
  227. YSCALEYUV2PACKEDX_YA(LUM_MMX_FILTER_OFFSET,%%mm0,%%mm2,%%mm5,%%mm1,%%mm7) \
  228. #define YSCALEYUV2PACKEDX_END \
  229. :: "r" (&c->redDither), \
  230. "m" (dummy), "m" (dummy), "m" (dummy),\
  231. "r" (dest), "m" (dstW) \
  232. : "%"REG_a, "%"REG_d, "%"REG_S \
  233. );
  234. #define YSCALEYUV2PACKEDX_ACCURATE_UV \
  235. __asm__ volatile(\
  236. "xor %%"REG_a", %%"REG_a" \n\t"\
  237. ASMALIGN(4)\
  238. "nop \n\t"\
  239. "1: \n\t"\
  240. "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
  241. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  242. "pxor %%mm4, %%mm4 \n\t"\
  243. "pxor %%mm5, %%mm5 \n\t"\
  244. "pxor %%mm6, %%mm6 \n\t"\
  245. "pxor %%mm7, %%mm7 \n\t"\
  246. ASMALIGN(4)\
  247. "2: \n\t"\
  248. "movq (%%"REG_S", %%"REG_a"), %%mm0 \n\t" /* UsrcData */\
  249. "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* VsrcData */\
  250. "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
  251. "movq (%%"REG_S", %%"REG_a"), %%mm1 \n\t" /* UsrcData */\
  252. "movq %%mm0, %%mm3 \n\t"\
  253. "punpcklwd %%mm1, %%mm0 \n\t"\
  254. "punpckhwd %%mm1, %%mm3 \n\t"\
  255. "movq "STR(APCK_COEF)"(%%"REG_d"),%%mm1 \n\t" /* filterCoeff */\
  256. "pmaddwd %%mm1, %%mm0 \n\t"\
  257. "pmaddwd %%mm1, %%mm3 \n\t"\
  258. "paddd %%mm0, %%mm4 \n\t"\
  259. "paddd %%mm3, %%mm5 \n\t"\
  260. "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm3 \n\t" /* VsrcData */\
  261. "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
  262. "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
  263. "test %%"REG_S", %%"REG_S" \n\t"\
  264. "movq %%mm2, %%mm0 \n\t"\
  265. "punpcklwd %%mm3, %%mm2 \n\t"\
  266. "punpckhwd %%mm3, %%mm0 \n\t"\
  267. "pmaddwd %%mm1, %%mm2 \n\t"\
  268. "pmaddwd %%mm1, %%mm0 \n\t"\
  269. "paddd %%mm2, %%mm6 \n\t"\
  270. "paddd %%mm0, %%mm7 \n\t"\
  271. " jnz 2b \n\t"\
  272. "psrad $16, %%mm4 \n\t"\
  273. "psrad $16, %%mm5 \n\t"\
  274. "psrad $16, %%mm6 \n\t"\
  275. "psrad $16, %%mm7 \n\t"\
  276. "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
  277. "packssdw %%mm5, %%mm4 \n\t"\
  278. "packssdw %%mm7, %%mm6 \n\t"\
  279. "paddw %%mm0, %%mm4 \n\t"\
  280. "paddw %%mm0, %%mm6 \n\t"\
  281. "movq %%mm4, "U_TEMP"(%0) \n\t"\
  282. "movq %%mm6, "V_TEMP"(%0) \n\t"\
  283. #define YSCALEYUV2PACKEDX_ACCURATE_YA(offset) \
  284. "lea "offset"(%0), %%"REG_d" \n\t"\
  285. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  286. "pxor %%mm1, %%mm1 \n\t"\
  287. "pxor %%mm5, %%mm5 \n\t"\
  288. "pxor %%mm7, %%mm7 \n\t"\
  289. "pxor %%mm6, %%mm6 \n\t"\
  290. ASMALIGN(4)\
  291. "2: \n\t"\
  292. "movq (%%"REG_S", %%"REG_a", 2), %%mm0 \n\t" /* Y1srcData */\
  293. "movq 8(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* Y2srcData */\
  294. "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
  295. "movq (%%"REG_S", %%"REG_a", 2), %%mm4 \n\t" /* Y1srcData */\
  296. "movq %%mm0, %%mm3 \n\t"\
  297. "punpcklwd %%mm4, %%mm0 \n\t"\
  298. "punpckhwd %%mm4, %%mm3 \n\t"\
  299. "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm4 \n\t" /* filterCoeff */\
  300. "pmaddwd %%mm4, %%mm0 \n\t"\
  301. "pmaddwd %%mm4, %%mm3 \n\t"\
  302. "paddd %%mm0, %%mm1 \n\t"\
  303. "paddd %%mm3, %%mm5 \n\t"\
  304. "movq 8(%%"REG_S", %%"REG_a", 2), %%mm3 \n\t" /* Y2srcData */\
  305. "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
  306. "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
  307. "test %%"REG_S", %%"REG_S" \n\t"\
  308. "movq %%mm2, %%mm0 \n\t"\
  309. "punpcklwd %%mm3, %%mm2 \n\t"\
  310. "punpckhwd %%mm3, %%mm0 \n\t"\
  311. "pmaddwd %%mm4, %%mm2 \n\t"\
  312. "pmaddwd %%mm4, %%mm0 \n\t"\
  313. "paddd %%mm2, %%mm7 \n\t"\
  314. "paddd %%mm0, %%mm6 \n\t"\
  315. " jnz 2b \n\t"\
  316. "psrad $16, %%mm1 \n\t"\
  317. "psrad $16, %%mm5 \n\t"\
  318. "psrad $16, %%mm7 \n\t"\
  319. "psrad $16, %%mm6 \n\t"\
  320. "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
  321. "packssdw %%mm5, %%mm1 \n\t"\
  322. "packssdw %%mm6, %%mm7 \n\t"\
  323. "paddw %%mm0, %%mm1 \n\t"\
  324. "paddw %%mm0, %%mm7 \n\t"\
  325. "movq "U_TEMP"(%0), %%mm3 \n\t"\
  326. "movq "V_TEMP"(%0), %%mm4 \n\t"\
  327. #define YSCALEYUV2PACKEDX_ACCURATE \
  328. YSCALEYUV2PACKEDX_ACCURATE_UV \
  329. YSCALEYUV2PACKEDX_ACCURATE_YA(LUM_MMX_FILTER_OFFSET)
  330. #define YSCALEYUV2RGBX \
  331. "psubw "U_OFFSET"(%0), %%mm3 \n\t" /* (U-128)8*/\
  332. "psubw "V_OFFSET"(%0), %%mm4 \n\t" /* (V-128)8*/\
  333. "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
  334. "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
  335. "pmulhw "UG_COEFF"(%0), %%mm3 \n\t"\
  336. "pmulhw "VG_COEFF"(%0), %%mm4 \n\t"\
  337. /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
  338. "pmulhw "UB_COEFF"(%0), %%mm2 \n\t"\
  339. "pmulhw "VR_COEFF"(%0), %%mm5 \n\t"\
  340. "psubw "Y_OFFSET"(%0), %%mm1 \n\t" /* 8(Y-16)*/\
  341. "psubw "Y_OFFSET"(%0), %%mm7 \n\t" /* 8(Y-16)*/\
  342. "pmulhw "Y_COEFF"(%0), %%mm1 \n\t"\
  343. "pmulhw "Y_COEFF"(%0), %%mm7 \n\t"\
  344. /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
  345. "paddw %%mm3, %%mm4 \n\t"\
  346. "movq %%mm2, %%mm0 \n\t"\
  347. "movq %%mm5, %%mm6 \n\t"\
  348. "movq %%mm4, %%mm3 \n\t"\
  349. "punpcklwd %%mm2, %%mm2 \n\t"\
  350. "punpcklwd %%mm5, %%mm5 \n\t"\
  351. "punpcklwd %%mm4, %%mm4 \n\t"\
  352. "paddw %%mm1, %%mm2 \n\t"\
  353. "paddw %%mm1, %%mm5 \n\t"\
  354. "paddw %%mm1, %%mm4 \n\t"\
  355. "punpckhwd %%mm0, %%mm0 \n\t"\
  356. "punpckhwd %%mm6, %%mm6 \n\t"\
  357. "punpckhwd %%mm3, %%mm3 \n\t"\
  358. "paddw %%mm7, %%mm0 \n\t"\
  359. "paddw %%mm7, %%mm6 \n\t"\
  360. "paddw %%mm7, %%mm3 \n\t"\
  361. /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
  362. "packuswb %%mm0, %%mm2 \n\t"\
  363. "packuswb %%mm6, %%mm5 \n\t"\
  364. "packuswb %%mm3, %%mm4 \n\t"\
  365. #define REAL_YSCALEYUV2PACKED(index, c) \
  366. "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
  367. "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1 \n\t"\
  368. "psraw $3, %%mm0 \n\t"\
  369. "psraw $3, %%mm1 \n\t"\
  370. "movq %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c") \n\t"\
  371. "movq %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c") \n\t"\
  372. "xor "#index", "#index" \n\t"\
  373. ASMALIGN(4)\
  374. "1: \n\t"\
  375. "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
  376. "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
  377. "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
  378. "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
  379. "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
  380. "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
  381. "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
  382. "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
  383. "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
  384. "psraw $7, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
  385. "psraw $7, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
  386. "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
  387. "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
  388. "movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
  389. "movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
  390. "movq 8(%0, "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
  391. "movq 8(%1, "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
  392. "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
  393. "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
  394. "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
  395. "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
  396. "psraw $7, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  397. "psraw $7, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  398. "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
  399. "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
  400. #define YSCALEYUV2PACKED(index, c) REAL_YSCALEYUV2PACKED(index, c)
  401. #define REAL_YSCALEYUV2RGB_UV(index, c) \
  402. "xor "#index", "#index" \n\t"\
  403. ASMALIGN(4)\
  404. "1: \n\t"\
  405. "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
  406. "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
  407. "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
  408. "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
  409. "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
  410. "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
  411. "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
  412. "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
  413. "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
  414. "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
  415. "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
  416. "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
  417. "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
  418. "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
  419. "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
  420. "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
  421. "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
  422. "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
  423. "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
  424. /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
  425. #define REAL_YSCALEYUV2RGB_YA(index, c, b1, b2) \
  426. "movq ("#b1", "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
  427. "movq ("#b2", "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
  428. "movq 8("#b1", "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
  429. "movq 8("#b2", "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
  430. "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
  431. "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
  432. "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
  433. "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
  434. "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  435. "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  436. "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
  437. "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
  438. #define REAL_YSCALEYUV2RGB_COEFF(c) \
  439. "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
  440. "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
  441. "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
  442. "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
  443. "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
  444. "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
  445. /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
  446. "paddw %%mm3, %%mm4 \n\t"\
  447. "movq %%mm2, %%mm0 \n\t"\
  448. "movq %%mm5, %%mm6 \n\t"\
  449. "movq %%mm4, %%mm3 \n\t"\
  450. "punpcklwd %%mm2, %%mm2 \n\t"\
  451. "punpcklwd %%mm5, %%mm5 \n\t"\
  452. "punpcklwd %%mm4, %%mm4 \n\t"\
  453. "paddw %%mm1, %%mm2 \n\t"\
  454. "paddw %%mm1, %%mm5 \n\t"\
  455. "paddw %%mm1, %%mm4 \n\t"\
  456. "punpckhwd %%mm0, %%mm0 \n\t"\
  457. "punpckhwd %%mm6, %%mm6 \n\t"\
  458. "punpckhwd %%mm3, %%mm3 \n\t"\
  459. "paddw %%mm7, %%mm0 \n\t"\
  460. "paddw %%mm7, %%mm6 \n\t"\
  461. "paddw %%mm7, %%mm3 \n\t"\
  462. /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
  463. "packuswb %%mm0, %%mm2 \n\t"\
  464. "packuswb %%mm6, %%mm5 \n\t"\
  465. "packuswb %%mm3, %%mm4 \n\t"\
  466. #define YSCALEYUV2RGB_YA(index, c, b1, b2) REAL_YSCALEYUV2RGB_YA(index, c, b1, b2)
  467. #define YSCALEYUV2RGB(index, c) \
  468. REAL_YSCALEYUV2RGB_UV(index, c) \
  469. REAL_YSCALEYUV2RGB_YA(index, c, %0, %1) \
  470. REAL_YSCALEYUV2RGB_COEFF(c)
  471. #define REAL_YSCALEYUV2PACKED1(index, c) \
  472. "xor "#index", "#index" \n\t"\
  473. ASMALIGN(4)\
  474. "1: \n\t"\
  475. "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
  476. "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
  477. "psraw $7, %%mm3 \n\t" \
  478. "psraw $7, %%mm4 \n\t" \
  479. "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
  480. "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
  481. "psraw $7, %%mm1 \n\t" \
  482. "psraw $7, %%mm7 \n\t" \
  483. #define YSCALEYUV2PACKED1(index, c) REAL_YSCALEYUV2PACKED1(index, c)
  484. #define REAL_YSCALEYUV2RGB1(index, c) \
  485. "xor "#index", "#index" \n\t"\
  486. ASMALIGN(4)\
  487. "1: \n\t"\
  488. "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
  489. "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
  490. "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
  491. "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
  492. "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
  493. "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
  494. "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
  495. "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
  496. "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
  497. "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
  498. /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
  499. "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
  500. "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
  501. "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  502. "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  503. "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
  504. "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
  505. "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
  506. "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
  507. "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
  508. "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
  509. /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
  510. "paddw %%mm3, %%mm4 \n\t"\
  511. "movq %%mm2, %%mm0 \n\t"\
  512. "movq %%mm5, %%mm6 \n\t"\
  513. "movq %%mm4, %%mm3 \n\t"\
  514. "punpcklwd %%mm2, %%mm2 \n\t"\
  515. "punpcklwd %%mm5, %%mm5 \n\t"\
  516. "punpcklwd %%mm4, %%mm4 \n\t"\
  517. "paddw %%mm1, %%mm2 \n\t"\
  518. "paddw %%mm1, %%mm5 \n\t"\
  519. "paddw %%mm1, %%mm4 \n\t"\
  520. "punpckhwd %%mm0, %%mm0 \n\t"\
  521. "punpckhwd %%mm6, %%mm6 \n\t"\
  522. "punpckhwd %%mm3, %%mm3 \n\t"\
  523. "paddw %%mm7, %%mm0 \n\t"\
  524. "paddw %%mm7, %%mm6 \n\t"\
  525. "paddw %%mm7, %%mm3 \n\t"\
  526. /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
  527. "packuswb %%mm0, %%mm2 \n\t"\
  528. "packuswb %%mm6, %%mm5 \n\t"\
  529. "packuswb %%mm3, %%mm4 \n\t"\
  530. #define YSCALEYUV2RGB1(index, c) REAL_YSCALEYUV2RGB1(index, c)
  531. #define REAL_YSCALEYUV2PACKED1b(index, c) \
  532. "xor "#index", "#index" \n\t"\
  533. ASMALIGN(4)\
  534. "1: \n\t"\
  535. "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
  536. "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
  537. "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
  538. "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
  539. "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
  540. "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
  541. "psrlw $8, %%mm3 \n\t" \
  542. "psrlw $8, %%mm4 \n\t" \
  543. "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
  544. "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
  545. "psraw $7, %%mm1 \n\t" \
  546. "psraw $7, %%mm7 \n\t"
  547. #define YSCALEYUV2PACKED1b(index, c) REAL_YSCALEYUV2PACKED1b(index, c)
  548. // do vertical chrominance interpolation
  549. #define REAL_YSCALEYUV2RGB1b(index, c) \
  550. "xor "#index", "#index" \n\t"\
  551. ASMALIGN(4)\
  552. "1: \n\t"\
  553. "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
  554. "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
  555. "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
  556. "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
  557. "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
  558. "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
  559. "psrlw $5, %%mm3 \n\t" /*FIXME might overflow*/\
  560. "psrlw $5, %%mm4 \n\t" /*FIXME might overflow*/\
  561. "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
  562. "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
  563. "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
  564. "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
  565. "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
  566. "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
  567. /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
  568. "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
  569. "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
  570. "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  571. "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  572. "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
  573. "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
  574. "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
  575. "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
  576. "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
  577. "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
  578. /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
  579. "paddw %%mm3, %%mm4 \n\t"\
  580. "movq %%mm2, %%mm0 \n\t"\
  581. "movq %%mm5, %%mm6 \n\t"\
  582. "movq %%mm4, %%mm3 \n\t"\
  583. "punpcklwd %%mm2, %%mm2 \n\t"\
  584. "punpcklwd %%mm5, %%mm5 \n\t"\
  585. "punpcklwd %%mm4, %%mm4 \n\t"\
  586. "paddw %%mm1, %%mm2 \n\t"\
  587. "paddw %%mm1, %%mm5 \n\t"\
  588. "paddw %%mm1, %%mm4 \n\t"\
  589. "punpckhwd %%mm0, %%mm0 \n\t"\
  590. "punpckhwd %%mm6, %%mm6 \n\t"\
  591. "punpckhwd %%mm3, %%mm3 \n\t"\
  592. "paddw %%mm7, %%mm0 \n\t"\
  593. "paddw %%mm7, %%mm6 \n\t"\
  594. "paddw %%mm7, %%mm3 \n\t"\
  595. /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
  596. "packuswb %%mm0, %%mm2 \n\t"\
  597. "packuswb %%mm6, %%mm5 \n\t"\
  598. "packuswb %%mm3, %%mm4 \n\t"\
  599. #define YSCALEYUV2RGB1b(index, c) REAL_YSCALEYUV2RGB1b(index, c)
  600. #define REAL_YSCALEYUV2RGB1_ALPHA(index) \
  601. "movq (%1, "#index", 2), %%mm7 \n\t" /* abuf0[index ] */\
  602. "movq 8(%1, "#index", 2), %%mm1 \n\t" /* abuf0[index+4] */\
  603. "psraw $7, %%mm7 \n\t" /* abuf0[index ] >>7 */\
  604. "psraw $7, %%mm1 \n\t" /* abuf0[index+4] >>7 */\
  605. "packuswb %%mm1, %%mm7 \n\t"
  606. #define YSCALEYUV2RGB1_ALPHA(index) REAL_YSCALEYUV2RGB1_ALPHA(index)
  607. #define REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) \
  608. "movq "#b", "#q2" \n\t" /* B */\
  609. "movq "#r", "#t" \n\t" /* R */\
  610. "punpcklbw "#g", "#b" \n\t" /* GBGBGBGB 0 */\
  611. "punpcklbw "#a", "#r" \n\t" /* ARARARAR 0 */\
  612. "punpckhbw "#g", "#q2" \n\t" /* GBGBGBGB 2 */\
  613. "punpckhbw "#a", "#t" \n\t" /* ARARARAR 2 */\
  614. "movq "#b", "#q0" \n\t" /* GBGBGBGB 0 */\
  615. "movq "#q2", "#q3" \n\t" /* GBGBGBGB 2 */\
  616. "punpcklwd "#r", "#q0" \n\t" /* ARGBARGB 0 */\
  617. "punpckhwd "#r", "#b" \n\t" /* ARGBARGB 1 */\
  618. "punpcklwd "#t", "#q2" \n\t" /* ARGBARGB 2 */\
  619. "punpckhwd "#t", "#q3" \n\t" /* ARGBARGB 3 */\
  620. \
  621. MOVNTQ( q0, (dst, index, 4))\
  622. MOVNTQ( b, 8(dst, index, 4))\
  623. MOVNTQ( q2, 16(dst, index, 4))\
  624. MOVNTQ( q3, 24(dst, index, 4))\
  625. \
  626. "add $8, "#index" \n\t"\
  627. "cmp "#dstw", "#index" \n\t"\
  628. " jb 1b \n\t"
  629. #define WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t)
  630. #define REAL_WRITERGB16(dst, dstw, index) \
  631. "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
  632. "pand "MANGLE(bFC)", %%mm4 \n\t" /* G */\
  633. "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
  634. "psrlq $3, %%mm2 \n\t"\
  635. \
  636. "movq %%mm2, %%mm1 \n\t"\
  637. "movq %%mm4, %%mm3 \n\t"\
  638. \
  639. "punpcklbw %%mm7, %%mm3 \n\t"\
  640. "punpcklbw %%mm5, %%mm2 \n\t"\
  641. "punpckhbw %%mm7, %%mm4 \n\t"\
  642. "punpckhbw %%mm5, %%mm1 \n\t"\
  643. \
  644. "psllq $3, %%mm3 \n\t"\
  645. "psllq $3, %%mm4 \n\t"\
  646. \
  647. "por %%mm3, %%mm2 \n\t"\
  648. "por %%mm4, %%mm1 \n\t"\
  649. \
  650. MOVNTQ(%%mm2, (dst, index, 2))\
  651. MOVNTQ(%%mm1, 8(dst, index, 2))\
  652. \
  653. "add $8, "#index" \n\t"\
  654. "cmp "#dstw", "#index" \n\t"\
  655. " jb 1b \n\t"
  656. #define WRITERGB16(dst, dstw, index) REAL_WRITERGB16(dst, dstw, index)
  657. #define REAL_WRITERGB15(dst, dstw, index) \
  658. "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
  659. "pand "MANGLE(bF8)", %%mm4 \n\t" /* G */\
  660. "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
  661. "psrlq $3, %%mm2 \n\t"\
  662. "psrlq $1, %%mm5 \n\t"\
  663. \
  664. "movq %%mm2, %%mm1 \n\t"\
  665. "movq %%mm4, %%mm3 \n\t"\
  666. \
  667. "punpcklbw %%mm7, %%mm3 \n\t"\
  668. "punpcklbw %%mm5, %%mm2 \n\t"\
  669. "punpckhbw %%mm7, %%mm4 \n\t"\
  670. "punpckhbw %%mm5, %%mm1 \n\t"\
  671. \
  672. "psllq $2, %%mm3 \n\t"\
  673. "psllq $2, %%mm4 \n\t"\
  674. \
  675. "por %%mm3, %%mm2 \n\t"\
  676. "por %%mm4, %%mm1 \n\t"\
  677. \
  678. MOVNTQ(%%mm2, (dst, index, 2))\
  679. MOVNTQ(%%mm1, 8(dst, index, 2))\
  680. \
  681. "add $8, "#index" \n\t"\
  682. "cmp "#dstw", "#index" \n\t"\
  683. " jb 1b \n\t"
  684. #define WRITERGB15(dst, dstw, index) REAL_WRITERGB15(dst, dstw, index)
  685. #define WRITEBGR24OLD(dst, dstw, index) \
  686. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
  687. "movq %%mm2, %%mm1 \n\t" /* B */\
  688. "movq %%mm5, %%mm6 \n\t" /* R */\
  689. "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
  690. "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
  691. "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
  692. "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
  693. "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
  694. "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
  695. "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
  696. "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
  697. "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
  698. "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
  699. \
  700. "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
  701. "psrlq $8, %%mm0 \n\t" /* 00RGB0RG 0 */\
  702. "pand "MANGLE(bm00000111)", %%mm4 \n\t" /* 00000RGB 0 */\
  703. "pand "MANGLE(bm11111000)", %%mm0 \n\t" /* 00RGB000 0.5 */\
  704. "por %%mm4, %%mm0 \n\t" /* 00RGBRGB 0 */\
  705. "movq %%mm2, %%mm4 \n\t" /* 0RGB0RGB 1 */\
  706. "psllq $48, %%mm2 \n\t" /* GB000000 1 */\
  707. "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
  708. \
  709. "movq %%mm4, %%mm2 \n\t" /* 0RGB0RGB 1 */\
  710. "psrld $16, %%mm4 \n\t" /* 000R000R 1 */\
  711. "psrlq $24, %%mm2 \n\t" /* 0000RGB0 1.5 */\
  712. "por %%mm4, %%mm2 \n\t" /* 000RRGBR 1 */\
  713. "pand "MANGLE(bm00001111)", %%mm2 \n\t" /* 0000RGBR 1 */\
  714. "movq %%mm1, %%mm4 \n\t" /* 0RGB0RGB 2 */\
  715. "psrlq $8, %%mm1 \n\t" /* 00RGB0RG 2 */\
  716. "pand "MANGLE(bm00000111)", %%mm4 \n\t" /* 00000RGB 2 */\
  717. "pand "MANGLE(bm11111000)", %%mm1 \n\t" /* 00RGB000 2.5 */\
  718. "por %%mm4, %%mm1 \n\t" /* 00RGBRGB 2 */\
  719. "movq %%mm1, %%mm4 \n\t" /* 00RGBRGB 2 */\
  720. "psllq $32, %%mm1 \n\t" /* BRGB0000 2 */\
  721. "por %%mm1, %%mm2 \n\t" /* BRGBRGBR 1 */\
  722. \
  723. "psrlq $32, %%mm4 \n\t" /* 000000RG 2.5 */\
  724. "movq %%mm3, %%mm5 \n\t" /* 0RGB0RGB 3 */\
  725. "psrlq $8, %%mm3 \n\t" /* 00RGB0RG 3 */\
  726. "pand "MANGLE(bm00000111)", %%mm5 \n\t" /* 00000RGB 3 */\
  727. "pand "MANGLE(bm11111000)", %%mm3 \n\t" /* 00RGB000 3.5 */\
  728. "por %%mm5, %%mm3 \n\t" /* 00RGBRGB 3 */\
  729. "psllq $16, %%mm3 \n\t" /* RGBRGB00 3 */\
  730. "por %%mm4, %%mm3 \n\t" /* RGBRGBRG 2.5 */\
  731. \
  732. MOVNTQ(%%mm0, (dst))\
  733. MOVNTQ(%%mm2, 8(dst))\
  734. MOVNTQ(%%mm3, 16(dst))\
  735. "add $24, "#dst" \n\t"\
  736. \
  737. "add $8, "#index" \n\t"\
  738. "cmp "#dstw", "#index" \n\t"\
  739. " jb 1b \n\t"
  740. #define WRITEBGR24MMX(dst, dstw, index) \
  741. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
  742. "movq %%mm2, %%mm1 \n\t" /* B */\
  743. "movq %%mm5, %%mm6 \n\t" /* R */\
  744. "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
  745. "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
  746. "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
  747. "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
  748. "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
  749. "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
  750. "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
  751. "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
  752. "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
  753. "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
  754. \
  755. "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
  756. "movq %%mm2, %%mm6 \n\t" /* 0RGB0RGB 1 */\
  757. "movq %%mm1, %%mm5 \n\t" /* 0RGB0RGB 2 */\
  758. "movq %%mm3, %%mm7 \n\t" /* 0RGB0RGB 3 */\
  759. \
  760. "psllq $40, %%mm0 \n\t" /* RGB00000 0 */\
  761. "psllq $40, %%mm2 \n\t" /* RGB00000 1 */\
  762. "psllq $40, %%mm1 \n\t" /* RGB00000 2 */\
  763. "psllq $40, %%mm3 \n\t" /* RGB00000 3 */\
  764. \
  765. "punpckhdq %%mm4, %%mm0 \n\t" /* 0RGBRGB0 0 */\
  766. "punpckhdq %%mm6, %%mm2 \n\t" /* 0RGBRGB0 1 */\
  767. "punpckhdq %%mm5, %%mm1 \n\t" /* 0RGBRGB0 2 */\
  768. "punpckhdq %%mm7, %%mm3 \n\t" /* 0RGBRGB0 3 */\
  769. \
  770. "psrlq $8, %%mm0 \n\t" /* 00RGBRGB 0 */\
  771. "movq %%mm2, %%mm6 \n\t" /* 0RGBRGB0 1 */\
  772. "psllq $40, %%mm2 \n\t" /* GB000000 1 */\
  773. "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
  774. MOVNTQ(%%mm0, (dst))\
  775. \
  776. "psrlq $24, %%mm6 \n\t" /* 0000RGBR 1 */\
  777. "movq %%mm1, %%mm5 \n\t" /* 0RGBRGB0 2 */\
  778. "psllq $24, %%mm1 \n\t" /* BRGB0000 2 */\
  779. "por %%mm1, %%mm6 \n\t" /* BRGBRGBR 1 */\
  780. MOVNTQ(%%mm6, 8(dst))\
  781. \
  782. "psrlq $40, %%mm5 \n\t" /* 000000RG 2 */\
  783. "psllq $8, %%mm3 \n\t" /* RGBRGB00 3 */\
  784. "por %%mm3, %%mm5 \n\t" /* RGBRGBRG 2 */\
  785. MOVNTQ(%%mm5, 16(dst))\
  786. \
  787. "add $24, "#dst" \n\t"\
  788. \
  789. "add $8, "#index" \n\t"\
  790. "cmp "#dstw", "#index" \n\t"\
  791. " jb 1b \n\t"
  792. #define WRITEBGR24MMX2(dst, dstw, index) \
  793. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
  794. "movq "MANGLE(ff_M24A)", %%mm0 \n\t"\
  795. "movq "MANGLE(ff_M24C)", %%mm7 \n\t"\
  796. "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2 B1 B0 B1 B0 */\
  797. "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2 G1 G0 G1 G0 */\
  798. "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0 R1 R0 R1 R0 */\
  799. \
  800. "pand %%mm0, %%mm1 \n\t" /* B2 B1 B0 */\
  801. "pand %%mm0, %%mm3 \n\t" /* G2 G1 G0 */\
  802. "pand %%mm7, %%mm6 \n\t" /* R1 R0 */\
  803. \
  804. "psllq $8, %%mm3 \n\t" /* G2 G1 G0 */\
  805. "por %%mm1, %%mm6 \n\t"\
  806. "por %%mm3, %%mm6 \n\t"\
  807. MOVNTQ(%%mm6, (dst))\
  808. \
  809. "psrlq $8, %%mm4 \n\t" /* 00 G7 G6 G5 G4 G3 G2 G1 */\
  810. "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4 B3 B2 B3 B2 */\
  811. "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3 G4 G3 G4 G3 */\
  812. "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4 R3 R2 R3 R2 */\
  813. \
  814. "pand "MANGLE(ff_M24B)", %%mm1 \n\t" /* B5 B4 B3 */\
  815. "pand %%mm7, %%mm3 \n\t" /* G4 G3 */\
  816. "pand %%mm0, %%mm6 \n\t" /* R4 R3 R2 */\
  817. \
  818. "por %%mm1, %%mm3 \n\t" /* B5 G4 B4 G3 B3 */\
  819. "por %%mm3, %%mm6 \n\t"\
  820. MOVNTQ(%%mm6, 8(dst))\
  821. \
  822. "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6 B7 B6 B6 B7 */\
  823. "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7 G6 G5 G6 G5 */\
  824. "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6 R5 R4 R5 R4 */\
  825. \
  826. "pand %%mm7, %%mm1 \n\t" /* B7 B6 */\
  827. "pand %%mm0, %%mm3 \n\t" /* G7 G6 G5 */\
  828. "pand "MANGLE(ff_M24B)", %%mm6 \n\t" /* R7 R6 R5 */\
  829. \
  830. "por %%mm1, %%mm3 \n\t"\
  831. "por %%mm3, %%mm6 \n\t"\
  832. MOVNTQ(%%mm6, 16(dst))\
  833. \
  834. "add $24, "#dst" \n\t"\
  835. \
  836. "add $8, "#index" \n\t"\
  837. "cmp "#dstw", "#index" \n\t"\
  838. " jb 1b \n\t"
  839. #if HAVE_MMX2
  840. #undef WRITEBGR24
  841. #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX2(dst, dstw, index)
  842. #else
  843. #undef WRITEBGR24
  844. #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX(dst, dstw, index)
  845. #endif
  846. #define REAL_WRITEYUY2(dst, dstw, index) \
  847. "packuswb %%mm3, %%mm3 \n\t"\
  848. "packuswb %%mm4, %%mm4 \n\t"\
  849. "packuswb %%mm7, %%mm1 \n\t"\
  850. "punpcklbw %%mm4, %%mm3 \n\t"\
  851. "movq %%mm1, %%mm7 \n\t"\
  852. "punpcklbw %%mm3, %%mm1 \n\t"\
  853. "punpckhbw %%mm3, %%mm7 \n\t"\
  854. \
  855. MOVNTQ(%%mm1, (dst, index, 2))\
  856. MOVNTQ(%%mm7, 8(dst, index, 2))\
  857. \
  858. "add $8, "#index" \n\t"\
  859. "cmp "#dstw", "#index" \n\t"\
  860. " jb 1b \n\t"
  861. #define WRITEYUY2(dst, dstw, index) REAL_WRITEYUY2(dst, dstw, index)
  862. static inline void RENAME(yuv2yuvX)(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
  863. const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize, const int16_t **alpSrc,
  864. uint8_t *dest, uint8_t *uDest, uint8_t *vDest, uint8_t *aDest, long dstW, long chrDstW)
  865. {
  866. #if HAVE_MMX
  867. if(!(c->flags & SWS_BITEXACT)){
  868. if (c->flags & SWS_ACCURATE_RND){
  869. if (uDest){
  870. YSCALEYUV2YV12X_ACCURATE( "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
  871. YSCALEYUV2YV12X_ACCURATE(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
  872. }
  873. if (CONFIG_SWSCALE_ALPHA && aDest){
  874. YSCALEYUV2YV12X_ACCURATE( "0", ALP_MMX_FILTER_OFFSET, aDest, dstW)
  875. }
  876. YSCALEYUV2YV12X_ACCURATE("0", LUM_MMX_FILTER_OFFSET, dest, dstW)
  877. }else{
  878. if (uDest){
  879. YSCALEYUV2YV12X( "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
  880. YSCALEYUV2YV12X(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
  881. }
  882. if (CONFIG_SWSCALE_ALPHA && aDest){
  883. YSCALEYUV2YV12X( "0", ALP_MMX_FILTER_OFFSET, aDest, dstW)
  884. }
  885. YSCALEYUV2YV12X("0", LUM_MMX_FILTER_OFFSET, dest, dstW)
  886. }
  887. return;
  888. }
  889. #endif
  890. #if HAVE_ALTIVEC
  891. yuv2yuvX_altivec_real(lumFilter, lumSrc, lumFilterSize,
  892. chrFilter, chrSrc, chrFilterSize,
  893. dest, uDest, vDest, dstW, chrDstW);
  894. #else //HAVE_ALTIVEC
  895. yuv2yuvXinC(lumFilter, lumSrc, lumFilterSize,
  896. chrFilter, chrSrc, chrFilterSize,
  897. alpSrc, dest, uDest, vDest, aDest, dstW, chrDstW);
  898. #endif //!HAVE_ALTIVEC
  899. }
  900. static inline void RENAME(yuv2nv12X)(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
  901. const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize,
  902. uint8_t *dest, uint8_t *uDest, int dstW, int chrDstW, int dstFormat)
  903. {
  904. yuv2nv12XinC(lumFilter, lumSrc, lumFilterSize,
  905. chrFilter, chrSrc, chrFilterSize,
  906. dest, uDest, dstW, chrDstW, dstFormat);
  907. }
  908. static inline void RENAME(yuv2yuv1)(SwsContext *c, const int16_t *lumSrc, const int16_t *chrSrc, const int16_t *alpSrc,
  909. uint8_t *dest, uint8_t *uDest, uint8_t *vDest, uint8_t *aDest, long dstW, long chrDstW)
  910. {
  911. int i;
  912. #if HAVE_MMX
  913. if(!(c->flags & SWS_BITEXACT)){
  914. long p= 4;
  915. uint8_t *src[4]= {alpSrc + dstW, lumSrc + dstW, chrSrc + chrDstW, chrSrc + VOFW + chrDstW};
  916. uint8_t *dst[4]= {aDest, dest, uDest, vDest};
  917. x86_reg counter[4]= {dstW, dstW, chrDstW, chrDstW};
  918. if (c->flags & SWS_ACCURATE_RND){
  919. while(p--){
  920. if (dst[p]){
  921. __asm__ volatile(
  922. YSCALEYUV2YV121_ACCURATE
  923. :: "r" (src[p]), "r" (dst[p] + counter[p]),
  924. "g" (-counter[p])
  925. : "%"REG_a
  926. );
  927. }
  928. }
  929. }else{
  930. while(p--){
  931. if (dst[p]){
  932. __asm__ volatile(
  933. YSCALEYUV2YV121
  934. :: "r" (src[p]), "r" (dst[p] + counter[p]),
  935. "g" (-counter[p])
  936. : "%"REG_a
  937. );
  938. }
  939. }
  940. }
  941. return;
  942. }
  943. #endif
  944. for (i=0; i<dstW; i++)
  945. {
  946. int val= (lumSrc[i]+64)>>7;
  947. if (val&256){
  948. if (val<0) val=0;
  949. else val=255;
  950. }
  951. dest[i]= val;
  952. }
  953. if (uDest)
  954. for (i=0; i<chrDstW; i++)
  955. {
  956. int u=(chrSrc[i ]+64)>>7;
  957. int v=(chrSrc[i + VOFW]+64)>>7;
  958. if ((u|v)&256){
  959. if (u<0) u=0;
  960. else if (u>255) u=255;
  961. if (v<0) v=0;
  962. else if (v>255) v=255;
  963. }
  964. uDest[i]= u;
  965. vDest[i]= v;
  966. }
  967. if (CONFIG_SWSCALE_ALPHA && aDest)
  968. for (i=0; i<dstW; i++){
  969. int val= (alpSrc[i]+64)>>7;
  970. aDest[i]= av_clip_uint8(val);
  971. }
  972. }
  973. /**
  974. * vertical scale YV12 to RGB
  975. */
  976. static inline void RENAME(yuv2packedX)(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
  977. const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize,
  978. const int16_t **alpSrc, uint8_t *dest, long dstW, long dstY)
  979. {
  980. #if HAVE_MMX
  981. x86_reg dummy=0;
  982. if(!(c->flags & SWS_BITEXACT)){
  983. if (c->flags & SWS_ACCURATE_RND){
  984. switch(c->dstFormat){
  985. case PIX_FMT_RGB32:
  986. if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf){
  987. YSCALEYUV2PACKEDX_ACCURATE
  988. YSCALEYUV2RGBX
  989. "movq %%mm2, "U_TEMP"(%0) \n\t"
  990. "movq %%mm4, "V_TEMP"(%0) \n\t"
  991. "movq %%mm5, "Y_TEMP"(%0) \n\t"
  992. YSCALEYUV2PACKEDX_ACCURATE_YA(ALP_MMX_FILTER_OFFSET)
  993. "movq "Y_TEMP"(%0), %%mm5 \n\t"
  994. "psraw $3, %%mm1 \n\t"
  995. "psraw $3, %%mm7 \n\t"
  996. "packuswb %%mm7, %%mm1 \n\t"
  997. WRITEBGR32(%4, %5, %%REGa, %%mm3, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm2, %%mm6)
  998. YSCALEYUV2PACKEDX_END
  999. }else{
  1000. YSCALEYUV2PACKEDX_ACCURATE
  1001. YSCALEYUV2RGBX
  1002. "pcmpeqd %%mm7, %%mm7 \n\t"
  1003. WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
  1004. YSCALEYUV2PACKEDX_END
  1005. }
  1006. return;
  1007. case PIX_FMT_BGR24:
  1008. YSCALEYUV2PACKEDX_ACCURATE
  1009. YSCALEYUV2RGBX
  1010. "pxor %%mm7, %%mm7 \n\t"
  1011. "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"\n\t" //FIXME optimize
  1012. "add %4, %%"REG_c" \n\t"
  1013. WRITEBGR24(%%REGc, %5, %%REGa)
  1014. :: "r" (&c->redDither),
  1015. "m" (dummy), "m" (dummy), "m" (dummy),
  1016. "r" (dest), "m" (dstW)
  1017. : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
  1018. );
  1019. return;
  1020. case PIX_FMT_RGB555:
  1021. YSCALEYUV2PACKEDX_ACCURATE
  1022. YSCALEYUV2RGBX
  1023. "pxor %%mm7, %%mm7 \n\t"
  1024. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1025. #ifdef DITHER1XBPP
  1026. "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
  1027. "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
  1028. "paddusb "RED_DITHER"(%0), %%mm5\n\t"
  1029. #endif
  1030. WRITERGB15(%4, %5, %%REGa)
  1031. YSCALEYUV2PACKEDX_END
  1032. return;
  1033. case PIX_FMT_RGB565:
  1034. YSCALEYUV2PACKEDX_ACCURATE
  1035. YSCALEYUV2RGBX
  1036. "pxor %%mm7, %%mm7 \n\t"
  1037. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1038. #ifdef DITHER1XBPP
  1039. "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
  1040. "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
  1041. "paddusb "RED_DITHER"(%0), %%mm5\n\t"
  1042. #endif
  1043. WRITERGB16(%4, %5, %%REGa)
  1044. YSCALEYUV2PACKEDX_END
  1045. return;
  1046. case PIX_FMT_YUYV422:
  1047. YSCALEYUV2PACKEDX_ACCURATE
  1048. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1049. "psraw $3, %%mm3 \n\t"
  1050. "psraw $3, %%mm4 \n\t"
  1051. "psraw $3, %%mm1 \n\t"
  1052. "psraw $3, %%mm7 \n\t"
  1053. WRITEYUY2(%4, %5, %%REGa)
  1054. YSCALEYUV2PACKEDX_END
  1055. return;
  1056. }
  1057. }else{
  1058. switch(c->dstFormat)
  1059. {
  1060. case PIX_FMT_RGB32:
  1061. if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf){
  1062. YSCALEYUV2PACKEDX
  1063. YSCALEYUV2RGBX
  1064. YSCALEYUV2PACKEDX_YA(ALP_MMX_FILTER_OFFSET, %%mm0, %%mm3, %%mm6, %%mm1, %%mm7)
  1065. "psraw $3, %%mm1 \n\t"
  1066. "psraw $3, %%mm7 \n\t"
  1067. "packuswb %%mm7, %%mm1 \n\t"
  1068. WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
  1069. YSCALEYUV2PACKEDX_END
  1070. }else{
  1071. YSCALEYUV2PACKEDX
  1072. YSCALEYUV2RGBX
  1073. "pcmpeqd %%mm7, %%mm7 \n\t"
  1074. WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
  1075. YSCALEYUV2PACKEDX_END
  1076. }
  1077. return;
  1078. case PIX_FMT_BGR24:
  1079. YSCALEYUV2PACKEDX
  1080. YSCALEYUV2RGBX
  1081. "pxor %%mm7, %%mm7 \n\t"
  1082. "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c" \n\t" //FIXME optimize
  1083. "add %4, %%"REG_c" \n\t"
  1084. WRITEBGR24(%%REGc, %5, %%REGa)
  1085. :: "r" (&c->redDither),
  1086. "m" (dummy), "m" (dummy), "m" (dummy),
  1087. "r" (dest), "m" (dstW)
  1088. : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
  1089. );
  1090. return;
  1091. case PIX_FMT_RGB555:
  1092. YSCALEYUV2PACKEDX
  1093. YSCALEYUV2RGBX
  1094. "pxor %%mm7, %%mm7 \n\t"
  1095. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1096. #ifdef DITHER1XBPP
  1097. "paddusb "BLUE_DITHER"(%0), %%mm2 \n\t"
  1098. "paddusb "GREEN_DITHER"(%0), %%mm4 \n\t"
  1099. "paddusb "RED_DITHER"(%0), %%mm5 \n\t"
  1100. #endif
  1101. WRITERGB15(%4, %5, %%REGa)
  1102. YSCALEYUV2PACKEDX_END
  1103. return;
  1104. case PIX_FMT_RGB565:
  1105. YSCALEYUV2PACKEDX
  1106. YSCALEYUV2RGBX
  1107. "pxor %%mm7, %%mm7 \n\t"
  1108. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1109. #ifdef DITHER1XBPP
  1110. "paddusb "BLUE_DITHER"(%0), %%mm2 \n\t"
  1111. "paddusb "GREEN_DITHER"(%0), %%mm4 \n\t"
  1112. "paddusb "RED_DITHER"(%0), %%mm5 \n\t"
  1113. #endif
  1114. WRITERGB16(%4, %5, %%REGa)
  1115. YSCALEYUV2PACKEDX_END
  1116. return;
  1117. case PIX_FMT_YUYV422:
  1118. YSCALEYUV2PACKEDX
  1119. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1120. "psraw $3, %%mm3 \n\t"
  1121. "psraw $3, %%mm4 \n\t"
  1122. "psraw $3, %%mm1 \n\t"
  1123. "psraw $3, %%mm7 \n\t"
  1124. WRITEYUY2(%4, %5, %%REGa)
  1125. YSCALEYUV2PACKEDX_END
  1126. return;
  1127. }
  1128. }
  1129. }
  1130. #endif /* HAVE_MMX */
  1131. #if HAVE_ALTIVEC
  1132. /* The following list of supported dstFormat values should
  1133. match what's found in the body of ff_yuv2packedX_altivec() */
  1134. if (!(c->flags & SWS_BITEXACT) && !c->alpPixBuf &&
  1135. (c->dstFormat==PIX_FMT_ABGR || c->dstFormat==PIX_FMT_BGRA ||
  1136. c->dstFormat==PIX_FMT_BGR24 || c->dstFormat==PIX_FMT_RGB24 ||
  1137. c->dstFormat==PIX_FMT_RGBA || c->dstFormat==PIX_FMT_ARGB))
  1138. ff_yuv2packedX_altivec(c, lumFilter, lumSrc, lumFilterSize,
  1139. chrFilter, chrSrc, chrFilterSize,
  1140. dest, dstW, dstY);
  1141. else
  1142. #endif
  1143. yuv2packedXinC(c, lumFilter, lumSrc, lumFilterSize,
  1144. chrFilter, chrSrc, chrFilterSize,
  1145. alpSrc, dest, dstW, dstY);
  1146. }
  1147. /**
  1148. * vertical bilinear scale YV12 to RGB
  1149. */
  1150. static inline void RENAME(yuv2packed2)(SwsContext *c, const uint16_t *buf0, const uint16_t *buf1, const uint16_t *uvbuf0, const uint16_t *uvbuf1,
  1151. const uint16_t *abuf0, const uint16_t *abuf1, uint8_t *dest, int dstW, int yalpha, int uvalpha, int y)
  1152. {
  1153. int yalpha1=4095- yalpha;
  1154. int uvalpha1=4095-uvalpha;
  1155. int i;
  1156. #if HAVE_MMX
  1157. if(!(c->flags & SWS_BITEXACT)){
  1158. switch(c->dstFormat)
  1159. {
  1160. //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
  1161. case PIX_FMT_RGB32:
  1162. if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf){
  1163. #if ARCH_X86_64
  1164. __asm__ volatile(
  1165. YSCALEYUV2RGB(%%REGBP, %5)
  1166. YSCALEYUV2RGB_YA(%%REGBP, %5, %6, %7)
  1167. "psraw $3, %%mm1 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
  1168. "psraw $3, %%mm7 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
  1169. "packuswb %%mm7, %%mm1 \n\t"
  1170. WRITEBGR32(%4, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
  1171. :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "r" (dest),
  1172. "a" (&c->redDither)
  1173. ,"r" (abuf0), "r" (abuf1)
  1174. : "%"REG_BP
  1175. );
  1176. #else
  1177. *(uint16_t **)(&c->u_temp)=abuf0;
  1178. *(uint16_t **)(&c->v_temp)=abuf1;
  1179. __asm__ volatile(
  1180. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1181. "mov %4, %%"REG_b" \n\t"
  1182. "push %%"REG_BP" \n\t"
  1183. YSCALEYUV2RGB(%%REGBP, %5)
  1184. "push %0 \n\t"
  1185. "push %1 \n\t"
  1186. "mov "U_TEMP"(%5), %0 \n\t"
  1187. "mov "V_TEMP"(%5), %1 \n\t"
  1188. YSCALEYUV2RGB_YA(%%REGBP, %5, %0, %1)
  1189. "psraw $3, %%mm1 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
  1190. "psraw $3, %%mm7 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
  1191. "packuswb %%mm7, %%mm1 \n\t"
  1192. "pop %1 \n\t"
  1193. "pop %0 \n\t"
  1194. WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
  1195. "pop %%"REG_BP" \n\t"
  1196. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1197. :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
  1198. "a" (&c->redDither)
  1199. );
  1200. #endif
  1201. }else{
  1202. __asm__ volatile(
  1203. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1204. "mov %4, %%"REG_b" \n\t"
  1205. "push %%"REG_BP" \n\t"
  1206. YSCALEYUV2RGB(%%REGBP, %5)
  1207. "pcmpeqd %%mm7, %%mm7 \n\t"
  1208. WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
  1209. "pop %%"REG_BP" \n\t"
  1210. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1211. :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
  1212. "a" (&c->redDither)
  1213. );
  1214. }
  1215. return;
  1216. case PIX_FMT_BGR24:
  1217. __asm__ volatile(
  1218. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1219. "mov %4, %%"REG_b" \n\t"
  1220. "push %%"REG_BP" \n\t"
  1221. YSCALEYUV2RGB(%%REGBP, %5)
  1222. "pxor %%mm7, %%mm7 \n\t"
  1223. WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
  1224. "pop %%"REG_BP" \n\t"
  1225. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1226. :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
  1227. "a" (&c->redDither)
  1228. );
  1229. return;
  1230. case PIX_FMT_RGB555:
  1231. __asm__ volatile(
  1232. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1233. "mov %4, %%"REG_b" \n\t"
  1234. "push %%"REG_BP" \n\t"
  1235. YSCALEYUV2RGB(%%REGBP, %5)
  1236. "pxor %%mm7, %%mm7 \n\t"
  1237. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1238. #ifdef DITHER1XBPP
  1239. "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
  1240. "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
  1241. "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
  1242. #endif
  1243. WRITERGB15(%%REGb, 8280(%5), %%REGBP)
  1244. "pop %%"REG_BP" \n\t"
  1245. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1246. :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
  1247. "a" (&c->redDither)
  1248. );
  1249. return;
  1250. case PIX_FMT_RGB565:
  1251. __asm__ volatile(
  1252. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1253. "mov %4, %%"REG_b" \n\t"
  1254. "push %%"REG_BP" \n\t"
  1255. YSCALEYUV2RGB(%%REGBP, %5)
  1256. "pxor %%mm7, %%mm7 \n\t"
  1257. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1258. #ifdef DITHER1XBPP
  1259. "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
  1260. "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
  1261. "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
  1262. #endif
  1263. WRITERGB16(%%REGb, 8280(%5), %%REGBP)
  1264. "pop %%"REG_BP" \n\t"
  1265. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1266. :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
  1267. "a" (&c->redDither)
  1268. );
  1269. return;
  1270. case PIX_FMT_YUYV422:
  1271. __asm__ volatile(
  1272. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1273. "mov %4, %%"REG_b" \n\t"
  1274. "push %%"REG_BP" \n\t"
  1275. YSCALEYUV2PACKED(%%REGBP, %5)
  1276. WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
  1277. "pop %%"REG_BP" \n\t"
  1278. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1279. :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
  1280. "a" (&c->redDither)
  1281. );
  1282. return;
  1283. default: break;
  1284. }
  1285. }
  1286. #endif //HAVE_MMX
  1287. YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB2_C, YSCALE_YUV_2_PACKED2_C(void,0), YSCALE_YUV_2_GRAY16_2_C, YSCALE_YUV_2_MONO2_C)
  1288. }
  1289. /**
  1290. * YV12 to RGB without scaling or interpolating
  1291. */
  1292. static inline void RENAME(yuv2packed1)(SwsContext *c, const uint16_t *buf0, const uint16_t *uvbuf0, const uint16_t *uvbuf1,
  1293. const uint16_t *abuf0, uint8_t *dest, int dstW, int uvalpha, int dstFormat, int flags, int y)
  1294. {
  1295. const int yalpha1=0;
  1296. int i;
  1297. const uint16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
  1298. const int yalpha= 4096; //FIXME ...
  1299. if (flags&SWS_FULL_CHR_H_INT)
  1300. {
  1301. c->yuv2packed2(c, buf0, buf0, uvbuf0, uvbuf1, abuf0, abuf0, dest, dstW, 0, uvalpha, y);
  1302. return;
  1303. }
  1304. #if HAVE_MMX
  1305. if(!(flags & SWS_BITEXACT)){
  1306. if (uvalpha < 2048) // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
  1307. {
  1308. switch(dstFormat)
  1309. {
  1310. case PIX_FMT_RGB32:
  1311. if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf){
  1312. __asm__ volatile(
  1313. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1314. "mov %4, %%"REG_b" \n\t"
  1315. "push %%"REG_BP" \n\t"
  1316. YSCALEYUV2RGB1(%%REGBP, %5)
  1317. YSCALEYUV2RGB1_ALPHA(%%REGBP)
  1318. WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
  1319. "pop %%"REG_BP" \n\t"
  1320. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1321. :: "c" (buf0), "d" (abuf0), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
  1322. "a" (&c->redDither)
  1323. );
  1324. }else{
  1325. __asm__ volatile(
  1326. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1327. "mov %4, %%"REG_b" \n\t"
  1328. "push %%"REG_BP" \n\t"
  1329. YSCALEYUV2RGB1(%%REGBP, %5)
  1330. "pcmpeqd %%mm7, %%mm7 \n\t"
  1331. WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
  1332. "pop %%"REG_BP" \n\t"
  1333. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1334. :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
  1335. "a" (&c->redDither)
  1336. );
  1337. }
  1338. return;
  1339. case PIX_FMT_BGR24:
  1340. __asm__ volatile(
  1341. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1342. "mov %4, %%"REG_b" \n\t"
  1343. "push %%"REG_BP" \n\t"
  1344. YSCALEYUV2RGB1(%%REGBP, %5)
  1345. "pxor %%mm7, %%mm7 \n\t"
  1346. WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
  1347. "pop %%"REG_BP" \n\t"
  1348. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1349. :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
  1350. "a" (&c->redDither)
  1351. );
  1352. return;
  1353. case PIX_FMT_RGB555:
  1354. __asm__ volatile(
  1355. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1356. "mov %4, %%"REG_b" \n\t"
  1357. "push %%"REG_BP" \n\t"
  1358. YSCALEYUV2RGB1(%%REGBP, %5)
  1359. "pxor %%mm7, %%mm7 \n\t"
  1360. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1361. #ifdef DITHER1XBPP
  1362. "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
  1363. "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
  1364. "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
  1365. #endif
  1366. WRITERGB15(%%REGb, 8280(%5), %%REGBP)
  1367. "pop %%"REG_BP" \n\t"
  1368. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1369. :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
  1370. "a" (&c->redDither)
  1371. );
  1372. return;
  1373. case PIX_FMT_RGB565:
  1374. __asm__ volatile(
  1375. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1376. "mov %4, %%"REG_b" \n\t"
  1377. "push %%"REG_BP" \n\t"
  1378. YSCALEYUV2RGB1(%%REGBP, %5)
  1379. "pxor %%mm7, %%mm7 \n\t"
  1380. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1381. #ifdef DITHER1XBPP
  1382. "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
  1383. "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
  1384. "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
  1385. #endif
  1386. WRITERGB16(%%REGb, 8280(%5), %%REGBP)
  1387. "pop %%"REG_BP" \n\t"
  1388. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1389. :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
  1390. "a" (&c->redDither)
  1391. );
  1392. return;
  1393. case PIX_FMT_YUYV422:
  1394. __asm__ volatile(
  1395. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1396. "mov %4, %%"REG_b" \n\t"
  1397. "push %%"REG_BP" \n\t"
  1398. YSCALEYUV2PACKED1(%%REGBP, %5)
  1399. WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
  1400. "pop %%"REG_BP" \n\t"
  1401. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1402. :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
  1403. "a" (&c->redDither)
  1404. );
  1405. return;
  1406. }
  1407. }
  1408. else
  1409. {
  1410. switch(dstFormat)
  1411. {
  1412. case PIX_FMT_RGB32:
  1413. if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf){
  1414. __asm__ volatile(
  1415. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1416. "mov %4, %%"REG_b" \n\t"
  1417. "push %%"REG_BP" \n\t"
  1418. YSCALEYUV2RGB1b(%%REGBP, %5)
  1419. YSCALEYUV2RGB1_ALPHA(%%REGBP)
  1420. WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
  1421. "pop %%"REG_BP" \n\t"
  1422. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1423. :: "c" (buf0), "d" (abuf0), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
  1424. "a" (&c->redDither)
  1425. );
  1426. }else{
  1427. __asm__ volatile(
  1428. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1429. "mov %4, %%"REG_b" \n\t"
  1430. "push %%"REG_BP" \n\t"
  1431. YSCALEYUV2RGB1b(%%REGBP, %5)
  1432. "pcmpeqd %%mm7, %%mm7 \n\t"
  1433. WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
  1434. "pop %%"REG_BP" \n\t"
  1435. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1436. :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
  1437. "a" (&c->redDither)
  1438. );
  1439. }
  1440. return;
  1441. case PIX_FMT_BGR24:
  1442. __asm__ volatile(
  1443. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1444. "mov %4, %%"REG_b" \n\t"
  1445. "push %%"REG_BP" \n\t"
  1446. YSCALEYUV2RGB1b(%%REGBP, %5)
  1447. "pxor %%mm7, %%mm7 \n\t"
  1448. WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
  1449. "pop %%"REG_BP" \n\t"
  1450. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1451. :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
  1452. "a" (&c->redDither)
  1453. );
  1454. return;
  1455. case PIX_FMT_RGB555:
  1456. __asm__ volatile(
  1457. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1458. "mov %4, %%"REG_b" \n\t"
  1459. "push %%"REG_BP" \n\t"
  1460. YSCALEYUV2RGB1b(%%REGBP, %5)
  1461. "pxor %%mm7, %%mm7 \n\t"
  1462. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1463. #ifdef DITHER1XBPP
  1464. "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
  1465. "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
  1466. "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
  1467. #endif
  1468. WRITERGB15(%%REGb, 8280(%5), %%REGBP)
  1469. "pop %%"REG_BP" \n\t"
  1470. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1471. :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
  1472. "a" (&c->redDither)
  1473. );
  1474. return;
  1475. case PIX_FMT_RGB565:
  1476. __asm__ volatile(
  1477. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1478. "mov %4, %%"REG_b" \n\t"
  1479. "push %%"REG_BP" \n\t"
  1480. YSCALEYUV2RGB1b(%%REGBP, %5)
  1481. "pxor %%mm7, %%mm7 \n\t"
  1482. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1483. #ifdef DITHER1XBPP
  1484. "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
  1485. "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
  1486. "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
  1487. #endif
  1488. WRITERGB16(%%REGb, 8280(%5), %%REGBP)
  1489. "pop %%"REG_BP" \n\t"
  1490. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1491. :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
  1492. "a" (&c->redDither)
  1493. );
  1494. return;
  1495. case PIX_FMT_YUYV422:
  1496. __asm__ volatile(
  1497. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1498. "mov %4, %%"REG_b" \n\t"
  1499. "push %%"REG_BP" \n\t"
  1500. YSCALEYUV2PACKED1b(%%REGBP, %5)
  1501. WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
  1502. "pop %%"REG_BP" \n\t"
  1503. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1504. :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
  1505. "a" (&c->redDither)
  1506. );
  1507. return;
  1508. }
  1509. }
  1510. }
  1511. #endif /* HAVE_MMX */
  1512. if (uvalpha < 2048)
  1513. {
  1514. YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1_C, YSCALE_YUV_2_PACKED1_C(void,0), YSCALE_YUV_2_GRAY16_1_C, YSCALE_YUV_2_MONO2_C)
  1515. }else{
  1516. YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1B_C, YSCALE_YUV_2_PACKED1B_C(void,0), YSCALE_YUV_2_GRAY16_1_C, YSCALE_YUV_2_MONO2_C)
  1517. }
  1518. }
  1519. //FIXME yuy2* can read up to 7 samples too much
  1520. static inline void RENAME(yuy2ToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
  1521. {
  1522. #if HAVE_MMX
  1523. __asm__ volatile(
  1524. "movq "MANGLE(bm01010101)", %%mm2 \n\t"
  1525. "mov %0, %%"REG_a" \n\t"
  1526. "1: \n\t"
  1527. "movq (%1, %%"REG_a",2), %%mm0 \n\t"
  1528. "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
  1529. "pand %%mm2, %%mm0 \n\t"
  1530. "pand %%mm2, %%mm1 \n\t"
  1531. "packuswb %%mm1, %%mm0 \n\t"
  1532. "movq %%mm0, (%2, %%"REG_a") \n\t"
  1533. "add $8, %%"REG_a" \n\t"
  1534. " js 1b \n\t"
  1535. : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst+width)
  1536. : "%"REG_a
  1537. );
  1538. #else
  1539. int i;
  1540. for (i=0; i<width; i++)
  1541. dst[i]= src[2*i];
  1542. #endif
  1543. }
  1544. static inline void RENAME(yuy2ToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
  1545. {
  1546. #if HAVE_MMX
  1547. __asm__ volatile(
  1548. "movq "MANGLE(bm01010101)", %%mm4 \n\t"
  1549. "mov %0, %%"REG_a" \n\t"
  1550. "1: \n\t"
  1551. "movq (%1, %%"REG_a",4), %%mm0 \n\t"
  1552. "movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
  1553. "psrlw $8, %%mm0 \n\t"
  1554. "psrlw $8, %%mm1 \n\t"
  1555. "packuswb %%mm1, %%mm0 \n\t"
  1556. "movq %%mm0, %%mm1 \n\t"
  1557. "psrlw $8, %%mm0 \n\t"
  1558. "pand %%mm4, %%mm1 \n\t"
  1559. "packuswb %%mm0, %%mm0 \n\t"
  1560. "packuswb %%mm1, %%mm1 \n\t"
  1561. "movd %%mm0, (%3, %%"REG_a") \n\t"
  1562. "movd %%mm1, (%2, %%"REG_a") \n\t"
  1563. "add $4, %%"REG_a" \n\t"
  1564. " js 1b \n\t"
  1565. : : "g" ((x86_reg)-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
  1566. : "%"REG_a
  1567. );
  1568. #else
  1569. int i;
  1570. for (i=0; i<width; i++)
  1571. {
  1572. dstU[i]= src1[4*i + 1];
  1573. dstV[i]= src1[4*i + 3];
  1574. }
  1575. #endif
  1576. assert(src1 == src2);
  1577. }
  1578. /* This is almost identical to the previous, end exists only because
  1579. * yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses. */
  1580. static inline void RENAME(uyvyToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
  1581. {
  1582. #if HAVE_MMX
  1583. __asm__ volatile(
  1584. "mov %0, %%"REG_a" \n\t"
  1585. "1: \n\t"
  1586. "movq (%1, %%"REG_a",2), %%mm0 \n\t"
  1587. "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
  1588. "psrlw $8, %%mm0 \n\t"
  1589. "psrlw $8, %%mm1 \n\t"
  1590. "packuswb %%mm1, %%mm0 \n\t"
  1591. "movq %%mm0, (%2, %%"REG_a") \n\t"
  1592. "add $8, %%"REG_a" \n\t"
  1593. " js 1b \n\t"
  1594. : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst+width)
  1595. : "%"REG_a
  1596. );
  1597. #else
  1598. int i;
  1599. for (i=0; i<width; i++)
  1600. dst[i]= src[2*i+1];
  1601. #endif
  1602. }
  1603. static inline void RENAME(uyvyToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
  1604. {
  1605. #if HAVE_MMX
  1606. __asm__ volatile(
  1607. "movq "MANGLE(bm01010101)", %%mm4 \n\t"
  1608. "mov %0, %%"REG_a" \n\t"
  1609. "1: \n\t"
  1610. "movq (%1, %%"REG_a",4), %%mm0 \n\t"
  1611. "movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
  1612. "pand %%mm4, %%mm0 \n\t"
  1613. "pand %%mm4, %%mm1 \n\t"
  1614. "packuswb %%mm1, %%mm0 \n\t"
  1615. "movq %%mm0, %%mm1 \n\t"
  1616. "psrlw $8, %%mm0 \n\t"
  1617. "pand %%mm4, %%mm1 \n\t"
  1618. "packuswb %%mm0, %%mm0 \n\t"
  1619. "packuswb %%mm1, %%mm1 \n\t"
  1620. "movd %%mm0, (%3, %%"REG_a") \n\t"
  1621. "movd %%mm1, (%2, %%"REG_a") \n\t"
  1622. "add $4, %%"REG_a" \n\t"
  1623. " js 1b \n\t"
  1624. : : "g" ((x86_reg)-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
  1625. : "%"REG_a
  1626. );
  1627. #else
  1628. int i;
  1629. for (i=0; i<width; i++)
  1630. {
  1631. dstU[i]= src1[4*i + 0];
  1632. dstV[i]= src1[4*i + 2];
  1633. }
  1634. #endif
  1635. assert(src1 == src2);
  1636. }
  1637. #define BGR2Y(type, name, shr, shg, shb, maskr, maskg, maskb, RY, GY, BY, S)\
  1638. static inline void RENAME(name)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)\
  1639. {\
  1640. int i;\
  1641. for (i=0; i<width; i++)\
  1642. {\
  1643. int b= (((const type*)src)[i]>>shb)&maskb;\
  1644. int g= (((const type*)src)[i]>>shg)&maskg;\
  1645. int r= (((const type*)src)[i]>>shr)&maskr;\
  1646. \
  1647. dst[i]= (((RY)*r + (GY)*g + (BY)*b + (33<<((S)-1)))>>(S));\
  1648. }\
  1649. }
  1650. BGR2Y(uint32_t, bgr32ToY,16, 0, 0, 0x00FF, 0xFF00, 0x00FF, RY<< 8, GY , BY<< 8, RGB2YUV_SHIFT+8)
  1651. BGR2Y(uint32_t, rgb32ToY, 0, 0,16, 0x00FF, 0xFF00, 0x00FF, RY<< 8, GY , BY<< 8, RGB2YUV_SHIFT+8)
  1652. BGR2Y(uint16_t, bgr16ToY, 0, 0, 0, 0x001F, 0x07E0, 0xF800, RY<<11, GY<<5, BY , RGB2YUV_SHIFT+8)
  1653. BGR2Y(uint16_t, bgr15ToY, 0, 0, 0, 0x001F, 0x03E0, 0x7C00, RY<<10, GY<<5, BY , RGB2YUV_SHIFT+7)
  1654. BGR2Y(uint16_t, rgb16ToY, 0, 0, 0, 0xF800, 0x07E0, 0x001F, RY , GY<<5, BY<<11, RGB2YUV_SHIFT+8)
  1655. BGR2Y(uint16_t, rgb15ToY, 0, 0, 0, 0x7C00, 0x03E0, 0x001F, RY , GY<<5, BY<<10, RGB2YUV_SHIFT+7)
  1656. static inline void RENAME(abgrToA)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused){
  1657. int i;
  1658. for (i=0; i<width; i++){
  1659. dst[i]= src[4*i];
  1660. }
  1661. }
  1662. #define BGR2UV(type, name, shr, shg, shb, maska, maskr, maskg, maskb, RU, GU, BU, RV, GV, BV, S)\
  1663. static inline void RENAME(name)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src, const uint8_t *dummy, long width, uint32_t *unused)\
  1664. {\
  1665. int i;\
  1666. for (i=0; i<width; i++)\
  1667. {\
  1668. int b= (((const type*)src)[i]&maskb)>>shb;\
  1669. int g= (((const type*)src)[i]&maskg)>>shg;\
  1670. int r= (((const type*)src)[i]&maskr)>>shr;\
  1671. \
  1672. dstU[i]= ((RU)*r + (GU)*g + (BU)*b + (257<<((S)-1)))>>(S);\
  1673. dstV[i]= ((RV)*r + (GV)*g + (BV)*b + (257<<((S)-1)))>>(S);\
  1674. }\
  1675. }\
  1676. static inline void RENAME(name ## _half)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src, const uint8_t *dummy, long width, uint32_t *unused)\
  1677. {\
  1678. int i;\
  1679. for (i=0; i<width; i++)\
  1680. {\
  1681. int pix0= ((const type*)src)[2*i+0];\
  1682. int pix1= ((const type*)src)[2*i+1];\
  1683. int g= (pix0&~(maskr|maskb))+(pix1&~(maskr|maskb));\
  1684. int b= ((pix0+pix1-g)&(maskb|(2*maskb)))>>shb;\
  1685. int r= ((pix0+pix1-g)&(maskr|(2*maskr)))>>shr;\
  1686. g&= maskg|(2*maskg);\
  1687. \
  1688. g>>=shg;\
  1689. \
  1690. dstU[i]= ((RU)*r + (GU)*g + (BU)*b + (257<<(S)))>>((S)+1);\
  1691. dstV[i]= ((RV)*r + (GV)*g + (BV)*b + (257<<(S)))>>((S)+1);\
  1692. }\
  1693. }
  1694. BGR2UV(uint32_t, bgr32ToUV,16, 0, 0, 0xFF000000, 0xFF0000, 0xFF00, 0x00FF, RU<< 8, GU , BU<< 8, RV<< 8, GV , BV<< 8, RGB2YUV_SHIFT+8)
  1695. BGR2UV(uint32_t, rgb32ToUV, 0, 0,16, 0xFF000000, 0x00FF, 0xFF00, 0xFF0000, RU<< 8, GU , BU<< 8, RV<< 8, GV , BV<< 8, RGB2YUV_SHIFT+8)
  1696. BGR2UV(uint16_t, bgr16ToUV, 0, 0, 0, 0, 0x001F, 0x07E0, 0xF800, RU<<11, GU<<5, BU , RV<<11, GV<<5, BV , RGB2YUV_SHIFT+8)
  1697. BGR2UV(uint16_t, bgr15ToUV, 0, 0, 0, 0, 0x001F, 0x03E0, 0x7C00, RU<<10, GU<<5, BU , RV<<10, GV<<5, BV , RGB2YUV_SHIFT+7)
  1698. BGR2UV(uint16_t, rgb16ToUV, 0, 0, 0, 0, 0xF800, 0x07E0, 0x001F, RU , GU<<5, BU<<11, RV , GV<<5, BV<<11, RGB2YUV_SHIFT+8)
  1699. BGR2UV(uint16_t, rgb15ToUV, 0, 0, 0, 0, 0x7C00, 0x03E0, 0x001F, RU , GU<<5, BU<<10, RV , GV<<5, BV<<10, RGB2YUV_SHIFT+7)
  1700. #if HAVE_MMX
  1701. static inline void RENAME(bgr24ToY_mmx)(uint8_t *dst, const uint8_t *src, long width, int srcFormat)
  1702. {
  1703. if(srcFormat == PIX_FMT_BGR24){
  1704. __asm__ volatile(
  1705. "movq "MANGLE(ff_bgr24toY1Coeff)", %%mm5 \n\t"
  1706. "movq "MANGLE(ff_bgr24toY2Coeff)", %%mm6 \n\t"
  1707. :
  1708. );
  1709. }else{
  1710. __asm__ volatile(
  1711. "movq "MANGLE(ff_rgb24toY1Coeff)", %%mm5 \n\t"
  1712. "movq "MANGLE(ff_rgb24toY2Coeff)", %%mm6 \n\t"
  1713. :
  1714. );
  1715. }
  1716. __asm__ volatile(
  1717. "movq "MANGLE(ff_bgr24toYOffset)", %%mm4 \n\t"
  1718. "mov %2, %%"REG_a" \n\t"
  1719. "pxor %%mm7, %%mm7 \n\t"
  1720. "1: \n\t"
  1721. PREFETCH" 64(%0) \n\t"
  1722. "movd (%0), %%mm0 \n\t"
  1723. "movd 2(%0), %%mm1 \n\t"
  1724. "movd 6(%0), %%mm2 \n\t"
  1725. "movd 8(%0), %%mm3 \n\t"
  1726. "add $12, %0 \n\t"
  1727. "punpcklbw %%mm7, %%mm0 \n\t"
  1728. "punpcklbw %%mm7, %%mm1 \n\t"
  1729. "punpcklbw %%mm7, %%mm2 \n\t"
  1730. "punpcklbw %%mm7, %%mm3 \n\t"
  1731. "pmaddwd %%mm5, %%mm0 \n\t"
  1732. "pmaddwd %%mm6, %%mm1 \n\t"
  1733. "pmaddwd %%mm5, %%mm2 \n\t"
  1734. "pmaddwd %%mm6, %%mm3 \n\t"
  1735. "paddd %%mm1, %%mm0 \n\t"
  1736. "paddd %%mm3, %%mm2 \n\t"
  1737. "paddd %%mm4, %%mm0 \n\t"
  1738. "paddd %%mm4, %%mm2 \n\t"
  1739. "psrad $15, %%mm0 \n\t"
  1740. "psrad $15, %%mm2 \n\t"
  1741. "packssdw %%mm2, %%mm0 \n\t"
  1742. "packuswb %%mm0, %%mm0 \n\t"
  1743. "movd %%mm0, (%1, %%"REG_a") \n\t"
  1744. "add $4, %%"REG_a" \n\t"
  1745. " js 1b \n\t"
  1746. : "+r" (src)
  1747. : "r" (dst+width), "g" ((x86_reg)-width)
  1748. : "%"REG_a
  1749. );
  1750. }
  1751. static inline void RENAME(bgr24ToUV_mmx)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src, long width, int srcFormat)
  1752. {
  1753. __asm__ volatile(
  1754. "movq 24+%4, %%mm6 \n\t"
  1755. "mov %3, %%"REG_a" \n\t"
  1756. "pxor %%mm7, %%mm7 \n\t"
  1757. "1: \n\t"
  1758. PREFETCH" 64(%0) \n\t"
  1759. "movd (%0), %%mm0 \n\t"
  1760. "movd 2(%0), %%mm1 \n\t"
  1761. "punpcklbw %%mm7, %%mm0 \n\t"
  1762. "punpcklbw %%mm7, %%mm1 \n\t"
  1763. "movq %%mm0, %%mm2 \n\t"
  1764. "movq %%mm1, %%mm3 \n\t"
  1765. "pmaddwd %4, %%mm0 \n\t"
  1766. "pmaddwd 8+%4, %%mm1 \n\t"
  1767. "pmaddwd 16+%4, %%mm2 \n\t"
  1768. "pmaddwd %%mm6, %%mm3 \n\t"
  1769. "paddd %%mm1, %%mm0 \n\t"
  1770. "paddd %%mm3, %%mm2 \n\t"
  1771. "movd 6(%0), %%mm1 \n\t"
  1772. "movd 8(%0), %%mm3 \n\t"
  1773. "add $12, %0 \n\t"
  1774. "punpcklbw %%mm7, %%mm1 \n\t"
  1775. "punpcklbw %%mm7, %%mm3 \n\t"
  1776. "movq %%mm1, %%mm4 \n\t"
  1777. "movq %%mm3, %%mm5 \n\t"
  1778. "pmaddwd %4, %%mm1 \n\t"
  1779. "pmaddwd 8+%4, %%mm3 \n\t"
  1780. "pmaddwd 16+%4, %%mm4 \n\t"
  1781. "pmaddwd %%mm6, %%mm5 \n\t"
  1782. "paddd %%mm3, %%mm1 \n\t"
  1783. "paddd %%mm5, %%mm4 \n\t"
  1784. "movq "MANGLE(ff_bgr24toUVOffset)", %%mm3 \n\t"
  1785. "paddd %%mm3, %%mm0 \n\t"
  1786. "paddd %%mm3, %%mm2 \n\t"
  1787. "paddd %%mm3, %%mm1 \n\t"
  1788. "paddd %%mm3, %%mm4 \n\t"
  1789. "psrad $15, %%mm0 \n\t"
  1790. "psrad $15, %%mm2 \n\t"
  1791. "psrad $15, %%mm1 \n\t"
  1792. "psrad $15, %%mm4 \n\t"
  1793. "packssdw %%mm1, %%mm0 \n\t"
  1794. "packssdw %%mm4, %%mm2 \n\t"
  1795. "packuswb %%mm0, %%mm0 \n\t"
  1796. "packuswb %%mm2, %%mm2 \n\t"
  1797. "movd %%mm0, (%1, %%"REG_a") \n\t"
  1798. "movd %%mm2, (%2, %%"REG_a") \n\t"
  1799. "add $4, %%"REG_a" \n\t"
  1800. " js 1b \n\t"
  1801. : "+r" (src)
  1802. : "r" (dstU+width), "r" (dstV+width), "g" ((x86_reg)-width), "m"(ff_bgr24toUV[srcFormat == PIX_FMT_RGB24][0])
  1803. : "%"REG_a
  1804. );
  1805. }
  1806. #endif
  1807. static inline void RENAME(bgr24ToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
  1808. {
  1809. #if HAVE_MMX
  1810. RENAME(bgr24ToY_mmx)(dst, src, width, PIX_FMT_BGR24);
  1811. #else
  1812. int i;
  1813. for (i=0; i<width; i++)
  1814. {
  1815. int b= src[i*3+0];
  1816. int g= src[i*3+1];
  1817. int r= src[i*3+2];
  1818. dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
  1819. }
  1820. #endif /* HAVE_MMX */
  1821. }
  1822. static inline void RENAME(bgr24ToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
  1823. {
  1824. #if HAVE_MMX
  1825. RENAME(bgr24ToUV_mmx)(dstU, dstV, src1, width, PIX_FMT_BGR24);
  1826. #else
  1827. int i;
  1828. for (i=0; i<width; i++)
  1829. {
  1830. int b= src1[3*i + 0];
  1831. int g= src1[3*i + 1];
  1832. int r= src1[3*i + 2];
  1833. dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
  1834. dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
  1835. }
  1836. #endif /* HAVE_MMX */
  1837. assert(src1 == src2);
  1838. }
  1839. static inline void RENAME(bgr24ToUV_half)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
  1840. {
  1841. int i;
  1842. for (i=0; i<width; i++)
  1843. {
  1844. int b= src1[6*i + 0] + src1[6*i + 3];
  1845. int g= src1[6*i + 1] + src1[6*i + 4];
  1846. int r= src1[6*i + 2] + src1[6*i + 5];
  1847. dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
  1848. dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
  1849. }
  1850. assert(src1 == src2);
  1851. }
  1852. static inline void RENAME(rgb24ToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
  1853. {
  1854. #if HAVE_MMX
  1855. RENAME(bgr24ToY_mmx)(dst, src, width, PIX_FMT_RGB24);
  1856. #else
  1857. int i;
  1858. for (i=0; i<width; i++)
  1859. {
  1860. int r= src[i*3+0];
  1861. int g= src[i*3+1];
  1862. int b= src[i*3+2];
  1863. dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
  1864. }
  1865. #endif
  1866. }
  1867. static inline void RENAME(rgb24ToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
  1868. {
  1869. #if HAVE_MMX
  1870. assert(src1==src2);
  1871. RENAME(bgr24ToUV_mmx)(dstU, dstV, src1, width, PIX_FMT_RGB24);
  1872. #else
  1873. int i;
  1874. assert(src1==src2);
  1875. for (i=0; i<width; i++)
  1876. {
  1877. int r= src1[3*i + 0];
  1878. int g= src1[3*i + 1];
  1879. int b= src1[3*i + 2];
  1880. dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
  1881. dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
  1882. }
  1883. #endif
  1884. }
  1885. static inline void RENAME(rgb24ToUV_half)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
  1886. {
  1887. int i;
  1888. assert(src1==src2);
  1889. for (i=0; i<width; i++)
  1890. {
  1891. int r= src1[6*i + 0] + src1[6*i + 3];
  1892. int g= src1[6*i + 1] + src1[6*i + 4];
  1893. int b= src1[6*i + 2] + src1[6*i + 5];
  1894. dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
  1895. dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
  1896. }
  1897. }
  1898. static inline void RENAME(palToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *pal)
  1899. {
  1900. int i;
  1901. for (i=0; i<width; i++)
  1902. {
  1903. int d= src[i];
  1904. dst[i]= pal[d] & 0xFF;
  1905. }
  1906. }
  1907. static inline void RENAME(palToUV)(uint8_t *dstU, uint8_t *dstV,
  1908. const uint8_t *src1, const uint8_t *src2,
  1909. long width, uint32_t *pal)
  1910. {
  1911. int i;
  1912. assert(src1 == src2);
  1913. for (i=0; i<width; i++)
  1914. {
  1915. int p= pal[src1[i]];
  1916. dstU[i]= p>>8;
  1917. dstV[i]= p>>16;
  1918. }
  1919. }
  1920. static inline void RENAME(monowhite2Y)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
  1921. {
  1922. int i, j;
  1923. for (i=0; i<width/8; i++){
  1924. int d= ~src[i];
  1925. for(j=0; j<8; j++)
  1926. dst[8*i+j]= ((d>>(7-j))&1)*255;
  1927. }
  1928. }
  1929. static inline void RENAME(monoblack2Y)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
  1930. {
  1931. int i, j;
  1932. for (i=0; i<width/8; i++){
  1933. int d= src[i];
  1934. for(j=0; j<8; j++)
  1935. dst[8*i+j]= ((d>>(7-j))&1)*255;
  1936. }
  1937. }
  1938. // bilinear / bicubic scaling
  1939. static inline void RENAME(hScale)(int16_t *dst, int dstW, const uint8_t *src, int srcW, int xInc,
  1940. const int16_t *filter, const int16_t *filterPos, long filterSize)
  1941. {
  1942. #if HAVE_MMX
  1943. assert(filterSize % 4 == 0 && filterSize>0);
  1944. if (filterSize==4) // Always true for upscaling, sometimes for down, too.
  1945. {
  1946. x86_reg counter= -2*dstW;
  1947. filter-= counter*2;
  1948. filterPos-= counter/2;
  1949. dst-= counter/2;
  1950. __asm__ volatile(
  1951. #if defined(PIC)
  1952. "push %%"REG_b" \n\t"
  1953. #endif
  1954. "pxor %%mm7, %%mm7 \n\t"
  1955. "push %%"REG_BP" \n\t" // we use 7 regs here ...
  1956. "mov %%"REG_a", %%"REG_BP" \n\t"
  1957. ASMALIGN(4)
  1958. "1: \n\t"
  1959. "movzwl (%2, %%"REG_BP"), %%eax \n\t"
  1960. "movzwl 2(%2, %%"REG_BP"), %%ebx \n\t"
  1961. "movq (%1, %%"REG_BP", 4), %%mm1 \n\t"
  1962. "movq 8(%1, %%"REG_BP", 4), %%mm3 \n\t"
  1963. "movd (%3, %%"REG_a"), %%mm0 \n\t"
  1964. "movd (%3, %%"REG_b"), %%mm2 \n\t"
  1965. "punpcklbw %%mm7, %%mm0 \n\t"
  1966. "punpcklbw %%mm7, %%mm2 \n\t"
  1967. "pmaddwd %%mm1, %%mm0 \n\t"
  1968. "pmaddwd %%mm2, %%mm3 \n\t"
  1969. "movq %%mm0, %%mm4 \n\t"
  1970. "punpckldq %%mm3, %%mm0 \n\t"
  1971. "punpckhdq %%mm3, %%mm4 \n\t"
  1972. "paddd %%mm4, %%mm0 \n\t"
  1973. "psrad $7, %%mm0 \n\t"
  1974. "packssdw %%mm0, %%mm0 \n\t"
  1975. "movd %%mm0, (%4, %%"REG_BP") \n\t"
  1976. "add $4, %%"REG_BP" \n\t"
  1977. " jnc 1b \n\t"
  1978. "pop %%"REG_BP" \n\t"
  1979. #if defined(PIC)
  1980. "pop %%"REG_b" \n\t"
  1981. #endif
  1982. : "+a" (counter)
  1983. : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
  1984. #if !defined(PIC)
  1985. : "%"REG_b
  1986. #endif
  1987. );
  1988. }
  1989. else if (filterSize==8)
  1990. {
  1991. x86_reg counter= -2*dstW;
  1992. filter-= counter*4;
  1993. filterPos-= counter/2;
  1994. dst-= counter/2;
  1995. __asm__ volatile(
  1996. #if defined(PIC)
  1997. "push %%"REG_b" \n\t"
  1998. #endif
  1999. "pxor %%mm7, %%mm7 \n\t"
  2000. "push %%"REG_BP" \n\t" // we use 7 regs here ...
  2001. "mov %%"REG_a", %%"REG_BP" \n\t"
  2002. ASMALIGN(4)
  2003. "1: \n\t"
  2004. "movzwl (%2, %%"REG_BP"), %%eax \n\t"
  2005. "movzwl 2(%2, %%"REG_BP"), %%ebx \n\t"
  2006. "movq (%1, %%"REG_BP", 8), %%mm1 \n\t"
  2007. "movq 16(%1, %%"REG_BP", 8), %%mm3 \n\t"
  2008. "movd (%3, %%"REG_a"), %%mm0 \n\t"
  2009. "movd (%3, %%"REG_b"), %%mm2 \n\t"
  2010. "punpcklbw %%mm7, %%mm0 \n\t"
  2011. "punpcklbw %%mm7, %%mm2 \n\t"
  2012. "pmaddwd %%mm1, %%mm0 \n\t"
  2013. "pmaddwd %%mm2, %%mm3 \n\t"
  2014. "movq 8(%1, %%"REG_BP", 8), %%mm1 \n\t"
  2015. "movq 24(%1, %%"REG_BP", 8), %%mm5 \n\t"
  2016. "movd 4(%3, %%"REG_a"), %%mm4 \n\t"
  2017. "movd 4(%3, %%"REG_b"), %%mm2 \n\t"
  2018. "punpcklbw %%mm7, %%mm4 \n\t"
  2019. "punpcklbw %%mm7, %%mm2 \n\t"
  2020. "pmaddwd %%mm1, %%mm4 \n\t"
  2021. "pmaddwd %%mm2, %%mm5 \n\t"
  2022. "paddd %%mm4, %%mm0 \n\t"
  2023. "paddd %%mm5, %%mm3 \n\t"
  2024. "movq %%mm0, %%mm4 \n\t"
  2025. "punpckldq %%mm3, %%mm0 \n\t"
  2026. "punpckhdq %%mm3, %%mm4 \n\t"
  2027. "paddd %%mm4, %%mm0 \n\t"
  2028. "psrad $7, %%mm0 \n\t"
  2029. "packssdw %%mm0, %%mm0 \n\t"
  2030. "movd %%mm0, (%4, %%"REG_BP") \n\t"
  2031. "add $4, %%"REG_BP" \n\t"
  2032. " jnc 1b \n\t"
  2033. "pop %%"REG_BP" \n\t"
  2034. #if defined(PIC)
  2035. "pop %%"REG_b" \n\t"
  2036. #endif
  2037. : "+a" (counter)
  2038. : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
  2039. #if !defined(PIC)
  2040. : "%"REG_b
  2041. #endif
  2042. );
  2043. }
  2044. else
  2045. {
  2046. uint8_t *offset = src+filterSize;
  2047. x86_reg counter= -2*dstW;
  2048. //filter-= counter*filterSize/2;
  2049. filterPos-= counter/2;
  2050. dst-= counter/2;
  2051. __asm__ volatile(
  2052. "pxor %%mm7, %%mm7 \n\t"
  2053. ASMALIGN(4)
  2054. "1: \n\t"
  2055. "mov %2, %%"REG_c" \n\t"
  2056. "movzwl (%%"REG_c", %0), %%eax \n\t"
  2057. "movzwl 2(%%"REG_c", %0), %%edx \n\t"
  2058. "mov %5, %%"REG_c" \n\t"
  2059. "pxor %%mm4, %%mm4 \n\t"
  2060. "pxor %%mm5, %%mm5 \n\t"
  2061. "2: \n\t"
  2062. "movq (%1), %%mm1 \n\t"
  2063. "movq (%1, %6), %%mm3 \n\t"
  2064. "movd (%%"REG_c", %%"REG_a"), %%mm0 \n\t"
  2065. "movd (%%"REG_c", %%"REG_d"), %%mm2 \n\t"
  2066. "punpcklbw %%mm7, %%mm0 \n\t"
  2067. "punpcklbw %%mm7, %%mm2 \n\t"
  2068. "pmaddwd %%mm1, %%mm0 \n\t"
  2069. "pmaddwd %%mm2, %%mm3 \n\t"
  2070. "paddd %%mm3, %%mm5 \n\t"
  2071. "paddd %%mm0, %%mm4 \n\t"
  2072. "add $8, %1 \n\t"
  2073. "add $4, %%"REG_c" \n\t"
  2074. "cmp %4, %%"REG_c" \n\t"
  2075. " jb 2b \n\t"
  2076. "add %6, %1 \n\t"
  2077. "movq %%mm4, %%mm0 \n\t"
  2078. "punpckldq %%mm5, %%mm4 \n\t"
  2079. "punpckhdq %%mm5, %%mm0 \n\t"
  2080. "paddd %%mm0, %%mm4 \n\t"
  2081. "psrad $7, %%mm4 \n\t"
  2082. "packssdw %%mm4, %%mm4 \n\t"
  2083. "mov %3, %%"REG_a" \n\t"
  2084. "movd %%mm4, (%%"REG_a", %0) \n\t"
  2085. "add $4, %0 \n\t"
  2086. " jnc 1b \n\t"
  2087. : "+r" (counter), "+r" (filter)
  2088. : "m" (filterPos), "m" (dst), "m"(offset),
  2089. "m" (src), "r" ((x86_reg)filterSize*2)
  2090. : "%"REG_a, "%"REG_c, "%"REG_d
  2091. );
  2092. }
  2093. #else
  2094. #if HAVE_ALTIVEC
  2095. hScale_altivec_real(dst, dstW, src, srcW, xInc, filter, filterPos, filterSize);
  2096. #else
  2097. int i;
  2098. for (i=0; i<dstW; i++)
  2099. {
  2100. int j;
  2101. int srcPos= filterPos[i];
  2102. int val=0;
  2103. //printf("filterPos: %d\n", filterPos[i]);
  2104. for (j=0; j<filterSize; j++)
  2105. {
  2106. //printf("filter: %d, src: %d\n", filter[i], src[srcPos + j]);
  2107. val += ((int)src[srcPos + j])*filter[filterSize*i + j];
  2108. }
  2109. //filter += hFilterSize;
  2110. dst[i] = FFMIN(val>>7, (1<<15)-1); // the cubic equation does overflow ...
  2111. //dst[i] = val>>7;
  2112. }
  2113. #endif /* HAVE_ALTIVEC */
  2114. #endif /* HAVE_MMX */
  2115. }
  2116. static inline void RENAME(hyscale_fast)(SwsContext *c, int16_t *dst,
  2117. int dstWidth, const uint8_t *src, int srcW,
  2118. int xInc)
  2119. {
  2120. int i;
  2121. unsigned int xpos=0;
  2122. for (i=0;i<dstWidth;i++)
  2123. {
  2124. register unsigned int xx=xpos>>16;
  2125. register unsigned int xalpha=(xpos&0xFFFF)>>9;
  2126. dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha;
  2127. xpos+=xInc;
  2128. }
  2129. }
  2130. // *** horizontal scale Y line to temp buffer
  2131. static inline void RENAME(hyscale)(SwsContext *c, uint16_t *dst, long dstWidth, const uint8_t *src, int srcW, int xInc,
  2132. int flags, const int16_t *hLumFilter,
  2133. const int16_t *hLumFilterPos, int hLumFilterSize,
  2134. int srcFormat, uint8_t *formatConvBuffer,
  2135. uint32_t *pal, int isAlpha)
  2136. {
  2137. int32_t *mmx2FilterPos = c->lumMmx2FilterPos;
  2138. int16_t *mmx2Filter = c->lumMmx2Filter;
  2139. int canMMX2BeUsed = c->canMMX2BeUsed;
  2140. void *funnyYCode = c->funnyYCode;
  2141. if (isAlpha) {
  2142. if (srcFormat == PIX_FMT_RGB32 || srcFormat == PIX_FMT_BGR32 )
  2143. src += 3;
  2144. } else {
  2145. if (srcFormat == PIX_FMT_RGB32_1 || srcFormat == PIX_FMT_BGR32_1)
  2146. src += ALT32_CORR;
  2147. }
  2148. if (c->hyscale_internal) {
  2149. c->hyscale_internal(formatConvBuffer, src, srcW, pal);
  2150. src= formatConvBuffer;
  2151. }
  2152. #if HAVE_MMX
  2153. // Use the new MMX scaler if the MMX2 one can't be used (it is faster than the x86 ASM one).
  2154. if (!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
  2155. #else
  2156. if (!(flags&SWS_FAST_BILINEAR))
  2157. #endif
  2158. {
  2159. c->hScale(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize);
  2160. }
  2161. else // fast bilinear upscale / crap downscale
  2162. {
  2163. #if ARCH_X86 && CONFIG_GPL
  2164. #if HAVE_MMX2
  2165. int i;
  2166. #if defined(PIC)
  2167. uint64_t ebxsave __attribute__((aligned(8)));
  2168. #endif
  2169. if (canMMX2BeUsed)
  2170. {
  2171. __asm__ volatile(
  2172. #if defined(PIC)
  2173. "mov %%"REG_b", %5 \n\t"
  2174. #endif
  2175. "pxor %%mm7, %%mm7 \n\t"
  2176. "mov %0, %%"REG_c" \n\t"
  2177. "mov %1, %%"REG_D" \n\t"
  2178. "mov %2, %%"REG_d" \n\t"
  2179. "mov %3, %%"REG_b" \n\t"
  2180. "xor %%"REG_a", %%"REG_a" \n\t" // i
  2181. PREFETCH" (%%"REG_c") \n\t"
  2182. PREFETCH" 32(%%"REG_c") \n\t"
  2183. PREFETCH" 64(%%"REG_c") \n\t"
  2184. #if ARCH_X86_64
  2185. #define FUNNY_Y_CODE \
  2186. "movl (%%"REG_b"), %%esi \n\t"\
  2187. "call *%4 \n\t"\
  2188. "movl (%%"REG_b", %%"REG_a"), %%esi \n\t"\
  2189. "add %%"REG_S", %%"REG_c" \n\t"\
  2190. "add %%"REG_a", %%"REG_D" \n\t"\
  2191. "xor %%"REG_a", %%"REG_a" \n\t"\
  2192. #else
  2193. #define FUNNY_Y_CODE \
  2194. "movl (%%"REG_b"), %%esi \n\t"\
  2195. "call *%4 \n\t"\
  2196. "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\
  2197. "add %%"REG_a", %%"REG_D" \n\t"\
  2198. "xor %%"REG_a", %%"REG_a" \n\t"\
  2199. #endif /* ARCH_X86_64 */
  2200. FUNNY_Y_CODE
  2201. FUNNY_Y_CODE
  2202. FUNNY_Y_CODE
  2203. FUNNY_Y_CODE
  2204. FUNNY_Y_CODE
  2205. FUNNY_Y_CODE
  2206. FUNNY_Y_CODE
  2207. FUNNY_Y_CODE
  2208. #if defined(PIC)
  2209. "mov %5, %%"REG_b" \n\t"
  2210. #endif
  2211. :: "m" (src), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
  2212. "m" (funnyYCode)
  2213. #if defined(PIC)
  2214. ,"m" (ebxsave)
  2215. #endif
  2216. : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
  2217. #if !defined(PIC)
  2218. ,"%"REG_b
  2219. #endif
  2220. );
  2221. for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) dst[i] = src[srcW-1]*128;
  2222. }
  2223. else
  2224. {
  2225. #endif /* HAVE_MMX2 */
  2226. x86_reg xInc_shr16 = xInc >> 16;
  2227. uint16_t xInc_mask = xInc & 0xffff;
  2228. //NO MMX just normal asm ...
  2229. __asm__ volatile(
  2230. "xor %%"REG_a", %%"REG_a" \n\t" // i
  2231. "xor %%"REG_d", %%"REG_d" \n\t" // xx
  2232. "xorl %%ecx, %%ecx \n\t" // 2*xalpha
  2233. ASMALIGN(4)
  2234. "1: \n\t"
  2235. "movzbl (%0, %%"REG_d"), %%edi \n\t" //src[xx]
  2236. "movzbl 1(%0, %%"REG_d"), %%esi \n\t" //src[xx+1]
  2237. "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
  2238. "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
  2239. "shll $16, %%edi \n\t"
  2240. "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
  2241. "mov %1, %%"REG_D" \n\t"
  2242. "shrl $9, %%esi \n\t"
  2243. "movw %%si, (%%"REG_D", %%"REG_a", 2) \n\t"
  2244. "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
  2245. "adc %3, %%"REG_d" \n\t" //xx+= xInc>>8 + carry
  2246. "movzbl (%0, %%"REG_d"), %%edi \n\t" //src[xx]
  2247. "movzbl 1(%0, %%"REG_d"), %%esi \n\t" //src[xx+1]
  2248. "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
  2249. "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
  2250. "shll $16, %%edi \n\t"
  2251. "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
  2252. "mov %1, %%"REG_D" \n\t"
  2253. "shrl $9, %%esi \n\t"
  2254. "movw %%si, 2(%%"REG_D", %%"REG_a", 2) \n\t"
  2255. "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
  2256. "adc %3, %%"REG_d" \n\t" //xx+= xInc>>8 + carry
  2257. "add $2, %%"REG_a" \n\t"
  2258. "cmp %2, %%"REG_a" \n\t"
  2259. " jb 1b \n\t"
  2260. :: "r" (src), "m" (dst), "m" (dstWidth), "m" (xInc_shr16), "m" (xInc_mask)
  2261. : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi"
  2262. );
  2263. #if HAVE_MMX2
  2264. } //if MMX2 can't be used
  2265. #endif
  2266. #else
  2267. c->hyscale_fast(c, dst, dstWidth, src, srcW, xInc);
  2268. #endif /* ARCH_X86 */
  2269. }
  2270. if(!isAlpha && c->srcRange != c->dstRange && !(isRGB(c->dstFormat) || isBGR(c->dstFormat))){
  2271. int i;
  2272. //FIXME all pal and rgb srcFormats could do this convertion as well
  2273. //FIXME all scalers more complex than bilinear could do half of this transform
  2274. if(c->srcRange){
  2275. for (i=0; i<dstWidth; i++)
  2276. dst[i]= (dst[i]*14071 + 33561947)>>14;
  2277. }else{
  2278. for (i=0; i<dstWidth; i++)
  2279. dst[i]= (FFMIN(dst[i],30189)*19077 - 39057361)>>14;
  2280. }
  2281. }
  2282. }
  2283. static inline void RENAME(hcscale_fast)(SwsContext *c, int16_t *dst,
  2284. int dstWidth, const uint8_t *src1,
  2285. const uint8_t *src2, int srcW, int xInc)
  2286. {
  2287. int i;
  2288. unsigned int xpos=0;
  2289. for (i=0;i<dstWidth;i++)
  2290. {
  2291. register unsigned int xx=xpos>>16;
  2292. register unsigned int xalpha=(xpos&0xFFFF)>>9;
  2293. dst[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha);
  2294. dst[i+VOFW]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha);
  2295. /* slower
  2296. dst[i]= (src1[xx]<<7) + (src1[xx+1] - src1[xx])*xalpha;
  2297. dst[i+VOFW]=(src2[xx]<<7) + (src2[xx+1] - src2[xx])*xalpha;
  2298. */
  2299. xpos+=xInc;
  2300. }
  2301. }
  2302. inline static void RENAME(hcscale)(SwsContext *c, uint16_t *dst, long dstWidth, const uint8_t *src1, const uint8_t *src2,
  2303. int srcW, int xInc, int flags, const int16_t *hChrFilter,
  2304. const int16_t *hChrFilterPos, int hChrFilterSize,
  2305. int srcFormat, uint8_t *formatConvBuffer,
  2306. uint32_t *pal)
  2307. {
  2308. int32_t *mmx2FilterPos = c->chrMmx2FilterPos;
  2309. int16_t *mmx2Filter = c->chrMmx2Filter;
  2310. int canMMX2BeUsed = c->canMMX2BeUsed;
  2311. void *funnyUVCode = c->funnyUVCode;
  2312. if (isGray(srcFormat) || srcFormat==PIX_FMT_MONOBLACK || srcFormat==PIX_FMT_MONOWHITE)
  2313. return;
  2314. if (srcFormat==PIX_FMT_RGB32_1) {
  2315. src1 += ALT32_CORR;
  2316. src2 += ALT32_CORR;
  2317. }
  2318. if (c->hcscale_internal) {
  2319. c->hcscale_internal(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
  2320. src1= formatConvBuffer;
  2321. src2= formatConvBuffer+VOFW;
  2322. }
  2323. #if HAVE_MMX
  2324. // Use the new MMX scaler if the MMX2 one can't be used (it is faster than the x86 ASM one).
  2325. if (!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
  2326. #else
  2327. if (!(flags&SWS_FAST_BILINEAR))
  2328. #endif
  2329. {
  2330. c->hScale(dst , dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
  2331. c->hScale(dst+VOFW, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
  2332. }
  2333. else // fast bilinear upscale / crap downscale
  2334. {
  2335. #if ARCH_X86 && CONFIG_GPL
  2336. #if HAVE_MMX2
  2337. int i;
  2338. #if defined(PIC)
  2339. uint64_t ebxsave __attribute__((aligned(8)));
  2340. #endif
  2341. if (canMMX2BeUsed)
  2342. {
  2343. __asm__ volatile(
  2344. #if defined(PIC)
  2345. "mov %%"REG_b", %6 \n\t"
  2346. #endif
  2347. "pxor %%mm7, %%mm7 \n\t"
  2348. "mov %0, %%"REG_c" \n\t"
  2349. "mov %1, %%"REG_D" \n\t"
  2350. "mov %2, %%"REG_d" \n\t"
  2351. "mov %3, %%"REG_b" \n\t"
  2352. "xor %%"REG_a", %%"REG_a" \n\t" // i
  2353. PREFETCH" (%%"REG_c") \n\t"
  2354. PREFETCH" 32(%%"REG_c") \n\t"
  2355. PREFETCH" 64(%%"REG_c") \n\t"
  2356. #if ARCH_X86_64
  2357. #define FUNNY_UV_CODE \
  2358. "movl (%%"REG_b"), %%esi \n\t"\
  2359. "call *%4 \n\t"\
  2360. "movl (%%"REG_b", %%"REG_a"), %%esi \n\t"\
  2361. "add %%"REG_S", %%"REG_c" \n\t"\
  2362. "add %%"REG_a", %%"REG_D" \n\t"\
  2363. "xor %%"REG_a", %%"REG_a" \n\t"\
  2364. #else
  2365. #define FUNNY_UV_CODE \
  2366. "movl (%%"REG_b"), %%esi \n\t"\
  2367. "call *%4 \n\t"\
  2368. "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\
  2369. "add %%"REG_a", %%"REG_D" \n\t"\
  2370. "xor %%"REG_a", %%"REG_a" \n\t"\
  2371. #endif /* ARCH_X86_64 */
  2372. FUNNY_UV_CODE
  2373. FUNNY_UV_CODE
  2374. FUNNY_UV_CODE
  2375. FUNNY_UV_CODE
  2376. "xor %%"REG_a", %%"REG_a" \n\t" // i
  2377. "mov %5, %%"REG_c" \n\t" // src
  2378. "mov %1, %%"REG_D" \n\t" // buf1
  2379. "add $"AV_STRINGIFY(VOF)", %%"REG_D" \n\t"
  2380. PREFETCH" (%%"REG_c") \n\t"
  2381. PREFETCH" 32(%%"REG_c") \n\t"
  2382. PREFETCH" 64(%%"REG_c") \n\t"
  2383. FUNNY_UV_CODE
  2384. FUNNY_UV_CODE
  2385. FUNNY_UV_CODE
  2386. FUNNY_UV_CODE
  2387. #if defined(PIC)
  2388. "mov %6, %%"REG_b" \n\t"
  2389. #endif
  2390. :: "m" (src1), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
  2391. "m" (funnyUVCode), "m" (src2)
  2392. #if defined(PIC)
  2393. ,"m" (ebxsave)
  2394. #endif
  2395. : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
  2396. #if !defined(PIC)
  2397. ,"%"REG_b
  2398. #endif
  2399. );
  2400. for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
  2401. {
  2402. //printf("%d %d %d\n", dstWidth, i, srcW);
  2403. dst[i] = src1[srcW-1]*128;
  2404. dst[i+VOFW] = src2[srcW-1]*128;
  2405. }
  2406. }
  2407. else
  2408. {
  2409. #endif /* HAVE_MMX2 */
  2410. x86_reg xInc_shr16 = (x86_reg) (xInc >> 16);
  2411. uint16_t xInc_mask = xInc & 0xffff;
  2412. __asm__ volatile(
  2413. "xor %%"REG_a", %%"REG_a" \n\t" // i
  2414. "xor %%"REG_d", %%"REG_d" \n\t" // xx
  2415. "xorl %%ecx, %%ecx \n\t" // 2*xalpha
  2416. ASMALIGN(4)
  2417. "1: \n\t"
  2418. "mov %0, %%"REG_S" \n\t"
  2419. "movzbl (%%"REG_S", %%"REG_d"), %%edi \n\t" //src[xx]
  2420. "movzbl 1(%%"REG_S", %%"REG_d"), %%esi \n\t" //src[xx+1]
  2421. "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
  2422. "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
  2423. "shll $16, %%edi \n\t"
  2424. "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
  2425. "mov %1, %%"REG_D" \n\t"
  2426. "shrl $9, %%esi \n\t"
  2427. "movw %%si, (%%"REG_D", %%"REG_a", 2) \n\t"
  2428. "movzbl (%5, %%"REG_d"), %%edi \n\t" //src[xx]
  2429. "movzbl 1(%5, %%"REG_d"), %%esi \n\t" //src[xx+1]
  2430. "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
  2431. "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
  2432. "shll $16, %%edi \n\t"
  2433. "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
  2434. "mov %1, %%"REG_D" \n\t"
  2435. "shrl $9, %%esi \n\t"
  2436. "movw %%si, "AV_STRINGIFY(VOF)"(%%"REG_D", %%"REG_a", 2) \n\t"
  2437. "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
  2438. "adc %3, %%"REG_d" \n\t" //xx+= xInc>>8 + carry
  2439. "add $1, %%"REG_a" \n\t"
  2440. "cmp %2, %%"REG_a" \n\t"
  2441. " jb 1b \n\t"
  2442. /* GCC 3.3 makes MPlayer crash on IA-32 machines when using "g" operand here,
  2443. which is needed to support GCC 4.0. */
  2444. #if ARCH_X86_64 && ((__GNUC__ > 3) || (__GNUC__ == 3 && __GNUC_MINOR__ >= 4))
  2445. :: "m" (src1), "m" (dst), "g" (dstWidth), "m" (xInc_shr16), "m" (xInc_mask),
  2446. #else
  2447. :: "m" (src1), "m" (dst), "m" (dstWidth), "m" (xInc_shr16), "m" (xInc_mask),
  2448. #endif
  2449. "r" (src2)
  2450. : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi"
  2451. );
  2452. #if HAVE_MMX2
  2453. } //if MMX2 can't be used
  2454. #endif
  2455. #else
  2456. c->hcscale_fast(c, dst, dstWidth, src1, src2, srcW, xInc);
  2457. #endif /* ARCH_X86 */
  2458. }
  2459. if(c->srcRange != c->dstRange && !(isRGB(c->dstFormat) || isBGR(c->dstFormat))){
  2460. int i;
  2461. //FIXME all pal and rgb srcFormats could do this convertion as well
  2462. //FIXME all scalers more complex than bilinear could do half of this transform
  2463. if(c->srcRange){
  2464. for (i=0; i<dstWidth; i++){
  2465. dst[i ]= (dst[i ]*1799 + 4081085)>>11; //1469
  2466. dst[i+VOFW]= (dst[i+VOFW]*1799 + 4081085)>>11; //1469
  2467. }
  2468. }else{
  2469. for (i=0; i<dstWidth; i++){
  2470. dst[i ]= (FFMIN(dst[i ],30775)*4663 - 9289992)>>12; //-264
  2471. dst[i+VOFW]= (FFMIN(dst[i+VOFW],30775)*4663 - 9289992)>>12; //-264
  2472. }
  2473. }
  2474. }
  2475. }
  2476. static int RENAME(swScale)(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
  2477. int srcSliceH, uint8_t* dst[], int dstStride[]){
  2478. /* load a few things into local vars to make the code more readable? and faster */
  2479. const int srcW= c->srcW;
  2480. const int dstW= c->dstW;
  2481. const int dstH= c->dstH;
  2482. const int chrDstW= c->chrDstW;
  2483. const int chrSrcW= c->chrSrcW;
  2484. const int lumXInc= c->lumXInc;
  2485. const int chrXInc= c->chrXInc;
  2486. const int dstFormat= c->dstFormat;
  2487. const int srcFormat= c->srcFormat;
  2488. const int flags= c->flags;
  2489. int16_t *vLumFilterPos= c->vLumFilterPos;
  2490. int16_t *vChrFilterPos= c->vChrFilterPos;
  2491. int16_t *hLumFilterPos= c->hLumFilterPos;
  2492. int16_t *hChrFilterPos= c->hChrFilterPos;
  2493. int16_t *vLumFilter= c->vLumFilter;
  2494. int16_t *vChrFilter= c->vChrFilter;
  2495. int16_t *hLumFilter= c->hLumFilter;
  2496. int16_t *hChrFilter= c->hChrFilter;
  2497. int32_t *lumMmxFilter= c->lumMmxFilter;
  2498. int32_t *chrMmxFilter= c->chrMmxFilter;
  2499. int32_t *alpMmxFilter= c->alpMmxFilter;
  2500. const int vLumFilterSize= c->vLumFilterSize;
  2501. const int vChrFilterSize= c->vChrFilterSize;
  2502. const int hLumFilterSize= c->hLumFilterSize;
  2503. const int hChrFilterSize= c->hChrFilterSize;
  2504. int16_t **lumPixBuf= c->lumPixBuf;
  2505. int16_t **chrPixBuf= c->chrPixBuf;
  2506. int16_t **alpPixBuf= c->alpPixBuf;
  2507. const int vLumBufSize= c->vLumBufSize;
  2508. const int vChrBufSize= c->vChrBufSize;
  2509. uint8_t *formatConvBuffer= c->formatConvBuffer;
  2510. const int chrSrcSliceY= srcSliceY >> c->chrSrcVSubSample;
  2511. const int chrSrcSliceH= -((-srcSliceH) >> c->chrSrcVSubSample);
  2512. int lastDstY;
  2513. uint32_t *pal=c->pal_yuv;
  2514. /* vars which will change and which we need to store back in the context */
  2515. int dstY= c->dstY;
  2516. int lumBufIndex= c->lumBufIndex;
  2517. int chrBufIndex= c->chrBufIndex;
  2518. int lastInLumBuf= c->lastInLumBuf;
  2519. int lastInChrBuf= c->lastInChrBuf;
  2520. if (isPacked(c->srcFormat)){
  2521. src[0]=
  2522. src[1]=
  2523. src[2]=
  2524. src[3]= src[0];
  2525. srcStride[0]=
  2526. srcStride[1]=
  2527. srcStride[2]=
  2528. srcStride[3]= srcStride[0];
  2529. }
  2530. srcStride[1]<<= c->vChrDrop;
  2531. srcStride[2]<<= c->vChrDrop;
  2532. //printf("swscale %X %X %X -> %X %X %X\n", (int)src[0], (int)src[1], (int)src[2],
  2533. // (int)dst[0], (int)dst[1], (int)dst[2]);
  2534. #if 0 //self test FIXME move to a vfilter or something
  2535. {
  2536. static volatile int i=0;
  2537. i++;
  2538. if (srcFormat==PIX_FMT_YUV420P && i==1 && srcSliceH>= c->srcH)
  2539. selfTest(src, srcStride, c->srcW, c->srcH);
  2540. i--;
  2541. }
  2542. #endif
  2543. //printf("sws Strides:%d %d %d -> %d %d %d\n", srcStride[0],srcStride[1],srcStride[2],
  2544. //dstStride[0],dstStride[1],dstStride[2]);
  2545. if (dstStride[0]%8 !=0 || dstStride[1]%8 !=0 || dstStride[2]%8 !=0 || dstStride[3]%8 != 0)
  2546. {
  2547. static int warnedAlready=0; //FIXME move this into the context perhaps
  2548. if (flags & SWS_PRINT_INFO && !warnedAlready)
  2549. {
  2550. av_log(c, AV_LOG_WARNING, "Warning: dstStride is not aligned!\n"
  2551. " ->cannot do aligned memory accesses anymore\n");
  2552. warnedAlready=1;
  2553. }
  2554. }
  2555. /* Note the user might start scaling the picture in the middle so this
  2556. will not get executed. This is not really intended but works
  2557. currently, so people might do it. */
  2558. if (srcSliceY ==0){
  2559. lumBufIndex=0;
  2560. chrBufIndex=0;
  2561. dstY=0;
  2562. lastInLumBuf= -1;
  2563. lastInChrBuf= -1;
  2564. }
  2565. lastDstY= dstY;
  2566. for (;dstY < dstH; dstY++){
  2567. unsigned char *dest =dst[0]+dstStride[0]*dstY;
  2568. const int chrDstY= dstY>>c->chrDstVSubSample;
  2569. unsigned char *uDest=dst[1]+dstStride[1]*chrDstY;
  2570. unsigned char *vDest=dst[2]+dstStride[2]*chrDstY;
  2571. unsigned char *aDest=(CONFIG_SWSCALE_ALPHA && alpPixBuf) ? dst[3]+dstStride[3]*dstY : NULL;
  2572. const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input
  2573. const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input
  2574. const int lastLumSrcY= firstLumSrcY + vLumFilterSize -1; // Last line needed as input
  2575. const int lastChrSrcY= firstChrSrcY + vChrFilterSize -1; // Last line needed as input
  2576. //printf("dstY:%d dstH:%d firstLumSrcY:%d lastInLumBuf:%d vLumBufSize: %d vChrBufSize: %d slice: %d %d vLumFilterSize: %d firstChrSrcY: %d vChrFilterSize: %d c->chrSrcVSubSample: %d\n",
  2577. // dstY, dstH, firstLumSrcY, lastInLumBuf, vLumBufSize, vChrBufSize, srcSliceY, srcSliceH, vLumFilterSize, firstChrSrcY, vChrFilterSize, c->chrSrcVSubSample);
  2578. //handle holes (FAST_BILINEAR & weird filters)
  2579. if (firstLumSrcY > lastInLumBuf) lastInLumBuf= firstLumSrcY-1;
  2580. if (firstChrSrcY > lastInChrBuf) lastInChrBuf= firstChrSrcY-1;
  2581. //printf("%d %d %d\n", firstChrSrcY, lastInChrBuf, vChrBufSize);
  2582. assert(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1);
  2583. assert(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1);
  2584. // Do we have enough lines in this slice to output the dstY line
  2585. if (lastLumSrcY < srcSliceY + srcSliceH && lastChrSrcY < -((-srcSliceY - srcSliceH)>>c->chrSrcVSubSample))
  2586. {
  2587. //Do horizontal scaling
  2588. while(lastInLumBuf < lastLumSrcY)
  2589. {
  2590. uint8_t *src1= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
  2591. uint8_t *src2= src[3]+(lastInLumBuf + 1 - srcSliceY)*srcStride[3];
  2592. lumBufIndex++;
  2593. //printf("%d %d %d %d\n", lumBufIndex, vLumBufSize, lastInLumBuf, lastLumSrcY);
  2594. assert(lumBufIndex < 2*vLumBufSize);
  2595. assert(lastInLumBuf + 1 - srcSliceY < srcSliceH);
  2596. assert(lastInLumBuf + 1 - srcSliceY >= 0);
  2597. //printf("%d %d\n", lumBufIndex, vLumBufSize);
  2598. RENAME(hyscale)(c, lumPixBuf[ lumBufIndex ], dstW, src1, srcW, lumXInc,
  2599. flags, hLumFilter, hLumFilterPos, hLumFilterSize,
  2600. c->srcFormat, formatConvBuffer,
  2601. pal, 0);
  2602. if (CONFIG_SWSCALE_ALPHA && alpPixBuf)
  2603. RENAME(hyscale)(c, alpPixBuf[ lumBufIndex ], dstW, src2, srcW, lumXInc,
  2604. flags, hLumFilter, hLumFilterPos, hLumFilterSize,
  2605. c->srcFormat, formatConvBuffer,
  2606. pal, 1);
  2607. lastInLumBuf++;
  2608. }
  2609. while(lastInChrBuf < lastChrSrcY)
  2610. {
  2611. uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
  2612. uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
  2613. chrBufIndex++;
  2614. assert(chrBufIndex < 2*vChrBufSize);
  2615. assert(lastInChrBuf + 1 - chrSrcSliceY < (chrSrcSliceH));
  2616. assert(lastInChrBuf + 1 - chrSrcSliceY >= 0);
  2617. //FIXME replace parameters through context struct (some at least)
  2618. if (!(isGray(srcFormat) || isGray(dstFormat)))
  2619. RENAME(hcscale)(c, chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
  2620. flags, hChrFilter, hChrFilterPos, hChrFilterSize,
  2621. c->srcFormat, formatConvBuffer,
  2622. pal);
  2623. lastInChrBuf++;
  2624. }
  2625. //wrap buf index around to stay inside the ring buffer
  2626. if (lumBufIndex >= vLumBufSize) lumBufIndex-= vLumBufSize;
  2627. if (chrBufIndex >= vChrBufSize) chrBufIndex-= vChrBufSize;
  2628. }
  2629. else // not enough lines left in this slice -> load the rest in the buffer
  2630. {
  2631. /* printf("%d %d Last:%d %d LastInBuf:%d %d Index:%d %d Y:%d FSize: %d %d BSize: %d %d\n",
  2632. firstChrSrcY,firstLumSrcY,lastChrSrcY,lastLumSrcY,
  2633. lastInChrBuf,lastInLumBuf,chrBufIndex,lumBufIndex,dstY,vChrFilterSize,vLumFilterSize,
  2634. vChrBufSize, vLumBufSize);*/
  2635. //Do horizontal scaling
  2636. while(lastInLumBuf+1 < srcSliceY + srcSliceH)
  2637. {
  2638. uint8_t *src1= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
  2639. uint8_t *src2= src[3]+(lastInLumBuf + 1 - srcSliceY)*srcStride[3];
  2640. lumBufIndex++;
  2641. assert(lumBufIndex < 2*vLumBufSize);
  2642. assert(lastInLumBuf + 1 - srcSliceY < srcSliceH);
  2643. assert(lastInLumBuf + 1 - srcSliceY >= 0);
  2644. RENAME(hyscale)(c, lumPixBuf[ lumBufIndex ], dstW, src1, srcW, lumXInc,
  2645. flags, hLumFilter, hLumFilterPos, hLumFilterSize,
  2646. c->srcFormat, formatConvBuffer,
  2647. pal, 0);
  2648. if (CONFIG_SWSCALE_ALPHA && alpPixBuf)
  2649. RENAME(hyscale)(c, alpPixBuf[ lumBufIndex ], dstW, src2, srcW, lumXInc,
  2650. flags, hLumFilter, hLumFilterPos, hLumFilterSize,
  2651. c->srcFormat, formatConvBuffer,
  2652. pal, 1);
  2653. lastInLumBuf++;
  2654. }
  2655. while(lastInChrBuf+1 < (chrSrcSliceY + chrSrcSliceH))
  2656. {
  2657. uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
  2658. uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
  2659. chrBufIndex++;
  2660. assert(chrBufIndex < 2*vChrBufSize);
  2661. assert(lastInChrBuf + 1 - chrSrcSliceY < chrSrcSliceH);
  2662. assert(lastInChrBuf + 1 - chrSrcSliceY >= 0);
  2663. if (!(isGray(srcFormat) || isGray(dstFormat)))
  2664. RENAME(hcscale)(c, chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
  2665. flags, hChrFilter, hChrFilterPos, hChrFilterSize,
  2666. c->srcFormat, formatConvBuffer,
  2667. pal);
  2668. lastInChrBuf++;
  2669. }
  2670. //wrap buf index around to stay inside the ring buffer
  2671. if (lumBufIndex >= vLumBufSize) lumBufIndex-= vLumBufSize;
  2672. if (chrBufIndex >= vChrBufSize) chrBufIndex-= vChrBufSize;
  2673. break; //we can't output a dstY line so let's try with the next slice
  2674. }
  2675. #if HAVE_MMX
  2676. c->blueDither= ff_dither8[dstY&1];
  2677. if (c->dstFormat == PIX_FMT_RGB555 || c->dstFormat == PIX_FMT_BGR555)
  2678. c->greenDither= ff_dither8[dstY&1];
  2679. else
  2680. c->greenDither= ff_dither4[dstY&1];
  2681. c->redDither= ff_dither8[(dstY+1)&1];
  2682. #endif
  2683. if (dstY < dstH-2)
  2684. {
  2685. const int16_t **lumSrcPtr= (const int16_t **) lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
  2686. const int16_t **chrSrcPtr= (const int16_t **) chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
  2687. const int16_t **alpSrcPtr= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? (const int16_t **) alpPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize : NULL;
  2688. #if HAVE_MMX
  2689. int i;
  2690. if (flags & SWS_ACCURATE_RND){
  2691. int s= APCK_SIZE / 8;
  2692. for (i=0; i<vLumFilterSize; i+=2){
  2693. *(void**)&lumMmxFilter[s*i ]= lumSrcPtr[i ];
  2694. *(void**)&lumMmxFilter[s*i+APCK_PTR2/4 ]= lumSrcPtr[i+(vLumFilterSize>1)];
  2695. lumMmxFilter[s*i+APCK_COEF/4 ]=
  2696. lumMmxFilter[s*i+APCK_COEF/4+1]= vLumFilter[dstY*vLumFilterSize + i ]
  2697. + (vLumFilterSize>1 ? vLumFilter[dstY*vLumFilterSize + i + 1]<<16 : 0);
  2698. if (CONFIG_SWSCALE_ALPHA && alpPixBuf){
  2699. *(void**)&alpMmxFilter[s*i ]= alpSrcPtr[i ];
  2700. *(void**)&alpMmxFilter[s*i+APCK_PTR2/4 ]= alpSrcPtr[i+(vLumFilterSize>1)];
  2701. alpMmxFilter[s*i+APCK_COEF/4 ]=
  2702. alpMmxFilter[s*i+APCK_COEF/4+1]= lumMmxFilter[s*i+APCK_COEF/4 ];
  2703. }
  2704. }
  2705. for (i=0; i<vChrFilterSize; i+=2){
  2706. *(void**)&chrMmxFilter[s*i ]= chrSrcPtr[i ];
  2707. *(void**)&chrMmxFilter[s*i+APCK_PTR2/4 ]= chrSrcPtr[i+(vChrFilterSize>1)];
  2708. chrMmxFilter[s*i+APCK_COEF/4 ]=
  2709. chrMmxFilter[s*i+APCK_COEF/4+1]= vChrFilter[chrDstY*vChrFilterSize + i ]
  2710. + (vChrFilterSize>1 ? vChrFilter[chrDstY*vChrFilterSize + i + 1]<<16 : 0);
  2711. }
  2712. }else{
  2713. for (i=0; i<vLumFilterSize; i++)
  2714. {
  2715. lumMmxFilter[4*i+0]= (int32_t)lumSrcPtr[i];
  2716. lumMmxFilter[4*i+1]= (uint64_t)lumSrcPtr[i] >> 32;
  2717. lumMmxFilter[4*i+2]=
  2718. lumMmxFilter[4*i+3]=
  2719. ((uint16_t)vLumFilter[dstY*vLumFilterSize + i])*0x10001;
  2720. if (CONFIG_SWSCALE_ALPHA && alpPixBuf){
  2721. alpMmxFilter[4*i+0]= (int32_t)alpSrcPtr[i];
  2722. alpMmxFilter[4*i+1]= (uint64_t)alpSrcPtr[i] >> 32;
  2723. alpMmxFilter[4*i+2]=
  2724. alpMmxFilter[4*i+3]= lumMmxFilter[4*i+2];
  2725. }
  2726. }
  2727. for (i=0; i<vChrFilterSize; i++)
  2728. {
  2729. chrMmxFilter[4*i+0]= (int32_t)chrSrcPtr[i];
  2730. chrMmxFilter[4*i+1]= (uint64_t)chrSrcPtr[i] >> 32;
  2731. chrMmxFilter[4*i+2]=
  2732. chrMmxFilter[4*i+3]=
  2733. ((uint16_t)vChrFilter[chrDstY*vChrFilterSize + i])*0x10001;
  2734. }
  2735. }
  2736. #endif
  2737. if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21){
  2738. const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
  2739. if (dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
  2740. c->yuv2nv12X(c,
  2741. vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
  2742. vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
  2743. dest, uDest, dstW, chrDstW, dstFormat);
  2744. }
  2745. else if (isPlanarYUV(dstFormat) || dstFormat==PIX_FMT_GRAY8) //YV12 like
  2746. {
  2747. const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
  2748. if ((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
  2749. if (vLumFilterSize == 1 && vChrFilterSize == 1) // unscaled YV12
  2750. {
  2751. int16_t *lumBuf = lumPixBuf[0];
  2752. int16_t *chrBuf= chrPixBuf[0];
  2753. int16_t *alpBuf= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? alpPixBuf[0] : NULL;
  2754. c->yuv2yuv1(c, lumBuf, chrBuf, alpBuf, dest, uDest, vDest, aDest, dstW, chrDstW);
  2755. }
  2756. else //General YV12
  2757. {
  2758. c->yuv2yuvX(c,
  2759. vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
  2760. vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
  2761. alpSrcPtr, dest, uDest, vDest, aDest, dstW, chrDstW);
  2762. }
  2763. }
  2764. else
  2765. {
  2766. assert(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
  2767. assert(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
  2768. if (vLumFilterSize == 1 && vChrFilterSize == 2) //unscaled RGB
  2769. {
  2770. int chrAlpha= vChrFilter[2*dstY+1];
  2771. if(flags & SWS_FULL_CHR_H_INT){
  2772. yuv2rgbXinC_full(c, //FIXME write a packed1_full function
  2773. vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
  2774. vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
  2775. alpSrcPtr, dest, dstW, dstY);
  2776. }else{
  2777. c->yuv2packed1(c, *lumSrcPtr, *chrSrcPtr, *(chrSrcPtr+1),
  2778. alpPixBuf ? *alpSrcPtr : NULL,
  2779. dest, dstW, chrAlpha, dstFormat, flags, dstY);
  2780. }
  2781. }
  2782. else if (vLumFilterSize == 2 && vChrFilterSize == 2) //bilinear upscale RGB
  2783. {
  2784. int lumAlpha= vLumFilter[2*dstY+1];
  2785. int chrAlpha= vChrFilter[2*dstY+1];
  2786. lumMmxFilter[2]=
  2787. lumMmxFilter[3]= vLumFilter[2*dstY ]*0x10001;
  2788. chrMmxFilter[2]=
  2789. chrMmxFilter[3]= vChrFilter[2*chrDstY]*0x10001;
  2790. if(flags & SWS_FULL_CHR_H_INT){
  2791. yuv2rgbXinC_full(c, //FIXME write a packed2_full function
  2792. vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
  2793. vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
  2794. alpSrcPtr, dest, dstW, dstY);
  2795. }else{
  2796. c->yuv2packed2(c, *lumSrcPtr, *(lumSrcPtr+1), *chrSrcPtr, *(chrSrcPtr+1),
  2797. alpPixBuf ? *alpSrcPtr : NULL, alpPixBuf ? *(alpSrcPtr+1) : NULL,
  2798. dest, dstW, lumAlpha, chrAlpha, dstY);
  2799. }
  2800. }
  2801. else //general RGB
  2802. {
  2803. if(flags & SWS_FULL_CHR_H_INT){
  2804. yuv2rgbXinC_full(c,
  2805. vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
  2806. vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
  2807. alpSrcPtr, dest, dstW, dstY);
  2808. }else{
  2809. c->yuv2packedX(c,
  2810. vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
  2811. vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
  2812. alpSrcPtr, dest, dstW, dstY);
  2813. }
  2814. }
  2815. }
  2816. }
  2817. else // hmm looks like we can't use MMX here without overwriting this array's tail
  2818. {
  2819. const int16_t **lumSrcPtr= (const int16_t **)lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
  2820. const int16_t **chrSrcPtr= (const int16_t **)chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
  2821. const int16_t **alpSrcPtr= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? (const int16_t **)alpPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize : NULL;
  2822. if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21){
  2823. const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
  2824. if (dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
  2825. yuv2nv12XinC(
  2826. vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
  2827. vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
  2828. dest, uDest, dstW, chrDstW, dstFormat);
  2829. }
  2830. else if (isPlanarYUV(dstFormat) || dstFormat==PIX_FMT_GRAY8) //YV12
  2831. {
  2832. const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
  2833. if ((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
  2834. yuv2yuvXinC(
  2835. vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
  2836. vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
  2837. alpSrcPtr, dest, uDest, vDest, aDest, dstW, chrDstW);
  2838. }
  2839. else
  2840. {
  2841. assert(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
  2842. assert(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
  2843. if(flags & SWS_FULL_CHR_H_INT){
  2844. yuv2rgbXinC_full(c,
  2845. vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
  2846. vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
  2847. alpSrcPtr, dest, dstW, dstY);
  2848. }else{
  2849. yuv2packedXinC(c,
  2850. vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
  2851. vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
  2852. alpSrcPtr, dest, dstW, dstY);
  2853. }
  2854. }
  2855. }
  2856. }
  2857. if ((dstFormat == PIX_FMT_YUVA420P) && !alpPixBuf)
  2858. fillPlane(dst[3], dstStride[3], dstW, dstY-lastDstY, lastDstY, 255);
  2859. #if HAVE_MMX
  2860. if (flags & SWS_CPU_CAPS_MMX2 ) __asm__ volatile("sfence":::"memory");
  2861. /* On K6 femms is faster than emms. On K7 femms is directly mapped to emms. */
  2862. if (flags & SWS_CPU_CAPS_3DNOW) __asm__ volatile("femms" :::"memory");
  2863. else __asm__ volatile("emms" :::"memory");
  2864. #endif
  2865. /* store changed local vars back in the context */
  2866. c->dstY= dstY;
  2867. c->lumBufIndex= lumBufIndex;
  2868. c->chrBufIndex= chrBufIndex;
  2869. c->lastInLumBuf= lastInLumBuf;
  2870. c->lastInChrBuf= lastInChrBuf;
  2871. return dstY - lastDstY;
  2872. }
  2873. static void RENAME(sws_init_swScale)(SwsContext *c)
  2874. {
  2875. enum PixelFormat srcFormat = c->srcFormat;
  2876. c->yuv2nv12X = RENAME(yuv2nv12X );
  2877. c->yuv2yuv1 = RENAME(yuv2yuv1 );
  2878. c->yuv2yuvX = RENAME(yuv2yuvX );
  2879. c->yuv2packed1 = RENAME(yuv2packed1 );
  2880. c->yuv2packed2 = RENAME(yuv2packed2 );
  2881. c->yuv2packedX = RENAME(yuv2packedX );
  2882. c->hScale = RENAME(hScale );
  2883. c->hyscale_fast = RENAME(hyscale_fast);
  2884. c->hcscale_fast = RENAME(hcscale_fast);
  2885. c->hcscale_internal = NULL;
  2886. switch(srcFormat) {
  2887. case PIX_FMT_YUYV422 : c->hcscale_internal = RENAME(yuy2ToUV); break;
  2888. case PIX_FMT_UYVY422 : c->hcscale_internal = RENAME(uyvyToUV); break;
  2889. case PIX_FMT_RGB8 :
  2890. case PIX_FMT_BGR8 :
  2891. case PIX_FMT_PAL8 :
  2892. case PIX_FMT_BGR4_BYTE:
  2893. case PIX_FMT_RGB4_BYTE: c->hcscale_internal = RENAME(palToUV); break;
  2894. }
  2895. if (c->chrSrcHSubSample) {
  2896. switch(srcFormat) {
  2897. case PIX_FMT_RGB32 :
  2898. case PIX_FMT_RGB32_1: c->hcscale_internal = RENAME(bgr32ToUV_half); break;
  2899. case PIX_FMT_BGR24 : c->hcscale_internal = RENAME(bgr24ToUV_half); break;
  2900. case PIX_FMT_BGR565 : c->hcscale_internal = RENAME(bgr16ToUV_half); break;
  2901. case PIX_FMT_BGR555 : c->hcscale_internal = RENAME(bgr15ToUV_half); break;
  2902. case PIX_FMT_BGR32 :
  2903. case PIX_FMT_BGR32_1: c->hcscale_internal = RENAME(rgb32ToUV_half); break;
  2904. case PIX_FMT_RGB24 : c->hcscale_internal = RENAME(rgb24ToUV_half); break;
  2905. case PIX_FMT_RGB565 : c->hcscale_internal = RENAME(rgb16ToUV_half); break;
  2906. case PIX_FMT_RGB555 : c->hcscale_internal = RENAME(rgb15ToUV_half); break;
  2907. }
  2908. } else {
  2909. switch(srcFormat) {
  2910. case PIX_FMT_RGB32 :
  2911. case PIX_FMT_RGB32_1: c->hcscale_internal = RENAME(bgr32ToUV); break;
  2912. case PIX_FMT_BGR24 : c->hcscale_internal = RENAME(bgr24ToUV); break;
  2913. case PIX_FMT_BGR565 : c->hcscale_internal = RENAME(bgr16ToUV); break;
  2914. case PIX_FMT_BGR555 : c->hcscale_internal = RENAME(bgr15ToUV); break;
  2915. case PIX_FMT_BGR32 :
  2916. case PIX_FMT_BGR32_1: c->hcscale_internal = RENAME(rgb32ToUV); break;
  2917. case PIX_FMT_RGB24 : c->hcscale_internal = RENAME(rgb24ToUV); break;
  2918. case PIX_FMT_RGB565 : c->hcscale_internal = RENAME(rgb16ToUV); break;
  2919. case PIX_FMT_RGB555 : c->hcscale_internal = RENAME(rgb15ToUV); break;
  2920. }
  2921. }
  2922. c->hyscale_internal = NULL;
  2923. switch (srcFormat) {
  2924. case PIX_FMT_YUYV422 :
  2925. case PIX_FMT_GRAY16BE : c->hyscale_internal = RENAME(yuy2ToY); break;
  2926. case PIX_FMT_UYVY422 :
  2927. case PIX_FMT_GRAY16LE : c->hyscale_internal = RENAME(uyvyToY); break;
  2928. case PIX_FMT_BGR24 : c->hyscale_internal = RENAME(bgr24ToY); break;
  2929. case PIX_FMT_BGR565 : c->hyscale_internal = RENAME(bgr16ToY); break;
  2930. case PIX_FMT_BGR555 : c->hyscale_internal = RENAME(bgr15ToY); break;
  2931. case PIX_FMT_RGB24 : c->hyscale_internal = RENAME(rgb24ToY); break;
  2932. case PIX_FMT_RGB565 : c->hyscale_internal = RENAME(rgb16ToY); break;
  2933. case PIX_FMT_RGB555 : c->hyscale_internal = RENAME(rgb15ToY); break;
  2934. case PIX_FMT_RGB8 :
  2935. case PIX_FMT_BGR8 :
  2936. case PIX_FMT_PAL8 :
  2937. case PIX_FMT_BGR4_BYTE:
  2938. case PIX_FMT_RGB4_BYTE: c->hyscale_internal = RENAME(palToY); break;
  2939. case PIX_FMT_MONOBLACK: c->hyscale_internal = RENAME(monoblack2Y); break;
  2940. case PIX_FMT_MONOWHITE: c->hyscale_internal = RENAME(monowhite2Y); break;
  2941. }
  2942. if (c->alpPixBuf) {
  2943. switch (srcFormat) {
  2944. case PIX_FMT_RGB32 :
  2945. case PIX_FMT_RGB32_1:
  2946. case PIX_FMT_BGR32 :
  2947. case PIX_FMT_BGR32_1: c->hyscale_internal = RENAME(abgrToA); break;
  2948. }
  2949. } else {
  2950. switch (srcFormat) {
  2951. case PIX_FMT_RGB32 :
  2952. case PIX_FMT_RGB32_1: c->hyscale_internal = RENAME(bgr32ToY); break;
  2953. case PIX_FMT_BGR32 :
  2954. case PIX_FMT_BGR32_1: c->hyscale_internal = RENAME(rgb32ToY); break;
  2955. }
  2956. }
  2957. }