You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

3053 lines
138KB

  1. /*
  2. * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
  3. *
  4. * This file is part of FFmpeg.
  5. *
  6. * FFmpeg is free software; you can redistribute it and/or modify
  7. * it under the terms of the GNU General Public License as published by
  8. * the Free Software Foundation; either version 2 of the License, or
  9. * (at your option) any later version.
  10. *
  11. * FFmpeg is distributed in the hope that it will be useful,
  12. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  14. * GNU General Public License for more details.
  15. *
  16. * You should have received a copy of the GNU General Public License
  17. * along with FFmpeg; if not, write to the Free Software
  18. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  19. *
  20. * The C code (not assembly, MMX, ...) of this file can be used
  21. * under the LGPL license.
  22. */
  23. #undef REAL_MOVNTQ
  24. #undef MOVNTQ
  25. #undef PAVGB
  26. #undef PREFETCH
  27. #undef PREFETCHW
  28. #if COMPILE_TEMPLATE_AMD3DNOW
  29. #define PREFETCH "prefetch"
  30. #define PREFETCHW "prefetchw"
  31. #elif COMPILE_TEMPLATE_MMX2
  32. #define PREFETCH "prefetchnta"
  33. #define PREFETCHW "prefetcht0"
  34. #else
  35. #define PREFETCH " # nop"
  36. #define PREFETCHW " # nop"
  37. #endif
  38. #if COMPILE_TEMPLATE_MMX2
  39. #define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
  40. #elif COMPILE_TEMPLATE_AMD3DNOW
  41. #define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
  42. #endif
  43. #if COMPILE_TEMPLATE_MMX2
  44. #define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
  45. #else
  46. #define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
  47. #endif
  48. #define MOVNTQ(a,b) REAL_MOVNTQ(a,b)
  49. #if COMPILE_TEMPLATE_ALTIVEC
  50. #include "ppc/swscale_altivec_template.c"
  51. #endif
  52. #define YSCALEYUV2YV12X(x, offset, dest, width) \
  53. __asm__ volatile(\
  54. "xor %%"REG_a", %%"REG_a" \n\t"\
  55. "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
  56. "movq %%mm3, %%mm4 \n\t"\
  57. "lea " offset "(%0), %%"REG_d" \n\t"\
  58. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  59. ASMALIGN(4) /* FIXME Unroll? */\
  60. "1: \n\t"\
  61. "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
  62. "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* srcData */\
  63. "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm5 \n\t" /* srcData */\
  64. "add $16, %%"REG_d" \n\t"\
  65. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  66. "test %%"REG_S", %%"REG_S" \n\t"\
  67. "pmulhw %%mm0, %%mm2 \n\t"\
  68. "pmulhw %%mm0, %%mm5 \n\t"\
  69. "paddw %%mm2, %%mm3 \n\t"\
  70. "paddw %%mm5, %%mm4 \n\t"\
  71. " jnz 1b \n\t"\
  72. "psraw $3, %%mm3 \n\t"\
  73. "psraw $3, %%mm4 \n\t"\
  74. "packuswb %%mm4, %%mm3 \n\t"\
  75. MOVNTQ(%%mm3, (%1, %%REGa))\
  76. "add $8, %%"REG_a" \n\t"\
  77. "cmp %2, %%"REG_a" \n\t"\
  78. "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
  79. "movq %%mm3, %%mm4 \n\t"\
  80. "lea " offset "(%0), %%"REG_d" \n\t"\
  81. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  82. "jb 1b \n\t"\
  83. :: "r" (&c->redDither),\
  84. "r" (dest), "g" (width)\
  85. : "%"REG_a, "%"REG_d, "%"REG_S\
  86. );
  87. #define YSCALEYUV2YV12X_ACCURATE(x, offset, dest, width) \
  88. __asm__ volatile(\
  89. "lea " offset "(%0), %%"REG_d" \n\t"\
  90. "xor %%"REG_a", %%"REG_a" \n\t"\
  91. "pxor %%mm4, %%mm4 \n\t"\
  92. "pxor %%mm5, %%mm5 \n\t"\
  93. "pxor %%mm6, %%mm6 \n\t"\
  94. "pxor %%mm7, %%mm7 \n\t"\
  95. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  96. ASMALIGN(4) \
  97. "1: \n\t"\
  98. "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm0 \n\t" /* srcData */\
  99. "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* srcData */\
  100. "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
  101. "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm1 \n\t" /* srcData */\
  102. "movq %%mm0, %%mm3 \n\t"\
  103. "punpcklwd %%mm1, %%mm0 \n\t"\
  104. "punpckhwd %%mm1, %%mm3 \n\t"\
  105. "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm1 \n\t" /* filterCoeff */\
  106. "pmaddwd %%mm1, %%mm0 \n\t"\
  107. "pmaddwd %%mm1, %%mm3 \n\t"\
  108. "paddd %%mm0, %%mm4 \n\t"\
  109. "paddd %%mm3, %%mm5 \n\t"\
  110. "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm3 \n\t" /* srcData */\
  111. "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
  112. "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
  113. "test %%"REG_S", %%"REG_S" \n\t"\
  114. "movq %%mm2, %%mm0 \n\t"\
  115. "punpcklwd %%mm3, %%mm2 \n\t"\
  116. "punpckhwd %%mm3, %%mm0 \n\t"\
  117. "pmaddwd %%mm1, %%mm2 \n\t"\
  118. "pmaddwd %%mm1, %%mm0 \n\t"\
  119. "paddd %%mm2, %%mm6 \n\t"\
  120. "paddd %%mm0, %%mm7 \n\t"\
  121. " jnz 1b \n\t"\
  122. "psrad $16, %%mm4 \n\t"\
  123. "psrad $16, %%mm5 \n\t"\
  124. "psrad $16, %%mm6 \n\t"\
  125. "psrad $16, %%mm7 \n\t"\
  126. "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
  127. "packssdw %%mm5, %%mm4 \n\t"\
  128. "packssdw %%mm7, %%mm6 \n\t"\
  129. "paddw %%mm0, %%mm4 \n\t"\
  130. "paddw %%mm0, %%mm6 \n\t"\
  131. "psraw $3, %%mm4 \n\t"\
  132. "psraw $3, %%mm6 \n\t"\
  133. "packuswb %%mm6, %%mm4 \n\t"\
  134. MOVNTQ(%%mm4, (%1, %%REGa))\
  135. "add $8, %%"REG_a" \n\t"\
  136. "cmp %2, %%"REG_a" \n\t"\
  137. "lea " offset "(%0), %%"REG_d" \n\t"\
  138. "pxor %%mm4, %%mm4 \n\t"\
  139. "pxor %%mm5, %%mm5 \n\t"\
  140. "pxor %%mm6, %%mm6 \n\t"\
  141. "pxor %%mm7, %%mm7 \n\t"\
  142. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  143. "jb 1b \n\t"\
  144. :: "r" (&c->redDither),\
  145. "r" (dest), "g" (width)\
  146. : "%"REG_a, "%"REG_d, "%"REG_S\
  147. );
  148. #define YSCALEYUV2YV121 \
  149. "mov %2, %%"REG_a" \n\t"\
  150. ASMALIGN(4) /* FIXME Unroll? */\
  151. "1: \n\t"\
  152. "movq (%0, %%"REG_a", 2), %%mm0 \n\t"\
  153. "movq 8(%0, %%"REG_a", 2), %%mm1 \n\t"\
  154. "psraw $7, %%mm0 \n\t"\
  155. "psraw $7, %%mm1 \n\t"\
  156. "packuswb %%mm1, %%mm0 \n\t"\
  157. MOVNTQ(%%mm0, (%1, %%REGa))\
  158. "add $8, %%"REG_a" \n\t"\
  159. "jnc 1b \n\t"
  160. #define YSCALEYUV2YV121_ACCURATE \
  161. "mov %2, %%"REG_a" \n\t"\
  162. "pcmpeqw %%mm7, %%mm7 \n\t"\
  163. "psrlw $15, %%mm7 \n\t"\
  164. "psllw $6, %%mm7 \n\t"\
  165. ASMALIGN(4) /* FIXME Unroll? */\
  166. "1: \n\t"\
  167. "movq (%0, %%"REG_a", 2), %%mm0 \n\t"\
  168. "movq 8(%0, %%"REG_a", 2), %%mm1 \n\t"\
  169. "paddsw %%mm7, %%mm0 \n\t"\
  170. "paddsw %%mm7, %%mm1 \n\t"\
  171. "psraw $7, %%mm0 \n\t"\
  172. "psraw $7, %%mm1 \n\t"\
  173. "packuswb %%mm1, %%mm0 \n\t"\
  174. MOVNTQ(%%mm0, (%1, %%REGa))\
  175. "add $8, %%"REG_a" \n\t"\
  176. "jnc 1b \n\t"
  177. /*
  178. :: "m" (-lumFilterSize), "m" (-chrFilterSize),
  179. "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
  180. "r" (dest), "m" (dstW),
  181. "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
  182. : "%eax", "%ebx", "%ecx", "%edx", "%esi"
  183. */
  184. #define YSCALEYUV2PACKEDX_UV \
  185. __asm__ volatile(\
  186. "xor %%"REG_a", %%"REG_a" \n\t"\
  187. ASMALIGN(4)\
  188. "nop \n\t"\
  189. "1: \n\t"\
  190. "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
  191. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  192. "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
  193. "movq %%mm3, %%mm4 \n\t"\
  194. ASMALIGN(4)\
  195. "2: \n\t"\
  196. "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
  197. "movq (%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* UsrcData */\
  198. "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm5 \n\t" /* VsrcData */\
  199. "add $16, %%"REG_d" \n\t"\
  200. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  201. "pmulhw %%mm0, %%mm2 \n\t"\
  202. "pmulhw %%mm0, %%mm5 \n\t"\
  203. "paddw %%mm2, %%mm3 \n\t"\
  204. "paddw %%mm5, %%mm4 \n\t"\
  205. "test %%"REG_S", %%"REG_S" \n\t"\
  206. " jnz 2b \n\t"\
  207. #define YSCALEYUV2PACKEDX_YA(offset,coeff,src1,src2,dst1,dst2) \
  208. "lea "offset"(%0), %%"REG_d" \n\t"\
  209. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  210. "movq "VROUNDER_OFFSET"(%0), "#dst1" \n\t"\
  211. "movq "#dst1", "#dst2" \n\t"\
  212. ASMALIGN(4)\
  213. "2: \n\t"\
  214. "movq 8(%%"REG_d"), "#coeff" \n\t" /* filterCoeff */\
  215. "movq (%%"REG_S", %%"REG_a", 2), "#src1" \n\t" /* Y1srcData */\
  216. "movq 8(%%"REG_S", %%"REG_a", 2), "#src2" \n\t" /* Y2srcData */\
  217. "add $16, %%"REG_d" \n\t"\
  218. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  219. "pmulhw "#coeff", "#src1" \n\t"\
  220. "pmulhw "#coeff", "#src2" \n\t"\
  221. "paddw "#src1", "#dst1" \n\t"\
  222. "paddw "#src2", "#dst2" \n\t"\
  223. "test %%"REG_S", %%"REG_S" \n\t"\
  224. " jnz 2b \n\t"\
  225. #define YSCALEYUV2PACKEDX \
  226. YSCALEYUV2PACKEDX_UV \
  227. YSCALEYUV2PACKEDX_YA(LUM_MMX_FILTER_OFFSET,%%mm0,%%mm2,%%mm5,%%mm1,%%mm7) \
  228. #define YSCALEYUV2PACKEDX_END \
  229. :: "r" (&c->redDither), \
  230. "m" (dummy), "m" (dummy), "m" (dummy),\
  231. "r" (dest), "m" (dstW) \
  232. : "%"REG_a, "%"REG_d, "%"REG_S \
  233. );
  234. #define YSCALEYUV2PACKEDX_ACCURATE_UV \
  235. __asm__ volatile(\
  236. "xor %%"REG_a", %%"REG_a" \n\t"\
  237. ASMALIGN(4)\
  238. "nop \n\t"\
  239. "1: \n\t"\
  240. "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
  241. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  242. "pxor %%mm4, %%mm4 \n\t"\
  243. "pxor %%mm5, %%mm5 \n\t"\
  244. "pxor %%mm6, %%mm6 \n\t"\
  245. "pxor %%mm7, %%mm7 \n\t"\
  246. ASMALIGN(4)\
  247. "2: \n\t"\
  248. "movq (%%"REG_S", %%"REG_a"), %%mm0 \n\t" /* UsrcData */\
  249. "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* VsrcData */\
  250. "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
  251. "movq (%%"REG_S", %%"REG_a"), %%mm1 \n\t" /* UsrcData */\
  252. "movq %%mm0, %%mm3 \n\t"\
  253. "punpcklwd %%mm1, %%mm0 \n\t"\
  254. "punpckhwd %%mm1, %%mm3 \n\t"\
  255. "movq "STR(APCK_COEF)"(%%"REG_d"),%%mm1 \n\t" /* filterCoeff */\
  256. "pmaddwd %%mm1, %%mm0 \n\t"\
  257. "pmaddwd %%mm1, %%mm3 \n\t"\
  258. "paddd %%mm0, %%mm4 \n\t"\
  259. "paddd %%mm3, %%mm5 \n\t"\
  260. "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm3 \n\t" /* VsrcData */\
  261. "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
  262. "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
  263. "test %%"REG_S", %%"REG_S" \n\t"\
  264. "movq %%mm2, %%mm0 \n\t"\
  265. "punpcklwd %%mm3, %%mm2 \n\t"\
  266. "punpckhwd %%mm3, %%mm0 \n\t"\
  267. "pmaddwd %%mm1, %%mm2 \n\t"\
  268. "pmaddwd %%mm1, %%mm0 \n\t"\
  269. "paddd %%mm2, %%mm6 \n\t"\
  270. "paddd %%mm0, %%mm7 \n\t"\
  271. " jnz 2b \n\t"\
  272. "psrad $16, %%mm4 \n\t"\
  273. "psrad $16, %%mm5 \n\t"\
  274. "psrad $16, %%mm6 \n\t"\
  275. "psrad $16, %%mm7 \n\t"\
  276. "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
  277. "packssdw %%mm5, %%mm4 \n\t"\
  278. "packssdw %%mm7, %%mm6 \n\t"\
  279. "paddw %%mm0, %%mm4 \n\t"\
  280. "paddw %%mm0, %%mm6 \n\t"\
  281. "movq %%mm4, "U_TEMP"(%0) \n\t"\
  282. "movq %%mm6, "V_TEMP"(%0) \n\t"\
  283. #define YSCALEYUV2PACKEDX_ACCURATE_YA(offset) \
  284. "lea "offset"(%0), %%"REG_d" \n\t"\
  285. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  286. "pxor %%mm1, %%mm1 \n\t"\
  287. "pxor %%mm5, %%mm5 \n\t"\
  288. "pxor %%mm7, %%mm7 \n\t"\
  289. "pxor %%mm6, %%mm6 \n\t"\
  290. ASMALIGN(4)\
  291. "2: \n\t"\
  292. "movq (%%"REG_S", %%"REG_a", 2), %%mm0 \n\t" /* Y1srcData */\
  293. "movq 8(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* Y2srcData */\
  294. "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
  295. "movq (%%"REG_S", %%"REG_a", 2), %%mm4 \n\t" /* Y1srcData */\
  296. "movq %%mm0, %%mm3 \n\t"\
  297. "punpcklwd %%mm4, %%mm0 \n\t"\
  298. "punpckhwd %%mm4, %%mm3 \n\t"\
  299. "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm4 \n\t" /* filterCoeff */\
  300. "pmaddwd %%mm4, %%mm0 \n\t"\
  301. "pmaddwd %%mm4, %%mm3 \n\t"\
  302. "paddd %%mm0, %%mm1 \n\t"\
  303. "paddd %%mm3, %%mm5 \n\t"\
  304. "movq 8(%%"REG_S", %%"REG_a", 2), %%mm3 \n\t" /* Y2srcData */\
  305. "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
  306. "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
  307. "test %%"REG_S", %%"REG_S" \n\t"\
  308. "movq %%mm2, %%mm0 \n\t"\
  309. "punpcklwd %%mm3, %%mm2 \n\t"\
  310. "punpckhwd %%mm3, %%mm0 \n\t"\
  311. "pmaddwd %%mm4, %%mm2 \n\t"\
  312. "pmaddwd %%mm4, %%mm0 \n\t"\
  313. "paddd %%mm2, %%mm7 \n\t"\
  314. "paddd %%mm0, %%mm6 \n\t"\
  315. " jnz 2b \n\t"\
  316. "psrad $16, %%mm1 \n\t"\
  317. "psrad $16, %%mm5 \n\t"\
  318. "psrad $16, %%mm7 \n\t"\
  319. "psrad $16, %%mm6 \n\t"\
  320. "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
  321. "packssdw %%mm5, %%mm1 \n\t"\
  322. "packssdw %%mm6, %%mm7 \n\t"\
  323. "paddw %%mm0, %%mm1 \n\t"\
  324. "paddw %%mm0, %%mm7 \n\t"\
  325. "movq "U_TEMP"(%0), %%mm3 \n\t"\
  326. "movq "V_TEMP"(%0), %%mm4 \n\t"\
  327. #define YSCALEYUV2PACKEDX_ACCURATE \
  328. YSCALEYUV2PACKEDX_ACCURATE_UV \
  329. YSCALEYUV2PACKEDX_ACCURATE_YA(LUM_MMX_FILTER_OFFSET)
  330. #define YSCALEYUV2RGBX \
  331. "psubw "U_OFFSET"(%0), %%mm3 \n\t" /* (U-128)8*/\
  332. "psubw "V_OFFSET"(%0), %%mm4 \n\t" /* (V-128)8*/\
  333. "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
  334. "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
  335. "pmulhw "UG_COEFF"(%0), %%mm3 \n\t"\
  336. "pmulhw "VG_COEFF"(%0), %%mm4 \n\t"\
  337. /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
  338. "pmulhw "UB_COEFF"(%0), %%mm2 \n\t"\
  339. "pmulhw "VR_COEFF"(%0), %%mm5 \n\t"\
  340. "psubw "Y_OFFSET"(%0), %%mm1 \n\t" /* 8(Y-16)*/\
  341. "psubw "Y_OFFSET"(%0), %%mm7 \n\t" /* 8(Y-16)*/\
  342. "pmulhw "Y_COEFF"(%0), %%mm1 \n\t"\
  343. "pmulhw "Y_COEFF"(%0), %%mm7 \n\t"\
  344. /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
  345. "paddw %%mm3, %%mm4 \n\t"\
  346. "movq %%mm2, %%mm0 \n\t"\
  347. "movq %%mm5, %%mm6 \n\t"\
  348. "movq %%mm4, %%mm3 \n\t"\
  349. "punpcklwd %%mm2, %%mm2 \n\t"\
  350. "punpcklwd %%mm5, %%mm5 \n\t"\
  351. "punpcklwd %%mm4, %%mm4 \n\t"\
  352. "paddw %%mm1, %%mm2 \n\t"\
  353. "paddw %%mm1, %%mm5 \n\t"\
  354. "paddw %%mm1, %%mm4 \n\t"\
  355. "punpckhwd %%mm0, %%mm0 \n\t"\
  356. "punpckhwd %%mm6, %%mm6 \n\t"\
  357. "punpckhwd %%mm3, %%mm3 \n\t"\
  358. "paddw %%mm7, %%mm0 \n\t"\
  359. "paddw %%mm7, %%mm6 \n\t"\
  360. "paddw %%mm7, %%mm3 \n\t"\
  361. /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
  362. "packuswb %%mm0, %%mm2 \n\t"\
  363. "packuswb %%mm6, %%mm5 \n\t"\
  364. "packuswb %%mm3, %%mm4 \n\t"\
  365. #define REAL_YSCALEYUV2PACKED(index, c) \
  366. "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
  367. "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1 \n\t"\
  368. "psraw $3, %%mm0 \n\t"\
  369. "psraw $3, %%mm1 \n\t"\
  370. "movq %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c") \n\t"\
  371. "movq %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c") \n\t"\
  372. "xor "#index", "#index" \n\t"\
  373. ASMALIGN(4)\
  374. "1: \n\t"\
  375. "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
  376. "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
  377. "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
  378. "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
  379. "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
  380. "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
  381. "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
  382. "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
  383. "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
  384. "psraw $7, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
  385. "psraw $7, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
  386. "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
  387. "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
  388. "movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
  389. "movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
  390. "movq 8(%0, "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
  391. "movq 8(%1, "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
  392. "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
  393. "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
  394. "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
  395. "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
  396. "psraw $7, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  397. "psraw $7, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  398. "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
  399. "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
  400. #define YSCALEYUV2PACKED(index, c) REAL_YSCALEYUV2PACKED(index, c)
  401. #define REAL_YSCALEYUV2RGB_UV(index, c) \
  402. "xor "#index", "#index" \n\t"\
  403. ASMALIGN(4)\
  404. "1: \n\t"\
  405. "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
  406. "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
  407. "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
  408. "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
  409. "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
  410. "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
  411. "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
  412. "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
  413. "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
  414. "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
  415. "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
  416. "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
  417. "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
  418. "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
  419. "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
  420. "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
  421. "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
  422. "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
  423. "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
  424. /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
  425. #define REAL_YSCALEYUV2RGB_YA(index, c, b1, b2) \
  426. "movq ("#b1", "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
  427. "movq ("#b2", "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
  428. "movq 8("#b1", "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
  429. "movq 8("#b2", "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
  430. "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
  431. "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
  432. "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
  433. "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
  434. "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  435. "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  436. "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
  437. "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
  438. #define REAL_YSCALEYUV2RGB_COEFF(c) \
  439. "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
  440. "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
  441. "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
  442. "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
  443. "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
  444. "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
  445. /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
  446. "paddw %%mm3, %%mm4 \n\t"\
  447. "movq %%mm2, %%mm0 \n\t"\
  448. "movq %%mm5, %%mm6 \n\t"\
  449. "movq %%mm4, %%mm3 \n\t"\
  450. "punpcklwd %%mm2, %%mm2 \n\t"\
  451. "punpcklwd %%mm5, %%mm5 \n\t"\
  452. "punpcklwd %%mm4, %%mm4 \n\t"\
  453. "paddw %%mm1, %%mm2 \n\t"\
  454. "paddw %%mm1, %%mm5 \n\t"\
  455. "paddw %%mm1, %%mm4 \n\t"\
  456. "punpckhwd %%mm0, %%mm0 \n\t"\
  457. "punpckhwd %%mm6, %%mm6 \n\t"\
  458. "punpckhwd %%mm3, %%mm3 \n\t"\
  459. "paddw %%mm7, %%mm0 \n\t"\
  460. "paddw %%mm7, %%mm6 \n\t"\
  461. "paddw %%mm7, %%mm3 \n\t"\
  462. /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
  463. "packuswb %%mm0, %%mm2 \n\t"\
  464. "packuswb %%mm6, %%mm5 \n\t"\
  465. "packuswb %%mm3, %%mm4 \n\t"\
  466. #define YSCALEYUV2RGB_YA(index, c, b1, b2) REAL_YSCALEYUV2RGB_YA(index, c, b1, b2)
  467. #define YSCALEYUV2RGB(index, c) \
  468. REAL_YSCALEYUV2RGB_UV(index, c) \
  469. REAL_YSCALEYUV2RGB_YA(index, c, %0, %1) \
  470. REAL_YSCALEYUV2RGB_COEFF(c)
  471. #define REAL_YSCALEYUV2PACKED1(index, c) \
  472. "xor "#index", "#index" \n\t"\
  473. ASMALIGN(4)\
  474. "1: \n\t"\
  475. "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
  476. "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
  477. "psraw $7, %%mm3 \n\t" \
  478. "psraw $7, %%mm4 \n\t" \
  479. "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
  480. "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
  481. "psraw $7, %%mm1 \n\t" \
  482. "psraw $7, %%mm7 \n\t" \
  483. #define YSCALEYUV2PACKED1(index, c) REAL_YSCALEYUV2PACKED1(index, c)
  484. #define REAL_YSCALEYUV2RGB1(index, c) \
  485. "xor "#index", "#index" \n\t"\
  486. ASMALIGN(4)\
  487. "1: \n\t"\
  488. "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
  489. "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
  490. "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
  491. "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
  492. "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
  493. "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
  494. "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
  495. "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
  496. "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
  497. "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
  498. /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
  499. "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
  500. "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
  501. "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  502. "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  503. "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
  504. "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
  505. "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
  506. "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
  507. "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
  508. "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
  509. /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
  510. "paddw %%mm3, %%mm4 \n\t"\
  511. "movq %%mm2, %%mm0 \n\t"\
  512. "movq %%mm5, %%mm6 \n\t"\
  513. "movq %%mm4, %%mm3 \n\t"\
  514. "punpcklwd %%mm2, %%mm2 \n\t"\
  515. "punpcklwd %%mm5, %%mm5 \n\t"\
  516. "punpcklwd %%mm4, %%mm4 \n\t"\
  517. "paddw %%mm1, %%mm2 \n\t"\
  518. "paddw %%mm1, %%mm5 \n\t"\
  519. "paddw %%mm1, %%mm4 \n\t"\
  520. "punpckhwd %%mm0, %%mm0 \n\t"\
  521. "punpckhwd %%mm6, %%mm6 \n\t"\
  522. "punpckhwd %%mm3, %%mm3 \n\t"\
  523. "paddw %%mm7, %%mm0 \n\t"\
  524. "paddw %%mm7, %%mm6 \n\t"\
  525. "paddw %%mm7, %%mm3 \n\t"\
  526. /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
  527. "packuswb %%mm0, %%mm2 \n\t"\
  528. "packuswb %%mm6, %%mm5 \n\t"\
  529. "packuswb %%mm3, %%mm4 \n\t"\
  530. #define YSCALEYUV2RGB1(index, c) REAL_YSCALEYUV2RGB1(index, c)
  531. #define REAL_YSCALEYUV2PACKED1b(index, c) \
  532. "xor "#index", "#index" \n\t"\
  533. ASMALIGN(4)\
  534. "1: \n\t"\
  535. "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
  536. "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
  537. "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
  538. "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
  539. "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
  540. "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
  541. "psrlw $8, %%mm3 \n\t" \
  542. "psrlw $8, %%mm4 \n\t" \
  543. "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
  544. "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
  545. "psraw $7, %%mm1 \n\t" \
  546. "psraw $7, %%mm7 \n\t"
  547. #define YSCALEYUV2PACKED1b(index, c) REAL_YSCALEYUV2PACKED1b(index, c)
  548. // do vertical chrominance interpolation
  549. #define REAL_YSCALEYUV2RGB1b(index, c) \
  550. "xor "#index", "#index" \n\t"\
  551. ASMALIGN(4)\
  552. "1: \n\t"\
  553. "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
  554. "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
  555. "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
  556. "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
  557. "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
  558. "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
  559. "psrlw $5, %%mm3 \n\t" /*FIXME might overflow*/\
  560. "psrlw $5, %%mm4 \n\t" /*FIXME might overflow*/\
  561. "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
  562. "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
  563. "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
  564. "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
  565. "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
  566. "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
  567. /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
  568. "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
  569. "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
  570. "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  571. "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  572. "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
  573. "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
  574. "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
  575. "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
  576. "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
  577. "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
  578. /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
  579. "paddw %%mm3, %%mm4 \n\t"\
  580. "movq %%mm2, %%mm0 \n\t"\
  581. "movq %%mm5, %%mm6 \n\t"\
  582. "movq %%mm4, %%mm3 \n\t"\
  583. "punpcklwd %%mm2, %%mm2 \n\t"\
  584. "punpcklwd %%mm5, %%mm5 \n\t"\
  585. "punpcklwd %%mm4, %%mm4 \n\t"\
  586. "paddw %%mm1, %%mm2 \n\t"\
  587. "paddw %%mm1, %%mm5 \n\t"\
  588. "paddw %%mm1, %%mm4 \n\t"\
  589. "punpckhwd %%mm0, %%mm0 \n\t"\
  590. "punpckhwd %%mm6, %%mm6 \n\t"\
  591. "punpckhwd %%mm3, %%mm3 \n\t"\
  592. "paddw %%mm7, %%mm0 \n\t"\
  593. "paddw %%mm7, %%mm6 \n\t"\
  594. "paddw %%mm7, %%mm3 \n\t"\
  595. /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
  596. "packuswb %%mm0, %%mm2 \n\t"\
  597. "packuswb %%mm6, %%mm5 \n\t"\
  598. "packuswb %%mm3, %%mm4 \n\t"\
  599. #define YSCALEYUV2RGB1b(index, c) REAL_YSCALEYUV2RGB1b(index, c)
  600. #define REAL_YSCALEYUV2RGB1_ALPHA(index) \
  601. "movq (%1, "#index", 2), %%mm7 \n\t" /* abuf0[index ] */\
  602. "movq 8(%1, "#index", 2), %%mm1 \n\t" /* abuf0[index+4] */\
  603. "psraw $7, %%mm7 \n\t" /* abuf0[index ] >>7 */\
  604. "psraw $7, %%mm1 \n\t" /* abuf0[index+4] >>7 */\
  605. "packuswb %%mm1, %%mm7 \n\t"
  606. #define YSCALEYUV2RGB1_ALPHA(index) REAL_YSCALEYUV2RGB1_ALPHA(index)
  607. #define REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) \
  608. "movq "#b", "#q2" \n\t" /* B */\
  609. "movq "#r", "#t" \n\t" /* R */\
  610. "punpcklbw "#g", "#b" \n\t" /* GBGBGBGB 0 */\
  611. "punpcklbw "#a", "#r" \n\t" /* ARARARAR 0 */\
  612. "punpckhbw "#g", "#q2" \n\t" /* GBGBGBGB 2 */\
  613. "punpckhbw "#a", "#t" \n\t" /* ARARARAR 2 */\
  614. "movq "#b", "#q0" \n\t" /* GBGBGBGB 0 */\
  615. "movq "#q2", "#q3" \n\t" /* GBGBGBGB 2 */\
  616. "punpcklwd "#r", "#q0" \n\t" /* ARGBARGB 0 */\
  617. "punpckhwd "#r", "#b" \n\t" /* ARGBARGB 1 */\
  618. "punpcklwd "#t", "#q2" \n\t" /* ARGBARGB 2 */\
  619. "punpckhwd "#t", "#q3" \n\t" /* ARGBARGB 3 */\
  620. \
  621. MOVNTQ( q0, (dst, index, 4))\
  622. MOVNTQ( b, 8(dst, index, 4))\
  623. MOVNTQ( q2, 16(dst, index, 4))\
  624. MOVNTQ( q3, 24(dst, index, 4))\
  625. \
  626. "add $8, "#index" \n\t"\
  627. "cmp "#dstw", "#index" \n\t"\
  628. " jb 1b \n\t"
  629. #define WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t)
  630. #define REAL_WRITERGB16(dst, dstw, index) \
  631. "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
  632. "pand "MANGLE(bFC)", %%mm4 \n\t" /* G */\
  633. "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
  634. "psrlq $3, %%mm2 \n\t"\
  635. \
  636. "movq %%mm2, %%mm1 \n\t"\
  637. "movq %%mm4, %%mm3 \n\t"\
  638. \
  639. "punpcklbw %%mm7, %%mm3 \n\t"\
  640. "punpcklbw %%mm5, %%mm2 \n\t"\
  641. "punpckhbw %%mm7, %%mm4 \n\t"\
  642. "punpckhbw %%mm5, %%mm1 \n\t"\
  643. \
  644. "psllq $3, %%mm3 \n\t"\
  645. "psllq $3, %%mm4 \n\t"\
  646. \
  647. "por %%mm3, %%mm2 \n\t"\
  648. "por %%mm4, %%mm1 \n\t"\
  649. \
  650. MOVNTQ(%%mm2, (dst, index, 2))\
  651. MOVNTQ(%%mm1, 8(dst, index, 2))\
  652. \
  653. "add $8, "#index" \n\t"\
  654. "cmp "#dstw", "#index" \n\t"\
  655. " jb 1b \n\t"
  656. #define WRITERGB16(dst, dstw, index) REAL_WRITERGB16(dst, dstw, index)
  657. #define REAL_WRITERGB15(dst, dstw, index) \
  658. "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
  659. "pand "MANGLE(bF8)", %%mm4 \n\t" /* G */\
  660. "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
  661. "psrlq $3, %%mm2 \n\t"\
  662. "psrlq $1, %%mm5 \n\t"\
  663. \
  664. "movq %%mm2, %%mm1 \n\t"\
  665. "movq %%mm4, %%mm3 \n\t"\
  666. \
  667. "punpcklbw %%mm7, %%mm3 \n\t"\
  668. "punpcklbw %%mm5, %%mm2 \n\t"\
  669. "punpckhbw %%mm7, %%mm4 \n\t"\
  670. "punpckhbw %%mm5, %%mm1 \n\t"\
  671. \
  672. "psllq $2, %%mm3 \n\t"\
  673. "psllq $2, %%mm4 \n\t"\
  674. \
  675. "por %%mm3, %%mm2 \n\t"\
  676. "por %%mm4, %%mm1 \n\t"\
  677. \
  678. MOVNTQ(%%mm2, (dst, index, 2))\
  679. MOVNTQ(%%mm1, 8(dst, index, 2))\
  680. \
  681. "add $8, "#index" \n\t"\
  682. "cmp "#dstw", "#index" \n\t"\
  683. " jb 1b \n\t"
  684. #define WRITERGB15(dst, dstw, index) REAL_WRITERGB15(dst, dstw, index)
  685. #define WRITEBGR24OLD(dst, dstw, index) \
  686. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
  687. "movq %%mm2, %%mm1 \n\t" /* B */\
  688. "movq %%mm5, %%mm6 \n\t" /* R */\
  689. "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
  690. "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
  691. "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
  692. "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
  693. "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
  694. "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
  695. "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
  696. "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
  697. "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
  698. "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
  699. \
  700. "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
  701. "psrlq $8, %%mm0 \n\t" /* 00RGB0RG 0 */\
  702. "pand "MANGLE(bm00000111)", %%mm4 \n\t" /* 00000RGB 0 */\
  703. "pand "MANGLE(bm11111000)", %%mm0 \n\t" /* 00RGB000 0.5 */\
  704. "por %%mm4, %%mm0 \n\t" /* 00RGBRGB 0 */\
  705. "movq %%mm2, %%mm4 \n\t" /* 0RGB0RGB 1 */\
  706. "psllq $48, %%mm2 \n\t" /* GB000000 1 */\
  707. "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
  708. \
  709. "movq %%mm4, %%mm2 \n\t" /* 0RGB0RGB 1 */\
  710. "psrld $16, %%mm4 \n\t" /* 000R000R 1 */\
  711. "psrlq $24, %%mm2 \n\t" /* 0000RGB0 1.5 */\
  712. "por %%mm4, %%mm2 \n\t" /* 000RRGBR 1 */\
  713. "pand "MANGLE(bm00001111)", %%mm2 \n\t" /* 0000RGBR 1 */\
  714. "movq %%mm1, %%mm4 \n\t" /* 0RGB0RGB 2 */\
  715. "psrlq $8, %%mm1 \n\t" /* 00RGB0RG 2 */\
  716. "pand "MANGLE(bm00000111)", %%mm4 \n\t" /* 00000RGB 2 */\
  717. "pand "MANGLE(bm11111000)", %%mm1 \n\t" /* 00RGB000 2.5 */\
  718. "por %%mm4, %%mm1 \n\t" /* 00RGBRGB 2 */\
  719. "movq %%mm1, %%mm4 \n\t" /* 00RGBRGB 2 */\
  720. "psllq $32, %%mm1 \n\t" /* BRGB0000 2 */\
  721. "por %%mm1, %%mm2 \n\t" /* BRGBRGBR 1 */\
  722. \
  723. "psrlq $32, %%mm4 \n\t" /* 000000RG 2.5 */\
  724. "movq %%mm3, %%mm5 \n\t" /* 0RGB0RGB 3 */\
  725. "psrlq $8, %%mm3 \n\t" /* 00RGB0RG 3 */\
  726. "pand "MANGLE(bm00000111)", %%mm5 \n\t" /* 00000RGB 3 */\
  727. "pand "MANGLE(bm11111000)", %%mm3 \n\t" /* 00RGB000 3.5 */\
  728. "por %%mm5, %%mm3 \n\t" /* 00RGBRGB 3 */\
  729. "psllq $16, %%mm3 \n\t" /* RGBRGB00 3 */\
  730. "por %%mm4, %%mm3 \n\t" /* RGBRGBRG 2.5 */\
  731. \
  732. MOVNTQ(%%mm0, (dst))\
  733. MOVNTQ(%%mm2, 8(dst))\
  734. MOVNTQ(%%mm3, 16(dst))\
  735. "add $24, "#dst" \n\t"\
  736. \
  737. "add $8, "#index" \n\t"\
  738. "cmp "#dstw", "#index" \n\t"\
  739. " jb 1b \n\t"
  740. #define WRITEBGR24MMX(dst, dstw, index) \
  741. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
  742. "movq %%mm2, %%mm1 \n\t" /* B */\
  743. "movq %%mm5, %%mm6 \n\t" /* R */\
  744. "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
  745. "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
  746. "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
  747. "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
  748. "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
  749. "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
  750. "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
  751. "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
  752. "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
  753. "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
  754. \
  755. "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
  756. "movq %%mm2, %%mm6 \n\t" /* 0RGB0RGB 1 */\
  757. "movq %%mm1, %%mm5 \n\t" /* 0RGB0RGB 2 */\
  758. "movq %%mm3, %%mm7 \n\t" /* 0RGB0RGB 3 */\
  759. \
  760. "psllq $40, %%mm0 \n\t" /* RGB00000 0 */\
  761. "psllq $40, %%mm2 \n\t" /* RGB00000 1 */\
  762. "psllq $40, %%mm1 \n\t" /* RGB00000 2 */\
  763. "psllq $40, %%mm3 \n\t" /* RGB00000 3 */\
  764. \
  765. "punpckhdq %%mm4, %%mm0 \n\t" /* 0RGBRGB0 0 */\
  766. "punpckhdq %%mm6, %%mm2 \n\t" /* 0RGBRGB0 1 */\
  767. "punpckhdq %%mm5, %%mm1 \n\t" /* 0RGBRGB0 2 */\
  768. "punpckhdq %%mm7, %%mm3 \n\t" /* 0RGBRGB0 3 */\
  769. \
  770. "psrlq $8, %%mm0 \n\t" /* 00RGBRGB 0 */\
  771. "movq %%mm2, %%mm6 \n\t" /* 0RGBRGB0 1 */\
  772. "psllq $40, %%mm2 \n\t" /* GB000000 1 */\
  773. "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
  774. MOVNTQ(%%mm0, (dst))\
  775. \
  776. "psrlq $24, %%mm6 \n\t" /* 0000RGBR 1 */\
  777. "movq %%mm1, %%mm5 \n\t" /* 0RGBRGB0 2 */\
  778. "psllq $24, %%mm1 \n\t" /* BRGB0000 2 */\
  779. "por %%mm1, %%mm6 \n\t" /* BRGBRGBR 1 */\
  780. MOVNTQ(%%mm6, 8(dst))\
  781. \
  782. "psrlq $40, %%mm5 \n\t" /* 000000RG 2 */\
  783. "psllq $8, %%mm3 \n\t" /* RGBRGB00 3 */\
  784. "por %%mm3, %%mm5 \n\t" /* RGBRGBRG 2 */\
  785. MOVNTQ(%%mm5, 16(dst))\
  786. \
  787. "add $24, "#dst" \n\t"\
  788. \
  789. "add $8, "#index" \n\t"\
  790. "cmp "#dstw", "#index" \n\t"\
  791. " jb 1b \n\t"
  792. #define WRITEBGR24MMX2(dst, dstw, index) \
  793. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
  794. "movq "MANGLE(ff_M24A)", %%mm0 \n\t"\
  795. "movq "MANGLE(ff_M24C)", %%mm7 \n\t"\
  796. "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2 B1 B0 B1 B0 */\
  797. "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2 G1 G0 G1 G0 */\
  798. "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0 R1 R0 R1 R0 */\
  799. \
  800. "pand %%mm0, %%mm1 \n\t" /* B2 B1 B0 */\
  801. "pand %%mm0, %%mm3 \n\t" /* G2 G1 G0 */\
  802. "pand %%mm7, %%mm6 \n\t" /* R1 R0 */\
  803. \
  804. "psllq $8, %%mm3 \n\t" /* G2 G1 G0 */\
  805. "por %%mm1, %%mm6 \n\t"\
  806. "por %%mm3, %%mm6 \n\t"\
  807. MOVNTQ(%%mm6, (dst))\
  808. \
  809. "psrlq $8, %%mm4 \n\t" /* 00 G7 G6 G5 G4 G3 G2 G1 */\
  810. "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4 B3 B2 B3 B2 */\
  811. "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3 G4 G3 G4 G3 */\
  812. "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4 R3 R2 R3 R2 */\
  813. \
  814. "pand "MANGLE(ff_M24B)", %%mm1 \n\t" /* B5 B4 B3 */\
  815. "pand %%mm7, %%mm3 \n\t" /* G4 G3 */\
  816. "pand %%mm0, %%mm6 \n\t" /* R4 R3 R2 */\
  817. \
  818. "por %%mm1, %%mm3 \n\t" /* B5 G4 B4 G3 B3 */\
  819. "por %%mm3, %%mm6 \n\t"\
  820. MOVNTQ(%%mm6, 8(dst))\
  821. \
  822. "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6 B7 B6 B6 B7 */\
  823. "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7 G6 G5 G6 G5 */\
  824. "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6 R5 R4 R5 R4 */\
  825. \
  826. "pand %%mm7, %%mm1 \n\t" /* B7 B6 */\
  827. "pand %%mm0, %%mm3 \n\t" /* G7 G6 G5 */\
  828. "pand "MANGLE(ff_M24B)", %%mm6 \n\t" /* R7 R6 R5 */\
  829. \
  830. "por %%mm1, %%mm3 \n\t"\
  831. "por %%mm3, %%mm6 \n\t"\
  832. MOVNTQ(%%mm6, 16(dst))\
  833. \
  834. "add $24, "#dst" \n\t"\
  835. \
  836. "add $8, "#index" \n\t"\
  837. "cmp "#dstw", "#index" \n\t"\
  838. " jb 1b \n\t"
  839. #if COMPILE_TEMPLATE_MMX2
  840. #undef WRITEBGR24
  841. #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX2(dst, dstw, index)
  842. #else
  843. #undef WRITEBGR24
  844. #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX(dst, dstw, index)
  845. #endif
  846. #define REAL_WRITEYUY2(dst, dstw, index) \
  847. "packuswb %%mm3, %%mm3 \n\t"\
  848. "packuswb %%mm4, %%mm4 \n\t"\
  849. "packuswb %%mm7, %%mm1 \n\t"\
  850. "punpcklbw %%mm4, %%mm3 \n\t"\
  851. "movq %%mm1, %%mm7 \n\t"\
  852. "punpcklbw %%mm3, %%mm1 \n\t"\
  853. "punpckhbw %%mm3, %%mm7 \n\t"\
  854. \
  855. MOVNTQ(%%mm1, (dst, index, 2))\
  856. MOVNTQ(%%mm7, 8(dst, index, 2))\
  857. \
  858. "add $8, "#index" \n\t"\
  859. "cmp "#dstw", "#index" \n\t"\
  860. " jb 1b \n\t"
  861. #define WRITEYUY2(dst, dstw, index) REAL_WRITEYUY2(dst, dstw, index)
  862. static inline void RENAME(yuv2yuvX)(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
  863. const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize, const int16_t **alpSrc,
  864. uint8_t *dest, uint8_t *uDest, uint8_t *vDest, uint8_t *aDest, long dstW, long chrDstW)
  865. {
  866. #if COMPILE_TEMPLATE_MMX
  867. if(!(c->flags & SWS_BITEXACT)) {
  868. if (c->flags & SWS_ACCURATE_RND) {
  869. if (uDest) {
  870. YSCALEYUV2YV12X_ACCURATE( "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
  871. YSCALEYUV2YV12X_ACCURATE(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
  872. }
  873. if (CONFIG_SWSCALE_ALPHA && aDest) {
  874. YSCALEYUV2YV12X_ACCURATE( "0", ALP_MMX_FILTER_OFFSET, aDest, dstW)
  875. }
  876. YSCALEYUV2YV12X_ACCURATE("0", LUM_MMX_FILTER_OFFSET, dest, dstW)
  877. } else {
  878. if (uDest) {
  879. YSCALEYUV2YV12X( "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
  880. YSCALEYUV2YV12X(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
  881. }
  882. if (CONFIG_SWSCALE_ALPHA && aDest) {
  883. YSCALEYUV2YV12X( "0", ALP_MMX_FILTER_OFFSET, aDest, dstW)
  884. }
  885. YSCALEYUV2YV12X("0", LUM_MMX_FILTER_OFFSET, dest, dstW)
  886. }
  887. return;
  888. }
  889. #endif
  890. #if COMPILE_TEMPLATE_ALTIVEC
  891. yuv2yuvX_altivec_real(lumFilter, lumSrc, lumFilterSize,
  892. chrFilter, chrSrc, chrFilterSize,
  893. dest, uDest, vDest, dstW, chrDstW);
  894. #else //COMPILE_TEMPLATE_ALTIVEC
  895. yuv2yuvXinC(lumFilter, lumSrc, lumFilterSize,
  896. chrFilter, chrSrc, chrFilterSize,
  897. alpSrc, dest, uDest, vDest, aDest, dstW, chrDstW);
  898. #endif //!COMPILE_TEMPLATE_ALTIVEC
  899. }
  900. static inline void RENAME(yuv2nv12X)(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
  901. const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize,
  902. uint8_t *dest, uint8_t *uDest, int dstW, int chrDstW, enum PixelFormat dstFormat)
  903. {
  904. yuv2nv12XinC(lumFilter, lumSrc, lumFilterSize,
  905. chrFilter, chrSrc, chrFilterSize,
  906. dest, uDest, dstW, chrDstW, dstFormat);
  907. }
  908. static inline void RENAME(yuv2yuv1)(SwsContext *c, const int16_t *lumSrc, const int16_t *chrSrc, const int16_t *alpSrc,
  909. uint8_t *dest, uint8_t *uDest, uint8_t *vDest, uint8_t *aDest, long dstW, long chrDstW)
  910. {
  911. int i;
  912. #if COMPILE_TEMPLATE_MMX
  913. if(!(c->flags & SWS_BITEXACT)) {
  914. long p= 4;
  915. uint8_t *src[4]= {alpSrc + dstW, lumSrc + dstW, chrSrc + chrDstW, chrSrc + VOFW + chrDstW};
  916. uint8_t *dst[4]= {aDest, dest, uDest, vDest};
  917. x86_reg counter[4]= {dstW, dstW, chrDstW, chrDstW};
  918. if (c->flags & SWS_ACCURATE_RND) {
  919. while(p--) {
  920. if (dst[p]) {
  921. __asm__ volatile(
  922. YSCALEYUV2YV121_ACCURATE
  923. :: "r" (src[p]), "r" (dst[p] + counter[p]),
  924. "g" (-counter[p])
  925. : "%"REG_a
  926. );
  927. }
  928. }
  929. } else {
  930. while(p--) {
  931. if (dst[p]) {
  932. __asm__ volatile(
  933. YSCALEYUV2YV121
  934. :: "r" (src[p]), "r" (dst[p] + counter[p]),
  935. "g" (-counter[p])
  936. : "%"REG_a
  937. );
  938. }
  939. }
  940. }
  941. return;
  942. }
  943. #endif
  944. for (i=0; i<dstW; i++) {
  945. int val= (lumSrc[i]+64)>>7;
  946. if (val&256) {
  947. if (val<0) val=0;
  948. else val=255;
  949. }
  950. dest[i]= val;
  951. }
  952. if (uDest)
  953. for (i=0; i<chrDstW; i++) {
  954. int u=(chrSrc[i ]+64)>>7;
  955. int v=(chrSrc[i + VOFW]+64)>>7;
  956. if ((u|v)&256) {
  957. if (u<0) u=0;
  958. else if (u>255) u=255;
  959. if (v<0) v=0;
  960. else if (v>255) v=255;
  961. }
  962. uDest[i]= u;
  963. vDest[i]= v;
  964. }
  965. if (CONFIG_SWSCALE_ALPHA && aDest)
  966. for (i=0; i<dstW; i++) {
  967. int val= (alpSrc[i]+64)>>7;
  968. aDest[i]= av_clip_uint8(val);
  969. }
  970. }
  971. /**
  972. * vertical scale YV12 to RGB
  973. */
  974. static inline void RENAME(yuv2packedX)(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
  975. const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize,
  976. const int16_t **alpSrc, uint8_t *dest, long dstW, long dstY)
  977. {
  978. #if COMPILE_TEMPLATE_MMX
  979. x86_reg dummy=0;
  980. if(!(c->flags & SWS_BITEXACT)) {
  981. if (c->flags & SWS_ACCURATE_RND) {
  982. switch(c->dstFormat) {
  983. case PIX_FMT_RGB32:
  984. if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
  985. YSCALEYUV2PACKEDX_ACCURATE
  986. YSCALEYUV2RGBX
  987. "movq %%mm2, "U_TEMP"(%0) \n\t"
  988. "movq %%mm4, "V_TEMP"(%0) \n\t"
  989. "movq %%mm5, "Y_TEMP"(%0) \n\t"
  990. YSCALEYUV2PACKEDX_ACCURATE_YA(ALP_MMX_FILTER_OFFSET)
  991. "movq "Y_TEMP"(%0), %%mm5 \n\t"
  992. "psraw $3, %%mm1 \n\t"
  993. "psraw $3, %%mm7 \n\t"
  994. "packuswb %%mm7, %%mm1 \n\t"
  995. WRITEBGR32(%4, %5, %%REGa, %%mm3, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm2, %%mm6)
  996. YSCALEYUV2PACKEDX_END
  997. } else {
  998. YSCALEYUV2PACKEDX_ACCURATE
  999. YSCALEYUV2RGBX
  1000. "pcmpeqd %%mm7, %%mm7 \n\t"
  1001. WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
  1002. YSCALEYUV2PACKEDX_END
  1003. }
  1004. return;
  1005. case PIX_FMT_BGR24:
  1006. YSCALEYUV2PACKEDX_ACCURATE
  1007. YSCALEYUV2RGBX
  1008. "pxor %%mm7, %%mm7 \n\t"
  1009. "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"\n\t" //FIXME optimize
  1010. "add %4, %%"REG_c" \n\t"
  1011. WRITEBGR24(%%REGc, %5, %%REGa)
  1012. :: "r" (&c->redDither),
  1013. "m" (dummy), "m" (dummy), "m" (dummy),
  1014. "r" (dest), "m" (dstW)
  1015. : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
  1016. );
  1017. return;
  1018. case PIX_FMT_RGB555:
  1019. YSCALEYUV2PACKEDX_ACCURATE
  1020. YSCALEYUV2RGBX
  1021. "pxor %%mm7, %%mm7 \n\t"
  1022. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1023. #ifdef DITHER1XBPP
  1024. "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
  1025. "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
  1026. "paddusb "RED_DITHER"(%0), %%mm5\n\t"
  1027. #endif
  1028. WRITERGB15(%4, %5, %%REGa)
  1029. YSCALEYUV2PACKEDX_END
  1030. return;
  1031. case PIX_FMT_RGB565:
  1032. YSCALEYUV2PACKEDX_ACCURATE
  1033. YSCALEYUV2RGBX
  1034. "pxor %%mm7, %%mm7 \n\t"
  1035. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1036. #ifdef DITHER1XBPP
  1037. "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
  1038. "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
  1039. "paddusb "RED_DITHER"(%0), %%mm5\n\t"
  1040. #endif
  1041. WRITERGB16(%4, %5, %%REGa)
  1042. YSCALEYUV2PACKEDX_END
  1043. return;
  1044. case PIX_FMT_YUYV422:
  1045. YSCALEYUV2PACKEDX_ACCURATE
  1046. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1047. "psraw $3, %%mm3 \n\t"
  1048. "psraw $3, %%mm4 \n\t"
  1049. "psraw $3, %%mm1 \n\t"
  1050. "psraw $3, %%mm7 \n\t"
  1051. WRITEYUY2(%4, %5, %%REGa)
  1052. YSCALEYUV2PACKEDX_END
  1053. return;
  1054. }
  1055. } else {
  1056. switch(c->dstFormat) {
  1057. case PIX_FMT_RGB32:
  1058. if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
  1059. YSCALEYUV2PACKEDX
  1060. YSCALEYUV2RGBX
  1061. YSCALEYUV2PACKEDX_YA(ALP_MMX_FILTER_OFFSET, %%mm0, %%mm3, %%mm6, %%mm1, %%mm7)
  1062. "psraw $3, %%mm1 \n\t"
  1063. "psraw $3, %%mm7 \n\t"
  1064. "packuswb %%mm7, %%mm1 \n\t"
  1065. WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
  1066. YSCALEYUV2PACKEDX_END
  1067. } else {
  1068. YSCALEYUV2PACKEDX
  1069. YSCALEYUV2RGBX
  1070. "pcmpeqd %%mm7, %%mm7 \n\t"
  1071. WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
  1072. YSCALEYUV2PACKEDX_END
  1073. }
  1074. return;
  1075. case PIX_FMT_BGR24:
  1076. YSCALEYUV2PACKEDX
  1077. YSCALEYUV2RGBX
  1078. "pxor %%mm7, %%mm7 \n\t"
  1079. "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c" \n\t" //FIXME optimize
  1080. "add %4, %%"REG_c" \n\t"
  1081. WRITEBGR24(%%REGc, %5, %%REGa)
  1082. :: "r" (&c->redDither),
  1083. "m" (dummy), "m" (dummy), "m" (dummy),
  1084. "r" (dest), "m" (dstW)
  1085. : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
  1086. );
  1087. return;
  1088. case PIX_FMT_RGB555:
  1089. YSCALEYUV2PACKEDX
  1090. YSCALEYUV2RGBX
  1091. "pxor %%mm7, %%mm7 \n\t"
  1092. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1093. #ifdef DITHER1XBPP
  1094. "paddusb "BLUE_DITHER"(%0), %%mm2 \n\t"
  1095. "paddusb "GREEN_DITHER"(%0), %%mm4 \n\t"
  1096. "paddusb "RED_DITHER"(%0), %%mm5 \n\t"
  1097. #endif
  1098. WRITERGB15(%4, %5, %%REGa)
  1099. YSCALEYUV2PACKEDX_END
  1100. return;
  1101. case PIX_FMT_RGB565:
  1102. YSCALEYUV2PACKEDX
  1103. YSCALEYUV2RGBX
  1104. "pxor %%mm7, %%mm7 \n\t"
  1105. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1106. #ifdef DITHER1XBPP
  1107. "paddusb "BLUE_DITHER"(%0), %%mm2 \n\t"
  1108. "paddusb "GREEN_DITHER"(%0), %%mm4 \n\t"
  1109. "paddusb "RED_DITHER"(%0), %%mm5 \n\t"
  1110. #endif
  1111. WRITERGB16(%4, %5, %%REGa)
  1112. YSCALEYUV2PACKEDX_END
  1113. return;
  1114. case PIX_FMT_YUYV422:
  1115. YSCALEYUV2PACKEDX
  1116. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1117. "psraw $3, %%mm3 \n\t"
  1118. "psraw $3, %%mm4 \n\t"
  1119. "psraw $3, %%mm1 \n\t"
  1120. "psraw $3, %%mm7 \n\t"
  1121. WRITEYUY2(%4, %5, %%REGa)
  1122. YSCALEYUV2PACKEDX_END
  1123. return;
  1124. }
  1125. }
  1126. }
  1127. #endif /* COMPILE_TEMPLATE_MMX */
  1128. #if COMPILE_TEMPLATE_ALTIVEC
  1129. /* The following list of supported dstFormat values should
  1130. match what's found in the body of ff_yuv2packedX_altivec() */
  1131. if (!(c->flags & SWS_BITEXACT) && !c->alpPixBuf &&
  1132. (c->dstFormat==PIX_FMT_ABGR || c->dstFormat==PIX_FMT_BGRA ||
  1133. c->dstFormat==PIX_FMT_BGR24 || c->dstFormat==PIX_FMT_RGB24 ||
  1134. c->dstFormat==PIX_FMT_RGBA || c->dstFormat==PIX_FMT_ARGB))
  1135. ff_yuv2packedX_altivec(c, lumFilter, lumSrc, lumFilterSize,
  1136. chrFilter, chrSrc, chrFilterSize,
  1137. dest, dstW, dstY);
  1138. else
  1139. #endif
  1140. yuv2packedXinC(c, lumFilter, lumSrc, lumFilterSize,
  1141. chrFilter, chrSrc, chrFilterSize,
  1142. alpSrc, dest, dstW, dstY);
  1143. }
  1144. /**
  1145. * vertical bilinear scale YV12 to RGB
  1146. */
  1147. static inline void RENAME(yuv2packed2)(SwsContext *c, const uint16_t *buf0, const uint16_t *buf1, const uint16_t *uvbuf0, const uint16_t *uvbuf1,
  1148. const uint16_t *abuf0, const uint16_t *abuf1, uint8_t *dest, int dstW, int yalpha, int uvalpha, int y)
  1149. {
  1150. int yalpha1=4095- yalpha;
  1151. int uvalpha1=4095-uvalpha;
  1152. int i;
  1153. #if COMPILE_TEMPLATE_MMX
  1154. if(!(c->flags & SWS_BITEXACT)) {
  1155. switch(c->dstFormat) {
  1156. //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
  1157. case PIX_FMT_RGB32:
  1158. if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
  1159. #if ARCH_X86_64
  1160. __asm__ volatile(
  1161. YSCALEYUV2RGB(%%REGBP, %5)
  1162. YSCALEYUV2RGB_YA(%%REGBP, %5, %6, %7)
  1163. "psraw $3, %%mm1 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
  1164. "psraw $3, %%mm7 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
  1165. "packuswb %%mm7, %%mm1 \n\t"
  1166. WRITEBGR32(%4, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
  1167. :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "r" (dest),
  1168. "a" (&c->redDither)
  1169. ,"r" (abuf0), "r" (abuf1)
  1170. : "%"REG_BP
  1171. );
  1172. #else
  1173. *(uint16_t **)(&c->u_temp)=abuf0;
  1174. *(uint16_t **)(&c->v_temp)=abuf1;
  1175. __asm__ volatile(
  1176. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1177. "mov %4, %%"REG_b" \n\t"
  1178. "push %%"REG_BP" \n\t"
  1179. YSCALEYUV2RGB(%%REGBP, %5)
  1180. "push %0 \n\t"
  1181. "push %1 \n\t"
  1182. "mov "U_TEMP"(%5), %0 \n\t"
  1183. "mov "V_TEMP"(%5), %1 \n\t"
  1184. YSCALEYUV2RGB_YA(%%REGBP, %5, %0, %1)
  1185. "psraw $3, %%mm1 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
  1186. "psraw $3, %%mm7 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
  1187. "packuswb %%mm7, %%mm1 \n\t"
  1188. "pop %1 \n\t"
  1189. "pop %0 \n\t"
  1190. WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
  1191. "pop %%"REG_BP" \n\t"
  1192. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1193. :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
  1194. "a" (&c->redDither)
  1195. );
  1196. #endif
  1197. } else {
  1198. __asm__ volatile(
  1199. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1200. "mov %4, %%"REG_b" \n\t"
  1201. "push %%"REG_BP" \n\t"
  1202. YSCALEYUV2RGB(%%REGBP, %5)
  1203. "pcmpeqd %%mm7, %%mm7 \n\t"
  1204. WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
  1205. "pop %%"REG_BP" \n\t"
  1206. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1207. :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
  1208. "a" (&c->redDither)
  1209. );
  1210. }
  1211. return;
  1212. case PIX_FMT_BGR24:
  1213. __asm__ volatile(
  1214. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1215. "mov %4, %%"REG_b" \n\t"
  1216. "push %%"REG_BP" \n\t"
  1217. YSCALEYUV2RGB(%%REGBP, %5)
  1218. "pxor %%mm7, %%mm7 \n\t"
  1219. WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
  1220. "pop %%"REG_BP" \n\t"
  1221. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1222. :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
  1223. "a" (&c->redDither)
  1224. );
  1225. return;
  1226. case PIX_FMT_RGB555:
  1227. __asm__ volatile(
  1228. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1229. "mov %4, %%"REG_b" \n\t"
  1230. "push %%"REG_BP" \n\t"
  1231. YSCALEYUV2RGB(%%REGBP, %5)
  1232. "pxor %%mm7, %%mm7 \n\t"
  1233. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1234. #ifdef DITHER1XBPP
  1235. "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
  1236. "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
  1237. "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
  1238. #endif
  1239. WRITERGB15(%%REGb, 8280(%5), %%REGBP)
  1240. "pop %%"REG_BP" \n\t"
  1241. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1242. :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
  1243. "a" (&c->redDither)
  1244. );
  1245. return;
  1246. case PIX_FMT_RGB565:
  1247. __asm__ volatile(
  1248. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1249. "mov %4, %%"REG_b" \n\t"
  1250. "push %%"REG_BP" \n\t"
  1251. YSCALEYUV2RGB(%%REGBP, %5)
  1252. "pxor %%mm7, %%mm7 \n\t"
  1253. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1254. #ifdef DITHER1XBPP
  1255. "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
  1256. "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
  1257. "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
  1258. #endif
  1259. WRITERGB16(%%REGb, 8280(%5), %%REGBP)
  1260. "pop %%"REG_BP" \n\t"
  1261. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1262. :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
  1263. "a" (&c->redDither)
  1264. );
  1265. return;
  1266. case PIX_FMT_YUYV422:
  1267. __asm__ volatile(
  1268. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1269. "mov %4, %%"REG_b" \n\t"
  1270. "push %%"REG_BP" \n\t"
  1271. YSCALEYUV2PACKED(%%REGBP, %5)
  1272. WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
  1273. "pop %%"REG_BP" \n\t"
  1274. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1275. :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
  1276. "a" (&c->redDither)
  1277. );
  1278. return;
  1279. default: break;
  1280. }
  1281. }
  1282. #endif //COMPILE_TEMPLATE_MMX
  1283. YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB2_C, YSCALE_YUV_2_PACKED2_C(void,0), YSCALE_YUV_2_GRAY16_2_C, YSCALE_YUV_2_MONO2_C)
  1284. }
  1285. /**
  1286. * YV12 to RGB without scaling or interpolating
  1287. */
  1288. static inline void RENAME(yuv2packed1)(SwsContext *c, const uint16_t *buf0, const uint16_t *uvbuf0, const uint16_t *uvbuf1,
  1289. const uint16_t *abuf0, uint8_t *dest, int dstW, int uvalpha, enum PixelFormat dstFormat, int flags, int y)
  1290. {
  1291. const int yalpha1=0;
  1292. int i;
  1293. const uint16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
  1294. const int yalpha= 4096; //FIXME ...
  1295. if (flags&SWS_FULL_CHR_H_INT) {
  1296. c->yuv2packed2(c, buf0, buf0, uvbuf0, uvbuf1, abuf0, abuf0, dest, dstW, 0, uvalpha, y);
  1297. return;
  1298. }
  1299. #if COMPILE_TEMPLATE_MMX
  1300. if(!(flags & SWS_BITEXACT)) {
  1301. if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
  1302. switch(dstFormat) {
  1303. case PIX_FMT_RGB32:
  1304. if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
  1305. __asm__ volatile(
  1306. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1307. "mov %4, %%"REG_b" \n\t"
  1308. "push %%"REG_BP" \n\t"
  1309. YSCALEYUV2RGB1(%%REGBP, %5)
  1310. YSCALEYUV2RGB1_ALPHA(%%REGBP)
  1311. WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
  1312. "pop %%"REG_BP" \n\t"
  1313. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1314. :: "c" (buf0), "d" (abuf0), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
  1315. "a" (&c->redDither)
  1316. );
  1317. } else {
  1318. __asm__ volatile(
  1319. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1320. "mov %4, %%"REG_b" \n\t"
  1321. "push %%"REG_BP" \n\t"
  1322. YSCALEYUV2RGB1(%%REGBP, %5)
  1323. "pcmpeqd %%mm7, %%mm7 \n\t"
  1324. WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
  1325. "pop %%"REG_BP" \n\t"
  1326. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1327. :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
  1328. "a" (&c->redDither)
  1329. );
  1330. }
  1331. return;
  1332. case PIX_FMT_BGR24:
  1333. __asm__ volatile(
  1334. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1335. "mov %4, %%"REG_b" \n\t"
  1336. "push %%"REG_BP" \n\t"
  1337. YSCALEYUV2RGB1(%%REGBP, %5)
  1338. "pxor %%mm7, %%mm7 \n\t"
  1339. WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
  1340. "pop %%"REG_BP" \n\t"
  1341. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1342. :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
  1343. "a" (&c->redDither)
  1344. );
  1345. return;
  1346. case PIX_FMT_RGB555:
  1347. __asm__ volatile(
  1348. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1349. "mov %4, %%"REG_b" \n\t"
  1350. "push %%"REG_BP" \n\t"
  1351. YSCALEYUV2RGB1(%%REGBP, %5)
  1352. "pxor %%mm7, %%mm7 \n\t"
  1353. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1354. #ifdef DITHER1XBPP
  1355. "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
  1356. "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
  1357. "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
  1358. #endif
  1359. WRITERGB15(%%REGb, 8280(%5), %%REGBP)
  1360. "pop %%"REG_BP" \n\t"
  1361. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1362. :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
  1363. "a" (&c->redDither)
  1364. );
  1365. return;
  1366. case PIX_FMT_RGB565:
  1367. __asm__ volatile(
  1368. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1369. "mov %4, %%"REG_b" \n\t"
  1370. "push %%"REG_BP" \n\t"
  1371. YSCALEYUV2RGB1(%%REGBP, %5)
  1372. "pxor %%mm7, %%mm7 \n\t"
  1373. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1374. #ifdef DITHER1XBPP
  1375. "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
  1376. "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
  1377. "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
  1378. #endif
  1379. WRITERGB16(%%REGb, 8280(%5), %%REGBP)
  1380. "pop %%"REG_BP" \n\t"
  1381. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1382. :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
  1383. "a" (&c->redDither)
  1384. );
  1385. return;
  1386. case PIX_FMT_YUYV422:
  1387. __asm__ volatile(
  1388. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1389. "mov %4, %%"REG_b" \n\t"
  1390. "push %%"REG_BP" \n\t"
  1391. YSCALEYUV2PACKED1(%%REGBP, %5)
  1392. WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
  1393. "pop %%"REG_BP" \n\t"
  1394. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1395. :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
  1396. "a" (&c->redDither)
  1397. );
  1398. return;
  1399. }
  1400. } else {
  1401. switch(dstFormat) {
  1402. case PIX_FMT_RGB32:
  1403. if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
  1404. __asm__ volatile(
  1405. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1406. "mov %4, %%"REG_b" \n\t"
  1407. "push %%"REG_BP" \n\t"
  1408. YSCALEYUV2RGB1b(%%REGBP, %5)
  1409. YSCALEYUV2RGB1_ALPHA(%%REGBP)
  1410. WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
  1411. "pop %%"REG_BP" \n\t"
  1412. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1413. :: "c" (buf0), "d" (abuf0), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
  1414. "a" (&c->redDither)
  1415. );
  1416. } else {
  1417. __asm__ volatile(
  1418. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1419. "mov %4, %%"REG_b" \n\t"
  1420. "push %%"REG_BP" \n\t"
  1421. YSCALEYUV2RGB1b(%%REGBP, %5)
  1422. "pcmpeqd %%mm7, %%mm7 \n\t"
  1423. WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
  1424. "pop %%"REG_BP" \n\t"
  1425. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1426. :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
  1427. "a" (&c->redDither)
  1428. );
  1429. }
  1430. return;
  1431. case PIX_FMT_BGR24:
  1432. __asm__ volatile(
  1433. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1434. "mov %4, %%"REG_b" \n\t"
  1435. "push %%"REG_BP" \n\t"
  1436. YSCALEYUV2RGB1b(%%REGBP, %5)
  1437. "pxor %%mm7, %%mm7 \n\t"
  1438. WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
  1439. "pop %%"REG_BP" \n\t"
  1440. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1441. :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
  1442. "a" (&c->redDither)
  1443. );
  1444. return;
  1445. case PIX_FMT_RGB555:
  1446. __asm__ volatile(
  1447. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1448. "mov %4, %%"REG_b" \n\t"
  1449. "push %%"REG_BP" \n\t"
  1450. YSCALEYUV2RGB1b(%%REGBP, %5)
  1451. "pxor %%mm7, %%mm7 \n\t"
  1452. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1453. #ifdef DITHER1XBPP
  1454. "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
  1455. "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
  1456. "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
  1457. #endif
  1458. WRITERGB15(%%REGb, 8280(%5), %%REGBP)
  1459. "pop %%"REG_BP" \n\t"
  1460. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1461. :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
  1462. "a" (&c->redDither)
  1463. );
  1464. return;
  1465. case PIX_FMT_RGB565:
  1466. __asm__ volatile(
  1467. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1468. "mov %4, %%"REG_b" \n\t"
  1469. "push %%"REG_BP" \n\t"
  1470. YSCALEYUV2RGB1b(%%REGBP, %5)
  1471. "pxor %%mm7, %%mm7 \n\t"
  1472. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1473. #ifdef DITHER1XBPP
  1474. "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
  1475. "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
  1476. "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
  1477. #endif
  1478. WRITERGB16(%%REGb, 8280(%5), %%REGBP)
  1479. "pop %%"REG_BP" \n\t"
  1480. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1481. :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
  1482. "a" (&c->redDither)
  1483. );
  1484. return;
  1485. case PIX_FMT_YUYV422:
  1486. __asm__ volatile(
  1487. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1488. "mov %4, %%"REG_b" \n\t"
  1489. "push %%"REG_BP" \n\t"
  1490. YSCALEYUV2PACKED1b(%%REGBP, %5)
  1491. WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
  1492. "pop %%"REG_BP" \n\t"
  1493. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1494. :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
  1495. "a" (&c->redDither)
  1496. );
  1497. return;
  1498. }
  1499. }
  1500. }
  1501. #endif /* COMPILE_TEMPLATE_MMX */
  1502. if (uvalpha < 2048) {
  1503. YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1_C, YSCALE_YUV_2_PACKED1_C(void,0), YSCALE_YUV_2_GRAY16_1_C, YSCALE_YUV_2_MONO2_C)
  1504. } else {
  1505. YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1B_C, YSCALE_YUV_2_PACKED1B_C(void,0), YSCALE_YUV_2_GRAY16_1_C, YSCALE_YUV_2_MONO2_C)
  1506. }
  1507. }
  1508. //FIXME yuy2* can read up to 7 samples too much
  1509. static inline void RENAME(yuy2ToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
  1510. {
  1511. #if COMPILE_TEMPLATE_MMX
  1512. __asm__ volatile(
  1513. "movq "MANGLE(bm01010101)", %%mm2 \n\t"
  1514. "mov %0, %%"REG_a" \n\t"
  1515. "1: \n\t"
  1516. "movq (%1, %%"REG_a",2), %%mm0 \n\t"
  1517. "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
  1518. "pand %%mm2, %%mm0 \n\t"
  1519. "pand %%mm2, %%mm1 \n\t"
  1520. "packuswb %%mm1, %%mm0 \n\t"
  1521. "movq %%mm0, (%2, %%"REG_a") \n\t"
  1522. "add $8, %%"REG_a" \n\t"
  1523. " js 1b \n\t"
  1524. : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst+width)
  1525. : "%"REG_a
  1526. );
  1527. #else
  1528. int i;
  1529. for (i=0; i<width; i++)
  1530. dst[i]= src[2*i];
  1531. #endif
  1532. }
  1533. static inline void RENAME(yuy2ToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
  1534. {
  1535. #if COMPILE_TEMPLATE_MMX
  1536. __asm__ volatile(
  1537. "movq "MANGLE(bm01010101)", %%mm4 \n\t"
  1538. "mov %0, %%"REG_a" \n\t"
  1539. "1: \n\t"
  1540. "movq (%1, %%"REG_a",4), %%mm0 \n\t"
  1541. "movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
  1542. "psrlw $8, %%mm0 \n\t"
  1543. "psrlw $8, %%mm1 \n\t"
  1544. "packuswb %%mm1, %%mm0 \n\t"
  1545. "movq %%mm0, %%mm1 \n\t"
  1546. "psrlw $8, %%mm0 \n\t"
  1547. "pand %%mm4, %%mm1 \n\t"
  1548. "packuswb %%mm0, %%mm0 \n\t"
  1549. "packuswb %%mm1, %%mm1 \n\t"
  1550. "movd %%mm0, (%3, %%"REG_a") \n\t"
  1551. "movd %%mm1, (%2, %%"REG_a") \n\t"
  1552. "add $4, %%"REG_a" \n\t"
  1553. " js 1b \n\t"
  1554. : : "g" ((x86_reg)-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
  1555. : "%"REG_a
  1556. );
  1557. #else
  1558. int i;
  1559. for (i=0; i<width; i++) {
  1560. dstU[i]= src1[4*i + 1];
  1561. dstV[i]= src1[4*i + 3];
  1562. }
  1563. #endif
  1564. assert(src1 == src2);
  1565. }
  1566. static inline void RENAME(LEToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
  1567. {
  1568. #if COMPILE_TEMPLATE_MMX
  1569. __asm__ volatile(
  1570. "mov %0, %%"REG_a" \n\t"
  1571. "1: \n\t"
  1572. "movq (%1, %%"REG_a",2), %%mm0 \n\t"
  1573. "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
  1574. "movq (%2, %%"REG_a",2), %%mm2 \n\t"
  1575. "movq 8(%2, %%"REG_a",2), %%mm3 \n\t"
  1576. "psrlw $8, %%mm0 \n\t"
  1577. "psrlw $8, %%mm1 \n\t"
  1578. "psrlw $8, %%mm2 \n\t"
  1579. "psrlw $8, %%mm3 \n\t"
  1580. "packuswb %%mm1, %%mm0 \n\t"
  1581. "packuswb %%mm3, %%mm2 \n\t"
  1582. "movq %%mm0, (%3, %%"REG_a") \n\t"
  1583. "movq %%mm2, (%4, %%"REG_a") \n\t"
  1584. "add $8, %%"REG_a" \n\t"
  1585. " js 1b \n\t"
  1586. : : "g" ((x86_reg)-width), "r" (src1+width*2), "r" (src2+width*2), "r" (dstU+width), "r" (dstV+width)
  1587. : "%"REG_a
  1588. );
  1589. #else
  1590. int i;
  1591. for (i=0; i<width; i++) {
  1592. dstU[i]= src1[2*i + 1];
  1593. dstV[i]= src2[2*i + 1];
  1594. }
  1595. #endif
  1596. }
  1597. /* This is almost identical to the previous, end exists only because
  1598. * yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses. */
  1599. static inline void RENAME(uyvyToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
  1600. {
  1601. #if COMPILE_TEMPLATE_MMX
  1602. __asm__ volatile(
  1603. "mov %0, %%"REG_a" \n\t"
  1604. "1: \n\t"
  1605. "movq (%1, %%"REG_a",2), %%mm0 \n\t"
  1606. "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
  1607. "psrlw $8, %%mm0 \n\t"
  1608. "psrlw $8, %%mm1 \n\t"
  1609. "packuswb %%mm1, %%mm0 \n\t"
  1610. "movq %%mm0, (%2, %%"REG_a") \n\t"
  1611. "add $8, %%"REG_a" \n\t"
  1612. " js 1b \n\t"
  1613. : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst+width)
  1614. : "%"REG_a
  1615. );
  1616. #else
  1617. int i;
  1618. for (i=0; i<width; i++)
  1619. dst[i]= src[2*i+1];
  1620. #endif
  1621. }
  1622. static inline void RENAME(uyvyToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
  1623. {
  1624. #if COMPILE_TEMPLATE_MMX
  1625. __asm__ volatile(
  1626. "movq "MANGLE(bm01010101)", %%mm4 \n\t"
  1627. "mov %0, %%"REG_a" \n\t"
  1628. "1: \n\t"
  1629. "movq (%1, %%"REG_a",4), %%mm0 \n\t"
  1630. "movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
  1631. "pand %%mm4, %%mm0 \n\t"
  1632. "pand %%mm4, %%mm1 \n\t"
  1633. "packuswb %%mm1, %%mm0 \n\t"
  1634. "movq %%mm0, %%mm1 \n\t"
  1635. "psrlw $8, %%mm0 \n\t"
  1636. "pand %%mm4, %%mm1 \n\t"
  1637. "packuswb %%mm0, %%mm0 \n\t"
  1638. "packuswb %%mm1, %%mm1 \n\t"
  1639. "movd %%mm0, (%3, %%"REG_a") \n\t"
  1640. "movd %%mm1, (%2, %%"REG_a") \n\t"
  1641. "add $4, %%"REG_a" \n\t"
  1642. " js 1b \n\t"
  1643. : : "g" ((x86_reg)-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
  1644. : "%"REG_a
  1645. );
  1646. #else
  1647. int i;
  1648. for (i=0; i<width; i++) {
  1649. dstU[i]= src1[4*i + 0];
  1650. dstV[i]= src1[4*i + 2];
  1651. }
  1652. #endif
  1653. assert(src1 == src2);
  1654. }
  1655. static inline void RENAME(BEToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
  1656. {
  1657. #if COMPILE_TEMPLATE_MMX
  1658. __asm__ volatile(
  1659. "movq "MANGLE(bm01010101)", %%mm4 \n\t"
  1660. "mov %0, %%"REG_a" \n\t"
  1661. "1: \n\t"
  1662. "movq (%1, %%"REG_a",2), %%mm0 \n\t"
  1663. "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
  1664. "movq (%2, %%"REG_a",2), %%mm2 \n\t"
  1665. "movq 8(%2, %%"REG_a",2), %%mm3 \n\t"
  1666. "pand %%mm4, %%mm0 \n\t"
  1667. "pand %%mm4, %%mm1 \n\t"
  1668. "pand %%mm4, %%mm2 \n\t"
  1669. "pand %%mm4, %%mm3 \n\t"
  1670. "packuswb %%mm1, %%mm0 \n\t"
  1671. "packuswb %%mm3, %%mm2 \n\t"
  1672. "movq %%mm0, (%3, %%"REG_a") \n\t"
  1673. "movq %%mm2, (%4, %%"REG_a") \n\t"
  1674. "add $8, %%"REG_a" \n\t"
  1675. " js 1b \n\t"
  1676. : : "g" ((x86_reg)-width), "r" (src1+width*2), "r" (src2+width*2), "r" (dstU+width), "r" (dstV+width)
  1677. : "%"REG_a
  1678. );
  1679. #else
  1680. int i;
  1681. for (i=0; i<width; i++) {
  1682. dstU[i]= src1[2*i];
  1683. dstV[i]= src2[2*i];
  1684. }
  1685. #endif
  1686. }
  1687. static inline void RENAME(nvXXtoUV)(uint8_t *dst1, uint8_t *dst2,
  1688. const uint8_t *src, long width)
  1689. {
  1690. #if COMPILE_TEMPLATE_MMX
  1691. __asm__ volatile(
  1692. "movq "MANGLE(bm01010101)", %%mm4 \n\t"
  1693. "mov %0, %%"REG_a" \n\t"
  1694. "1: \n\t"
  1695. "movq (%1, %%"REG_a",2), %%mm0 \n\t"
  1696. "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
  1697. "movq %%mm0, %%mm2 \n\t"
  1698. "movq %%mm1, %%mm3 \n\t"
  1699. "pand %%mm4, %%mm0 \n\t"
  1700. "pand %%mm4, %%mm1 \n\t"
  1701. "psrlw $8, %%mm2 \n\t"
  1702. "psrlw $8, %%mm3 \n\t"
  1703. "packuswb %%mm1, %%mm0 \n\t"
  1704. "packuswb %%mm3, %%mm2 \n\t"
  1705. "movq %%mm0, (%2, %%"REG_a") \n\t"
  1706. "movq %%mm2, (%3, %%"REG_a") \n\t"
  1707. "add $8, %%"REG_a" \n\t"
  1708. " js 1b \n\t"
  1709. : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst1+width), "r" (dst2+width)
  1710. : "%"REG_a
  1711. );
  1712. #else
  1713. int i;
  1714. for (i = 0; i < width; i++) {
  1715. dst1[i] = src[2*i+0];
  1716. dst2[i] = src[2*i+1];
  1717. }
  1718. #endif
  1719. }
  1720. static inline void nv12ToUV(uint8_t *dstU, uint8_t *dstV,
  1721. const uint8_t *src1, const uint8_t *src2,
  1722. long width, uint32_t *unused)
  1723. {
  1724. RENAME(nvXXtoUV)(dstU, dstV, src1, width);
  1725. }
  1726. static inline void nv21ToUV(uint8_t *dstU, uint8_t *dstV,
  1727. const uint8_t *src1, const uint8_t *src2,
  1728. long width, uint32_t *unused)
  1729. {
  1730. RENAME(nvXXtoUV)(dstV, dstU, src1, width);
  1731. }
  1732. #if COMPILE_TEMPLATE_MMX
  1733. static inline void RENAME(bgr24ToY_mmx)(uint8_t *dst, const uint8_t *src, long width, enum PixelFormat srcFormat)
  1734. {
  1735. if(srcFormat == PIX_FMT_BGR24) {
  1736. __asm__ volatile(
  1737. "movq "MANGLE(ff_bgr24toY1Coeff)", %%mm5 \n\t"
  1738. "movq "MANGLE(ff_bgr24toY2Coeff)", %%mm6 \n\t"
  1739. :
  1740. );
  1741. } else {
  1742. __asm__ volatile(
  1743. "movq "MANGLE(ff_rgb24toY1Coeff)", %%mm5 \n\t"
  1744. "movq "MANGLE(ff_rgb24toY2Coeff)", %%mm6 \n\t"
  1745. :
  1746. );
  1747. }
  1748. __asm__ volatile(
  1749. "movq "MANGLE(ff_bgr24toYOffset)", %%mm4 \n\t"
  1750. "mov %2, %%"REG_a" \n\t"
  1751. "pxor %%mm7, %%mm7 \n\t"
  1752. "1: \n\t"
  1753. PREFETCH" 64(%0) \n\t"
  1754. "movd (%0), %%mm0 \n\t"
  1755. "movd 2(%0), %%mm1 \n\t"
  1756. "movd 6(%0), %%mm2 \n\t"
  1757. "movd 8(%0), %%mm3 \n\t"
  1758. "add $12, %0 \n\t"
  1759. "punpcklbw %%mm7, %%mm0 \n\t"
  1760. "punpcklbw %%mm7, %%mm1 \n\t"
  1761. "punpcklbw %%mm7, %%mm2 \n\t"
  1762. "punpcklbw %%mm7, %%mm3 \n\t"
  1763. "pmaddwd %%mm5, %%mm0 \n\t"
  1764. "pmaddwd %%mm6, %%mm1 \n\t"
  1765. "pmaddwd %%mm5, %%mm2 \n\t"
  1766. "pmaddwd %%mm6, %%mm3 \n\t"
  1767. "paddd %%mm1, %%mm0 \n\t"
  1768. "paddd %%mm3, %%mm2 \n\t"
  1769. "paddd %%mm4, %%mm0 \n\t"
  1770. "paddd %%mm4, %%mm2 \n\t"
  1771. "psrad $15, %%mm0 \n\t"
  1772. "psrad $15, %%mm2 \n\t"
  1773. "packssdw %%mm2, %%mm0 \n\t"
  1774. "packuswb %%mm0, %%mm0 \n\t"
  1775. "movd %%mm0, (%1, %%"REG_a") \n\t"
  1776. "add $4, %%"REG_a" \n\t"
  1777. " js 1b \n\t"
  1778. : "+r" (src)
  1779. : "r" (dst+width), "g" ((x86_reg)-width)
  1780. : "%"REG_a
  1781. );
  1782. }
  1783. static inline void RENAME(bgr24ToUV_mmx)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src, long width, enum PixelFormat srcFormat)
  1784. {
  1785. __asm__ volatile(
  1786. "movq 24+%4, %%mm6 \n\t"
  1787. "mov %3, %%"REG_a" \n\t"
  1788. "pxor %%mm7, %%mm7 \n\t"
  1789. "1: \n\t"
  1790. PREFETCH" 64(%0) \n\t"
  1791. "movd (%0), %%mm0 \n\t"
  1792. "movd 2(%0), %%mm1 \n\t"
  1793. "punpcklbw %%mm7, %%mm0 \n\t"
  1794. "punpcklbw %%mm7, %%mm1 \n\t"
  1795. "movq %%mm0, %%mm2 \n\t"
  1796. "movq %%mm1, %%mm3 \n\t"
  1797. "pmaddwd %4, %%mm0 \n\t"
  1798. "pmaddwd 8+%4, %%mm1 \n\t"
  1799. "pmaddwd 16+%4, %%mm2 \n\t"
  1800. "pmaddwd %%mm6, %%mm3 \n\t"
  1801. "paddd %%mm1, %%mm0 \n\t"
  1802. "paddd %%mm3, %%mm2 \n\t"
  1803. "movd 6(%0), %%mm1 \n\t"
  1804. "movd 8(%0), %%mm3 \n\t"
  1805. "add $12, %0 \n\t"
  1806. "punpcklbw %%mm7, %%mm1 \n\t"
  1807. "punpcklbw %%mm7, %%mm3 \n\t"
  1808. "movq %%mm1, %%mm4 \n\t"
  1809. "movq %%mm3, %%mm5 \n\t"
  1810. "pmaddwd %4, %%mm1 \n\t"
  1811. "pmaddwd 8+%4, %%mm3 \n\t"
  1812. "pmaddwd 16+%4, %%mm4 \n\t"
  1813. "pmaddwd %%mm6, %%mm5 \n\t"
  1814. "paddd %%mm3, %%mm1 \n\t"
  1815. "paddd %%mm5, %%mm4 \n\t"
  1816. "movq "MANGLE(ff_bgr24toUVOffset)", %%mm3 \n\t"
  1817. "paddd %%mm3, %%mm0 \n\t"
  1818. "paddd %%mm3, %%mm2 \n\t"
  1819. "paddd %%mm3, %%mm1 \n\t"
  1820. "paddd %%mm3, %%mm4 \n\t"
  1821. "psrad $15, %%mm0 \n\t"
  1822. "psrad $15, %%mm2 \n\t"
  1823. "psrad $15, %%mm1 \n\t"
  1824. "psrad $15, %%mm4 \n\t"
  1825. "packssdw %%mm1, %%mm0 \n\t"
  1826. "packssdw %%mm4, %%mm2 \n\t"
  1827. "packuswb %%mm0, %%mm0 \n\t"
  1828. "packuswb %%mm2, %%mm2 \n\t"
  1829. "movd %%mm0, (%1, %%"REG_a") \n\t"
  1830. "movd %%mm2, (%2, %%"REG_a") \n\t"
  1831. "add $4, %%"REG_a" \n\t"
  1832. " js 1b \n\t"
  1833. : "+r" (src)
  1834. : "r" (dstU+width), "r" (dstV+width), "g" ((x86_reg)-width), "m"(ff_bgr24toUV[srcFormat == PIX_FMT_RGB24][0])
  1835. : "%"REG_a
  1836. );
  1837. }
  1838. #endif
  1839. static inline void RENAME(bgr24ToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
  1840. {
  1841. #if COMPILE_TEMPLATE_MMX
  1842. RENAME(bgr24ToY_mmx)(dst, src, width, PIX_FMT_BGR24);
  1843. #else
  1844. int i;
  1845. for (i=0; i<width; i++) {
  1846. int b= src[i*3+0];
  1847. int g= src[i*3+1];
  1848. int r= src[i*3+2];
  1849. dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
  1850. }
  1851. #endif /* COMPILE_TEMPLATE_MMX */
  1852. }
  1853. static inline void RENAME(bgr24ToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
  1854. {
  1855. #if COMPILE_TEMPLATE_MMX
  1856. RENAME(bgr24ToUV_mmx)(dstU, dstV, src1, width, PIX_FMT_BGR24);
  1857. #else
  1858. int i;
  1859. for (i=0; i<width; i++) {
  1860. int b= src1[3*i + 0];
  1861. int g= src1[3*i + 1];
  1862. int r= src1[3*i + 2];
  1863. dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
  1864. dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
  1865. }
  1866. #endif /* COMPILE_TEMPLATE_MMX */
  1867. assert(src1 == src2);
  1868. }
  1869. static inline void RENAME(bgr24ToUV_half)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
  1870. {
  1871. int i;
  1872. for (i=0; i<width; i++) {
  1873. int b= src1[6*i + 0] + src1[6*i + 3];
  1874. int g= src1[6*i + 1] + src1[6*i + 4];
  1875. int r= src1[6*i + 2] + src1[6*i + 5];
  1876. dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
  1877. dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
  1878. }
  1879. assert(src1 == src2);
  1880. }
  1881. static inline void RENAME(rgb24ToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
  1882. {
  1883. #if COMPILE_TEMPLATE_MMX
  1884. RENAME(bgr24ToY_mmx)(dst, src, width, PIX_FMT_RGB24);
  1885. #else
  1886. int i;
  1887. for (i=0; i<width; i++) {
  1888. int r= src[i*3+0];
  1889. int g= src[i*3+1];
  1890. int b= src[i*3+2];
  1891. dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
  1892. }
  1893. #endif
  1894. }
  1895. static inline void RENAME(rgb24ToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
  1896. {
  1897. #if COMPILE_TEMPLATE_MMX
  1898. assert(src1==src2);
  1899. RENAME(bgr24ToUV_mmx)(dstU, dstV, src1, width, PIX_FMT_RGB24);
  1900. #else
  1901. int i;
  1902. assert(src1==src2);
  1903. for (i=0; i<width; i++) {
  1904. int r= src1[3*i + 0];
  1905. int g= src1[3*i + 1];
  1906. int b= src1[3*i + 2];
  1907. dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
  1908. dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
  1909. }
  1910. #endif
  1911. }
  1912. static inline void RENAME(rgb24ToUV_half)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
  1913. {
  1914. int i;
  1915. assert(src1==src2);
  1916. for (i=0; i<width; i++) {
  1917. int r= src1[6*i + 0] + src1[6*i + 3];
  1918. int g= src1[6*i + 1] + src1[6*i + 4];
  1919. int b= src1[6*i + 2] + src1[6*i + 5];
  1920. dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
  1921. dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
  1922. }
  1923. }
  1924. // bilinear / bicubic scaling
  1925. static inline void RENAME(hScale)(int16_t *dst, int dstW, const uint8_t *src, int srcW, int xInc,
  1926. const int16_t *filter, const int16_t *filterPos, long filterSize)
  1927. {
  1928. #if COMPILE_TEMPLATE_MMX
  1929. assert(filterSize % 4 == 0 && filterSize>0);
  1930. if (filterSize==4) { // Always true for upscaling, sometimes for down, too.
  1931. x86_reg counter= -2*dstW;
  1932. filter-= counter*2;
  1933. filterPos-= counter/2;
  1934. dst-= counter/2;
  1935. __asm__ volatile(
  1936. #if defined(PIC)
  1937. "push %%"REG_b" \n\t"
  1938. #endif
  1939. "pxor %%mm7, %%mm7 \n\t"
  1940. "push %%"REG_BP" \n\t" // we use 7 regs here ...
  1941. "mov %%"REG_a", %%"REG_BP" \n\t"
  1942. ASMALIGN(4)
  1943. "1: \n\t"
  1944. "movzwl (%2, %%"REG_BP"), %%eax \n\t"
  1945. "movzwl 2(%2, %%"REG_BP"), %%ebx \n\t"
  1946. "movq (%1, %%"REG_BP", 4), %%mm1 \n\t"
  1947. "movq 8(%1, %%"REG_BP", 4), %%mm3 \n\t"
  1948. "movd (%3, %%"REG_a"), %%mm0 \n\t"
  1949. "movd (%3, %%"REG_b"), %%mm2 \n\t"
  1950. "punpcklbw %%mm7, %%mm0 \n\t"
  1951. "punpcklbw %%mm7, %%mm2 \n\t"
  1952. "pmaddwd %%mm1, %%mm0 \n\t"
  1953. "pmaddwd %%mm2, %%mm3 \n\t"
  1954. "movq %%mm0, %%mm4 \n\t"
  1955. "punpckldq %%mm3, %%mm0 \n\t"
  1956. "punpckhdq %%mm3, %%mm4 \n\t"
  1957. "paddd %%mm4, %%mm0 \n\t"
  1958. "psrad $7, %%mm0 \n\t"
  1959. "packssdw %%mm0, %%mm0 \n\t"
  1960. "movd %%mm0, (%4, %%"REG_BP") \n\t"
  1961. "add $4, %%"REG_BP" \n\t"
  1962. " jnc 1b \n\t"
  1963. "pop %%"REG_BP" \n\t"
  1964. #if defined(PIC)
  1965. "pop %%"REG_b" \n\t"
  1966. #endif
  1967. : "+a" (counter)
  1968. : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
  1969. #if !defined(PIC)
  1970. : "%"REG_b
  1971. #endif
  1972. );
  1973. } else if (filterSize==8) {
  1974. x86_reg counter= -2*dstW;
  1975. filter-= counter*4;
  1976. filterPos-= counter/2;
  1977. dst-= counter/2;
  1978. __asm__ volatile(
  1979. #if defined(PIC)
  1980. "push %%"REG_b" \n\t"
  1981. #endif
  1982. "pxor %%mm7, %%mm7 \n\t"
  1983. "push %%"REG_BP" \n\t" // we use 7 regs here ...
  1984. "mov %%"REG_a", %%"REG_BP" \n\t"
  1985. ASMALIGN(4)
  1986. "1: \n\t"
  1987. "movzwl (%2, %%"REG_BP"), %%eax \n\t"
  1988. "movzwl 2(%2, %%"REG_BP"), %%ebx \n\t"
  1989. "movq (%1, %%"REG_BP", 8), %%mm1 \n\t"
  1990. "movq 16(%1, %%"REG_BP", 8), %%mm3 \n\t"
  1991. "movd (%3, %%"REG_a"), %%mm0 \n\t"
  1992. "movd (%3, %%"REG_b"), %%mm2 \n\t"
  1993. "punpcklbw %%mm7, %%mm0 \n\t"
  1994. "punpcklbw %%mm7, %%mm2 \n\t"
  1995. "pmaddwd %%mm1, %%mm0 \n\t"
  1996. "pmaddwd %%mm2, %%mm3 \n\t"
  1997. "movq 8(%1, %%"REG_BP", 8), %%mm1 \n\t"
  1998. "movq 24(%1, %%"REG_BP", 8), %%mm5 \n\t"
  1999. "movd 4(%3, %%"REG_a"), %%mm4 \n\t"
  2000. "movd 4(%3, %%"REG_b"), %%mm2 \n\t"
  2001. "punpcklbw %%mm7, %%mm4 \n\t"
  2002. "punpcklbw %%mm7, %%mm2 \n\t"
  2003. "pmaddwd %%mm1, %%mm4 \n\t"
  2004. "pmaddwd %%mm2, %%mm5 \n\t"
  2005. "paddd %%mm4, %%mm0 \n\t"
  2006. "paddd %%mm5, %%mm3 \n\t"
  2007. "movq %%mm0, %%mm4 \n\t"
  2008. "punpckldq %%mm3, %%mm0 \n\t"
  2009. "punpckhdq %%mm3, %%mm4 \n\t"
  2010. "paddd %%mm4, %%mm0 \n\t"
  2011. "psrad $7, %%mm0 \n\t"
  2012. "packssdw %%mm0, %%mm0 \n\t"
  2013. "movd %%mm0, (%4, %%"REG_BP") \n\t"
  2014. "add $4, %%"REG_BP" \n\t"
  2015. " jnc 1b \n\t"
  2016. "pop %%"REG_BP" \n\t"
  2017. #if defined(PIC)
  2018. "pop %%"REG_b" \n\t"
  2019. #endif
  2020. : "+a" (counter)
  2021. : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
  2022. #if !defined(PIC)
  2023. : "%"REG_b
  2024. #endif
  2025. );
  2026. } else {
  2027. uint8_t *offset = src+filterSize;
  2028. x86_reg counter= -2*dstW;
  2029. //filter-= counter*filterSize/2;
  2030. filterPos-= counter/2;
  2031. dst-= counter/2;
  2032. __asm__ volatile(
  2033. "pxor %%mm7, %%mm7 \n\t"
  2034. ASMALIGN(4)
  2035. "1: \n\t"
  2036. "mov %2, %%"REG_c" \n\t"
  2037. "movzwl (%%"REG_c", %0), %%eax \n\t"
  2038. "movzwl 2(%%"REG_c", %0), %%edx \n\t"
  2039. "mov %5, %%"REG_c" \n\t"
  2040. "pxor %%mm4, %%mm4 \n\t"
  2041. "pxor %%mm5, %%mm5 \n\t"
  2042. "2: \n\t"
  2043. "movq (%1), %%mm1 \n\t"
  2044. "movq (%1, %6), %%mm3 \n\t"
  2045. "movd (%%"REG_c", %%"REG_a"), %%mm0 \n\t"
  2046. "movd (%%"REG_c", %%"REG_d"), %%mm2 \n\t"
  2047. "punpcklbw %%mm7, %%mm0 \n\t"
  2048. "punpcklbw %%mm7, %%mm2 \n\t"
  2049. "pmaddwd %%mm1, %%mm0 \n\t"
  2050. "pmaddwd %%mm2, %%mm3 \n\t"
  2051. "paddd %%mm3, %%mm5 \n\t"
  2052. "paddd %%mm0, %%mm4 \n\t"
  2053. "add $8, %1 \n\t"
  2054. "add $4, %%"REG_c" \n\t"
  2055. "cmp %4, %%"REG_c" \n\t"
  2056. " jb 2b \n\t"
  2057. "add %6, %1 \n\t"
  2058. "movq %%mm4, %%mm0 \n\t"
  2059. "punpckldq %%mm5, %%mm4 \n\t"
  2060. "punpckhdq %%mm5, %%mm0 \n\t"
  2061. "paddd %%mm0, %%mm4 \n\t"
  2062. "psrad $7, %%mm4 \n\t"
  2063. "packssdw %%mm4, %%mm4 \n\t"
  2064. "mov %3, %%"REG_a" \n\t"
  2065. "movd %%mm4, (%%"REG_a", %0) \n\t"
  2066. "add $4, %0 \n\t"
  2067. " jnc 1b \n\t"
  2068. : "+r" (counter), "+r" (filter)
  2069. : "m" (filterPos), "m" (dst), "m"(offset),
  2070. "m" (src), "r" ((x86_reg)filterSize*2)
  2071. : "%"REG_a, "%"REG_c, "%"REG_d
  2072. );
  2073. }
  2074. #else
  2075. #if COMPILE_TEMPLATE_ALTIVEC
  2076. hScale_altivec_real(dst, dstW, src, srcW, xInc, filter, filterPos, filterSize);
  2077. #else
  2078. int i;
  2079. for (i=0; i<dstW; i++) {
  2080. int j;
  2081. int srcPos= filterPos[i];
  2082. int val=0;
  2083. //printf("filterPos: %d\n", filterPos[i]);
  2084. for (j=0; j<filterSize; j++) {
  2085. //printf("filter: %d, src: %d\n", filter[i], src[srcPos + j]);
  2086. val += ((int)src[srcPos + j])*filter[filterSize*i + j];
  2087. }
  2088. //filter += hFilterSize;
  2089. dst[i] = FFMIN(val>>7, (1<<15)-1); // the cubic equation does overflow ...
  2090. //dst[i] = val>>7;
  2091. }
  2092. #endif /* COMPILE_ALTIVEC */
  2093. #endif /* COMPILE_MMX */
  2094. }
  2095. #define FAST_BILINEAR_X86 \
  2096. "subl %%edi, %%esi \n\t" /* src[xx+1] - src[xx] */ \
  2097. "imull %%ecx, %%esi \n\t" /* (src[xx+1] - src[xx])*xalpha */ \
  2098. "shll $16, %%edi \n\t" \
  2099. "addl %%edi, %%esi \n\t" /* src[xx+1]*xalpha + src[xx]*(1-xalpha) */ \
  2100. "mov %1, %%"REG_D"\n\t" \
  2101. "shrl $9, %%esi \n\t" \
  2102. static inline void RENAME(hyscale_fast)(SwsContext *c, int16_t *dst,
  2103. int dstWidth, const uint8_t *src, int srcW,
  2104. int xInc)
  2105. {
  2106. int i;
  2107. unsigned int xpos=0;
  2108. for (i=0;i<dstWidth;i++) {
  2109. register unsigned int xx=xpos>>16;
  2110. register unsigned int xalpha=(xpos&0xFFFF)>>9;
  2111. dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha;
  2112. xpos+=xInc;
  2113. }
  2114. }
  2115. // *** horizontal scale Y line to temp buffer
  2116. static inline void RENAME(hyscale)(SwsContext *c, uint16_t *dst, long dstWidth, const uint8_t *src, int srcW, int xInc,
  2117. int flags, const int16_t *hLumFilter,
  2118. const int16_t *hLumFilterPos, int hLumFilterSize,
  2119. enum PixelFormat srcFormat, uint8_t *formatConvBuffer,
  2120. uint32_t *pal, int isAlpha)
  2121. {
  2122. int32_t av_unused *mmx2FilterPos = c->lumMmx2FilterPos;
  2123. int16_t av_unused *mmx2Filter = c->lumMmx2Filter;
  2124. int av_unused canMMX2BeUsed = c->canMMX2BeUsed;
  2125. void av_unused *mmx2FilterCode= c->lumMmx2FilterCode;
  2126. void (*internal_func)(uint8_t *, const uint8_t *, long, uint32_t *) = isAlpha ? c->hascale_internal : c->hyscale_internal;
  2127. if (isAlpha) {
  2128. if (srcFormat == PIX_FMT_RGB32 || srcFormat == PIX_FMT_BGR32 )
  2129. src += 3;
  2130. } else {
  2131. if (srcFormat == PIX_FMT_RGB32_1 || srcFormat == PIX_FMT_BGR32_1)
  2132. src += ALT32_CORR;
  2133. }
  2134. if (srcFormat == PIX_FMT_RGB48LE)
  2135. src++;
  2136. if (internal_func) {
  2137. internal_func(formatConvBuffer, src, srcW, pal);
  2138. src= formatConvBuffer;
  2139. }
  2140. #if COMPILE_TEMPLATE_MMX
  2141. // Use the new MMX scaler if the MMX2 one can't be used (it is faster than the x86 ASM one).
  2142. if (!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
  2143. #else
  2144. if (!(flags&SWS_FAST_BILINEAR))
  2145. #endif
  2146. {
  2147. c->hScale(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize);
  2148. } else { // fast bilinear upscale / crap downscale
  2149. #if ARCH_X86 && CONFIG_GPL
  2150. #if COMPILE_TEMPLATE_MMX2
  2151. int i;
  2152. #if defined(PIC)
  2153. DECLARE_ALIGNED(8, uint64_t, ebxsave);
  2154. #endif
  2155. if (canMMX2BeUsed) {
  2156. __asm__ volatile(
  2157. #if defined(PIC)
  2158. "mov %%"REG_b", %5 \n\t"
  2159. #endif
  2160. "pxor %%mm7, %%mm7 \n\t"
  2161. "mov %0, %%"REG_c" \n\t"
  2162. "mov %1, %%"REG_D" \n\t"
  2163. "mov %2, %%"REG_d" \n\t"
  2164. "mov %3, %%"REG_b" \n\t"
  2165. "xor %%"REG_a", %%"REG_a" \n\t" // i
  2166. PREFETCH" (%%"REG_c") \n\t"
  2167. PREFETCH" 32(%%"REG_c") \n\t"
  2168. PREFETCH" 64(%%"REG_c") \n\t"
  2169. #if ARCH_X86_64
  2170. #define CALL_MMX2_FILTER_CODE \
  2171. "movl (%%"REG_b"), %%esi \n\t"\
  2172. "call *%4 \n\t"\
  2173. "movl (%%"REG_b", %%"REG_a"), %%esi \n\t"\
  2174. "add %%"REG_S", %%"REG_c" \n\t"\
  2175. "add %%"REG_a", %%"REG_D" \n\t"\
  2176. "xor %%"REG_a", %%"REG_a" \n\t"\
  2177. #else
  2178. #define CALL_MMX2_FILTER_CODE \
  2179. "movl (%%"REG_b"), %%esi \n\t"\
  2180. "call *%4 \n\t"\
  2181. "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\
  2182. "add %%"REG_a", %%"REG_D" \n\t"\
  2183. "xor %%"REG_a", %%"REG_a" \n\t"\
  2184. #endif /* ARCH_X86_64 */
  2185. CALL_MMX2_FILTER_CODE
  2186. CALL_MMX2_FILTER_CODE
  2187. CALL_MMX2_FILTER_CODE
  2188. CALL_MMX2_FILTER_CODE
  2189. CALL_MMX2_FILTER_CODE
  2190. CALL_MMX2_FILTER_CODE
  2191. CALL_MMX2_FILTER_CODE
  2192. CALL_MMX2_FILTER_CODE
  2193. #if defined(PIC)
  2194. "mov %5, %%"REG_b" \n\t"
  2195. #endif
  2196. :: "m" (src), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
  2197. "m" (mmx2FilterCode)
  2198. #if defined(PIC)
  2199. ,"m" (ebxsave)
  2200. #endif
  2201. : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
  2202. #if !defined(PIC)
  2203. ,"%"REG_b
  2204. #endif
  2205. );
  2206. for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) dst[i] = src[srcW-1]*128;
  2207. } else {
  2208. #endif /* COMPILE_TEMPLATE_MMX2 */
  2209. x86_reg xInc_shr16 = xInc >> 16;
  2210. uint16_t xInc_mask = xInc & 0xffff;
  2211. //NO MMX just normal asm ...
  2212. __asm__ volatile(
  2213. "xor %%"REG_a", %%"REG_a" \n\t" // i
  2214. "xor %%"REG_d", %%"REG_d" \n\t" // xx
  2215. "xorl %%ecx, %%ecx \n\t" // xalpha
  2216. ASMALIGN(4)
  2217. "1: \n\t"
  2218. "movzbl (%0, %%"REG_d"), %%edi \n\t" //src[xx]
  2219. "movzbl 1(%0, %%"REG_d"), %%esi \n\t" //src[xx+1]
  2220. FAST_BILINEAR_X86
  2221. "movw %%si, (%%"REG_D", %%"REG_a", 2) \n\t"
  2222. "addw %4, %%cx \n\t" //xalpha += xInc&0xFFFF
  2223. "adc %3, %%"REG_d" \n\t" //xx+= xInc>>16 + carry
  2224. "movzbl (%0, %%"REG_d"), %%edi \n\t" //src[xx]
  2225. "movzbl 1(%0, %%"REG_d"), %%esi \n\t" //src[xx+1]
  2226. FAST_BILINEAR_X86
  2227. "movw %%si, 2(%%"REG_D", %%"REG_a", 2) \n\t"
  2228. "addw %4, %%cx \n\t" //xalpha += xInc&0xFFFF
  2229. "adc %3, %%"REG_d" \n\t" //xx+= xInc>>16 + carry
  2230. "add $2, %%"REG_a" \n\t"
  2231. "cmp %2, %%"REG_a" \n\t"
  2232. " jb 1b \n\t"
  2233. :: "r" (src), "m" (dst), "m" (dstWidth), "m" (xInc_shr16), "m" (xInc_mask)
  2234. : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi"
  2235. );
  2236. #if COMPILE_TEMPLATE_MMX2
  2237. } //if MMX2 can't be used
  2238. #endif
  2239. #else
  2240. c->hyscale_fast(c, dst, dstWidth, src, srcW, xInc);
  2241. #endif /* ARCH_X86 */
  2242. }
  2243. if(!isAlpha && c->srcRange != c->dstRange && !(isRGB(c->dstFormat) || isBGR(c->dstFormat))) {
  2244. int i;
  2245. //FIXME all pal and rgb srcFormats could do this convertion as well
  2246. //FIXME all scalers more complex than bilinear could do half of this transform
  2247. if(c->srcRange) {
  2248. for (i=0; i<dstWidth; i++)
  2249. dst[i]= (dst[i]*14071 + 33561947)>>14;
  2250. } else {
  2251. for (i=0; i<dstWidth; i++)
  2252. dst[i]= (FFMIN(dst[i],30189)*19077 - 39057361)>>14;
  2253. }
  2254. }
  2255. }
  2256. static inline void RENAME(hcscale_fast)(SwsContext *c, int16_t *dst,
  2257. int dstWidth, const uint8_t *src1,
  2258. const uint8_t *src2, int srcW, int xInc)
  2259. {
  2260. int i;
  2261. unsigned int xpos=0;
  2262. for (i=0;i<dstWidth;i++) {
  2263. register unsigned int xx=xpos>>16;
  2264. register unsigned int xalpha=(xpos&0xFFFF)>>9;
  2265. dst[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha);
  2266. dst[i+VOFW]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha);
  2267. /* slower
  2268. dst[i]= (src1[xx]<<7) + (src1[xx+1] - src1[xx])*xalpha;
  2269. dst[i+VOFW]=(src2[xx]<<7) + (src2[xx+1] - src2[xx])*xalpha;
  2270. */
  2271. xpos+=xInc;
  2272. }
  2273. }
  2274. inline static void RENAME(hcscale)(SwsContext *c, uint16_t *dst, long dstWidth, const uint8_t *src1, const uint8_t *src2,
  2275. int srcW, int xInc, int flags, const int16_t *hChrFilter,
  2276. const int16_t *hChrFilterPos, int hChrFilterSize,
  2277. enum PixelFormat srcFormat, uint8_t *formatConvBuffer,
  2278. uint32_t *pal)
  2279. {
  2280. int32_t av_unused *mmx2FilterPos = c->chrMmx2FilterPos;
  2281. int16_t av_unused *mmx2Filter = c->chrMmx2Filter;
  2282. int av_unused canMMX2BeUsed = c->canMMX2BeUsed;
  2283. void av_unused *mmx2FilterCode= c->chrMmx2FilterCode;
  2284. if (isGray(srcFormat) || srcFormat==PIX_FMT_MONOBLACK || srcFormat==PIX_FMT_MONOWHITE)
  2285. return;
  2286. if (srcFormat==PIX_FMT_RGB32_1 || srcFormat==PIX_FMT_BGR32_1) {
  2287. src1 += ALT32_CORR;
  2288. src2 += ALT32_CORR;
  2289. }
  2290. if (srcFormat==PIX_FMT_RGB48LE) {
  2291. src1++;
  2292. src2++;
  2293. }
  2294. if (c->hcscale_internal) {
  2295. c->hcscale_internal(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
  2296. src1= formatConvBuffer;
  2297. src2= formatConvBuffer+VOFW;
  2298. }
  2299. #if COMPILE_TEMPLATE_MMX
  2300. // Use the new MMX scaler if the MMX2 one can't be used (it is faster than the x86 ASM one).
  2301. if (!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
  2302. #else
  2303. if (!(flags&SWS_FAST_BILINEAR))
  2304. #endif
  2305. {
  2306. c->hScale(dst , dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
  2307. c->hScale(dst+VOFW, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
  2308. } else { // fast bilinear upscale / crap downscale
  2309. #if ARCH_X86 && CONFIG_GPL
  2310. #if COMPILE_TEMPLATE_MMX2
  2311. int i;
  2312. #if defined(PIC)
  2313. DECLARE_ALIGNED(8, uint64_t, ebxsave);
  2314. #endif
  2315. if (canMMX2BeUsed) {
  2316. __asm__ volatile(
  2317. #if defined(PIC)
  2318. "mov %%"REG_b", %6 \n\t"
  2319. #endif
  2320. "pxor %%mm7, %%mm7 \n\t"
  2321. "mov %0, %%"REG_c" \n\t"
  2322. "mov %1, %%"REG_D" \n\t"
  2323. "mov %2, %%"REG_d" \n\t"
  2324. "mov %3, %%"REG_b" \n\t"
  2325. "xor %%"REG_a", %%"REG_a" \n\t" // i
  2326. PREFETCH" (%%"REG_c") \n\t"
  2327. PREFETCH" 32(%%"REG_c") \n\t"
  2328. PREFETCH" 64(%%"REG_c") \n\t"
  2329. CALL_MMX2_FILTER_CODE
  2330. CALL_MMX2_FILTER_CODE
  2331. CALL_MMX2_FILTER_CODE
  2332. CALL_MMX2_FILTER_CODE
  2333. "xor %%"REG_a", %%"REG_a" \n\t" // i
  2334. "mov %5, %%"REG_c" \n\t" // src
  2335. "mov %1, %%"REG_D" \n\t" // buf1
  2336. "add $"AV_STRINGIFY(VOF)", %%"REG_D" \n\t"
  2337. PREFETCH" (%%"REG_c") \n\t"
  2338. PREFETCH" 32(%%"REG_c") \n\t"
  2339. PREFETCH" 64(%%"REG_c") \n\t"
  2340. CALL_MMX2_FILTER_CODE
  2341. CALL_MMX2_FILTER_CODE
  2342. CALL_MMX2_FILTER_CODE
  2343. CALL_MMX2_FILTER_CODE
  2344. #if defined(PIC)
  2345. "mov %6, %%"REG_b" \n\t"
  2346. #endif
  2347. :: "m" (src1), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
  2348. "m" (mmx2FilterCode), "m" (src2)
  2349. #if defined(PIC)
  2350. ,"m" (ebxsave)
  2351. #endif
  2352. : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
  2353. #if !defined(PIC)
  2354. ,"%"REG_b
  2355. #endif
  2356. );
  2357. for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) {
  2358. //printf("%d %d %d\n", dstWidth, i, srcW);
  2359. dst[i] = src1[srcW-1]*128;
  2360. dst[i+VOFW] = src2[srcW-1]*128;
  2361. }
  2362. } else {
  2363. #endif /* COMPILE_TEMPLATE_MMX2 */
  2364. x86_reg xInc_shr16 = (x86_reg) (xInc >> 16);
  2365. uint16_t xInc_mask = xInc & 0xffff;
  2366. __asm__ volatile(
  2367. "xor %%"REG_a", %%"REG_a" \n\t" // i
  2368. "xor %%"REG_d", %%"REG_d" \n\t" // xx
  2369. "xorl %%ecx, %%ecx \n\t" // xalpha
  2370. ASMALIGN(4)
  2371. "1: \n\t"
  2372. "mov %0, %%"REG_S" \n\t"
  2373. "movzbl (%%"REG_S", %%"REG_d"), %%edi \n\t" //src[xx]
  2374. "movzbl 1(%%"REG_S", %%"REG_d"), %%esi \n\t" //src[xx+1]
  2375. FAST_BILINEAR_X86
  2376. "movw %%si, (%%"REG_D", %%"REG_a", 2) \n\t"
  2377. "movzbl (%5, %%"REG_d"), %%edi \n\t" //src[xx]
  2378. "movzbl 1(%5, %%"REG_d"), %%esi \n\t" //src[xx+1]
  2379. FAST_BILINEAR_X86
  2380. "movw %%si, "AV_STRINGIFY(VOF)"(%%"REG_D", %%"REG_a", 2) \n\t"
  2381. "addw %4, %%cx \n\t" //xalpha += xInc&0xFFFF
  2382. "adc %3, %%"REG_d" \n\t" //xx+= xInc>>16 + carry
  2383. "add $1, %%"REG_a" \n\t"
  2384. "cmp %2, %%"REG_a" \n\t"
  2385. " jb 1b \n\t"
  2386. /* GCC 3.3 makes MPlayer crash on IA-32 machines when using "g" operand here,
  2387. which is needed to support GCC 4.0. */
  2388. #if ARCH_X86_64 && AV_GCC_VERSION_AT_LEAST(3,4)
  2389. :: "m" (src1), "m" (dst), "g" (dstWidth), "m" (xInc_shr16), "m" (xInc_mask),
  2390. #else
  2391. :: "m" (src1), "m" (dst), "m" (dstWidth), "m" (xInc_shr16), "m" (xInc_mask),
  2392. #endif
  2393. "r" (src2)
  2394. : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi"
  2395. );
  2396. #if COMPILE_TEMPLATE_MMX2
  2397. } //if MMX2 can't be used
  2398. #endif
  2399. #else
  2400. c->hcscale_fast(c, dst, dstWidth, src1, src2, srcW, xInc);
  2401. #endif /* ARCH_X86 */
  2402. }
  2403. if(c->srcRange != c->dstRange && !(isRGB(c->dstFormat) || isBGR(c->dstFormat))) {
  2404. int i;
  2405. //FIXME all pal and rgb srcFormats could do this convertion as well
  2406. //FIXME all scalers more complex than bilinear could do half of this transform
  2407. if(c->srcRange) {
  2408. for (i=0; i<dstWidth; i++) {
  2409. dst[i ]= (dst[i ]*1799 + 4081085)>>11; //1469
  2410. dst[i+VOFW]= (dst[i+VOFW]*1799 + 4081085)>>11; //1469
  2411. }
  2412. } else {
  2413. for (i=0; i<dstWidth; i++) {
  2414. dst[i ]= (FFMIN(dst[i ],30775)*4663 - 9289992)>>12; //-264
  2415. dst[i+VOFW]= (FFMIN(dst[i+VOFW],30775)*4663 - 9289992)>>12; //-264
  2416. }
  2417. }
  2418. }
  2419. }
  2420. #define DEBUG_SWSCALE_BUFFERS 0
  2421. #define DEBUG_BUFFERS(...) if (DEBUG_SWSCALE_BUFFERS) av_log(c, AV_LOG_DEBUG, __VA_ARGS__)
  2422. static int RENAME(swScale)(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
  2423. int srcSliceH, uint8_t* dst[], int dstStride[])
  2424. {
  2425. /* load a few things into local vars to make the code more readable? and faster */
  2426. const int srcW= c->srcW;
  2427. const int dstW= c->dstW;
  2428. const int dstH= c->dstH;
  2429. const int chrDstW= c->chrDstW;
  2430. const int chrSrcW= c->chrSrcW;
  2431. const int lumXInc= c->lumXInc;
  2432. const int chrXInc= c->chrXInc;
  2433. const enum PixelFormat dstFormat= c->dstFormat;
  2434. const enum PixelFormat srcFormat= c->srcFormat;
  2435. const int flags= c->flags;
  2436. int16_t *vLumFilterPos= c->vLumFilterPos;
  2437. int16_t *vChrFilterPos= c->vChrFilterPos;
  2438. int16_t *hLumFilterPos= c->hLumFilterPos;
  2439. int16_t *hChrFilterPos= c->hChrFilterPos;
  2440. int16_t *vLumFilter= c->vLumFilter;
  2441. int16_t *vChrFilter= c->vChrFilter;
  2442. int16_t *hLumFilter= c->hLumFilter;
  2443. int16_t *hChrFilter= c->hChrFilter;
  2444. int32_t *lumMmxFilter= c->lumMmxFilter;
  2445. int32_t *chrMmxFilter= c->chrMmxFilter;
  2446. int32_t *alpMmxFilter= c->alpMmxFilter;
  2447. const int vLumFilterSize= c->vLumFilterSize;
  2448. const int vChrFilterSize= c->vChrFilterSize;
  2449. const int hLumFilterSize= c->hLumFilterSize;
  2450. const int hChrFilterSize= c->hChrFilterSize;
  2451. int16_t **lumPixBuf= c->lumPixBuf;
  2452. int16_t **chrPixBuf= c->chrPixBuf;
  2453. int16_t **alpPixBuf= c->alpPixBuf;
  2454. const int vLumBufSize= c->vLumBufSize;
  2455. const int vChrBufSize= c->vChrBufSize;
  2456. uint8_t *formatConvBuffer= c->formatConvBuffer;
  2457. const int chrSrcSliceY= srcSliceY >> c->chrSrcVSubSample;
  2458. const int chrSrcSliceH= -((-srcSliceH) >> c->chrSrcVSubSample);
  2459. int lastDstY;
  2460. uint32_t *pal=c->pal_yuv;
  2461. /* vars which will change and which we need to store back in the context */
  2462. int dstY= c->dstY;
  2463. int lumBufIndex= c->lumBufIndex;
  2464. int chrBufIndex= c->chrBufIndex;
  2465. int lastInLumBuf= c->lastInLumBuf;
  2466. int lastInChrBuf= c->lastInChrBuf;
  2467. if (isPacked(c->srcFormat)) {
  2468. src[0]=
  2469. src[1]=
  2470. src[2]=
  2471. src[3]= src[0];
  2472. srcStride[0]=
  2473. srcStride[1]=
  2474. srcStride[2]=
  2475. srcStride[3]= srcStride[0];
  2476. }
  2477. srcStride[1]<<= c->vChrDrop;
  2478. srcStride[2]<<= c->vChrDrop;
  2479. DEBUG_BUFFERS("swScale() %p[%d] %p[%d] %p[%d] %p[%d] -> %p[%d] %p[%d] %p[%d] %p[%d]\n",
  2480. src[0], srcStride[0], src[1], srcStride[1], src[2], srcStride[2], src[3], srcStride[3],
  2481. dst[0], dstStride[0], dst[1], dstStride[1], dst[2], dstStride[2], dst[3], dstStride[3]);
  2482. DEBUG_BUFFERS("srcSliceY: %d srcSliceH: %d dstY: %d dstH: %d\n",
  2483. srcSliceY, srcSliceH, dstY, dstH);
  2484. DEBUG_BUFFERS("vLumFilterSize: %d vLumBufSize: %d vChrFilterSize: %d vChrBufSize: %d\n",
  2485. vLumFilterSize, vLumBufSize, vChrFilterSize, vChrBufSize);
  2486. if (dstStride[0]%8 !=0 || dstStride[1]%8 !=0 || dstStride[2]%8 !=0 || dstStride[3]%8 != 0) {
  2487. static int warnedAlready=0; //FIXME move this into the context perhaps
  2488. if (flags & SWS_PRINT_INFO && !warnedAlready) {
  2489. av_log(c, AV_LOG_WARNING, "Warning: dstStride is not aligned!\n"
  2490. " ->cannot do aligned memory accesses anymore\n");
  2491. warnedAlready=1;
  2492. }
  2493. }
  2494. /* Note the user might start scaling the picture in the middle so this
  2495. will not get executed. This is not really intended but works
  2496. currently, so people might do it. */
  2497. if (srcSliceY ==0) {
  2498. lumBufIndex=0;
  2499. chrBufIndex=0;
  2500. dstY=0;
  2501. lastInLumBuf= -1;
  2502. lastInChrBuf= -1;
  2503. }
  2504. lastDstY= dstY;
  2505. for (;dstY < dstH; dstY++) {
  2506. unsigned char *dest =dst[0]+dstStride[0]*dstY;
  2507. const int chrDstY= dstY>>c->chrDstVSubSample;
  2508. unsigned char *uDest=dst[1]+dstStride[1]*chrDstY;
  2509. unsigned char *vDest=dst[2]+dstStride[2]*chrDstY;
  2510. unsigned char *aDest=(CONFIG_SWSCALE_ALPHA && alpPixBuf) ? dst[3]+dstStride[3]*dstY : NULL;
  2511. const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input
  2512. const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input
  2513. int lastLumSrcY= firstLumSrcY + vLumFilterSize -1; // Last line needed as input
  2514. int lastChrSrcY= firstChrSrcY + vChrFilterSize -1; // Last line needed as input
  2515. int enough_lines;
  2516. //handle holes (FAST_BILINEAR & weird filters)
  2517. if (firstLumSrcY > lastInLumBuf) lastInLumBuf= firstLumSrcY-1;
  2518. if (firstChrSrcY > lastInChrBuf) lastInChrBuf= firstChrSrcY-1;
  2519. assert(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1);
  2520. assert(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1);
  2521. // Do we have enough lines in this slice to output the dstY line
  2522. enough_lines = lastLumSrcY < srcSliceY + srcSliceH && lastChrSrcY < -((-srcSliceY - srcSliceH)>>c->chrSrcVSubSample);
  2523. if (!enough_lines) {
  2524. lastLumSrcY = srcSliceY + srcSliceH - 1;
  2525. lastChrSrcY = chrSrcSliceY + chrSrcSliceH - 1;
  2526. }
  2527. DEBUG_BUFFERS("dstY: %d\n", dstY);
  2528. DEBUG_BUFFERS("\tfirstLumSrcY: %d lastLumSrcY: %d lastInLumBuf: %d\n",
  2529. firstLumSrcY, lastLumSrcY, lastInLumBuf);
  2530. DEBUG_BUFFERS("\tfirstChrSrcY: %d lastChrSrcY: %d lastInChrBuf: %d\n",
  2531. firstChrSrcY, lastChrSrcY, lastInChrBuf);
  2532. //Do horizontal scaling
  2533. while(lastInLumBuf < lastLumSrcY) {
  2534. uint8_t *src1= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
  2535. uint8_t *src2= src[3]+(lastInLumBuf + 1 - srcSliceY)*srcStride[3];
  2536. lumBufIndex++;
  2537. DEBUG_BUFFERS("\t\tlumBufIndex %d: lastInLumBuf: %d\n",
  2538. lumBufIndex, lastInLumBuf);
  2539. assert(lumBufIndex < 2*vLumBufSize);
  2540. assert(lastInLumBuf + 1 - srcSliceY < srcSliceH);
  2541. assert(lastInLumBuf + 1 - srcSliceY >= 0);
  2542. RENAME(hyscale)(c, lumPixBuf[ lumBufIndex ], dstW, src1, srcW, lumXInc,
  2543. flags, hLumFilter, hLumFilterPos, hLumFilterSize,
  2544. c->srcFormat, formatConvBuffer,
  2545. pal, 0);
  2546. if (CONFIG_SWSCALE_ALPHA && alpPixBuf)
  2547. RENAME(hyscale)(c, alpPixBuf[ lumBufIndex ], dstW, src2, srcW, lumXInc,
  2548. flags, hLumFilter, hLumFilterPos, hLumFilterSize,
  2549. c->srcFormat, formatConvBuffer,
  2550. pal, 1);
  2551. lastInLumBuf++;
  2552. }
  2553. while(lastInChrBuf < lastChrSrcY) {
  2554. uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
  2555. uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
  2556. chrBufIndex++;
  2557. DEBUG_BUFFERS("\t\tchrBufIndex %d: lastInChrBuf: %d\n",
  2558. chrBufIndex, lastInChrBuf);
  2559. assert(chrBufIndex < 2*vChrBufSize);
  2560. assert(lastInChrBuf + 1 - chrSrcSliceY < (chrSrcSliceH));
  2561. assert(lastInChrBuf + 1 - chrSrcSliceY >= 0);
  2562. //FIXME replace parameters through context struct (some at least)
  2563. if (!(isGray(srcFormat) || isGray(dstFormat)))
  2564. RENAME(hcscale)(c, chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
  2565. flags, hChrFilter, hChrFilterPos, hChrFilterSize,
  2566. c->srcFormat, formatConvBuffer,
  2567. pal);
  2568. lastInChrBuf++;
  2569. }
  2570. //wrap buf index around to stay inside the ring buffer
  2571. if (lumBufIndex >= vLumBufSize) lumBufIndex-= vLumBufSize;
  2572. if (chrBufIndex >= vChrBufSize) chrBufIndex-= vChrBufSize;
  2573. if (!enough_lines)
  2574. break; //we can't output a dstY line so let's try with the next slice
  2575. #if COMPILE_TEMPLATE_MMX
  2576. c->blueDither= ff_dither8[dstY&1];
  2577. if (c->dstFormat == PIX_FMT_RGB555 || c->dstFormat == PIX_FMT_BGR555)
  2578. c->greenDither= ff_dither8[dstY&1];
  2579. else
  2580. c->greenDither= ff_dither4[dstY&1];
  2581. c->redDither= ff_dither8[(dstY+1)&1];
  2582. #endif
  2583. if (dstY < dstH-2) {
  2584. const int16_t **lumSrcPtr= (const int16_t **) lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
  2585. const int16_t **chrSrcPtr= (const int16_t **) chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
  2586. const int16_t **alpSrcPtr= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? (const int16_t **) alpPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize : NULL;
  2587. #if COMPILE_TEMPLATE_MMX
  2588. int i;
  2589. if (flags & SWS_ACCURATE_RND) {
  2590. int s= APCK_SIZE / 8;
  2591. for (i=0; i<vLumFilterSize; i+=2) {
  2592. *(void**)&lumMmxFilter[s*i ]= lumSrcPtr[i ];
  2593. *(void**)&lumMmxFilter[s*i+APCK_PTR2/4 ]= lumSrcPtr[i+(vLumFilterSize>1)];
  2594. lumMmxFilter[s*i+APCK_COEF/4 ]=
  2595. lumMmxFilter[s*i+APCK_COEF/4+1]= vLumFilter[dstY*vLumFilterSize + i ]
  2596. + (vLumFilterSize>1 ? vLumFilter[dstY*vLumFilterSize + i + 1]<<16 : 0);
  2597. if (CONFIG_SWSCALE_ALPHA && alpPixBuf) {
  2598. *(void**)&alpMmxFilter[s*i ]= alpSrcPtr[i ];
  2599. *(void**)&alpMmxFilter[s*i+APCK_PTR2/4 ]= alpSrcPtr[i+(vLumFilterSize>1)];
  2600. alpMmxFilter[s*i+APCK_COEF/4 ]=
  2601. alpMmxFilter[s*i+APCK_COEF/4+1]= lumMmxFilter[s*i+APCK_COEF/4 ];
  2602. }
  2603. }
  2604. for (i=0; i<vChrFilterSize; i+=2) {
  2605. *(void**)&chrMmxFilter[s*i ]= chrSrcPtr[i ];
  2606. *(void**)&chrMmxFilter[s*i+APCK_PTR2/4 ]= chrSrcPtr[i+(vChrFilterSize>1)];
  2607. chrMmxFilter[s*i+APCK_COEF/4 ]=
  2608. chrMmxFilter[s*i+APCK_COEF/4+1]= vChrFilter[chrDstY*vChrFilterSize + i ]
  2609. + (vChrFilterSize>1 ? vChrFilter[chrDstY*vChrFilterSize + i + 1]<<16 : 0);
  2610. }
  2611. } else {
  2612. for (i=0; i<vLumFilterSize; i++) {
  2613. lumMmxFilter[4*i+0]= (int32_t)lumSrcPtr[i];
  2614. lumMmxFilter[4*i+1]= (uint64_t)lumSrcPtr[i] >> 32;
  2615. lumMmxFilter[4*i+2]=
  2616. lumMmxFilter[4*i+3]=
  2617. ((uint16_t)vLumFilter[dstY*vLumFilterSize + i])*0x10001;
  2618. if (CONFIG_SWSCALE_ALPHA && alpPixBuf) {
  2619. alpMmxFilter[4*i+0]= (int32_t)alpSrcPtr[i];
  2620. alpMmxFilter[4*i+1]= (uint64_t)alpSrcPtr[i] >> 32;
  2621. alpMmxFilter[4*i+2]=
  2622. alpMmxFilter[4*i+3]= lumMmxFilter[4*i+2];
  2623. }
  2624. }
  2625. for (i=0; i<vChrFilterSize; i++) {
  2626. chrMmxFilter[4*i+0]= (int32_t)chrSrcPtr[i];
  2627. chrMmxFilter[4*i+1]= (uint64_t)chrSrcPtr[i] >> 32;
  2628. chrMmxFilter[4*i+2]=
  2629. chrMmxFilter[4*i+3]=
  2630. ((uint16_t)vChrFilter[chrDstY*vChrFilterSize + i])*0x10001;
  2631. }
  2632. }
  2633. #endif
  2634. if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21) {
  2635. const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
  2636. if (dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
  2637. c->yuv2nv12X(c,
  2638. vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
  2639. vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
  2640. dest, uDest, dstW, chrDstW, dstFormat);
  2641. } else if (isPlanarYUV(dstFormat) || dstFormat==PIX_FMT_GRAY8) { //YV12 like
  2642. const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
  2643. if ((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
  2644. if (is16BPS(dstFormat)) {
  2645. yuv2yuvX16inC(
  2646. vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
  2647. vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
  2648. alpSrcPtr, (uint16_t *) dest, (uint16_t *) uDest, (uint16_t *) vDest, (uint16_t *) aDest, dstW, chrDstW,
  2649. dstFormat);
  2650. } else if (vLumFilterSize == 1 && vChrFilterSize == 1) { // unscaled YV12
  2651. int16_t *lumBuf = lumSrcPtr[0];
  2652. int16_t *chrBuf= chrSrcPtr[0];
  2653. int16_t *alpBuf= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? alpSrcPtr[0] : NULL;
  2654. c->yuv2yuv1(c, lumBuf, chrBuf, alpBuf, dest, uDest, vDest, aDest, dstW, chrDstW);
  2655. } else { //General YV12
  2656. c->yuv2yuvX(c,
  2657. vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
  2658. vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
  2659. alpSrcPtr, dest, uDest, vDest, aDest, dstW, chrDstW);
  2660. }
  2661. } else {
  2662. assert(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
  2663. assert(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
  2664. if (vLumFilterSize == 1 && vChrFilterSize == 2) { //unscaled RGB
  2665. int chrAlpha= vChrFilter[2*dstY+1];
  2666. if(flags & SWS_FULL_CHR_H_INT) {
  2667. yuv2rgbXinC_full(c, //FIXME write a packed1_full function
  2668. vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
  2669. vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
  2670. alpSrcPtr, dest, dstW, dstY);
  2671. } else {
  2672. c->yuv2packed1(c, *lumSrcPtr, *chrSrcPtr, *(chrSrcPtr+1),
  2673. alpPixBuf ? *alpSrcPtr : NULL,
  2674. dest, dstW, chrAlpha, dstFormat, flags, dstY);
  2675. }
  2676. } else if (vLumFilterSize == 2 && vChrFilterSize == 2) { //bilinear upscale RGB
  2677. int lumAlpha= vLumFilter[2*dstY+1];
  2678. int chrAlpha= vChrFilter[2*dstY+1];
  2679. lumMmxFilter[2]=
  2680. lumMmxFilter[3]= vLumFilter[2*dstY ]*0x10001;
  2681. chrMmxFilter[2]=
  2682. chrMmxFilter[3]= vChrFilter[2*chrDstY]*0x10001;
  2683. if(flags & SWS_FULL_CHR_H_INT) {
  2684. yuv2rgbXinC_full(c, //FIXME write a packed2_full function
  2685. vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
  2686. vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
  2687. alpSrcPtr, dest, dstW, dstY);
  2688. } else {
  2689. c->yuv2packed2(c, *lumSrcPtr, *(lumSrcPtr+1), *chrSrcPtr, *(chrSrcPtr+1),
  2690. alpPixBuf ? *alpSrcPtr : NULL, alpPixBuf ? *(alpSrcPtr+1) : NULL,
  2691. dest, dstW, lumAlpha, chrAlpha, dstY);
  2692. }
  2693. } else { //general RGB
  2694. if(flags & SWS_FULL_CHR_H_INT) {
  2695. yuv2rgbXinC_full(c,
  2696. vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
  2697. vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
  2698. alpSrcPtr, dest, dstW, dstY);
  2699. } else {
  2700. c->yuv2packedX(c,
  2701. vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
  2702. vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
  2703. alpSrcPtr, dest, dstW, dstY);
  2704. }
  2705. }
  2706. }
  2707. } else { // hmm looks like we can't use MMX here without overwriting this array's tail
  2708. const int16_t **lumSrcPtr= (const int16_t **)lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
  2709. const int16_t **chrSrcPtr= (const int16_t **)chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
  2710. const int16_t **alpSrcPtr= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? (const int16_t **)alpPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize : NULL;
  2711. if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21) {
  2712. const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
  2713. if (dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
  2714. yuv2nv12XinC(
  2715. vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
  2716. vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
  2717. dest, uDest, dstW, chrDstW, dstFormat);
  2718. } else if (isPlanarYUV(dstFormat) || dstFormat==PIX_FMT_GRAY8) { //YV12
  2719. const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
  2720. if ((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
  2721. if (is16BPS(dstFormat)) {
  2722. yuv2yuvX16inC(
  2723. vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
  2724. vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
  2725. alpSrcPtr, (uint16_t *) dest, (uint16_t *) uDest, (uint16_t *) vDest, (uint16_t *) aDest, dstW, chrDstW,
  2726. dstFormat);
  2727. } else {
  2728. yuv2yuvXinC(
  2729. vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
  2730. vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
  2731. alpSrcPtr, dest, uDest, vDest, aDest, dstW, chrDstW);
  2732. }
  2733. } else {
  2734. assert(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
  2735. assert(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
  2736. if(flags & SWS_FULL_CHR_H_INT) {
  2737. yuv2rgbXinC_full(c,
  2738. vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
  2739. vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
  2740. alpSrcPtr, dest, dstW, dstY);
  2741. } else {
  2742. yuv2packedXinC(c,
  2743. vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
  2744. vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
  2745. alpSrcPtr, dest, dstW, dstY);
  2746. }
  2747. }
  2748. }
  2749. }
  2750. if ((dstFormat == PIX_FMT_YUVA420P) && !alpPixBuf)
  2751. fillPlane(dst[3], dstStride[3], dstW, dstY-lastDstY, lastDstY, 255);
  2752. #if COMPILE_TEMPLATE_MMX
  2753. if (flags & SWS_CPU_CAPS_MMX2 ) __asm__ volatile("sfence":::"memory");
  2754. /* On K6 femms is faster than emms. On K7 femms is directly mapped to emms. */
  2755. if (flags & SWS_CPU_CAPS_3DNOW) __asm__ volatile("femms" :::"memory");
  2756. else __asm__ volatile("emms" :::"memory");
  2757. #endif
  2758. /* store changed local vars back in the context */
  2759. c->dstY= dstY;
  2760. c->lumBufIndex= lumBufIndex;
  2761. c->chrBufIndex= chrBufIndex;
  2762. c->lastInLumBuf= lastInLumBuf;
  2763. c->lastInChrBuf= lastInChrBuf;
  2764. return dstY - lastDstY;
  2765. }
  2766. static void RENAME(sws_init_swScale)(SwsContext *c)
  2767. {
  2768. enum PixelFormat srcFormat = c->srcFormat;
  2769. c->yuv2nv12X = RENAME(yuv2nv12X );
  2770. c->yuv2yuv1 = RENAME(yuv2yuv1 );
  2771. c->yuv2yuvX = RENAME(yuv2yuvX );
  2772. c->yuv2packed1 = RENAME(yuv2packed1 );
  2773. c->yuv2packed2 = RENAME(yuv2packed2 );
  2774. c->yuv2packedX = RENAME(yuv2packedX );
  2775. c->hScale = RENAME(hScale );
  2776. c->hyscale_fast = RENAME(hyscale_fast);
  2777. c->hcscale_fast = RENAME(hcscale_fast);
  2778. c->hcscale_internal = NULL;
  2779. switch(srcFormat) {
  2780. case PIX_FMT_YUYV422 : c->hcscale_internal = RENAME(yuy2ToUV); break;
  2781. case PIX_FMT_UYVY422 : c->hcscale_internal = RENAME(uyvyToUV); break;
  2782. case PIX_FMT_NV12 : c->hcscale_internal = nv12ToUV; break;
  2783. case PIX_FMT_NV21 : c->hcscale_internal = nv21ToUV; break;
  2784. case PIX_FMT_RGB8 :
  2785. case PIX_FMT_BGR8 :
  2786. case PIX_FMT_PAL8 :
  2787. case PIX_FMT_BGR4_BYTE:
  2788. case PIX_FMT_RGB4_BYTE: c->hcscale_internal = palToUV; break;
  2789. case PIX_FMT_YUV420P16BE:
  2790. case PIX_FMT_YUV422P16BE:
  2791. case PIX_FMT_YUV444P16BE: c->hcscale_internal = RENAME(BEToUV); break;
  2792. case PIX_FMT_YUV420P16LE:
  2793. case PIX_FMT_YUV422P16LE:
  2794. case PIX_FMT_YUV444P16LE: c->hcscale_internal = RENAME(LEToUV); break;
  2795. }
  2796. if (c->chrSrcHSubSample) {
  2797. switch(srcFormat) {
  2798. case PIX_FMT_RGB48BE:
  2799. case PIX_FMT_RGB48LE: c->hcscale_internal = rgb48ToUV_half; break;
  2800. case PIX_FMT_RGB32 :
  2801. case PIX_FMT_RGB32_1: c->hcscale_internal = bgr32ToUV_half; break;
  2802. case PIX_FMT_BGR24 : c->hcscale_internal = RENAME(bgr24ToUV_half); break;
  2803. case PIX_FMT_BGR565 : c->hcscale_internal = bgr16ToUV_half; break;
  2804. case PIX_FMT_BGR555 : c->hcscale_internal = bgr15ToUV_half; break;
  2805. case PIX_FMT_BGR32 :
  2806. case PIX_FMT_BGR32_1: c->hcscale_internal = rgb32ToUV_half; break;
  2807. case PIX_FMT_RGB24 : c->hcscale_internal = RENAME(rgb24ToUV_half); break;
  2808. case PIX_FMT_RGB565 : c->hcscale_internal = rgb16ToUV_half; break;
  2809. case PIX_FMT_RGB555 : c->hcscale_internal = rgb15ToUV_half; break;
  2810. }
  2811. } else {
  2812. switch(srcFormat) {
  2813. case PIX_FMT_RGB48BE:
  2814. case PIX_FMT_RGB48LE: c->hcscale_internal = rgb48ToUV; break;
  2815. case PIX_FMT_RGB32 :
  2816. case PIX_FMT_RGB32_1: c->hcscale_internal = bgr32ToUV; break;
  2817. case PIX_FMT_BGR24 : c->hcscale_internal = RENAME(bgr24ToUV); break;
  2818. case PIX_FMT_BGR565 : c->hcscale_internal = bgr16ToUV; break;
  2819. case PIX_FMT_BGR555 : c->hcscale_internal = bgr15ToUV; break;
  2820. case PIX_FMT_BGR32 :
  2821. case PIX_FMT_BGR32_1: c->hcscale_internal = rgb32ToUV; break;
  2822. case PIX_FMT_RGB24 : c->hcscale_internal = RENAME(rgb24ToUV); break;
  2823. case PIX_FMT_RGB565 : c->hcscale_internal = rgb16ToUV; break;
  2824. case PIX_FMT_RGB555 : c->hcscale_internal = rgb15ToUV; break;
  2825. }
  2826. }
  2827. c->hyscale_internal = NULL;
  2828. c->hascale_internal = NULL;
  2829. switch (srcFormat) {
  2830. case PIX_FMT_YUYV422 :
  2831. case PIX_FMT_YUV420P16BE:
  2832. case PIX_FMT_YUV422P16BE:
  2833. case PIX_FMT_YUV444P16BE:
  2834. case PIX_FMT_GRAY16BE : c->hyscale_internal = RENAME(yuy2ToY); break;
  2835. case PIX_FMT_UYVY422 :
  2836. case PIX_FMT_YUV420P16LE:
  2837. case PIX_FMT_YUV422P16LE:
  2838. case PIX_FMT_YUV444P16LE:
  2839. case PIX_FMT_GRAY16LE : c->hyscale_internal = RENAME(uyvyToY); break;
  2840. case PIX_FMT_BGR24 : c->hyscale_internal = RENAME(bgr24ToY); break;
  2841. case PIX_FMT_BGR565 : c->hyscale_internal = bgr16ToY; break;
  2842. case PIX_FMT_BGR555 : c->hyscale_internal = bgr15ToY; break;
  2843. case PIX_FMT_RGB24 : c->hyscale_internal = RENAME(rgb24ToY); break;
  2844. case PIX_FMT_RGB565 : c->hyscale_internal = rgb16ToY; break;
  2845. case PIX_FMT_RGB555 : c->hyscale_internal = rgb15ToY; break;
  2846. case PIX_FMT_RGB8 :
  2847. case PIX_FMT_BGR8 :
  2848. case PIX_FMT_PAL8 :
  2849. case PIX_FMT_BGR4_BYTE:
  2850. case PIX_FMT_RGB4_BYTE: c->hyscale_internal = palToY; break;
  2851. case PIX_FMT_MONOBLACK: c->hyscale_internal = monoblack2Y; break;
  2852. case PIX_FMT_MONOWHITE: c->hyscale_internal = monowhite2Y; break;
  2853. case PIX_FMT_RGB32 :
  2854. case PIX_FMT_RGB32_1: c->hyscale_internal = bgr32ToY; break;
  2855. case PIX_FMT_BGR32 :
  2856. case PIX_FMT_BGR32_1: c->hyscale_internal = rgb32ToY; break;
  2857. case PIX_FMT_RGB48BE:
  2858. case PIX_FMT_RGB48LE: c->hyscale_internal = rgb48ToY; break;
  2859. }
  2860. if (c->alpPixBuf) {
  2861. switch (srcFormat) {
  2862. case PIX_FMT_RGB32 :
  2863. case PIX_FMT_RGB32_1:
  2864. case PIX_FMT_BGR32 :
  2865. case PIX_FMT_BGR32_1: c->hascale_internal = abgrToA; break;
  2866. }
  2867. }
  2868. }