You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

2822 lines
131KB

  1. /*
  2. * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
  3. *
  4. * This file is part of Libav.
  5. *
  6. * Libav is free software; you can redistribute it and/or
  7. * modify it under the terms of the GNU Lesser General Public
  8. * License as published by the Free Software Foundation; either
  9. * version 2.1 of the License, or (at your option) any later version.
  10. *
  11. * Libav is distributed in the hope that it will be useful,
  12. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  14. * Lesser General Public License for more details.
  15. *
  16. * You should have received a copy of the GNU Lesser General Public
  17. * License along with Libav; if not, write to the Free Software
  18. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  19. */
  20. #include "swscale_template.h"
  21. #undef REAL_MOVNTQ
  22. #undef MOVNTQ
  23. #undef PAVGB
  24. #undef PREFETCH
  25. #if COMPILE_TEMPLATE_AMD3DNOW
  26. #define PREFETCH "prefetch"
  27. #elif COMPILE_TEMPLATE_MMX2
  28. #define PREFETCH "prefetchnta"
  29. #else
  30. #define PREFETCH " # nop"
  31. #endif
  32. #if COMPILE_TEMPLATE_MMX2
  33. #define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
  34. #elif COMPILE_TEMPLATE_AMD3DNOW
  35. #define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
  36. #endif
  37. #if COMPILE_TEMPLATE_MMX2
  38. #define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
  39. #else
  40. #define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
  41. #endif
  42. #define MOVNTQ(a,b) REAL_MOVNTQ(a,b)
  43. #define YSCALEYUV2YV12X(x, offset, dest, width) \
  44. __asm__ volatile(\
  45. "xor %%"REG_a", %%"REG_a" \n\t"\
  46. "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
  47. "movq %%mm3, %%mm4 \n\t"\
  48. "lea " offset "(%0), %%"REG_d" \n\t"\
  49. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  50. ".p2align 4 \n\t" /* FIXME Unroll? */\
  51. "1: \n\t"\
  52. "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
  53. "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* srcData */\
  54. "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm5 \n\t" /* srcData */\
  55. "add $16, %%"REG_d" \n\t"\
  56. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  57. "test %%"REG_S", %%"REG_S" \n\t"\
  58. "pmulhw %%mm0, %%mm2 \n\t"\
  59. "pmulhw %%mm0, %%mm5 \n\t"\
  60. "paddw %%mm2, %%mm3 \n\t"\
  61. "paddw %%mm5, %%mm4 \n\t"\
  62. " jnz 1b \n\t"\
  63. "psraw $3, %%mm3 \n\t"\
  64. "psraw $3, %%mm4 \n\t"\
  65. "packuswb %%mm4, %%mm3 \n\t"\
  66. MOVNTQ(%%mm3, (%1, %%REGa))\
  67. "add $8, %%"REG_a" \n\t"\
  68. "cmp %2, %%"REG_a" \n\t"\
  69. "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
  70. "movq %%mm3, %%mm4 \n\t"\
  71. "lea " offset "(%0), %%"REG_d" \n\t"\
  72. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  73. "jb 1b \n\t"\
  74. :: "r" (&c->redDither),\
  75. "r" (dest), "g" ((x86_reg)width)\
  76. : "%"REG_a, "%"REG_d, "%"REG_S\
  77. );
  78. #define YSCALEYUV2YV12X_ACCURATE(x, offset, dest, width) \
  79. __asm__ volatile(\
  80. "lea " offset "(%0), %%"REG_d" \n\t"\
  81. "xor %%"REG_a", %%"REG_a" \n\t"\
  82. "pxor %%mm4, %%mm4 \n\t"\
  83. "pxor %%mm5, %%mm5 \n\t"\
  84. "pxor %%mm6, %%mm6 \n\t"\
  85. "pxor %%mm7, %%mm7 \n\t"\
  86. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  87. ".p2align 4 \n\t"\
  88. "1: \n\t"\
  89. "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm0 \n\t" /* srcData */\
  90. "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* srcData */\
  91. "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
  92. "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm1 \n\t" /* srcData */\
  93. "movq %%mm0, %%mm3 \n\t"\
  94. "punpcklwd %%mm1, %%mm0 \n\t"\
  95. "punpckhwd %%mm1, %%mm3 \n\t"\
  96. "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm1 \n\t" /* filterCoeff */\
  97. "pmaddwd %%mm1, %%mm0 \n\t"\
  98. "pmaddwd %%mm1, %%mm3 \n\t"\
  99. "paddd %%mm0, %%mm4 \n\t"\
  100. "paddd %%mm3, %%mm5 \n\t"\
  101. "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm3 \n\t" /* srcData */\
  102. "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
  103. "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
  104. "test %%"REG_S", %%"REG_S" \n\t"\
  105. "movq %%mm2, %%mm0 \n\t"\
  106. "punpcklwd %%mm3, %%mm2 \n\t"\
  107. "punpckhwd %%mm3, %%mm0 \n\t"\
  108. "pmaddwd %%mm1, %%mm2 \n\t"\
  109. "pmaddwd %%mm1, %%mm0 \n\t"\
  110. "paddd %%mm2, %%mm6 \n\t"\
  111. "paddd %%mm0, %%mm7 \n\t"\
  112. " jnz 1b \n\t"\
  113. "psrad $16, %%mm4 \n\t"\
  114. "psrad $16, %%mm5 \n\t"\
  115. "psrad $16, %%mm6 \n\t"\
  116. "psrad $16, %%mm7 \n\t"\
  117. "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
  118. "packssdw %%mm5, %%mm4 \n\t"\
  119. "packssdw %%mm7, %%mm6 \n\t"\
  120. "paddw %%mm0, %%mm4 \n\t"\
  121. "paddw %%mm0, %%mm6 \n\t"\
  122. "psraw $3, %%mm4 \n\t"\
  123. "psraw $3, %%mm6 \n\t"\
  124. "packuswb %%mm6, %%mm4 \n\t"\
  125. MOVNTQ(%%mm4, (%1, %%REGa))\
  126. "add $8, %%"REG_a" \n\t"\
  127. "cmp %2, %%"REG_a" \n\t"\
  128. "lea " offset "(%0), %%"REG_d" \n\t"\
  129. "pxor %%mm4, %%mm4 \n\t"\
  130. "pxor %%mm5, %%mm5 \n\t"\
  131. "pxor %%mm6, %%mm6 \n\t"\
  132. "pxor %%mm7, %%mm7 \n\t"\
  133. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  134. "jb 1b \n\t"\
  135. :: "r" (&c->redDither),\
  136. "r" (dest), "g" ((x86_reg)width)\
  137. : "%"REG_a, "%"REG_d, "%"REG_S\
  138. );
  139. #define YSCALEYUV2YV121 \
  140. "mov %2, %%"REG_a" \n\t"\
  141. ".p2align 4 \n\t" /* FIXME Unroll? */\
  142. "1: \n\t"\
  143. "movq (%0, %%"REG_a", 2), %%mm0 \n\t"\
  144. "movq 8(%0, %%"REG_a", 2), %%mm1 \n\t"\
  145. "psraw $7, %%mm0 \n\t"\
  146. "psraw $7, %%mm1 \n\t"\
  147. "packuswb %%mm1, %%mm0 \n\t"\
  148. MOVNTQ(%%mm0, (%1, %%REGa))\
  149. "add $8, %%"REG_a" \n\t"\
  150. "jnc 1b \n\t"
  151. #define YSCALEYUV2YV121_ACCURATE \
  152. "mov %2, %%"REG_a" \n\t"\
  153. "pcmpeqw %%mm7, %%mm7 \n\t"\
  154. "psrlw $15, %%mm7 \n\t"\
  155. "psllw $6, %%mm7 \n\t"\
  156. ".p2align 4 \n\t" /* FIXME Unroll? */\
  157. "1: \n\t"\
  158. "movq (%0, %%"REG_a", 2), %%mm0 \n\t"\
  159. "movq 8(%0, %%"REG_a", 2), %%mm1 \n\t"\
  160. "paddsw %%mm7, %%mm0 \n\t"\
  161. "paddsw %%mm7, %%mm1 \n\t"\
  162. "psraw $7, %%mm0 \n\t"\
  163. "psraw $7, %%mm1 \n\t"\
  164. "packuswb %%mm1, %%mm0 \n\t"\
  165. MOVNTQ(%%mm0, (%1, %%REGa))\
  166. "add $8, %%"REG_a" \n\t"\
  167. "jnc 1b \n\t"
  168. /*
  169. :: "m" (-lumFilterSize), "m" (-chrFilterSize),
  170. "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
  171. "r" (dest), "m" (dstW_reg),
  172. "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
  173. : "%eax", "%ebx", "%ecx", "%edx", "%esi"
  174. */
  175. #define YSCALEYUV2PACKEDX_UV \
  176. __asm__ volatile(\
  177. "xor %%"REG_a", %%"REG_a" \n\t"\
  178. ".p2align 4 \n\t"\
  179. "nop \n\t"\
  180. "1: \n\t"\
  181. "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
  182. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  183. "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
  184. "movq %%mm3, %%mm4 \n\t"\
  185. ".p2align 4 \n\t"\
  186. "2: \n\t"\
  187. "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
  188. "movq (%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* UsrcData */\
  189. "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm5 \n\t" /* VsrcData */\
  190. "add $16, %%"REG_d" \n\t"\
  191. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  192. "pmulhw %%mm0, %%mm2 \n\t"\
  193. "pmulhw %%mm0, %%mm5 \n\t"\
  194. "paddw %%mm2, %%mm3 \n\t"\
  195. "paddw %%mm5, %%mm4 \n\t"\
  196. "test %%"REG_S", %%"REG_S" \n\t"\
  197. " jnz 2b \n\t"\
  198. #define YSCALEYUV2PACKEDX_YA(offset,coeff,src1,src2,dst1,dst2) \
  199. "lea "offset"(%0), %%"REG_d" \n\t"\
  200. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  201. "movq "VROUNDER_OFFSET"(%0), "#dst1" \n\t"\
  202. "movq "#dst1", "#dst2" \n\t"\
  203. ".p2align 4 \n\t"\
  204. "2: \n\t"\
  205. "movq 8(%%"REG_d"), "#coeff" \n\t" /* filterCoeff */\
  206. "movq (%%"REG_S", %%"REG_a", 2), "#src1" \n\t" /* Y1srcData */\
  207. "movq 8(%%"REG_S", %%"REG_a", 2), "#src2" \n\t" /* Y2srcData */\
  208. "add $16, %%"REG_d" \n\t"\
  209. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  210. "pmulhw "#coeff", "#src1" \n\t"\
  211. "pmulhw "#coeff", "#src2" \n\t"\
  212. "paddw "#src1", "#dst1" \n\t"\
  213. "paddw "#src2", "#dst2" \n\t"\
  214. "test %%"REG_S", %%"REG_S" \n\t"\
  215. " jnz 2b \n\t"\
  216. #define YSCALEYUV2PACKEDX \
  217. YSCALEYUV2PACKEDX_UV \
  218. YSCALEYUV2PACKEDX_YA(LUM_MMX_FILTER_OFFSET,%%mm0,%%mm2,%%mm5,%%mm1,%%mm7) \
  219. #define YSCALEYUV2PACKEDX_END \
  220. :: "r" (&c->redDither), \
  221. "m" (dummy), "m" (dummy), "m" (dummy),\
  222. "r" (dest), "m" (dstW_reg) \
  223. : "%"REG_a, "%"REG_d, "%"REG_S \
  224. );
  225. #define YSCALEYUV2PACKEDX_ACCURATE_UV \
  226. __asm__ volatile(\
  227. "xor %%"REG_a", %%"REG_a" \n\t"\
  228. ".p2align 4 \n\t"\
  229. "nop \n\t"\
  230. "1: \n\t"\
  231. "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
  232. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  233. "pxor %%mm4, %%mm4 \n\t"\
  234. "pxor %%mm5, %%mm5 \n\t"\
  235. "pxor %%mm6, %%mm6 \n\t"\
  236. "pxor %%mm7, %%mm7 \n\t"\
  237. ".p2align 4 \n\t"\
  238. "2: \n\t"\
  239. "movq (%%"REG_S", %%"REG_a"), %%mm0 \n\t" /* UsrcData */\
  240. "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* VsrcData */\
  241. "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
  242. "movq (%%"REG_S", %%"REG_a"), %%mm1 \n\t" /* UsrcData */\
  243. "movq %%mm0, %%mm3 \n\t"\
  244. "punpcklwd %%mm1, %%mm0 \n\t"\
  245. "punpckhwd %%mm1, %%mm3 \n\t"\
  246. "movq "STR(APCK_COEF)"(%%"REG_d"),%%mm1 \n\t" /* filterCoeff */\
  247. "pmaddwd %%mm1, %%mm0 \n\t"\
  248. "pmaddwd %%mm1, %%mm3 \n\t"\
  249. "paddd %%mm0, %%mm4 \n\t"\
  250. "paddd %%mm3, %%mm5 \n\t"\
  251. "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm3 \n\t" /* VsrcData */\
  252. "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
  253. "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
  254. "test %%"REG_S", %%"REG_S" \n\t"\
  255. "movq %%mm2, %%mm0 \n\t"\
  256. "punpcklwd %%mm3, %%mm2 \n\t"\
  257. "punpckhwd %%mm3, %%mm0 \n\t"\
  258. "pmaddwd %%mm1, %%mm2 \n\t"\
  259. "pmaddwd %%mm1, %%mm0 \n\t"\
  260. "paddd %%mm2, %%mm6 \n\t"\
  261. "paddd %%mm0, %%mm7 \n\t"\
  262. " jnz 2b \n\t"\
  263. "psrad $16, %%mm4 \n\t"\
  264. "psrad $16, %%mm5 \n\t"\
  265. "psrad $16, %%mm6 \n\t"\
  266. "psrad $16, %%mm7 \n\t"\
  267. "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
  268. "packssdw %%mm5, %%mm4 \n\t"\
  269. "packssdw %%mm7, %%mm6 \n\t"\
  270. "paddw %%mm0, %%mm4 \n\t"\
  271. "paddw %%mm0, %%mm6 \n\t"\
  272. "movq %%mm4, "U_TEMP"(%0) \n\t"\
  273. "movq %%mm6, "V_TEMP"(%0) \n\t"\
  274. #define YSCALEYUV2PACKEDX_ACCURATE_YA(offset) \
  275. "lea "offset"(%0), %%"REG_d" \n\t"\
  276. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  277. "pxor %%mm1, %%mm1 \n\t"\
  278. "pxor %%mm5, %%mm5 \n\t"\
  279. "pxor %%mm7, %%mm7 \n\t"\
  280. "pxor %%mm6, %%mm6 \n\t"\
  281. ".p2align 4 \n\t"\
  282. "2: \n\t"\
  283. "movq (%%"REG_S", %%"REG_a", 2), %%mm0 \n\t" /* Y1srcData */\
  284. "movq 8(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* Y2srcData */\
  285. "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
  286. "movq (%%"REG_S", %%"REG_a", 2), %%mm4 \n\t" /* Y1srcData */\
  287. "movq %%mm0, %%mm3 \n\t"\
  288. "punpcklwd %%mm4, %%mm0 \n\t"\
  289. "punpckhwd %%mm4, %%mm3 \n\t"\
  290. "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm4 \n\t" /* filterCoeff */\
  291. "pmaddwd %%mm4, %%mm0 \n\t"\
  292. "pmaddwd %%mm4, %%mm3 \n\t"\
  293. "paddd %%mm0, %%mm1 \n\t"\
  294. "paddd %%mm3, %%mm5 \n\t"\
  295. "movq 8(%%"REG_S", %%"REG_a", 2), %%mm3 \n\t" /* Y2srcData */\
  296. "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
  297. "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
  298. "test %%"REG_S", %%"REG_S" \n\t"\
  299. "movq %%mm2, %%mm0 \n\t"\
  300. "punpcklwd %%mm3, %%mm2 \n\t"\
  301. "punpckhwd %%mm3, %%mm0 \n\t"\
  302. "pmaddwd %%mm4, %%mm2 \n\t"\
  303. "pmaddwd %%mm4, %%mm0 \n\t"\
  304. "paddd %%mm2, %%mm7 \n\t"\
  305. "paddd %%mm0, %%mm6 \n\t"\
  306. " jnz 2b \n\t"\
  307. "psrad $16, %%mm1 \n\t"\
  308. "psrad $16, %%mm5 \n\t"\
  309. "psrad $16, %%mm7 \n\t"\
  310. "psrad $16, %%mm6 \n\t"\
  311. "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
  312. "packssdw %%mm5, %%mm1 \n\t"\
  313. "packssdw %%mm6, %%mm7 \n\t"\
  314. "paddw %%mm0, %%mm1 \n\t"\
  315. "paddw %%mm0, %%mm7 \n\t"\
  316. "movq "U_TEMP"(%0), %%mm3 \n\t"\
  317. "movq "V_TEMP"(%0), %%mm4 \n\t"\
  318. #define YSCALEYUV2PACKEDX_ACCURATE \
  319. YSCALEYUV2PACKEDX_ACCURATE_UV \
  320. YSCALEYUV2PACKEDX_ACCURATE_YA(LUM_MMX_FILTER_OFFSET)
  321. #define YSCALEYUV2RGBX \
  322. "psubw "U_OFFSET"(%0), %%mm3 \n\t" /* (U-128)8*/\
  323. "psubw "V_OFFSET"(%0), %%mm4 \n\t" /* (V-128)8*/\
  324. "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
  325. "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
  326. "pmulhw "UG_COEFF"(%0), %%mm3 \n\t"\
  327. "pmulhw "VG_COEFF"(%0), %%mm4 \n\t"\
  328. /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
  329. "pmulhw "UB_COEFF"(%0), %%mm2 \n\t"\
  330. "pmulhw "VR_COEFF"(%0), %%mm5 \n\t"\
  331. "psubw "Y_OFFSET"(%0), %%mm1 \n\t" /* 8(Y-16)*/\
  332. "psubw "Y_OFFSET"(%0), %%mm7 \n\t" /* 8(Y-16)*/\
  333. "pmulhw "Y_COEFF"(%0), %%mm1 \n\t"\
  334. "pmulhw "Y_COEFF"(%0), %%mm7 \n\t"\
  335. /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
  336. "paddw %%mm3, %%mm4 \n\t"\
  337. "movq %%mm2, %%mm0 \n\t"\
  338. "movq %%mm5, %%mm6 \n\t"\
  339. "movq %%mm4, %%mm3 \n\t"\
  340. "punpcklwd %%mm2, %%mm2 \n\t"\
  341. "punpcklwd %%mm5, %%mm5 \n\t"\
  342. "punpcklwd %%mm4, %%mm4 \n\t"\
  343. "paddw %%mm1, %%mm2 \n\t"\
  344. "paddw %%mm1, %%mm5 \n\t"\
  345. "paddw %%mm1, %%mm4 \n\t"\
  346. "punpckhwd %%mm0, %%mm0 \n\t"\
  347. "punpckhwd %%mm6, %%mm6 \n\t"\
  348. "punpckhwd %%mm3, %%mm3 \n\t"\
  349. "paddw %%mm7, %%mm0 \n\t"\
  350. "paddw %%mm7, %%mm6 \n\t"\
  351. "paddw %%mm7, %%mm3 \n\t"\
  352. /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
  353. "packuswb %%mm0, %%mm2 \n\t"\
  354. "packuswb %%mm6, %%mm5 \n\t"\
  355. "packuswb %%mm3, %%mm4 \n\t"\
  356. #define REAL_YSCALEYUV2PACKED(index, c) \
  357. "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
  358. "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1 \n\t"\
  359. "psraw $3, %%mm0 \n\t"\
  360. "psraw $3, %%mm1 \n\t"\
  361. "movq %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c") \n\t"\
  362. "movq %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c") \n\t"\
  363. "xor "#index", "#index" \n\t"\
  364. ".p2align 4 \n\t"\
  365. "1: \n\t"\
  366. "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
  367. "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
  368. "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
  369. "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
  370. "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
  371. "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
  372. "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
  373. "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
  374. "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
  375. "psraw $7, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
  376. "psraw $7, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
  377. "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
  378. "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
  379. "movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
  380. "movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
  381. "movq 8(%0, "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
  382. "movq 8(%1, "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
  383. "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
  384. "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
  385. "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
  386. "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
  387. "psraw $7, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  388. "psraw $7, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  389. "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
  390. "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
  391. #define YSCALEYUV2PACKED(index, c) REAL_YSCALEYUV2PACKED(index, c)
  392. #define REAL_YSCALEYUV2RGB_UV(index, c) \
  393. "xor "#index", "#index" \n\t"\
  394. ".p2align 4 \n\t"\
  395. "1: \n\t"\
  396. "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
  397. "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
  398. "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
  399. "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
  400. "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
  401. "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
  402. "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
  403. "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
  404. "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
  405. "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
  406. "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
  407. "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
  408. "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
  409. "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
  410. "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
  411. "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
  412. "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
  413. "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
  414. "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
  415. /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
  416. #define REAL_YSCALEYUV2RGB_YA(index, c, b1, b2) \
  417. "movq ("#b1", "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
  418. "movq ("#b2", "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
  419. "movq 8("#b1", "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
  420. "movq 8("#b2", "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
  421. "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
  422. "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
  423. "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
  424. "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
  425. "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  426. "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  427. "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
  428. "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
  429. #define REAL_YSCALEYUV2RGB_COEFF(c) \
  430. "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
  431. "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
  432. "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
  433. "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
  434. "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
  435. "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
  436. /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
  437. "paddw %%mm3, %%mm4 \n\t"\
  438. "movq %%mm2, %%mm0 \n\t"\
  439. "movq %%mm5, %%mm6 \n\t"\
  440. "movq %%mm4, %%mm3 \n\t"\
  441. "punpcklwd %%mm2, %%mm2 \n\t"\
  442. "punpcklwd %%mm5, %%mm5 \n\t"\
  443. "punpcklwd %%mm4, %%mm4 \n\t"\
  444. "paddw %%mm1, %%mm2 \n\t"\
  445. "paddw %%mm1, %%mm5 \n\t"\
  446. "paddw %%mm1, %%mm4 \n\t"\
  447. "punpckhwd %%mm0, %%mm0 \n\t"\
  448. "punpckhwd %%mm6, %%mm6 \n\t"\
  449. "punpckhwd %%mm3, %%mm3 \n\t"\
  450. "paddw %%mm7, %%mm0 \n\t"\
  451. "paddw %%mm7, %%mm6 \n\t"\
  452. "paddw %%mm7, %%mm3 \n\t"\
  453. /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
  454. "packuswb %%mm0, %%mm2 \n\t"\
  455. "packuswb %%mm6, %%mm5 \n\t"\
  456. "packuswb %%mm3, %%mm4 \n\t"\
  457. #define YSCALEYUV2RGB_YA(index, c, b1, b2) REAL_YSCALEYUV2RGB_YA(index, c, b1, b2)
  458. #define YSCALEYUV2RGB(index, c) \
  459. REAL_YSCALEYUV2RGB_UV(index, c) \
  460. REAL_YSCALEYUV2RGB_YA(index, c, %0, %1) \
  461. REAL_YSCALEYUV2RGB_COEFF(c)
  462. #define REAL_YSCALEYUV2PACKED1(index, c) \
  463. "xor "#index", "#index" \n\t"\
  464. ".p2align 4 \n\t"\
  465. "1: \n\t"\
  466. "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
  467. "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
  468. "psraw $7, %%mm3 \n\t" \
  469. "psraw $7, %%mm4 \n\t" \
  470. "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
  471. "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
  472. "psraw $7, %%mm1 \n\t" \
  473. "psraw $7, %%mm7 \n\t" \
  474. #define YSCALEYUV2PACKED1(index, c) REAL_YSCALEYUV2PACKED1(index, c)
  475. #define REAL_YSCALEYUV2RGB1(index, c) \
  476. "xor "#index", "#index" \n\t"\
  477. ".p2align 4 \n\t"\
  478. "1: \n\t"\
  479. "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
  480. "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
  481. "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
  482. "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
  483. "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
  484. "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
  485. "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
  486. "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
  487. "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
  488. "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
  489. /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
  490. "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
  491. "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
  492. "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  493. "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  494. "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
  495. "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
  496. "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
  497. "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
  498. "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
  499. "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
  500. /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
  501. "paddw %%mm3, %%mm4 \n\t"\
  502. "movq %%mm2, %%mm0 \n\t"\
  503. "movq %%mm5, %%mm6 \n\t"\
  504. "movq %%mm4, %%mm3 \n\t"\
  505. "punpcklwd %%mm2, %%mm2 \n\t"\
  506. "punpcklwd %%mm5, %%mm5 \n\t"\
  507. "punpcklwd %%mm4, %%mm4 \n\t"\
  508. "paddw %%mm1, %%mm2 \n\t"\
  509. "paddw %%mm1, %%mm5 \n\t"\
  510. "paddw %%mm1, %%mm4 \n\t"\
  511. "punpckhwd %%mm0, %%mm0 \n\t"\
  512. "punpckhwd %%mm6, %%mm6 \n\t"\
  513. "punpckhwd %%mm3, %%mm3 \n\t"\
  514. "paddw %%mm7, %%mm0 \n\t"\
  515. "paddw %%mm7, %%mm6 \n\t"\
  516. "paddw %%mm7, %%mm3 \n\t"\
  517. /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
  518. "packuswb %%mm0, %%mm2 \n\t"\
  519. "packuswb %%mm6, %%mm5 \n\t"\
  520. "packuswb %%mm3, %%mm4 \n\t"\
  521. #define YSCALEYUV2RGB1(index, c) REAL_YSCALEYUV2RGB1(index, c)
  522. #define REAL_YSCALEYUV2PACKED1b(index, c) \
  523. "xor "#index", "#index" \n\t"\
  524. ".p2align 4 \n\t"\
  525. "1: \n\t"\
  526. "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
  527. "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
  528. "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
  529. "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
  530. "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
  531. "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
  532. "psrlw $8, %%mm3 \n\t" \
  533. "psrlw $8, %%mm4 \n\t" \
  534. "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
  535. "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
  536. "psraw $7, %%mm1 \n\t" \
  537. "psraw $7, %%mm7 \n\t"
  538. #define YSCALEYUV2PACKED1b(index, c) REAL_YSCALEYUV2PACKED1b(index, c)
  539. // do vertical chrominance interpolation
  540. #define REAL_YSCALEYUV2RGB1b(index, c) \
  541. "xor "#index", "#index" \n\t"\
  542. ".p2align 4 \n\t"\
  543. "1: \n\t"\
  544. "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
  545. "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
  546. "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
  547. "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
  548. "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
  549. "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
  550. "psrlw $5, %%mm3 \n\t" /*FIXME might overflow*/\
  551. "psrlw $5, %%mm4 \n\t" /*FIXME might overflow*/\
  552. "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
  553. "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
  554. "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
  555. "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
  556. "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
  557. "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
  558. /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
  559. "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
  560. "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
  561. "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  562. "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  563. "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
  564. "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
  565. "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
  566. "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
  567. "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
  568. "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
  569. /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
  570. "paddw %%mm3, %%mm4 \n\t"\
  571. "movq %%mm2, %%mm0 \n\t"\
  572. "movq %%mm5, %%mm6 \n\t"\
  573. "movq %%mm4, %%mm3 \n\t"\
  574. "punpcklwd %%mm2, %%mm2 \n\t"\
  575. "punpcklwd %%mm5, %%mm5 \n\t"\
  576. "punpcklwd %%mm4, %%mm4 \n\t"\
  577. "paddw %%mm1, %%mm2 \n\t"\
  578. "paddw %%mm1, %%mm5 \n\t"\
  579. "paddw %%mm1, %%mm4 \n\t"\
  580. "punpckhwd %%mm0, %%mm0 \n\t"\
  581. "punpckhwd %%mm6, %%mm6 \n\t"\
  582. "punpckhwd %%mm3, %%mm3 \n\t"\
  583. "paddw %%mm7, %%mm0 \n\t"\
  584. "paddw %%mm7, %%mm6 \n\t"\
  585. "paddw %%mm7, %%mm3 \n\t"\
  586. /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
  587. "packuswb %%mm0, %%mm2 \n\t"\
  588. "packuswb %%mm6, %%mm5 \n\t"\
  589. "packuswb %%mm3, %%mm4 \n\t"\
  590. #define YSCALEYUV2RGB1b(index, c) REAL_YSCALEYUV2RGB1b(index, c)
  591. #define REAL_YSCALEYUV2RGB1_ALPHA(index) \
  592. "movq (%1, "#index", 2), %%mm7 \n\t" /* abuf0[index ] */\
  593. "movq 8(%1, "#index", 2), %%mm1 \n\t" /* abuf0[index+4] */\
  594. "psraw $7, %%mm7 \n\t" /* abuf0[index ] >>7 */\
  595. "psraw $7, %%mm1 \n\t" /* abuf0[index+4] >>7 */\
  596. "packuswb %%mm1, %%mm7 \n\t"
  597. #define YSCALEYUV2RGB1_ALPHA(index) REAL_YSCALEYUV2RGB1_ALPHA(index)
  598. #define REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) \
  599. "movq "#b", "#q2" \n\t" /* B */\
  600. "movq "#r", "#t" \n\t" /* R */\
  601. "punpcklbw "#g", "#b" \n\t" /* GBGBGBGB 0 */\
  602. "punpcklbw "#a", "#r" \n\t" /* ARARARAR 0 */\
  603. "punpckhbw "#g", "#q2" \n\t" /* GBGBGBGB 2 */\
  604. "punpckhbw "#a", "#t" \n\t" /* ARARARAR 2 */\
  605. "movq "#b", "#q0" \n\t" /* GBGBGBGB 0 */\
  606. "movq "#q2", "#q3" \n\t" /* GBGBGBGB 2 */\
  607. "punpcklwd "#r", "#q0" \n\t" /* ARGBARGB 0 */\
  608. "punpckhwd "#r", "#b" \n\t" /* ARGBARGB 1 */\
  609. "punpcklwd "#t", "#q2" \n\t" /* ARGBARGB 2 */\
  610. "punpckhwd "#t", "#q3" \n\t" /* ARGBARGB 3 */\
  611. \
  612. MOVNTQ( q0, (dst, index, 4))\
  613. MOVNTQ( b, 8(dst, index, 4))\
  614. MOVNTQ( q2, 16(dst, index, 4))\
  615. MOVNTQ( q3, 24(dst, index, 4))\
  616. \
  617. "add $8, "#index" \n\t"\
  618. "cmp "#dstw", "#index" \n\t"\
  619. " jb 1b \n\t"
  620. #define WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t)
  621. #define REAL_WRITERGB16(dst, dstw, index) \
  622. "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
  623. "pand "MANGLE(bFC)", %%mm4 \n\t" /* G */\
  624. "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
  625. "psrlq $3, %%mm2 \n\t"\
  626. \
  627. "movq %%mm2, %%mm1 \n\t"\
  628. "movq %%mm4, %%mm3 \n\t"\
  629. \
  630. "punpcklbw %%mm7, %%mm3 \n\t"\
  631. "punpcklbw %%mm5, %%mm2 \n\t"\
  632. "punpckhbw %%mm7, %%mm4 \n\t"\
  633. "punpckhbw %%mm5, %%mm1 \n\t"\
  634. \
  635. "psllq $3, %%mm3 \n\t"\
  636. "psllq $3, %%mm4 \n\t"\
  637. \
  638. "por %%mm3, %%mm2 \n\t"\
  639. "por %%mm4, %%mm1 \n\t"\
  640. \
  641. MOVNTQ(%%mm2, (dst, index, 2))\
  642. MOVNTQ(%%mm1, 8(dst, index, 2))\
  643. \
  644. "add $8, "#index" \n\t"\
  645. "cmp "#dstw", "#index" \n\t"\
  646. " jb 1b \n\t"
  647. #define WRITERGB16(dst, dstw, index) REAL_WRITERGB16(dst, dstw, index)
  648. #define REAL_WRITERGB15(dst, dstw, index) \
  649. "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
  650. "pand "MANGLE(bF8)", %%mm4 \n\t" /* G */\
  651. "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
  652. "psrlq $3, %%mm2 \n\t"\
  653. "psrlq $1, %%mm5 \n\t"\
  654. \
  655. "movq %%mm2, %%mm1 \n\t"\
  656. "movq %%mm4, %%mm3 \n\t"\
  657. \
  658. "punpcklbw %%mm7, %%mm3 \n\t"\
  659. "punpcklbw %%mm5, %%mm2 \n\t"\
  660. "punpckhbw %%mm7, %%mm4 \n\t"\
  661. "punpckhbw %%mm5, %%mm1 \n\t"\
  662. \
  663. "psllq $2, %%mm3 \n\t"\
  664. "psllq $2, %%mm4 \n\t"\
  665. \
  666. "por %%mm3, %%mm2 \n\t"\
  667. "por %%mm4, %%mm1 \n\t"\
  668. \
  669. MOVNTQ(%%mm2, (dst, index, 2))\
  670. MOVNTQ(%%mm1, 8(dst, index, 2))\
  671. \
  672. "add $8, "#index" \n\t"\
  673. "cmp "#dstw", "#index" \n\t"\
  674. " jb 1b \n\t"
  675. #define WRITERGB15(dst, dstw, index) REAL_WRITERGB15(dst, dstw, index)
  676. #define WRITEBGR24OLD(dst, dstw, index) \
  677. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
  678. "movq %%mm2, %%mm1 \n\t" /* B */\
  679. "movq %%mm5, %%mm6 \n\t" /* R */\
  680. "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
  681. "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
  682. "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
  683. "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
  684. "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
  685. "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
  686. "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
  687. "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
  688. "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
  689. "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
  690. \
  691. "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
  692. "psrlq $8, %%mm0 \n\t" /* 00RGB0RG 0 */\
  693. "pand "MANGLE(bm00000111)", %%mm4 \n\t" /* 00000RGB 0 */\
  694. "pand "MANGLE(bm11111000)", %%mm0 \n\t" /* 00RGB000 0.5 */\
  695. "por %%mm4, %%mm0 \n\t" /* 00RGBRGB 0 */\
  696. "movq %%mm2, %%mm4 \n\t" /* 0RGB0RGB 1 */\
  697. "psllq $48, %%mm2 \n\t" /* GB000000 1 */\
  698. "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
  699. \
  700. "movq %%mm4, %%mm2 \n\t" /* 0RGB0RGB 1 */\
  701. "psrld $16, %%mm4 \n\t" /* 000R000R 1 */\
  702. "psrlq $24, %%mm2 \n\t" /* 0000RGB0 1.5 */\
  703. "por %%mm4, %%mm2 \n\t" /* 000RRGBR 1 */\
  704. "pand "MANGLE(bm00001111)", %%mm2 \n\t" /* 0000RGBR 1 */\
  705. "movq %%mm1, %%mm4 \n\t" /* 0RGB0RGB 2 */\
  706. "psrlq $8, %%mm1 \n\t" /* 00RGB0RG 2 */\
  707. "pand "MANGLE(bm00000111)", %%mm4 \n\t" /* 00000RGB 2 */\
  708. "pand "MANGLE(bm11111000)", %%mm1 \n\t" /* 00RGB000 2.5 */\
  709. "por %%mm4, %%mm1 \n\t" /* 00RGBRGB 2 */\
  710. "movq %%mm1, %%mm4 \n\t" /* 00RGBRGB 2 */\
  711. "psllq $32, %%mm1 \n\t" /* BRGB0000 2 */\
  712. "por %%mm1, %%mm2 \n\t" /* BRGBRGBR 1 */\
  713. \
  714. "psrlq $32, %%mm4 \n\t" /* 000000RG 2.5 */\
  715. "movq %%mm3, %%mm5 \n\t" /* 0RGB0RGB 3 */\
  716. "psrlq $8, %%mm3 \n\t" /* 00RGB0RG 3 */\
  717. "pand "MANGLE(bm00000111)", %%mm5 \n\t" /* 00000RGB 3 */\
  718. "pand "MANGLE(bm11111000)", %%mm3 \n\t" /* 00RGB000 3.5 */\
  719. "por %%mm5, %%mm3 \n\t" /* 00RGBRGB 3 */\
  720. "psllq $16, %%mm3 \n\t" /* RGBRGB00 3 */\
  721. "por %%mm4, %%mm3 \n\t" /* RGBRGBRG 2.5 */\
  722. \
  723. MOVNTQ(%%mm0, (dst))\
  724. MOVNTQ(%%mm2, 8(dst))\
  725. MOVNTQ(%%mm3, 16(dst))\
  726. "add $24, "#dst" \n\t"\
  727. \
  728. "add $8, "#index" \n\t"\
  729. "cmp "#dstw", "#index" \n\t"\
  730. " jb 1b \n\t"
  731. #define WRITEBGR24MMX(dst, dstw, index) \
  732. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
  733. "movq %%mm2, %%mm1 \n\t" /* B */\
  734. "movq %%mm5, %%mm6 \n\t" /* R */\
  735. "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
  736. "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
  737. "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
  738. "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
  739. "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
  740. "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
  741. "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
  742. "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
  743. "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
  744. "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
  745. \
  746. "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
  747. "movq %%mm2, %%mm6 \n\t" /* 0RGB0RGB 1 */\
  748. "movq %%mm1, %%mm5 \n\t" /* 0RGB0RGB 2 */\
  749. "movq %%mm3, %%mm7 \n\t" /* 0RGB0RGB 3 */\
  750. \
  751. "psllq $40, %%mm0 \n\t" /* RGB00000 0 */\
  752. "psllq $40, %%mm2 \n\t" /* RGB00000 1 */\
  753. "psllq $40, %%mm1 \n\t" /* RGB00000 2 */\
  754. "psllq $40, %%mm3 \n\t" /* RGB00000 3 */\
  755. \
  756. "punpckhdq %%mm4, %%mm0 \n\t" /* 0RGBRGB0 0 */\
  757. "punpckhdq %%mm6, %%mm2 \n\t" /* 0RGBRGB0 1 */\
  758. "punpckhdq %%mm5, %%mm1 \n\t" /* 0RGBRGB0 2 */\
  759. "punpckhdq %%mm7, %%mm3 \n\t" /* 0RGBRGB0 3 */\
  760. \
  761. "psrlq $8, %%mm0 \n\t" /* 00RGBRGB 0 */\
  762. "movq %%mm2, %%mm6 \n\t" /* 0RGBRGB0 1 */\
  763. "psllq $40, %%mm2 \n\t" /* GB000000 1 */\
  764. "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
  765. MOVNTQ(%%mm0, (dst))\
  766. \
  767. "psrlq $24, %%mm6 \n\t" /* 0000RGBR 1 */\
  768. "movq %%mm1, %%mm5 \n\t" /* 0RGBRGB0 2 */\
  769. "psllq $24, %%mm1 \n\t" /* BRGB0000 2 */\
  770. "por %%mm1, %%mm6 \n\t" /* BRGBRGBR 1 */\
  771. MOVNTQ(%%mm6, 8(dst))\
  772. \
  773. "psrlq $40, %%mm5 \n\t" /* 000000RG 2 */\
  774. "psllq $8, %%mm3 \n\t" /* RGBRGB00 3 */\
  775. "por %%mm3, %%mm5 \n\t" /* RGBRGBRG 2 */\
  776. MOVNTQ(%%mm5, 16(dst))\
  777. \
  778. "add $24, "#dst" \n\t"\
  779. \
  780. "add $8, "#index" \n\t"\
  781. "cmp "#dstw", "#index" \n\t"\
  782. " jb 1b \n\t"
  783. #define WRITEBGR24MMX2(dst, dstw, index) \
  784. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
  785. "movq "MANGLE(ff_M24A)", %%mm0 \n\t"\
  786. "movq "MANGLE(ff_M24C)", %%mm7 \n\t"\
  787. "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2 B1 B0 B1 B0 */\
  788. "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2 G1 G0 G1 G0 */\
  789. "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0 R1 R0 R1 R0 */\
  790. \
  791. "pand %%mm0, %%mm1 \n\t" /* B2 B1 B0 */\
  792. "pand %%mm0, %%mm3 \n\t" /* G2 G1 G0 */\
  793. "pand %%mm7, %%mm6 \n\t" /* R1 R0 */\
  794. \
  795. "psllq $8, %%mm3 \n\t" /* G2 G1 G0 */\
  796. "por %%mm1, %%mm6 \n\t"\
  797. "por %%mm3, %%mm6 \n\t"\
  798. MOVNTQ(%%mm6, (dst))\
  799. \
  800. "psrlq $8, %%mm4 \n\t" /* 00 G7 G6 G5 G4 G3 G2 G1 */\
  801. "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4 B3 B2 B3 B2 */\
  802. "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3 G4 G3 G4 G3 */\
  803. "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4 R3 R2 R3 R2 */\
  804. \
  805. "pand "MANGLE(ff_M24B)", %%mm1 \n\t" /* B5 B4 B3 */\
  806. "pand %%mm7, %%mm3 \n\t" /* G4 G3 */\
  807. "pand %%mm0, %%mm6 \n\t" /* R4 R3 R2 */\
  808. \
  809. "por %%mm1, %%mm3 \n\t" /* B5 G4 B4 G3 B3 */\
  810. "por %%mm3, %%mm6 \n\t"\
  811. MOVNTQ(%%mm6, 8(dst))\
  812. \
  813. "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6 B7 B6 B6 B7 */\
  814. "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7 G6 G5 G6 G5 */\
  815. "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6 R5 R4 R5 R4 */\
  816. \
  817. "pand %%mm7, %%mm1 \n\t" /* B7 B6 */\
  818. "pand %%mm0, %%mm3 \n\t" /* G7 G6 G5 */\
  819. "pand "MANGLE(ff_M24B)", %%mm6 \n\t" /* R7 R6 R5 */\
  820. \
  821. "por %%mm1, %%mm3 \n\t"\
  822. "por %%mm3, %%mm6 \n\t"\
  823. MOVNTQ(%%mm6, 16(dst))\
  824. \
  825. "add $24, "#dst" \n\t"\
  826. \
  827. "add $8, "#index" \n\t"\
  828. "cmp "#dstw", "#index" \n\t"\
  829. " jb 1b \n\t"
  830. #if COMPILE_TEMPLATE_MMX2
  831. #undef WRITEBGR24
  832. #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX2(dst, dstw, index)
  833. #else
  834. #undef WRITEBGR24
  835. #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX(dst, dstw, index)
  836. #endif
  837. #define REAL_WRITEYUY2(dst, dstw, index) \
  838. "packuswb %%mm3, %%mm3 \n\t"\
  839. "packuswb %%mm4, %%mm4 \n\t"\
  840. "packuswb %%mm7, %%mm1 \n\t"\
  841. "punpcklbw %%mm4, %%mm3 \n\t"\
  842. "movq %%mm1, %%mm7 \n\t"\
  843. "punpcklbw %%mm3, %%mm1 \n\t"\
  844. "punpckhbw %%mm3, %%mm7 \n\t"\
  845. \
  846. MOVNTQ(%%mm1, (dst, index, 2))\
  847. MOVNTQ(%%mm7, 8(dst, index, 2))\
  848. \
  849. "add $8, "#index" \n\t"\
  850. "cmp "#dstw", "#index" \n\t"\
  851. " jb 1b \n\t"
  852. #define WRITEYUY2(dst, dstw, index) REAL_WRITEYUY2(dst, dstw, index)
  853. static inline void RENAME(yuv2yuvX)(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
  854. const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize, const int16_t **alpSrc,
  855. uint8_t *dest, uint8_t *uDest, uint8_t *vDest, uint8_t *aDest, long dstW, long chrDstW)
  856. {
  857. if(!(c->flags & SWS_BITEXACT)) {
  858. if (c->flags & SWS_ACCURATE_RND) {
  859. if (uDest) {
  860. YSCALEYUV2YV12X_ACCURATE( "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
  861. YSCALEYUV2YV12X_ACCURATE(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
  862. }
  863. if (CONFIG_SWSCALE_ALPHA && aDest) {
  864. YSCALEYUV2YV12X_ACCURATE( "0", ALP_MMX_FILTER_OFFSET, aDest, dstW)
  865. }
  866. YSCALEYUV2YV12X_ACCURATE("0", LUM_MMX_FILTER_OFFSET, dest, dstW)
  867. } else {
  868. if (uDest) {
  869. YSCALEYUV2YV12X( "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
  870. YSCALEYUV2YV12X(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
  871. }
  872. if (CONFIG_SWSCALE_ALPHA && aDest) {
  873. YSCALEYUV2YV12X( "0", ALP_MMX_FILTER_OFFSET, aDest, dstW)
  874. }
  875. YSCALEYUV2YV12X("0", LUM_MMX_FILTER_OFFSET, dest, dstW)
  876. }
  877. return;
  878. }
  879. yuv2yuvXinC(lumFilter, lumSrc, lumFilterSize,
  880. chrFilter, chrSrc, chrFilterSize,
  881. alpSrc, dest, uDest, vDest, aDest, dstW, chrDstW);
  882. }
  883. static inline void RENAME(yuv2nv12X)(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
  884. const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize,
  885. uint8_t *dest, uint8_t *uDest, int dstW, int chrDstW, enum PixelFormat dstFormat)
  886. {
  887. yuv2nv12XinC(lumFilter, lumSrc, lumFilterSize,
  888. chrFilter, chrSrc, chrFilterSize,
  889. dest, uDest, dstW, chrDstW, dstFormat);
  890. }
  891. static inline void RENAME(yuv2yuv1)(SwsContext *c, const int16_t *lumSrc, const int16_t *chrSrc, const int16_t *alpSrc,
  892. uint8_t *dest, uint8_t *uDest, uint8_t *vDest, uint8_t *aDest, long dstW, long chrDstW)
  893. {
  894. int i;
  895. if(!(c->flags & SWS_BITEXACT)) {
  896. long p= 4;
  897. const uint8_t *src[4]= {alpSrc + dstW, lumSrc + dstW, chrSrc + chrDstW, chrSrc + VOFW + chrDstW};
  898. uint8_t *dst[4]= {aDest, dest, uDest, vDest};
  899. x86_reg counter[4]= {dstW, dstW, chrDstW, chrDstW};
  900. if (c->flags & SWS_ACCURATE_RND) {
  901. while(p--) {
  902. if (dst[p]) {
  903. __asm__ volatile(
  904. YSCALEYUV2YV121_ACCURATE
  905. :: "r" (src[p]), "r" (dst[p] + counter[p]),
  906. "g" (-counter[p])
  907. : "%"REG_a
  908. );
  909. }
  910. }
  911. } else {
  912. while(p--) {
  913. if (dst[p]) {
  914. __asm__ volatile(
  915. YSCALEYUV2YV121
  916. :: "r" (src[p]), "r" (dst[p] + counter[p]),
  917. "g" (-counter[p])
  918. : "%"REG_a
  919. );
  920. }
  921. }
  922. }
  923. return;
  924. }
  925. for (i=0; i<dstW; i++) {
  926. int val= (lumSrc[i]+64)>>7;
  927. if (val&256) {
  928. if (val<0) val=0;
  929. else val=255;
  930. }
  931. dest[i]= val;
  932. }
  933. if (uDest)
  934. for (i=0; i<chrDstW; i++) {
  935. int u=(chrSrc[i ]+64)>>7;
  936. int v=(chrSrc[i + VOFW]+64)>>7;
  937. if ((u|v)&256) {
  938. if (u<0) u=0;
  939. else if (u>255) u=255;
  940. if (v<0) v=0;
  941. else if (v>255) v=255;
  942. }
  943. uDest[i]= u;
  944. vDest[i]= v;
  945. }
  946. if (CONFIG_SWSCALE_ALPHA && aDest)
  947. for (i=0; i<dstW; i++) {
  948. int val= (alpSrc[i]+64)>>7;
  949. aDest[i]= av_clip_uint8(val);
  950. }
  951. }
  952. /**
  953. * vertical scale YV12 to RGB
  954. */
  955. static inline void RENAME(yuv2packedX)(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
  956. const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize,
  957. const int16_t **alpSrc, uint8_t *dest, long dstW, long dstY)
  958. {
  959. x86_reg dummy=0;
  960. x86_reg dstW_reg = dstW;
  961. if(!(c->flags & SWS_BITEXACT)) {
  962. if (c->flags & SWS_ACCURATE_RND) {
  963. switch(c->dstFormat) {
  964. case PIX_FMT_RGB32:
  965. if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
  966. YSCALEYUV2PACKEDX_ACCURATE
  967. YSCALEYUV2RGBX
  968. "movq %%mm2, "U_TEMP"(%0) \n\t"
  969. "movq %%mm4, "V_TEMP"(%0) \n\t"
  970. "movq %%mm5, "Y_TEMP"(%0) \n\t"
  971. YSCALEYUV2PACKEDX_ACCURATE_YA(ALP_MMX_FILTER_OFFSET)
  972. "movq "Y_TEMP"(%0), %%mm5 \n\t"
  973. "psraw $3, %%mm1 \n\t"
  974. "psraw $3, %%mm7 \n\t"
  975. "packuswb %%mm7, %%mm1 \n\t"
  976. WRITEBGR32(%4, %5, %%REGa, %%mm3, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm2, %%mm6)
  977. YSCALEYUV2PACKEDX_END
  978. } else {
  979. YSCALEYUV2PACKEDX_ACCURATE
  980. YSCALEYUV2RGBX
  981. "pcmpeqd %%mm7, %%mm7 \n\t"
  982. WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
  983. YSCALEYUV2PACKEDX_END
  984. }
  985. return;
  986. case PIX_FMT_BGR24:
  987. YSCALEYUV2PACKEDX_ACCURATE
  988. YSCALEYUV2RGBX
  989. "pxor %%mm7, %%mm7 \n\t"
  990. "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"\n\t" //FIXME optimize
  991. "add %4, %%"REG_c" \n\t"
  992. WRITEBGR24(%%REGc, %5, %%REGa)
  993. :: "r" (&c->redDither),
  994. "m" (dummy), "m" (dummy), "m" (dummy),
  995. "r" (dest), "m" (dstW_reg)
  996. : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
  997. );
  998. return;
  999. case PIX_FMT_RGB555:
  1000. YSCALEYUV2PACKEDX_ACCURATE
  1001. YSCALEYUV2RGBX
  1002. "pxor %%mm7, %%mm7 \n\t"
  1003. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1004. #ifdef DITHER1XBPP
  1005. "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
  1006. "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
  1007. "paddusb "RED_DITHER"(%0), %%mm5\n\t"
  1008. #endif
  1009. WRITERGB15(%4, %5, %%REGa)
  1010. YSCALEYUV2PACKEDX_END
  1011. return;
  1012. case PIX_FMT_RGB565:
  1013. YSCALEYUV2PACKEDX_ACCURATE
  1014. YSCALEYUV2RGBX
  1015. "pxor %%mm7, %%mm7 \n\t"
  1016. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1017. #ifdef DITHER1XBPP
  1018. "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
  1019. "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
  1020. "paddusb "RED_DITHER"(%0), %%mm5\n\t"
  1021. #endif
  1022. WRITERGB16(%4, %5, %%REGa)
  1023. YSCALEYUV2PACKEDX_END
  1024. return;
  1025. case PIX_FMT_YUYV422:
  1026. YSCALEYUV2PACKEDX_ACCURATE
  1027. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1028. "psraw $3, %%mm3 \n\t"
  1029. "psraw $3, %%mm4 \n\t"
  1030. "psraw $3, %%mm1 \n\t"
  1031. "psraw $3, %%mm7 \n\t"
  1032. WRITEYUY2(%4, %5, %%REGa)
  1033. YSCALEYUV2PACKEDX_END
  1034. return;
  1035. }
  1036. } else {
  1037. switch(c->dstFormat) {
  1038. case PIX_FMT_RGB32:
  1039. if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
  1040. YSCALEYUV2PACKEDX
  1041. YSCALEYUV2RGBX
  1042. YSCALEYUV2PACKEDX_YA(ALP_MMX_FILTER_OFFSET, %%mm0, %%mm3, %%mm6, %%mm1, %%mm7)
  1043. "psraw $3, %%mm1 \n\t"
  1044. "psraw $3, %%mm7 \n\t"
  1045. "packuswb %%mm7, %%mm1 \n\t"
  1046. WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
  1047. YSCALEYUV2PACKEDX_END
  1048. } else {
  1049. YSCALEYUV2PACKEDX
  1050. YSCALEYUV2RGBX
  1051. "pcmpeqd %%mm7, %%mm7 \n\t"
  1052. WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
  1053. YSCALEYUV2PACKEDX_END
  1054. }
  1055. return;
  1056. case PIX_FMT_BGR24:
  1057. YSCALEYUV2PACKEDX
  1058. YSCALEYUV2RGBX
  1059. "pxor %%mm7, %%mm7 \n\t"
  1060. "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c" \n\t" //FIXME optimize
  1061. "add %4, %%"REG_c" \n\t"
  1062. WRITEBGR24(%%REGc, %5, %%REGa)
  1063. :: "r" (&c->redDither),
  1064. "m" (dummy), "m" (dummy), "m" (dummy),
  1065. "r" (dest), "m" (dstW_reg)
  1066. : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
  1067. );
  1068. return;
  1069. case PIX_FMT_RGB555:
  1070. YSCALEYUV2PACKEDX
  1071. YSCALEYUV2RGBX
  1072. "pxor %%mm7, %%mm7 \n\t"
  1073. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1074. #ifdef DITHER1XBPP
  1075. "paddusb "BLUE_DITHER"(%0), %%mm2 \n\t"
  1076. "paddusb "GREEN_DITHER"(%0), %%mm4 \n\t"
  1077. "paddusb "RED_DITHER"(%0), %%mm5 \n\t"
  1078. #endif
  1079. WRITERGB15(%4, %5, %%REGa)
  1080. YSCALEYUV2PACKEDX_END
  1081. return;
  1082. case PIX_FMT_RGB565:
  1083. YSCALEYUV2PACKEDX
  1084. YSCALEYUV2RGBX
  1085. "pxor %%mm7, %%mm7 \n\t"
  1086. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1087. #ifdef DITHER1XBPP
  1088. "paddusb "BLUE_DITHER"(%0), %%mm2 \n\t"
  1089. "paddusb "GREEN_DITHER"(%0), %%mm4 \n\t"
  1090. "paddusb "RED_DITHER"(%0), %%mm5 \n\t"
  1091. #endif
  1092. WRITERGB16(%4, %5, %%REGa)
  1093. YSCALEYUV2PACKEDX_END
  1094. return;
  1095. case PIX_FMT_YUYV422:
  1096. YSCALEYUV2PACKEDX
  1097. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1098. "psraw $3, %%mm3 \n\t"
  1099. "psraw $3, %%mm4 \n\t"
  1100. "psraw $3, %%mm1 \n\t"
  1101. "psraw $3, %%mm7 \n\t"
  1102. WRITEYUY2(%4, %5, %%REGa)
  1103. YSCALEYUV2PACKEDX_END
  1104. return;
  1105. }
  1106. }
  1107. }
  1108. yuv2packedXinC(c, lumFilter, lumSrc, lumFilterSize,
  1109. chrFilter, chrSrc, chrFilterSize,
  1110. alpSrc, dest, dstW, dstY);
  1111. }
  1112. /**
  1113. * vertical bilinear scale YV12 to RGB
  1114. */
  1115. static inline void RENAME(yuv2packed2)(SwsContext *c, const uint16_t *buf0, const uint16_t *buf1, const uint16_t *uvbuf0, const uint16_t *uvbuf1,
  1116. const uint16_t *abuf0, const uint16_t *abuf1, uint8_t *dest, int dstW, int yalpha, int uvalpha, int y)
  1117. {
  1118. int yalpha1=4095- yalpha;
  1119. int uvalpha1=4095-uvalpha;
  1120. int i;
  1121. if(!(c->flags & SWS_BITEXACT)) {
  1122. switch(c->dstFormat) {
  1123. //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
  1124. case PIX_FMT_RGB32:
  1125. if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
  1126. #if ARCH_X86_64
  1127. __asm__ volatile(
  1128. YSCALEYUV2RGB(%%r8, %5)
  1129. YSCALEYUV2RGB_YA(%%r8, %5, %6, %7)
  1130. "psraw $3, %%mm1 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
  1131. "psraw $3, %%mm7 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
  1132. "packuswb %%mm7, %%mm1 \n\t"
  1133. WRITEBGR32(%4, 8280(%5), %%r8, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
  1134. :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "r" (dest),
  1135. "a" (&c->redDither)
  1136. ,"r" (abuf0), "r" (abuf1)
  1137. : "%r8"
  1138. );
  1139. #else
  1140. *(const uint16_t **)(&c->u_temp)=abuf0;
  1141. *(const uint16_t **)(&c->v_temp)=abuf1;
  1142. __asm__ volatile(
  1143. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1144. "mov %4, %%"REG_b" \n\t"
  1145. "push %%"REG_BP" \n\t"
  1146. YSCALEYUV2RGB(%%REGBP, %5)
  1147. "push %0 \n\t"
  1148. "push %1 \n\t"
  1149. "mov "U_TEMP"(%5), %0 \n\t"
  1150. "mov "V_TEMP"(%5), %1 \n\t"
  1151. YSCALEYUV2RGB_YA(%%REGBP, %5, %0, %1)
  1152. "psraw $3, %%mm1 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
  1153. "psraw $3, %%mm7 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
  1154. "packuswb %%mm7, %%mm1 \n\t"
  1155. "pop %1 \n\t"
  1156. "pop %0 \n\t"
  1157. WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
  1158. "pop %%"REG_BP" \n\t"
  1159. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1160. :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
  1161. "a" (&c->redDither)
  1162. );
  1163. #endif
  1164. } else {
  1165. __asm__ volatile(
  1166. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1167. "mov %4, %%"REG_b" \n\t"
  1168. "push %%"REG_BP" \n\t"
  1169. YSCALEYUV2RGB(%%REGBP, %5)
  1170. "pcmpeqd %%mm7, %%mm7 \n\t"
  1171. WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
  1172. "pop %%"REG_BP" \n\t"
  1173. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1174. :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
  1175. "a" (&c->redDither)
  1176. );
  1177. }
  1178. return;
  1179. case PIX_FMT_BGR24:
  1180. __asm__ volatile(
  1181. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1182. "mov %4, %%"REG_b" \n\t"
  1183. "push %%"REG_BP" \n\t"
  1184. YSCALEYUV2RGB(%%REGBP, %5)
  1185. "pxor %%mm7, %%mm7 \n\t"
  1186. WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
  1187. "pop %%"REG_BP" \n\t"
  1188. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1189. :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
  1190. "a" (&c->redDither)
  1191. );
  1192. return;
  1193. case PIX_FMT_RGB555:
  1194. __asm__ volatile(
  1195. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1196. "mov %4, %%"REG_b" \n\t"
  1197. "push %%"REG_BP" \n\t"
  1198. YSCALEYUV2RGB(%%REGBP, %5)
  1199. "pxor %%mm7, %%mm7 \n\t"
  1200. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1201. #ifdef DITHER1XBPP
  1202. "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
  1203. "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
  1204. "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
  1205. #endif
  1206. WRITERGB15(%%REGb, 8280(%5), %%REGBP)
  1207. "pop %%"REG_BP" \n\t"
  1208. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1209. :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
  1210. "a" (&c->redDither)
  1211. );
  1212. return;
  1213. case PIX_FMT_RGB565:
  1214. __asm__ volatile(
  1215. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1216. "mov %4, %%"REG_b" \n\t"
  1217. "push %%"REG_BP" \n\t"
  1218. YSCALEYUV2RGB(%%REGBP, %5)
  1219. "pxor %%mm7, %%mm7 \n\t"
  1220. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1221. #ifdef DITHER1XBPP
  1222. "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
  1223. "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
  1224. "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
  1225. #endif
  1226. WRITERGB16(%%REGb, 8280(%5), %%REGBP)
  1227. "pop %%"REG_BP" \n\t"
  1228. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1229. :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
  1230. "a" (&c->redDither)
  1231. );
  1232. return;
  1233. case PIX_FMT_YUYV422:
  1234. __asm__ volatile(
  1235. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1236. "mov %4, %%"REG_b" \n\t"
  1237. "push %%"REG_BP" \n\t"
  1238. YSCALEYUV2PACKED(%%REGBP, %5)
  1239. WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
  1240. "pop %%"REG_BP" \n\t"
  1241. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1242. :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
  1243. "a" (&c->redDither)
  1244. );
  1245. return;
  1246. default: break;
  1247. }
  1248. }
  1249. YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB2_C, YSCALE_YUV_2_PACKED2_C(void,0), YSCALE_YUV_2_GRAY16_2_C, YSCALE_YUV_2_MONO2_C)
  1250. }
  1251. /**
  1252. * YV12 to RGB without scaling or interpolating
  1253. */
  1254. static inline void RENAME(yuv2packed1)(SwsContext *c, const uint16_t *buf0, const uint16_t *uvbuf0, const uint16_t *uvbuf1,
  1255. const uint16_t *abuf0, uint8_t *dest, int dstW, int uvalpha, enum PixelFormat dstFormat, int flags, int y)
  1256. {
  1257. const int yalpha1=0;
  1258. int i;
  1259. const uint16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
  1260. const int yalpha= 4096; //FIXME ...
  1261. if (flags&SWS_FULL_CHR_H_INT) {
  1262. c->yuv2packed2(c, buf0, buf0, uvbuf0, uvbuf1, abuf0, abuf0, dest, dstW, 0, uvalpha, y);
  1263. return;
  1264. }
  1265. if(!(flags & SWS_BITEXACT)) {
  1266. if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
  1267. switch(dstFormat) {
  1268. case PIX_FMT_RGB32:
  1269. if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
  1270. __asm__ volatile(
  1271. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1272. "mov %4, %%"REG_b" \n\t"
  1273. "push %%"REG_BP" \n\t"
  1274. YSCALEYUV2RGB1(%%REGBP, %5)
  1275. YSCALEYUV2RGB1_ALPHA(%%REGBP)
  1276. WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
  1277. "pop %%"REG_BP" \n\t"
  1278. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1279. :: "c" (buf0), "d" (abuf0), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
  1280. "a" (&c->redDither)
  1281. );
  1282. } else {
  1283. __asm__ volatile(
  1284. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1285. "mov %4, %%"REG_b" \n\t"
  1286. "push %%"REG_BP" \n\t"
  1287. YSCALEYUV2RGB1(%%REGBP, %5)
  1288. "pcmpeqd %%mm7, %%mm7 \n\t"
  1289. WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
  1290. "pop %%"REG_BP" \n\t"
  1291. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1292. :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
  1293. "a" (&c->redDither)
  1294. );
  1295. }
  1296. return;
  1297. case PIX_FMT_BGR24:
  1298. __asm__ volatile(
  1299. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1300. "mov %4, %%"REG_b" \n\t"
  1301. "push %%"REG_BP" \n\t"
  1302. YSCALEYUV2RGB1(%%REGBP, %5)
  1303. "pxor %%mm7, %%mm7 \n\t"
  1304. WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
  1305. "pop %%"REG_BP" \n\t"
  1306. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1307. :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
  1308. "a" (&c->redDither)
  1309. );
  1310. return;
  1311. case PIX_FMT_RGB555:
  1312. __asm__ volatile(
  1313. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1314. "mov %4, %%"REG_b" \n\t"
  1315. "push %%"REG_BP" \n\t"
  1316. YSCALEYUV2RGB1(%%REGBP, %5)
  1317. "pxor %%mm7, %%mm7 \n\t"
  1318. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1319. #ifdef DITHER1XBPP
  1320. "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
  1321. "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
  1322. "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
  1323. #endif
  1324. WRITERGB15(%%REGb, 8280(%5), %%REGBP)
  1325. "pop %%"REG_BP" \n\t"
  1326. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1327. :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
  1328. "a" (&c->redDither)
  1329. );
  1330. return;
  1331. case PIX_FMT_RGB565:
  1332. __asm__ volatile(
  1333. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1334. "mov %4, %%"REG_b" \n\t"
  1335. "push %%"REG_BP" \n\t"
  1336. YSCALEYUV2RGB1(%%REGBP, %5)
  1337. "pxor %%mm7, %%mm7 \n\t"
  1338. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1339. #ifdef DITHER1XBPP
  1340. "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
  1341. "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
  1342. "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
  1343. #endif
  1344. WRITERGB16(%%REGb, 8280(%5), %%REGBP)
  1345. "pop %%"REG_BP" \n\t"
  1346. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1347. :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
  1348. "a" (&c->redDither)
  1349. );
  1350. return;
  1351. case PIX_FMT_YUYV422:
  1352. __asm__ volatile(
  1353. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1354. "mov %4, %%"REG_b" \n\t"
  1355. "push %%"REG_BP" \n\t"
  1356. YSCALEYUV2PACKED1(%%REGBP, %5)
  1357. WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
  1358. "pop %%"REG_BP" \n\t"
  1359. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1360. :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
  1361. "a" (&c->redDither)
  1362. );
  1363. return;
  1364. }
  1365. } else {
  1366. switch(dstFormat) {
  1367. case PIX_FMT_RGB32:
  1368. if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
  1369. __asm__ volatile(
  1370. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1371. "mov %4, %%"REG_b" \n\t"
  1372. "push %%"REG_BP" \n\t"
  1373. YSCALEYUV2RGB1b(%%REGBP, %5)
  1374. YSCALEYUV2RGB1_ALPHA(%%REGBP)
  1375. WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
  1376. "pop %%"REG_BP" \n\t"
  1377. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1378. :: "c" (buf0), "d" (abuf0), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
  1379. "a" (&c->redDither)
  1380. );
  1381. } else {
  1382. __asm__ volatile(
  1383. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1384. "mov %4, %%"REG_b" \n\t"
  1385. "push %%"REG_BP" \n\t"
  1386. YSCALEYUV2RGB1b(%%REGBP, %5)
  1387. "pcmpeqd %%mm7, %%mm7 \n\t"
  1388. WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
  1389. "pop %%"REG_BP" \n\t"
  1390. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1391. :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
  1392. "a" (&c->redDither)
  1393. );
  1394. }
  1395. return;
  1396. case PIX_FMT_BGR24:
  1397. __asm__ volatile(
  1398. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1399. "mov %4, %%"REG_b" \n\t"
  1400. "push %%"REG_BP" \n\t"
  1401. YSCALEYUV2RGB1b(%%REGBP, %5)
  1402. "pxor %%mm7, %%mm7 \n\t"
  1403. WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
  1404. "pop %%"REG_BP" \n\t"
  1405. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1406. :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
  1407. "a" (&c->redDither)
  1408. );
  1409. return;
  1410. case PIX_FMT_RGB555:
  1411. __asm__ volatile(
  1412. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1413. "mov %4, %%"REG_b" \n\t"
  1414. "push %%"REG_BP" \n\t"
  1415. YSCALEYUV2RGB1b(%%REGBP, %5)
  1416. "pxor %%mm7, %%mm7 \n\t"
  1417. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1418. #ifdef DITHER1XBPP
  1419. "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
  1420. "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
  1421. "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
  1422. #endif
  1423. WRITERGB15(%%REGb, 8280(%5), %%REGBP)
  1424. "pop %%"REG_BP" \n\t"
  1425. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1426. :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
  1427. "a" (&c->redDither)
  1428. );
  1429. return;
  1430. case PIX_FMT_RGB565:
  1431. __asm__ volatile(
  1432. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1433. "mov %4, %%"REG_b" \n\t"
  1434. "push %%"REG_BP" \n\t"
  1435. YSCALEYUV2RGB1b(%%REGBP, %5)
  1436. "pxor %%mm7, %%mm7 \n\t"
  1437. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1438. #ifdef DITHER1XBPP
  1439. "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
  1440. "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
  1441. "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
  1442. #endif
  1443. WRITERGB16(%%REGb, 8280(%5), %%REGBP)
  1444. "pop %%"REG_BP" \n\t"
  1445. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1446. :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
  1447. "a" (&c->redDither)
  1448. );
  1449. return;
  1450. case PIX_FMT_YUYV422:
  1451. __asm__ volatile(
  1452. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1453. "mov %4, %%"REG_b" \n\t"
  1454. "push %%"REG_BP" \n\t"
  1455. YSCALEYUV2PACKED1b(%%REGBP, %5)
  1456. WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
  1457. "pop %%"REG_BP" \n\t"
  1458. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1459. :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
  1460. "a" (&c->redDither)
  1461. );
  1462. return;
  1463. }
  1464. }
  1465. }
  1466. if (uvalpha < 2048) {
  1467. YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1_C, YSCALE_YUV_2_PACKED1_C(void,0), YSCALE_YUV_2_GRAY16_1_C, YSCALE_YUV_2_MONO2_C)
  1468. } else {
  1469. YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1B_C, YSCALE_YUV_2_PACKED1B_C(void,0), YSCALE_YUV_2_GRAY16_1_C, YSCALE_YUV_2_MONO2_C)
  1470. }
  1471. }
  1472. //FIXME yuy2* can read up to 7 samples too much
  1473. static inline void RENAME(yuy2ToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
  1474. {
  1475. __asm__ volatile(
  1476. "movq "MANGLE(bm01010101)", %%mm2 \n\t"
  1477. "mov %0, %%"REG_a" \n\t"
  1478. "1: \n\t"
  1479. "movq (%1, %%"REG_a",2), %%mm0 \n\t"
  1480. "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
  1481. "pand %%mm2, %%mm0 \n\t"
  1482. "pand %%mm2, %%mm1 \n\t"
  1483. "packuswb %%mm1, %%mm0 \n\t"
  1484. "movq %%mm0, (%2, %%"REG_a") \n\t"
  1485. "add $8, %%"REG_a" \n\t"
  1486. " js 1b \n\t"
  1487. : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst+width)
  1488. : "%"REG_a
  1489. );
  1490. }
  1491. static inline void RENAME(yuy2ToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
  1492. {
  1493. __asm__ volatile(
  1494. "movq "MANGLE(bm01010101)", %%mm4 \n\t"
  1495. "mov %0, %%"REG_a" \n\t"
  1496. "1: \n\t"
  1497. "movq (%1, %%"REG_a",4), %%mm0 \n\t"
  1498. "movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
  1499. "psrlw $8, %%mm0 \n\t"
  1500. "psrlw $8, %%mm1 \n\t"
  1501. "packuswb %%mm1, %%mm0 \n\t"
  1502. "movq %%mm0, %%mm1 \n\t"
  1503. "psrlw $8, %%mm0 \n\t"
  1504. "pand %%mm4, %%mm1 \n\t"
  1505. "packuswb %%mm0, %%mm0 \n\t"
  1506. "packuswb %%mm1, %%mm1 \n\t"
  1507. "movd %%mm0, (%3, %%"REG_a") \n\t"
  1508. "movd %%mm1, (%2, %%"REG_a") \n\t"
  1509. "add $4, %%"REG_a" \n\t"
  1510. " js 1b \n\t"
  1511. : : "g" ((x86_reg)-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
  1512. : "%"REG_a
  1513. );
  1514. assert(src1 == src2);
  1515. }
  1516. static inline void RENAME(LEToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
  1517. {
  1518. __asm__ volatile(
  1519. "mov %0, %%"REG_a" \n\t"
  1520. "1: \n\t"
  1521. "movq (%1, %%"REG_a",2), %%mm0 \n\t"
  1522. "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
  1523. "movq (%2, %%"REG_a",2), %%mm2 \n\t"
  1524. "movq 8(%2, %%"REG_a",2), %%mm3 \n\t"
  1525. "psrlw $8, %%mm0 \n\t"
  1526. "psrlw $8, %%mm1 \n\t"
  1527. "psrlw $8, %%mm2 \n\t"
  1528. "psrlw $8, %%mm3 \n\t"
  1529. "packuswb %%mm1, %%mm0 \n\t"
  1530. "packuswb %%mm3, %%mm2 \n\t"
  1531. "movq %%mm0, (%3, %%"REG_a") \n\t"
  1532. "movq %%mm2, (%4, %%"REG_a") \n\t"
  1533. "add $8, %%"REG_a" \n\t"
  1534. " js 1b \n\t"
  1535. : : "g" ((x86_reg)-width), "r" (src1+width*2), "r" (src2+width*2), "r" (dstU+width), "r" (dstV+width)
  1536. : "%"REG_a
  1537. );
  1538. }
  1539. /* This is almost identical to the previous, end exists only because
  1540. * yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses. */
  1541. static inline void RENAME(uyvyToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
  1542. {
  1543. __asm__ volatile(
  1544. "mov %0, %%"REG_a" \n\t"
  1545. "1: \n\t"
  1546. "movq (%1, %%"REG_a",2), %%mm0 \n\t"
  1547. "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
  1548. "psrlw $8, %%mm0 \n\t"
  1549. "psrlw $8, %%mm1 \n\t"
  1550. "packuswb %%mm1, %%mm0 \n\t"
  1551. "movq %%mm0, (%2, %%"REG_a") \n\t"
  1552. "add $8, %%"REG_a" \n\t"
  1553. " js 1b \n\t"
  1554. : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst+width)
  1555. : "%"REG_a
  1556. );
  1557. }
  1558. static inline void RENAME(uyvyToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
  1559. {
  1560. __asm__ volatile(
  1561. "movq "MANGLE(bm01010101)", %%mm4 \n\t"
  1562. "mov %0, %%"REG_a" \n\t"
  1563. "1: \n\t"
  1564. "movq (%1, %%"REG_a",4), %%mm0 \n\t"
  1565. "movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
  1566. "pand %%mm4, %%mm0 \n\t"
  1567. "pand %%mm4, %%mm1 \n\t"
  1568. "packuswb %%mm1, %%mm0 \n\t"
  1569. "movq %%mm0, %%mm1 \n\t"
  1570. "psrlw $8, %%mm0 \n\t"
  1571. "pand %%mm4, %%mm1 \n\t"
  1572. "packuswb %%mm0, %%mm0 \n\t"
  1573. "packuswb %%mm1, %%mm1 \n\t"
  1574. "movd %%mm0, (%3, %%"REG_a") \n\t"
  1575. "movd %%mm1, (%2, %%"REG_a") \n\t"
  1576. "add $4, %%"REG_a" \n\t"
  1577. " js 1b \n\t"
  1578. : : "g" ((x86_reg)-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
  1579. : "%"REG_a
  1580. );
  1581. assert(src1 == src2);
  1582. }
  1583. static inline void RENAME(BEToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
  1584. {
  1585. __asm__ volatile(
  1586. "movq "MANGLE(bm01010101)", %%mm4 \n\t"
  1587. "mov %0, %%"REG_a" \n\t"
  1588. "1: \n\t"
  1589. "movq (%1, %%"REG_a",2), %%mm0 \n\t"
  1590. "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
  1591. "movq (%2, %%"REG_a",2), %%mm2 \n\t"
  1592. "movq 8(%2, %%"REG_a",2), %%mm3 \n\t"
  1593. "pand %%mm4, %%mm0 \n\t"
  1594. "pand %%mm4, %%mm1 \n\t"
  1595. "pand %%mm4, %%mm2 \n\t"
  1596. "pand %%mm4, %%mm3 \n\t"
  1597. "packuswb %%mm1, %%mm0 \n\t"
  1598. "packuswb %%mm3, %%mm2 \n\t"
  1599. "movq %%mm0, (%3, %%"REG_a") \n\t"
  1600. "movq %%mm2, (%4, %%"REG_a") \n\t"
  1601. "add $8, %%"REG_a" \n\t"
  1602. " js 1b \n\t"
  1603. : : "g" ((x86_reg)-width), "r" (src1+width*2), "r" (src2+width*2), "r" (dstU+width), "r" (dstV+width)
  1604. : "%"REG_a
  1605. );
  1606. }
  1607. static inline void RENAME(nvXXtoUV)(uint8_t *dst1, uint8_t *dst2,
  1608. const uint8_t *src, long width)
  1609. {
  1610. __asm__ volatile(
  1611. "movq "MANGLE(bm01010101)", %%mm4 \n\t"
  1612. "mov %0, %%"REG_a" \n\t"
  1613. "1: \n\t"
  1614. "movq (%1, %%"REG_a",2), %%mm0 \n\t"
  1615. "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
  1616. "movq %%mm0, %%mm2 \n\t"
  1617. "movq %%mm1, %%mm3 \n\t"
  1618. "pand %%mm4, %%mm0 \n\t"
  1619. "pand %%mm4, %%mm1 \n\t"
  1620. "psrlw $8, %%mm2 \n\t"
  1621. "psrlw $8, %%mm3 \n\t"
  1622. "packuswb %%mm1, %%mm0 \n\t"
  1623. "packuswb %%mm3, %%mm2 \n\t"
  1624. "movq %%mm0, (%2, %%"REG_a") \n\t"
  1625. "movq %%mm2, (%3, %%"REG_a") \n\t"
  1626. "add $8, %%"REG_a" \n\t"
  1627. " js 1b \n\t"
  1628. : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst1+width), "r" (dst2+width)
  1629. : "%"REG_a
  1630. );
  1631. }
  1632. static inline void RENAME(nv12ToUV)(uint8_t *dstU, uint8_t *dstV,
  1633. const uint8_t *src1, const uint8_t *src2,
  1634. long width, uint32_t *unused)
  1635. {
  1636. RENAME(nvXXtoUV)(dstU, dstV, src1, width);
  1637. }
  1638. static inline void RENAME(nv21ToUV)(uint8_t *dstU, uint8_t *dstV,
  1639. const uint8_t *src1, const uint8_t *src2,
  1640. long width, uint32_t *unused)
  1641. {
  1642. RENAME(nvXXtoUV)(dstV, dstU, src1, width);
  1643. }
  1644. static inline void RENAME(bgr24ToY_mmx)(uint8_t *dst, const uint8_t *src, long width, enum PixelFormat srcFormat)
  1645. {
  1646. if(srcFormat == PIX_FMT_BGR24) {
  1647. __asm__ volatile(
  1648. "movq "MANGLE(ff_bgr24toY1Coeff)", %%mm5 \n\t"
  1649. "movq "MANGLE(ff_bgr24toY2Coeff)", %%mm6 \n\t"
  1650. :
  1651. );
  1652. } else {
  1653. __asm__ volatile(
  1654. "movq "MANGLE(ff_rgb24toY1Coeff)", %%mm5 \n\t"
  1655. "movq "MANGLE(ff_rgb24toY2Coeff)", %%mm6 \n\t"
  1656. :
  1657. );
  1658. }
  1659. __asm__ volatile(
  1660. "movq "MANGLE(ff_bgr24toYOffset)", %%mm4 \n\t"
  1661. "mov %2, %%"REG_a" \n\t"
  1662. "pxor %%mm7, %%mm7 \n\t"
  1663. "1: \n\t"
  1664. PREFETCH" 64(%0) \n\t"
  1665. "movd (%0), %%mm0 \n\t"
  1666. "movd 2(%0), %%mm1 \n\t"
  1667. "movd 6(%0), %%mm2 \n\t"
  1668. "movd 8(%0), %%mm3 \n\t"
  1669. "add $12, %0 \n\t"
  1670. "punpcklbw %%mm7, %%mm0 \n\t"
  1671. "punpcklbw %%mm7, %%mm1 \n\t"
  1672. "punpcklbw %%mm7, %%mm2 \n\t"
  1673. "punpcklbw %%mm7, %%mm3 \n\t"
  1674. "pmaddwd %%mm5, %%mm0 \n\t"
  1675. "pmaddwd %%mm6, %%mm1 \n\t"
  1676. "pmaddwd %%mm5, %%mm2 \n\t"
  1677. "pmaddwd %%mm6, %%mm3 \n\t"
  1678. "paddd %%mm1, %%mm0 \n\t"
  1679. "paddd %%mm3, %%mm2 \n\t"
  1680. "paddd %%mm4, %%mm0 \n\t"
  1681. "paddd %%mm4, %%mm2 \n\t"
  1682. "psrad $15, %%mm0 \n\t"
  1683. "psrad $15, %%mm2 \n\t"
  1684. "packssdw %%mm2, %%mm0 \n\t"
  1685. "packuswb %%mm0, %%mm0 \n\t"
  1686. "movd %%mm0, (%1, %%"REG_a") \n\t"
  1687. "add $4, %%"REG_a" \n\t"
  1688. " js 1b \n\t"
  1689. : "+r" (src)
  1690. : "r" (dst+width), "g" ((x86_reg)-width)
  1691. : "%"REG_a
  1692. );
  1693. }
  1694. static inline void RENAME(bgr24ToUV_mmx)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src, long width, enum PixelFormat srcFormat)
  1695. {
  1696. __asm__ volatile(
  1697. "movq 24(%4), %%mm6 \n\t"
  1698. "mov %3, %%"REG_a" \n\t"
  1699. "pxor %%mm7, %%mm7 \n\t"
  1700. "1: \n\t"
  1701. PREFETCH" 64(%0) \n\t"
  1702. "movd (%0), %%mm0 \n\t"
  1703. "movd 2(%0), %%mm1 \n\t"
  1704. "punpcklbw %%mm7, %%mm0 \n\t"
  1705. "punpcklbw %%mm7, %%mm1 \n\t"
  1706. "movq %%mm0, %%mm2 \n\t"
  1707. "movq %%mm1, %%mm3 \n\t"
  1708. "pmaddwd (%4), %%mm0 \n\t"
  1709. "pmaddwd 8(%4), %%mm1 \n\t"
  1710. "pmaddwd 16(%4), %%mm2 \n\t"
  1711. "pmaddwd %%mm6, %%mm3 \n\t"
  1712. "paddd %%mm1, %%mm0 \n\t"
  1713. "paddd %%mm3, %%mm2 \n\t"
  1714. "movd 6(%0), %%mm1 \n\t"
  1715. "movd 8(%0), %%mm3 \n\t"
  1716. "add $12, %0 \n\t"
  1717. "punpcklbw %%mm7, %%mm1 \n\t"
  1718. "punpcklbw %%mm7, %%mm3 \n\t"
  1719. "movq %%mm1, %%mm4 \n\t"
  1720. "movq %%mm3, %%mm5 \n\t"
  1721. "pmaddwd (%4), %%mm1 \n\t"
  1722. "pmaddwd 8(%4), %%mm3 \n\t"
  1723. "pmaddwd 16(%4), %%mm4 \n\t"
  1724. "pmaddwd %%mm6, %%mm5 \n\t"
  1725. "paddd %%mm3, %%mm1 \n\t"
  1726. "paddd %%mm5, %%mm4 \n\t"
  1727. "movq "MANGLE(ff_bgr24toUVOffset)", %%mm3 \n\t"
  1728. "paddd %%mm3, %%mm0 \n\t"
  1729. "paddd %%mm3, %%mm2 \n\t"
  1730. "paddd %%mm3, %%mm1 \n\t"
  1731. "paddd %%mm3, %%mm4 \n\t"
  1732. "psrad $15, %%mm0 \n\t"
  1733. "psrad $15, %%mm2 \n\t"
  1734. "psrad $15, %%mm1 \n\t"
  1735. "psrad $15, %%mm4 \n\t"
  1736. "packssdw %%mm1, %%mm0 \n\t"
  1737. "packssdw %%mm4, %%mm2 \n\t"
  1738. "packuswb %%mm0, %%mm0 \n\t"
  1739. "packuswb %%mm2, %%mm2 \n\t"
  1740. "movd %%mm0, (%1, %%"REG_a") \n\t"
  1741. "movd %%mm2, (%2, %%"REG_a") \n\t"
  1742. "add $4, %%"REG_a" \n\t"
  1743. " js 1b \n\t"
  1744. : "+r" (src)
  1745. : "r" (dstU+width), "r" (dstV+width), "g" ((x86_reg)-width), "r"(ff_bgr24toUV[srcFormat == PIX_FMT_RGB24])
  1746. : "%"REG_a
  1747. );
  1748. }
  1749. static inline void RENAME(bgr24ToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
  1750. {
  1751. RENAME(bgr24ToY_mmx)(dst, src, width, PIX_FMT_BGR24);
  1752. }
  1753. static inline void RENAME(bgr24ToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
  1754. {
  1755. RENAME(bgr24ToUV_mmx)(dstU, dstV, src1, width, PIX_FMT_BGR24);
  1756. assert(src1 == src2);
  1757. }
  1758. static inline void RENAME(bgr24ToUV_half)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
  1759. {
  1760. int i;
  1761. for (i=0; i<width; i++) {
  1762. int b= src1[6*i + 0] + src1[6*i + 3];
  1763. int g= src1[6*i + 1] + src1[6*i + 4];
  1764. int r= src1[6*i + 2] + src1[6*i + 5];
  1765. dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
  1766. dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
  1767. }
  1768. assert(src1 == src2);
  1769. }
  1770. static inline void RENAME(rgb24ToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
  1771. {
  1772. RENAME(bgr24ToY_mmx)(dst, src, width, PIX_FMT_RGB24);
  1773. }
  1774. static inline void RENAME(rgb24ToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
  1775. {
  1776. assert(src1==src2);
  1777. RENAME(bgr24ToUV_mmx)(dstU, dstV, src1, width, PIX_FMT_RGB24);
  1778. }
  1779. static inline void RENAME(rgb24ToUV_half)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
  1780. {
  1781. int i;
  1782. assert(src1==src2);
  1783. for (i=0; i<width; i++) {
  1784. int r= src1[6*i + 0] + src1[6*i + 3];
  1785. int g= src1[6*i + 1] + src1[6*i + 4];
  1786. int b= src1[6*i + 2] + src1[6*i + 5];
  1787. dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
  1788. dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
  1789. }
  1790. }
  1791. // bilinear / bicubic scaling
  1792. static inline void RENAME(hScale)(int16_t *dst, int dstW, const uint8_t *src, int srcW, int xInc,
  1793. const int16_t *filter, const int16_t *filterPos, long filterSize)
  1794. {
  1795. assert(filterSize % 4 == 0 && filterSize>0);
  1796. if (filterSize==4) { // Always true for upscaling, sometimes for down, too.
  1797. x86_reg counter= -2*dstW;
  1798. filter-= counter*2;
  1799. filterPos-= counter/2;
  1800. dst-= counter/2;
  1801. __asm__ volatile(
  1802. #if defined(PIC)
  1803. "push %%"REG_b" \n\t"
  1804. #endif
  1805. "pxor %%mm7, %%mm7 \n\t"
  1806. "push %%"REG_BP" \n\t" // we use 7 regs here ...
  1807. "mov %%"REG_a", %%"REG_BP" \n\t"
  1808. ".p2align 4 \n\t"
  1809. "1: \n\t"
  1810. "movzwl (%2, %%"REG_BP"), %%eax \n\t"
  1811. "movzwl 2(%2, %%"REG_BP"), %%ebx \n\t"
  1812. "movq (%1, %%"REG_BP", 4), %%mm1 \n\t"
  1813. "movq 8(%1, %%"REG_BP", 4), %%mm3 \n\t"
  1814. "movd (%3, %%"REG_a"), %%mm0 \n\t"
  1815. "movd (%3, %%"REG_b"), %%mm2 \n\t"
  1816. "punpcklbw %%mm7, %%mm0 \n\t"
  1817. "punpcklbw %%mm7, %%mm2 \n\t"
  1818. "pmaddwd %%mm1, %%mm0 \n\t"
  1819. "pmaddwd %%mm2, %%mm3 \n\t"
  1820. "movq %%mm0, %%mm4 \n\t"
  1821. "punpckldq %%mm3, %%mm0 \n\t"
  1822. "punpckhdq %%mm3, %%mm4 \n\t"
  1823. "paddd %%mm4, %%mm0 \n\t"
  1824. "psrad $7, %%mm0 \n\t"
  1825. "packssdw %%mm0, %%mm0 \n\t"
  1826. "movd %%mm0, (%4, %%"REG_BP") \n\t"
  1827. "add $4, %%"REG_BP" \n\t"
  1828. " jnc 1b \n\t"
  1829. "pop %%"REG_BP" \n\t"
  1830. #if defined(PIC)
  1831. "pop %%"REG_b" \n\t"
  1832. #endif
  1833. : "+a" (counter)
  1834. : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
  1835. #if !defined(PIC)
  1836. : "%"REG_b
  1837. #endif
  1838. );
  1839. } else if (filterSize==8) {
  1840. x86_reg counter= -2*dstW;
  1841. filter-= counter*4;
  1842. filterPos-= counter/2;
  1843. dst-= counter/2;
  1844. __asm__ volatile(
  1845. #if defined(PIC)
  1846. "push %%"REG_b" \n\t"
  1847. #endif
  1848. "pxor %%mm7, %%mm7 \n\t"
  1849. "push %%"REG_BP" \n\t" // we use 7 regs here ...
  1850. "mov %%"REG_a", %%"REG_BP" \n\t"
  1851. ".p2align 4 \n\t"
  1852. "1: \n\t"
  1853. "movzwl (%2, %%"REG_BP"), %%eax \n\t"
  1854. "movzwl 2(%2, %%"REG_BP"), %%ebx \n\t"
  1855. "movq (%1, %%"REG_BP", 8), %%mm1 \n\t"
  1856. "movq 16(%1, %%"REG_BP", 8), %%mm3 \n\t"
  1857. "movd (%3, %%"REG_a"), %%mm0 \n\t"
  1858. "movd (%3, %%"REG_b"), %%mm2 \n\t"
  1859. "punpcklbw %%mm7, %%mm0 \n\t"
  1860. "punpcklbw %%mm7, %%mm2 \n\t"
  1861. "pmaddwd %%mm1, %%mm0 \n\t"
  1862. "pmaddwd %%mm2, %%mm3 \n\t"
  1863. "movq 8(%1, %%"REG_BP", 8), %%mm1 \n\t"
  1864. "movq 24(%1, %%"REG_BP", 8), %%mm5 \n\t"
  1865. "movd 4(%3, %%"REG_a"), %%mm4 \n\t"
  1866. "movd 4(%3, %%"REG_b"), %%mm2 \n\t"
  1867. "punpcklbw %%mm7, %%mm4 \n\t"
  1868. "punpcklbw %%mm7, %%mm2 \n\t"
  1869. "pmaddwd %%mm1, %%mm4 \n\t"
  1870. "pmaddwd %%mm2, %%mm5 \n\t"
  1871. "paddd %%mm4, %%mm0 \n\t"
  1872. "paddd %%mm5, %%mm3 \n\t"
  1873. "movq %%mm0, %%mm4 \n\t"
  1874. "punpckldq %%mm3, %%mm0 \n\t"
  1875. "punpckhdq %%mm3, %%mm4 \n\t"
  1876. "paddd %%mm4, %%mm0 \n\t"
  1877. "psrad $7, %%mm0 \n\t"
  1878. "packssdw %%mm0, %%mm0 \n\t"
  1879. "movd %%mm0, (%4, %%"REG_BP") \n\t"
  1880. "add $4, %%"REG_BP" \n\t"
  1881. " jnc 1b \n\t"
  1882. "pop %%"REG_BP" \n\t"
  1883. #if defined(PIC)
  1884. "pop %%"REG_b" \n\t"
  1885. #endif
  1886. : "+a" (counter)
  1887. : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
  1888. #if !defined(PIC)
  1889. : "%"REG_b
  1890. #endif
  1891. );
  1892. } else {
  1893. const uint8_t *offset = src+filterSize;
  1894. x86_reg counter= -2*dstW;
  1895. //filter-= counter*filterSize/2;
  1896. filterPos-= counter/2;
  1897. dst-= counter/2;
  1898. __asm__ volatile(
  1899. "pxor %%mm7, %%mm7 \n\t"
  1900. ".p2align 4 \n\t"
  1901. "1: \n\t"
  1902. "mov %2, %%"REG_c" \n\t"
  1903. "movzwl (%%"REG_c", %0), %%eax \n\t"
  1904. "movzwl 2(%%"REG_c", %0), %%edx \n\t"
  1905. "mov %5, %%"REG_c" \n\t"
  1906. "pxor %%mm4, %%mm4 \n\t"
  1907. "pxor %%mm5, %%mm5 \n\t"
  1908. "2: \n\t"
  1909. "movq (%1), %%mm1 \n\t"
  1910. "movq (%1, %6), %%mm3 \n\t"
  1911. "movd (%%"REG_c", %%"REG_a"), %%mm0 \n\t"
  1912. "movd (%%"REG_c", %%"REG_d"), %%mm2 \n\t"
  1913. "punpcklbw %%mm7, %%mm0 \n\t"
  1914. "punpcklbw %%mm7, %%mm2 \n\t"
  1915. "pmaddwd %%mm1, %%mm0 \n\t"
  1916. "pmaddwd %%mm2, %%mm3 \n\t"
  1917. "paddd %%mm3, %%mm5 \n\t"
  1918. "paddd %%mm0, %%mm4 \n\t"
  1919. "add $8, %1 \n\t"
  1920. "add $4, %%"REG_c" \n\t"
  1921. "cmp %4, %%"REG_c" \n\t"
  1922. " jb 2b \n\t"
  1923. "add %6, %1 \n\t"
  1924. "movq %%mm4, %%mm0 \n\t"
  1925. "punpckldq %%mm5, %%mm4 \n\t"
  1926. "punpckhdq %%mm5, %%mm0 \n\t"
  1927. "paddd %%mm0, %%mm4 \n\t"
  1928. "psrad $7, %%mm4 \n\t"
  1929. "packssdw %%mm4, %%mm4 \n\t"
  1930. "mov %3, %%"REG_a" \n\t"
  1931. "movd %%mm4, (%%"REG_a", %0) \n\t"
  1932. "add $4, %0 \n\t"
  1933. " jnc 1b \n\t"
  1934. : "+r" (counter), "+r" (filter)
  1935. : "m" (filterPos), "m" (dst), "m"(offset),
  1936. "m" (src), "r" ((x86_reg)filterSize*2)
  1937. : "%"REG_a, "%"REG_c, "%"REG_d
  1938. );
  1939. }
  1940. }
  1941. //FIXME all pal and rgb srcFormats could do this convertion as well
  1942. //FIXME all scalers more complex than bilinear could do half of this transform
  1943. static void RENAME(chrRangeToJpeg)(uint16_t *dst, int width)
  1944. {
  1945. int i;
  1946. for (i = 0; i < width; i++) {
  1947. dst[i ] = (FFMIN(dst[i ],30775)*4663 - 9289992)>>12; //-264
  1948. dst[i+VOFW] = (FFMIN(dst[i+VOFW],30775)*4663 - 9289992)>>12; //-264
  1949. }
  1950. }
  1951. static void RENAME(chrRangeFromJpeg)(uint16_t *dst, int width)
  1952. {
  1953. int i;
  1954. for (i = 0; i < width; i++) {
  1955. dst[i ] = (dst[i ]*1799 + 4081085)>>11; //1469
  1956. dst[i+VOFW] = (dst[i+VOFW]*1799 + 4081085)>>11; //1469
  1957. }
  1958. }
  1959. static void RENAME(lumRangeToJpeg)(uint16_t *dst, int width)
  1960. {
  1961. int i;
  1962. for (i = 0; i < width; i++)
  1963. dst[i] = (FFMIN(dst[i],30189)*19077 - 39057361)>>14;
  1964. }
  1965. static void RENAME(lumRangeFromJpeg)(uint16_t *dst, int width)
  1966. {
  1967. int i;
  1968. for (i = 0; i < width; i++)
  1969. dst[i] = (dst[i]*14071 + 33561947)>>14;
  1970. }
  1971. #define FAST_BILINEAR_X86 \
  1972. "subl %%edi, %%esi \n\t" /* src[xx+1] - src[xx] */ \
  1973. "imull %%ecx, %%esi \n\t" /* (src[xx+1] - src[xx])*xalpha */ \
  1974. "shll $16, %%edi \n\t" \
  1975. "addl %%edi, %%esi \n\t" /* src[xx+1]*xalpha + src[xx]*(1-xalpha) */ \
  1976. "mov %1, %%"REG_D"\n\t" \
  1977. "shrl $9, %%esi \n\t" \
  1978. static inline void RENAME(hyscale_fast)(SwsContext *c, int16_t *dst,
  1979. long dstWidth, const uint8_t *src, int srcW,
  1980. int xInc)
  1981. {
  1982. #if COMPILE_TEMPLATE_MMX2
  1983. int32_t *filterPos = c->hLumFilterPos;
  1984. int16_t *filter = c->hLumFilter;
  1985. int canMMX2BeUsed = c->canMMX2BeUsed;
  1986. void *mmx2FilterCode= c->lumMmx2FilterCode;
  1987. int i;
  1988. #if defined(PIC)
  1989. DECLARE_ALIGNED(8, uint64_t, ebxsave);
  1990. #endif
  1991. if (canMMX2BeUsed) {
  1992. __asm__ volatile(
  1993. #if defined(PIC)
  1994. "mov %%"REG_b", %5 \n\t"
  1995. #endif
  1996. "pxor %%mm7, %%mm7 \n\t"
  1997. "mov %0, %%"REG_c" \n\t"
  1998. "mov %1, %%"REG_D" \n\t"
  1999. "mov %2, %%"REG_d" \n\t"
  2000. "mov %3, %%"REG_b" \n\t"
  2001. "xor %%"REG_a", %%"REG_a" \n\t" // i
  2002. PREFETCH" (%%"REG_c") \n\t"
  2003. PREFETCH" 32(%%"REG_c") \n\t"
  2004. PREFETCH" 64(%%"REG_c") \n\t"
  2005. #if ARCH_X86_64
  2006. #define CALL_MMX2_FILTER_CODE \
  2007. "movl (%%"REG_b"), %%esi \n\t"\
  2008. "call *%4 \n\t"\
  2009. "movl (%%"REG_b", %%"REG_a"), %%esi \n\t"\
  2010. "add %%"REG_S", %%"REG_c" \n\t"\
  2011. "add %%"REG_a", %%"REG_D" \n\t"\
  2012. "xor %%"REG_a", %%"REG_a" \n\t"\
  2013. #else
  2014. #define CALL_MMX2_FILTER_CODE \
  2015. "movl (%%"REG_b"), %%esi \n\t"\
  2016. "call *%4 \n\t"\
  2017. "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\
  2018. "add %%"REG_a", %%"REG_D" \n\t"\
  2019. "xor %%"REG_a", %%"REG_a" \n\t"\
  2020. #endif /* ARCH_X86_64 */
  2021. CALL_MMX2_FILTER_CODE
  2022. CALL_MMX2_FILTER_CODE
  2023. CALL_MMX2_FILTER_CODE
  2024. CALL_MMX2_FILTER_CODE
  2025. CALL_MMX2_FILTER_CODE
  2026. CALL_MMX2_FILTER_CODE
  2027. CALL_MMX2_FILTER_CODE
  2028. CALL_MMX2_FILTER_CODE
  2029. #if defined(PIC)
  2030. "mov %5, %%"REG_b" \n\t"
  2031. #endif
  2032. :: "m" (src), "m" (dst), "m" (filter), "m" (filterPos),
  2033. "m" (mmx2FilterCode)
  2034. #if defined(PIC)
  2035. ,"m" (ebxsave)
  2036. #endif
  2037. : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
  2038. #if !defined(PIC)
  2039. ,"%"REG_b
  2040. #endif
  2041. );
  2042. for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) dst[i] = src[srcW-1]*128;
  2043. } else {
  2044. #endif /* COMPILE_TEMPLATE_MMX2 */
  2045. x86_reg xInc_shr16 = xInc >> 16;
  2046. uint16_t xInc_mask = xInc & 0xffff;
  2047. x86_reg dstWidth_reg = dstWidth;
  2048. //NO MMX just normal asm ...
  2049. __asm__ volatile(
  2050. "xor %%"REG_a", %%"REG_a" \n\t" // i
  2051. "xor %%"REG_d", %%"REG_d" \n\t" // xx
  2052. "xorl %%ecx, %%ecx \n\t" // xalpha
  2053. ".p2align 4 \n\t"
  2054. "1: \n\t"
  2055. "movzbl (%0, %%"REG_d"), %%edi \n\t" //src[xx]
  2056. "movzbl 1(%0, %%"REG_d"), %%esi \n\t" //src[xx+1]
  2057. FAST_BILINEAR_X86
  2058. "movw %%si, (%%"REG_D", %%"REG_a", 2) \n\t"
  2059. "addw %4, %%cx \n\t" //xalpha += xInc&0xFFFF
  2060. "adc %3, %%"REG_d" \n\t" //xx+= xInc>>16 + carry
  2061. "movzbl (%0, %%"REG_d"), %%edi \n\t" //src[xx]
  2062. "movzbl 1(%0, %%"REG_d"), %%esi \n\t" //src[xx+1]
  2063. FAST_BILINEAR_X86
  2064. "movw %%si, 2(%%"REG_D", %%"REG_a", 2) \n\t"
  2065. "addw %4, %%cx \n\t" //xalpha += xInc&0xFFFF
  2066. "adc %3, %%"REG_d" \n\t" //xx+= xInc>>16 + carry
  2067. "add $2, %%"REG_a" \n\t"
  2068. "cmp %2, %%"REG_a" \n\t"
  2069. " jb 1b \n\t"
  2070. :: "r" (src), "m" (dst), "m" (dstWidth_reg), "m" (xInc_shr16), "m" (xInc_mask)
  2071. : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi"
  2072. );
  2073. #if COMPILE_TEMPLATE_MMX2
  2074. } //if MMX2 can't be used
  2075. #endif
  2076. }
  2077. // *** horizontal scale Y line to temp buffer
  2078. static inline void RENAME(hyscale)(SwsContext *c, uint16_t *dst, long dstWidth, const uint8_t *src, int srcW, int xInc,
  2079. const int16_t *hLumFilter,
  2080. const int16_t *hLumFilterPos, int hLumFilterSize,
  2081. uint8_t *formatConvBuffer,
  2082. uint32_t *pal, int isAlpha)
  2083. {
  2084. void (*toYV12)(uint8_t *, const uint8_t *, long, uint32_t *) = isAlpha ? c->alpToYV12 : c->lumToYV12;
  2085. void (*convertRange)(uint16_t *, int) = isAlpha ? NULL : c->lumConvertRange;
  2086. src += isAlpha ? c->alpSrcOffset : c->lumSrcOffset;
  2087. if (toYV12) {
  2088. toYV12(formatConvBuffer, src, srcW, pal);
  2089. src= formatConvBuffer;
  2090. }
  2091. if (!c->hyscale_fast) {
  2092. c->hScale(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize);
  2093. } else { // fast bilinear upscale / crap downscale
  2094. c->hyscale_fast(c, dst, dstWidth, src, srcW, xInc);
  2095. }
  2096. if (convertRange)
  2097. convertRange(dst, dstWidth);
  2098. }
  2099. static inline void RENAME(hcscale_fast)(SwsContext *c, int16_t *dst,
  2100. long dstWidth, const uint8_t *src1,
  2101. const uint8_t *src2, int srcW, int xInc)
  2102. {
  2103. #if COMPILE_TEMPLATE_MMX2
  2104. int32_t *filterPos = c->hChrFilterPos;
  2105. int16_t *filter = c->hChrFilter;
  2106. int canMMX2BeUsed = c->canMMX2BeUsed;
  2107. void *mmx2FilterCode= c->chrMmx2FilterCode;
  2108. int i;
  2109. #if defined(PIC)
  2110. DECLARE_ALIGNED(8, uint64_t, ebxsave);
  2111. #endif
  2112. if (canMMX2BeUsed) {
  2113. __asm__ volatile(
  2114. #if defined(PIC)
  2115. "mov %%"REG_b", %6 \n\t"
  2116. #endif
  2117. "pxor %%mm7, %%mm7 \n\t"
  2118. "mov %0, %%"REG_c" \n\t"
  2119. "mov %1, %%"REG_D" \n\t"
  2120. "mov %2, %%"REG_d" \n\t"
  2121. "mov %3, %%"REG_b" \n\t"
  2122. "xor %%"REG_a", %%"REG_a" \n\t" // i
  2123. PREFETCH" (%%"REG_c") \n\t"
  2124. PREFETCH" 32(%%"REG_c") \n\t"
  2125. PREFETCH" 64(%%"REG_c") \n\t"
  2126. CALL_MMX2_FILTER_CODE
  2127. CALL_MMX2_FILTER_CODE
  2128. CALL_MMX2_FILTER_CODE
  2129. CALL_MMX2_FILTER_CODE
  2130. "xor %%"REG_a", %%"REG_a" \n\t" // i
  2131. "mov %5, %%"REG_c" \n\t" // src
  2132. "mov %1, %%"REG_D" \n\t" // buf1
  2133. "add $"AV_STRINGIFY(VOF)", %%"REG_D" \n\t"
  2134. PREFETCH" (%%"REG_c") \n\t"
  2135. PREFETCH" 32(%%"REG_c") \n\t"
  2136. PREFETCH" 64(%%"REG_c") \n\t"
  2137. CALL_MMX2_FILTER_CODE
  2138. CALL_MMX2_FILTER_CODE
  2139. CALL_MMX2_FILTER_CODE
  2140. CALL_MMX2_FILTER_CODE
  2141. #if defined(PIC)
  2142. "mov %6, %%"REG_b" \n\t"
  2143. #endif
  2144. :: "m" (src1), "m" (dst), "m" (filter), "m" (filterPos),
  2145. "m" (mmx2FilterCode), "m" (src2)
  2146. #if defined(PIC)
  2147. ,"m" (ebxsave)
  2148. #endif
  2149. : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
  2150. #if !defined(PIC)
  2151. ,"%"REG_b
  2152. #endif
  2153. );
  2154. for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) {
  2155. //printf("%d %d %d\n", dstWidth, i, srcW);
  2156. dst[i] = src1[srcW-1]*128;
  2157. dst[i+VOFW] = src2[srcW-1]*128;
  2158. }
  2159. } else {
  2160. #endif /* COMPILE_TEMPLATE_MMX2 */
  2161. x86_reg xInc_shr16 = (x86_reg) (xInc >> 16);
  2162. uint16_t xInc_mask = xInc & 0xffff;
  2163. x86_reg dstWidth_reg = dstWidth;
  2164. __asm__ volatile(
  2165. "xor %%"REG_a", %%"REG_a" \n\t" // i
  2166. "xor %%"REG_d", %%"REG_d" \n\t" // xx
  2167. "xorl %%ecx, %%ecx \n\t" // xalpha
  2168. ".p2align 4 \n\t"
  2169. "1: \n\t"
  2170. "mov %0, %%"REG_S" \n\t"
  2171. "movzbl (%%"REG_S", %%"REG_d"), %%edi \n\t" //src[xx]
  2172. "movzbl 1(%%"REG_S", %%"REG_d"), %%esi \n\t" //src[xx+1]
  2173. FAST_BILINEAR_X86
  2174. "movw %%si, (%%"REG_D", %%"REG_a", 2) \n\t"
  2175. "movzbl (%5, %%"REG_d"), %%edi \n\t" //src[xx]
  2176. "movzbl 1(%5, %%"REG_d"), %%esi \n\t" //src[xx+1]
  2177. FAST_BILINEAR_X86
  2178. "movw %%si, "AV_STRINGIFY(VOF)"(%%"REG_D", %%"REG_a", 2) \n\t"
  2179. "addw %4, %%cx \n\t" //xalpha += xInc&0xFFFF
  2180. "adc %3, %%"REG_d" \n\t" //xx+= xInc>>16 + carry
  2181. "add $1, %%"REG_a" \n\t"
  2182. "cmp %2, %%"REG_a" \n\t"
  2183. " jb 1b \n\t"
  2184. /* GCC 3.3 makes MPlayer crash on IA-32 machines when using "g" operand here,
  2185. which is needed to support GCC 4.0. */
  2186. #if ARCH_X86_64 && AV_GCC_VERSION_AT_LEAST(3,4)
  2187. :: "m" (src1), "m" (dst), "g" (dstWidth_reg), "m" (xInc_shr16), "m" (xInc_mask),
  2188. #else
  2189. :: "m" (src1), "m" (dst), "m" (dstWidth_reg), "m" (xInc_shr16), "m" (xInc_mask),
  2190. #endif
  2191. "r" (src2)
  2192. : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi"
  2193. );
  2194. #if COMPILE_TEMPLATE_MMX2
  2195. } //if MMX2 can't be used
  2196. #endif
  2197. }
  2198. inline static void RENAME(hcscale)(SwsContext *c, uint16_t *dst, long dstWidth, const uint8_t *src1, const uint8_t *src2,
  2199. int srcW, int xInc, const int16_t *hChrFilter,
  2200. const int16_t *hChrFilterPos, int hChrFilterSize,
  2201. uint8_t *formatConvBuffer,
  2202. uint32_t *pal)
  2203. {
  2204. src1 += c->chrSrcOffset;
  2205. src2 += c->chrSrcOffset;
  2206. if (c->chrToYV12) {
  2207. c->chrToYV12(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
  2208. src1= formatConvBuffer;
  2209. src2= formatConvBuffer+VOFW;
  2210. }
  2211. if (!c->hcscale_fast) {
  2212. c->hScale(dst , dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
  2213. c->hScale(dst+VOFW, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
  2214. } else { // fast bilinear upscale / crap downscale
  2215. c->hcscale_fast(c, dst, dstWidth, src1, src2, srcW, xInc);
  2216. }
  2217. if (c->chrConvertRange)
  2218. c->chrConvertRange(dst, dstWidth);
  2219. }
  2220. #define DEBUG_SWSCALE_BUFFERS 0
  2221. #define DEBUG_BUFFERS(...) if (DEBUG_SWSCALE_BUFFERS) av_log(c, AV_LOG_DEBUG, __VA_ARGS__)
  2222. static int RENAME(swScale)(SwsContext *c, const uint8_t* src[], int srcStride[], int srcSliceY,
  2223. int srcSliceH, uint8_t* dst[], int dstStride[])
  2224. {
  2225. /* load a few things into local vars to make the code more readable? and faster */
  2226. const int srcW= c->srcW;
  2227. const int dstW= c->dstW;
  2228. const int dstH= c->dstH;
  2229. const int chrDstW= c->chrDstW;
  2230. const int chrSrcW= c->chrSrcW;
  2231. const int lumXInc= c->lumXInc;
  2232. const int chrXInc= c->chrXInc;
  2233. const enum PixelFormat dstFormat= c->dstFormat;
  2234. const int flags= c->flags;
  2235. int16_t *vLumFilterPos= c->vLumFilterPos;
  2236. int16_t *vChrFilterPos= c->vChrFilterPos;
  2237. int16_t *hLumFilterPos= c->hLumFilterPos;
  2238. int16_t *hChrFilterPos= c->hChrFilterPos;
  2239. int16_t *vLumFilter= c->vLumFilter;
  2240. int16_t *vChrFilter= c->vChrFilter;
  2241. int16_t *hLumFilter= c->hLumFilter;
  2242. int16_t *hChrFilter= c->hChrFilter;
  2243. int32_t *lumMmxFilter= c->lumMmxFilter;
  2244. int32_t *chrMmxFilter= c->chrMmxFilter;
  2245. int32_t av_unused *alpMmxFilter= c->alpMmxFilter;
  2246. const int vLumFilterSize= c->vLumFilterSize;
  2247. const int vChrFilterSize= c->vChrFilterSize;
  2248. const int hLumFilterSize= c->hLumFilterSize;
  2249. const int hChrFilterSize= c->hChrFilterSize;
  2250. int16_t **lumPixBuf= c->lumPixBuf;
  2251. int16_t **chrPixBuf= c->chrPixBuf;
  2252. int16_t **alpPixBuf= c->alpPixBuf;
  2253. const int vLumBufSize= c->vLumBufSize;
  2254. const int vChrBufSize= c->vChrBufSize;
  2255. uint8_t *formatConvBuffer= c->formatConvBuffer;
  2256. const int chrSrcSliceY= srcSliceY >> c->chrSrcVSubSample;
  2257. const int chrSrcSliceH= -((-srcSliceH) >> c->chrSrcVSubSample);
  2258. int lastDstY;
  2259. uint32_t *pal=c->pal_yuv;
  2260. /* vars which will change and which we need to store back in the context */
  2261. int dstY= c->dstY;
  2262. int lumBufIndex= c->lumBufIndex;
  2263. int chrBufIndex= c->chrBufIndex;
  2264. int lastInLumBuf= c->lastInLumBuf;
  2265. int lastInChrBuf= c->lastInChrBuf;
  2266. if (isPacked(c->srcFormat)) {
  2267. src[0]=
  2268. src[1]=
  2269. src[2]=
  2270. src[3]= src[0];
  2271. srcStride[0]=
  2272. srcStride[1]=
  2273. srcStride[2]=
  2274. srcStride[3]= srcStride[0];
  2275. }
  2276. srcStride[1]<<= c->vChrDrop;
  2277. srcStride[2]<<= c->vChrDrop;
  2278. DEBUG_BUFFERS("swScale() %p[%d] %p[%d] %p[%d] %p[%d] -> %p[%d] %p[%d] %p[%d] %p[%d]\n",
  2279. src[0], srcStride[0], src[1], srcStride[1], src[2], srcStride[2], src[3], srcStride[3],
  2280. dst[0], dstStride[0], dst[1], dstStride[1], dst[2], dstStride[2], dst[3], dstStride[3]);
  2281. DEBUG_BUFFERS("srcSliceY: %d srcSliceH: %d dstY: %d dstH: %d\n",
  2282. srcSliceY, srcSliceH, dstY, dstH);
  2283. DEBUG_BUFFERS("vLumFilterSize: %d vLumBufSize: %d vChrFilterSize: %d vChrBufSize: %d\n",
  2284. vLumFilterSize, vLumBufSize, vChrFilterSize, vChrBufSize);
  2285. if (dstStride[0]%8 !=0 || dstStride[1]%8 !=0 || dstStride[2]%8 !=0 || dstStride[3]%8 != 0) {
  2286. static int warnedAlready=0; //FIXME move this into the context perhaps
  2287. if (flags & SWS_PRINT_INFO && !warnedAlready) {
  2288. av_log(c, AV_LOG_WARNING, "Warning: dstStride is not aligned!\n"
  2289. " ->cannot do aligned memory accesses anymore\n");
  2290. warnedAlready=1;
  2291. }
  2292. }
  2293. /* Note the user might start scaling the picture in the middle so this
  2294. will not get executed. This is not really intended but works
  2295. currently, so people might do it. */
  2296. if (srcSliceY ==0) {
  2297. lumBufIndex=-1;
  2298. chrBufIndex=-1;
  2299. dstY=0;
  2300. lastInLumBuf= -1;
  2301. lastInChrBuf= -1;
  2302. }
  2303. lastDstY= dstY;
  2304. for (;dstY < dstH; dstY++) {
  2305. unsigned char *dest =dst[0]+dstStride[0]*dstY;
  2306. const int chrDstY= dstY>>c->chrDstVSubSample;
  2307. unsigned char *uDest=dst[1]+dstStride[1]*chrDstY;
  2308. unsigned char *vDest=dst[2]+dstStride[2]*chrDstY;
  2309. unsigned char *aDest=(CONFIG_SWSCALE_ALPHA && alpPixBuf) ? dst[3]+dstStride[3]*dstY : NULL;
  2310. const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input
  2311. const int firstLumSrcY2= vLumFilterPos[FFMIN(dstY | ((1<<c->chrDstVSubSample) - 1), dstH-1)];
  2312. const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input
  2313. int lastLumSrcY= firstLumSrcY + vLumFilterSize -1; // Last line needed as input
  2314. int lastLumSrcY2=firstLumSrcY2+ vLumFilterSize -1; // Last line needed as input
  2315. int lastChrSrcY= firstChrSrcY + vChrFilterSize -1; // Last line needed as input
  2316. int enough_lines;
  2317. //handle holes (FAST_BILINEAR & weird filters)
  2318. if (firstLumSrcY > lastInLumBuf) lastInLumBuf= firstLumSrcY-1;
  2319. if (firstChrSrcY > lastInChrBuf) lastInChrBuf= firstChrSrcY-1;
  2320. assert(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1);
  2321. assert(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1);
  2322. DEBUG_BUFFERS("dstY: %d\n", dstY);
  2323. DEBUG_BUFFERS("\tfirstLumSrcY: %d lastLumSrcY: %d lastInLumBuf: %d\n",
  2324. firstLumSrcY, lastLumSrcY, lastInLumBuf);
  2325. DEBUG_BUFFERS("\tfirstChrSrcY: %d lastChrSrcY: %d lastInChrBuf: %d\n",
  2326. firstChrSrcY, lastChrSrcY, lastInChrBuf);
  2327. // Do we have enough lines in this slice to output the dstY line
  2328. enough_lines = lastLumSrcY2 < srcSliceY + srcSliceH && lastChrSrcY < -((-srcSliceY - srcSliceH)>>c->chrSrcVSubSample);
  2329. if (!enough_lines) {
  2330. lastLumSrcY = srcSliceY + srcSliceH - 1;
  2331. lastChrSrcY = chrSrcSliceY + chrSrcSliceH - 1;
  2332. DEBUG_BUFFERS("buffering slice: lastLumSrcY %d lastChrSrcY %d\n",
  2333. lastLumSrcY, lastChrSrcY);
  2334. }
  2335. //Do horizontal scaling
  2336. while(lastInLumBuf < lastLumSrcY) {
  2337. const uint8_t *src1= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
  2338. const uint8_t *src2= src[3]+(lastInLumBuf + 1 - srcSliceY)*srcStride[3];
  2339. lumBufIndex++;
  2340. assert(lumBufIndex < 2*vLumBufSize);
  2341. assert(lastInLumBuf + 1 - srcSliceY < srcSliceH);
  2342. assert(lastInLumBuf + 1 - srcSliceY >= 0);
  2343. RENAME(hyscale)(c, lumPixBuf[ lumBufIndex ], dstW, src1, srcW, lumXInc,
  2344. hLumFilter, hLumFilterPos, hLumFilterSize,
  2345. formatConvBuffer,
  2346. pal, 0);
  2347. if (CONFIG_SWSCALE_ALPHA && alpPixBuf)
  2348. RENAME(hyscale)(c, alpPixBuf[ lumBufIndex ], dstW, src2, srcW, lumXInc,
  2349. hLumFilter, hLumFilterPos, hLumFilterSize,
  2350. formatConvBuffer,
  2351. pal, 1);
  2352. lastInLumBuf++;
  2353. DEBUG_BUFFERS("\t\tlumBufIndex %d: lastInLumBuf: %d\n",
  2354. lumBufIndex, lastInLumBuf);
  2355. }
  2356. while(lastInChrBuf < lastChrSrcY) {
  2357. const uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
  2358. const uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
  2359. chrBufIndex++;
  2360. assert(chrBufIndex < 2*vChrBufSize);
  2361. assert(lastInChrBuf + 1 - chrSrcSliceY < (chrSrcSliceH));
  2362. assert(lastInChrBuf + 1 - chrSrcSliceY >= 0);
  2363. //FIXME replace parameters through context struct (some at least)
  2364. if (c->needs_hcscale)
  2365. RENAME(hcscale)(c, chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
  2366. hChrFilter, hChrFilterPos, hChrFilterSize,
  2367. formatConvBuffer,
  2368. pal);
  2369. lastInChrBuf++;
  2370. DEBUG_BUFFERS("\t\tchrBufIndex %d: lastInChrBuf: %d\n",
  2371. chrBufIndex, lastInChrBuf);
  2372. }
  2373. //wrap buf index around to stay inside the ring buffer
  2374. if (lumBufIndex >= vLumBufSize) lumBufIndex-= vLumBufSize;
  2375. if (chrBufIndex >= vChrBufSize) chrBufIndex-= vChrBufSize;
  2376. if (!enough_lines)
  2377. break; //we can't output a dstY line so let's try with the next slice
  2378. c->blueDither= ff_dither8[dstY&1];
  2379. if (c->dstFormat == PIX_FMT_RGB555 || c->dstFormat == PIX_FMT_BGR555)
  2380. c->greenDither= ff_dither8[dstY&1];
  2381. else
  2382. c->greenDither= ff_dither4[dstY&1];
  2383. c->redDither= ff_dither8[(dstY+1)&1];
  2384. if (dstY < dstH-2) {
  2385. const int16_t **lumSrcPtr= (const int16_t **) lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
  2386. const int16_t **chrSrcPtr= (const int16_t **) chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
  2387. const int16_t **alpSrcPtr= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? (const int16_t **) alpPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize : NULL;
  2388. int i;
  2389. if (flags & SWS_ACCURATE_RND) {
  2390. int s= APCK_SIZE / 8;
  2391. for (i=0; i<vLumFilterSize; i+=2) {
  2392. *(const void**)&lumMmxFilter[s*i ]= lumSrcPtr[i ];
  2393. *(const void**)&lumMmxFilter[s*i+APCK_PTR2/4 ]= lumSrcPtr[i+(vLumFilterSize>1)];
  2394. lumMmxFilter[s*i+APCK_COEF/4 ]=
  2395. lumMmxFilter[s*i+APCK_COEF/4+1]= vLumFilter[dstY*vLumFilterSize + i ]
  2396. + (vLumFilterSize>1 ? vLumFilter[dstY*vLumFilterSize + i + 1]<<16 : 0);
  2397. if (CONFIG_SWSCALE_ALPHA && alpPixBuf) {
  2398. *(const void**)&alpMmxFilter[s*i ]= alpSrcPtr[i ];
  2399. *(const void**)&alpMmxFilter[s*i+APCK_PTR2/4 ]= alpSrcPtr[i+(vLumFilterSize>1)];
  2400. alpMmxFilter[s*i+APCK_COEF/4 ]=
  2401. alpMmxFilter[s*i+APCK_COEF/4+1]= lumMmxFilter[s*i+APCK_COEF/4 ];
  2402. }
  2403. }
  2404. for (i=0; i<vChrFilterSize; i+=2) {
  2405. *(const void**)&chrMmxFilter[s*i ]= chrSrcPtr[i ];
  2406. *(const void**)&chrMmxFilter[s*i+APCK_PTR2/4 ]= chrSrcPtr[i+(vChrFilterSize>1)];
  2407. chrMmxFilter[s*i+APCK_COEF/4 ]=
  2408. chrMmxFilter[s*i+APCK_COEF/4+1]= vChrFilter[chrDstY*vChrFilterSize + i ]
  2409. + (vChrFilterSize>1 ? vChrFilter[chrDstY*vChrFilterSize + i + 1]<<16 : 0);
  2410. }
  2411. } else {
  2412. for (i=0; i<vLumFilterSize; i++) {
  2413. lumMmxFilter[4*i+0]= (int32_t)lumSrcPtr[i];
  2414. lumMmxFilter[4*i+1]= (uint64_t)lumSrcPtr[i] >> 32;
  2415. lumMmxFilter[4*i+2]=
  2416. lumMmxFilter[4*i+3]=
  2417. ((uint16_t)vLumFilter[dstY*vLumFilterSize + i])*0x10001;
  2418. if (CONFIG_SWSCALE_ALPHA && alpPixBuf) {
  2419. alpMmxFilter[4*i+0]= (int32_t)alpSrcPtr[i];
  2420. alpMmxFilter[4*i+1]= (uint64_t)alpSrcPtr[i] >> 32;
  2421. alpMmxFilter[4*i+2]=
  2422. alpMmxFilter[4*i+3]= lumMmxFilter[4*i+2];
  2423. }
  2424. }
  2425. for (i=0; i<vChrFilterSize; i++) {
  2426. chrMmxFilter[4*i+0]= (int32_t)chrSrcPtr[i];
  2427. chrMmxFilter[4*i+1]= (uint64_t)chrSrcPtr[i] >> 32;
  2428. chrMmxFilter[4*i+2]=
  2429. chrMmxFilter[4*i+3]=
  2430. ((uint16_t)vChrFilter[chrDstY*vChrFilterSize + i])*0x10001;
  2431. }
  2432. }
  2433. if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21) {
  2434. const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
  2435. if (dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
  2436. c->yuv2nv12X(c,
  2437. vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
  2438. vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
  2439. dest, uDest, dstW, chrDstW, dstFormat);
  2440. } else if (isPlanarYUV(dstFormat) || dstFormat==PIX_FMT_GRAY8) { //YV12 like
  2441. const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
  2442. if ((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
  2443. if (is16BPS(dstFormat) || is9_OR_10BPS(dstFormat)) {
  2444. yuv2yuvX16inC(
  2445. vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
  2446. vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
  2447. alpSrcPtr, (uint16_t *) dest, (uint16_t *) uDest, (uint16_t *) vDest, (uint16_t *) aDest, dstW, chrDstW,
  2448. dstFormat);
  2449. } else if (vLumFilterSize == 1 && vChrFilterSize == 1) { // unscaled YV12
  2450. const int16_t *lumBuf = lumSrcPtr[0];
  2451. const int16_t *chrBuf= chrSrcPtr[0];
  2452. const int16_t *alpBuf= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? alpSrcPtr[0] : NULL;
  2453. c->yuv2yuv1(c, lumBuf, chrBuf, alpBuf, dest, uDest, vDest, aDest, dstW, chrDstW);
  2454. } else { //General YV12
  2455. c->yuv2yuvX(c,
  2456. vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
  2457. vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
  2458. alpSrcPtr, dest, uDest, vDest, aDest, dstW, chrDstW);
  2459. }
  2460. } else {
  2461. assert(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
  2462. assert(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
  2463. if (vLumFilterSize == 1 && vChrFilterSize == 2) { //unscaled RGB
  2464. int chrAlpha= vChrFilter[2*dstY+1];
  2465. if(flags & SWS_FULL_CHR_H_INT) {
  2466. yuv2rgbXinC_full(c, //FIXME write a packed1_full function
  2467. vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
  2468. vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
  2469. alpSrcPtr, dest, dstW, dstY);
  2470. } else {
  2471. c->yuv2packed1(c, *lumSrcPtr, *chrSrcPtr, *(chrSrcPtr+1),
  2472. alpPixBuf ? *alpSrcPtr : NULL,
  2473. dest, dstW, chrAlpha, dstFormat, flags, dstY);
  2474. }
  2475. } else if (vLumFilterSize == 2 && vChrFilterSize == 2) { //bilinear upscale RGB
  2476. int lumAlpha= vLumFilter[2*dstY+1];
  2477. int chrAlpha= vChrFilter[2*dstY+1];
  2478. lumMmxFilter[2]=
  2479. lumMmxFilter[3]= vLumFilter[2*dstY ]*0x10001;
  2480. chrMmxFilter[2]=
  2481. chrMmxFilter[3]= vChrFilter[2*chrDstY]*0x10001;
  2482. if(flags & SWS_FULL_CHR_H_INT) {
  2483. yuv2rgbXinC_full(c, //FIXME write a packed2_full function
  2484. vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
  2485. vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
  2486. alpSrcPtr, dest, dstW, dstY);
  2487. } else {
  2488. c->yuv2packed2(c, *lumSrcPtr, *(lumSrcPtr+1), *chrSrcPtr, *(chrSrcPtr+1),
  2489. alpPixBuf ? *alpSrcPtr : NULL, alpPixBuf ? *(alpSrcPtr+1) : NULL,
  2490. dest, dstW, lumAlpha, chrAlpha, dstY);
  2491. }
  2492. } else { //general RGB
  2493. if(flags & SWS_FULL_CHR_H_INT) {
  2494. yuv2rgbXinC_full(c,
  2495. vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
  2496. vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
  2497. alpSrcPtr, dest, dstW, dstY);
  2498. } else {
  2499. c->yuv2packedX(c,
  2500. vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
  2501. vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
  2502. alpSrcPtr, dest, dstW, dstY);
  2503. }
  2504. }
  2505. }
  2506. } else { // hmm looks like we can't use MMX here without overwriting this array's tail
  2507. const int16_t **lumSrcPtr= (const int16_t **)lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
  2508. const int16_t **chrSrcPtr= (const int16_t **)chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
  2509. const int16_t **alpSrcPtr= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? (const int16_t **)alpPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize : NULL;
  2510. if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21) {
  2511. const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
  2512. if (dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
  2513. yuv2nv12XinC(
  2514. vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
  2515. vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
  2516. dest, uDest, dstW, chrDstW, dstFormat);
  2517. } else if (isPlanarYUV(dstFormat) || dstFormat==PIX_FMT_GRAY8) { //YV12
  2518. const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
  2519. if ((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
  2520. if (is16BPS(dstFormat) || is9_OR_10BPS(dstFormat)) {
  2521. yuv2yuvX16inC(
  2522. vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
  2523. vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
  2524. alpSrcPtr, (uint16_t *) dest, (uint16_t *) uDest, (uint16_t *) vDest, (uint16_t *) aDest, dstW, chrDstW,
  2525. dstFormat);
  2526. } else {
  2527. yuv2yuvXinC(
  2528. vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
  2529. vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
  2530. alpSrcPtr, dest, uDest, vDest, aDest, dstW, chrDstW);
  2531. }
  2532. } else {
  2533. assert(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
  2534. assert(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
  2535. if(flags & SWS_FULL_CHR_H_INT) {
  2536. yuv2rgbXinC_full(c,
  2537. vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
  2538. vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
  2539. alpSrcPtr, dest, dstW, dstY);
  2540. } else {
  2541. yuv2packedXinC(c,
  2542. vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
  2543. vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
  2544. alpSrcPtr, dest, dstW, dstY);
  2545. }
  2546. }
  2547. }
  2548. }
  2549. if ((dstFormat == PIX_FMT_YUVA420P) && !alpPixBuf)
  2550. fillPlane(dst[3], dstStride[3], dstW, dstY-lastDstY, lastDstY, 255);
  2551. if (COMPILE_TEMPLATE_MMX2) __asm__ volatile("sfence":::"memory");
  2552. /* On K6 femms is faster than emms. On K7 femms is directly mapped to emms. */
  2553. if (COMPILE_TEMPLATE_AMD3DNOW) __asm__ volatile("femms" :::"memory");
  2554. else __asm__ volatile("emms" :::"memory");
  2555. /* store changed local vars back in the context */
  2556. c->dstY= dstY;
  2557. c->lumBufIndex= lumBufIndex;
  2558. c->chrBufIndex= chrBufIndex;
  2559. c->lastInLumBuf= lastInLumBuf;
  2560. c->lastInChrBuf= lastInChrBuf;
  2561. return dstY - lastDstY;
  2562. }
  2563. static void RENAME(sws_init_swScale)(SwsContext *c)
  2564. {
  2565. enum PixelFormat srcFormat = c->srcFormat;
  2566. c->yuv2nv12X = RENAME(yuv2nv12X );
  2567. c->yuv2yuv1 = RENAME(yuv2yuv1 );
  2568. c->yuv2yuvX = RENAME(yuv2yuvX );
  2569. c->yuv2packed1 = RENAME(yuv2packed1 );
  2570. c->yuv2packed2 = RENAME(yuv2packed2 );
  2571. c->yuv2packedX = RENAME(yuv2packedX );
  2572. c->hScale = RENAME(hScale );
  2573. // Use the new MMX scaler if the MMX2 one can't be used (it is faster than the x86 ASM one).
  2574. if (c->flags & SWS_FAST_BILINEAR && c->canMMX2BeUsed)
  2575. {
  2576. c->hyscale_fast = RENAME(hyscale_fast);
  2577. c->hcscale_fast = RENAME(hcscale_fast);
  2578. } else {
  2579. c->hyscale_fast = NULL;
  2580. c->hcscale_fast = NULL;
  2581. }
  2582. switch(srcFormat) {
  2583. case PIX_FMT_YUYV422 : c->chrToYV12 = RENAME(yuy2ToUV); break;
  2584. case PIX_FMT_UYVY422 : c->chrToYV12 = RENAME(uyvyToUV); break;
  2585. case PIX_FMT_NV12 : c->chrToYV12 = RENAME(nv12ToUV); break;
  2586. case PIX_FMT_NV21 : c->chrToYV12 = RENAME(nv21ToUV); break;
  2587. case PIX_FMT_YUV420P16BE:
  2588. case PIX_FMT_YUV422P16BE:
  2589. case PIX_FMT_YUV444P16BE: c->chrToYV12 = RENAME(BEToUV); break;
  2590. case PIX_FMT_YUV420P16LE:
  2591. case PIX_FMT_YUV422P16LE:
  2592. case PIX_FMT_YUV444P16LE: c->chrToYV12 = RENAME(LEToUV); break;
  2593. default: break;
  2594. }
  2595. if (c->chrSrcHSubSample) {
  2596. switch(srcFormat) {
  2597. case PIX_FMT_BGR24 : c->chrToYV12 = RENAME(bgr24ToUV_half); break;
  2598. case PIX_FMT_RGB24 : c->chrToYV12 = RENAME(rgb24ToUV_half); break;
  2599. default: break;
  2600. }
  2601. } else {
  2602. switch(srcFormat) {
  2603. case PIX_FMT_BGR24 : c->chrToYV12 = RENAME(bgr24ToUV); break;
  2604. case PIX_FMT_RGB24 : c->chrToYV12 = RENAME(rgb24ToUV); break;
  2605. default: break;
  2606. }
  2607. }
  2608. switch (srcFormat) {
  2609. case PIX_FMT_YUYV422 :
  2610. case PIX_FMT_YUV420P16BE:
  2611. case PIX_FMT_YUV422P16BE:
  2612. case PIX_FMT_YUV444P16BE:
  2613. case PIX_FMT_Y400A :
  2614. case PIX_FMT_GRAY16BE : c->lumToYV12 = RENAME(yuy2ToY); break;
  2615. case PIX_FMT_UYVY422 :
  2616. case PIX_FMT_YUV420P16LE:
  2617. case PIX_FMT_YUV422P16LE:
  2618. case PIX_FMT_YUV444P16LE:
  2619. case PIX_FMT_GRAY16LE : c->lumToYV12 = RENAME(uyvyToY); break;
  2620. case PIX_FMT_BGR24 : c->lumToYV12 = RENAME(bgr24ToY); break;
  2621. case PIX_FMT_RGB24 : c->lumToYV12 = RENAME(rgb24ToY); break;
  2622. default: break;
  2623. }
  2624. if (c->alpPixBuf) {
  2625. switch (srcFormat) {
  2626. case PIX_FMT_Y400A : c->alpToYV12 = RENAME(yuy2ToY); break;
  2627. default: break;
  2628. }
  2629. }
  2630. if (c->srcRange != c->dstRange && !isAnyRGB(c->dstFormat)) {
  2631. if (c->srcRange) {
  2632. c->lumConvertRange = RENAME(lumRangeFromJpeg);
  2633. c->chrConvertRange = RENAME(chrRangeFromJpeg);
  2634. } else {
  2635. c->lumConvertRange = RENAME(lumRangeToJpeg);
  2636. c->chrConvertRange = RENAME(chrRangeToJpeg);
  2637. }
  2638. }
  2639. }