You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

2352 lines
109KB

  1. /*
  2. * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
  3. *
  4. * This file is part of FFmpeg.
  5. *
  6. * FFmpeg is free software; you can redistribute it and/or
  7. * modify it under the terms of the GNU Lesser General Public
  8. * License as published by the Free Software Foundation; either
  9. * version 2.1 of the License, or (at your option) any later version.
  10. *
  11. * FFmpeg is distributed in the hope that it will be useful,
  12. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  14. * Lesser General Public License for more details.
  15. *
  16. * You should have received a copy of the GNU Lesser General Public
  17. * License along with FFmpeg; if not, write to the Free Software
  18. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  19. */
  20. #include "swscale_template.h"
  21. #undef REAL_MOVNTQ
  22. #undef MOVNTQ
  23. #undef PREFETCH
  24. #if COMPILE_TEMPLATE_MMX2
  25. #define PREFETCH "prefetchnta"
  26. #else
  27. #define PREFETCH " # nop"
  28. #endif
  29. #if COMPILE_TEMPLATE_MMX2
  30. #define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
  31. #else
  32. #define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
  33. #endif
  34. #define MOVNTQ(a,b) REAL_MOVNTQ(a,b)
  35. #define YSCALEYUV2YV12X(x, offset, dest, width) \
  36. __asm__ volatile(\
  37. "xor %%"REG_a", %%"REG_a" \n\t"\
  38. "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
  39. "movq %%mm3, %%mm4 \n\t"\
  40. "lea " offset "(%0), %%"REG_d" \n\t"\
  41. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  42. ".p2align 4 \n\t" /* FIXME Unroll? */\
  43. "1: \n\t"\
  44. "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
  45. "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* srcData */\
  46. "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm5 \n\t" /* srcData */\
  47. "add $16, %%"REG_d" \n\t"\
  48. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  49. "test %%"REG_S", %%"REG_S" \n\t"\
  50. "pmulhw %%mm0, %%mm2 \n\t"\
  51. "pmulhw %%mm0, %%mm5 \n\t"\
  52. "paddw %%mm2, %%mm3 \n\t"\
  53. "paddw %%mm5, %%mm4 \n\t"\
  54. " jnz 1b \n\t"\
  55. "psraw $3, %%mm3 \n\t"\
  56. "psraw $3, %%mm4 \n\t"\
  57. "packuswb %%mm4, %%mm3 \n\t"\
  58. MOVNTQ(%%mm3, (%1, %%REGa))\
  59. "add $8, %%"REG_a" \n\t"\
  60. "cmp %2, %%"REG_a" \n\t"\
  61. "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
  62. "movq %%mm3, %%mm4 \n\t"\
  63. "lea " offset "(%0), %%"REG_d" \n\t"\
  64. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  65. "jb 1b \n\t"\
  66. :: "r" (&c->redDither),\
  67. "r" (dest), "g" ((x86_reg)width)\
  68. : "%"REG_a, "%"REG_d, "%"REG_S\
  69. );
  70. #define YSCALEYUV2YV12X_ACCURATE(x, offset, dest, width) \
  71. __asm__ volatile(\
  72. "lea " offset "(%0), %%"REG_d" \n\t"\
  73. "xor %%"REG_a", %%"REG_a" \n\t"\
  74. "pxor %%mm4, %%mm4 \n\t"\
  75. "pxor %%mm5, %%mm5 \n\t"\
  76. "pxor %%mm6, %%mm6 \n\t"\
  77. "pxor %%mm7, %%mm7 \n\t"\
  78. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  79. ".p2align 4 \n\t"\
  80. "1: \n\t"\
  81. "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm0 \n\t" /* srcData */\
  82. "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* srcData */\
  83. "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
  84. "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm1 \n\t" /* srcData */\
  85. "movq %%mm0, %%mm3 \n\t"\
  86. "punpcklwd %%mm1, %%mm0 \n\t"\
  87. "punpckhwd %%mm1, %%mm3 \n\t"\
  88. "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm1 \n\t" /* filterCoeff */\
  89. "pmaddwd %%mm1, %%mm0 \n\t"\
  90. "pmaddwd %%mm1, %%mm3 \n\t"\
  91. "paddd %%mm0, %%mm4 \n\t"\
  92. "paddd %%mm3, %%mm5 \n\t"\
  93. "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm3 \n\t" /* srcData */\
  94. "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
  95. "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
  96. "test %%"REG_S", %%"REG_S" \n\t"\
  97. "movq %%mm2, %%mm0 \n\t"\
  98. "punpcklwd %%mm3, %%mm2 \n\t"\
  99. "punpckhwd %%mm3, %%mm0 \n\t"\
  100. "pmaddwd %%mm1, %%mm2 \n\t"\
  101. "pmaddwd %%mm1, %%mm0 \n\t"\
  102. "paddd %%mm2, %%mm6 \n\t"\
  103. "paddd %%mm0, %%mm7 \n\t"\
  104. " jnz 1b \n\t"\
  105. "psrad $16, %%mm4 \n\t"\
  106. "psrad $16, %%mm5 \n\t"\
  107. "psrad $16, %%mm6 \n\t"\
  108. "psrad $16, %%mm7 \n\t"\
  109. "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
  110. "packssdw %%mm5, %%mm4 \n\t"\
  111. "packssdw %%mm7, %%mm6 \n\t"\
  112. "paddw %%mm0, %%mm4 \n\t"\
  113. "paddw %%mm0, %%mm6 \n\t"\
  114. "psraw $3, %%mm4 \n\t"\
  115. "psraw $3, %%mm6 \n\t"\
  116. "packuswb %%mm6, %%mm4 \n\t"\
  117. MOVNTQ(%%mm4, (%1, %%REGa))\
  118. "add $8, %%"REG_a" \n\t"\
  119. "cmp %2, %%"REG_a" \n\t"\
  120. "lea " offset "(%0), %%"REG_d" \n\t"\
  121. "pxor %%mm4, %%mm4 \n\t"\
  122. "pxor %%mm5, %%mm5 \n\t"\
  123. "pxor %%mm6, %%mm6 \n\t"\
  124. "pxor %%mm7, %%mm7 \n\t"\
  125. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  126. "jb 1b \n\t"\
  127. :: "r" (&c->redDither),\
  128. "r" (dest), "g" ((x86_reg)width)\
  129. : "%"REG_a, "%"REG_d, "%"REG_S\
  130. );
  131. #define YSCALEYUV2YV121 \
  132. "mov %2, %%"REG_a" \n\t"\
  133. ".p2align 4 \n\t" /* FIXME Unroll? */\
  134. "1: \n\t"\
  135. "movq (%0, %%"REG_a", 2), %%mm0 \n\t"\
  136. "movq 8(%0, %%"REG_a", 2), %%mm1 \n\t"\
  137. "psraw $7, %%mm0 \n\t"\
  138. "psraw $7, %%mm1 \n\t"\
  139. "packuswb %%mm1, %%mm0 \n\t"\
  140. MOVNTQ(%%mm0, (%1, %%REGa))\
  141. "add $8, %%"REG_a" \n\t"\
  142. "jnc 1b \n\t"
  143. #define YSCALEYUV2YV121_ACCURATE \
  144. "mov %2, %%"REG_a" \n\t"\
  145. "pcmpeqw %%mm7, %%mm7 \n\t"\
  146. "psrlw $15, %%mm7 \n\t"\
  147. "psllw $6, %%mm7 \n\t"\
  148. ".p2align 4 \n\t" /* FIXME Unroll? */\
  149. "1: \n\t"\
  150. "movq (%0, %%"REG_a", 2), %%mm0 \n\t"\
  151. "movq 8(%0, %%"REG_a", 2), %%mm1 \n\t"\
  152. "paddsw %%mm7, %%mm0 \n\t"\
  153. "paddsw %%mm7, %%mm1 \n\t"\
  154. "psraw $7, %%mm0 \n\t"\
  155. "psraw $7, %%mm1 \n\t"\
  156. "packuswb %%mm1, %%mm0 \n\t"\
  157. MOVNTQ(%%mm0, (%1, %%REGa))\
  158. "add $8, %%"REG_a" \n\t"\
  159. "jnc 1b \n\t"
  160. /*
  161. :: "m" (-lumFilterSize), "m" (-chrFilterSize),
  162. "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
  163. "r" (dest), "m" (dstW_reg),
  164. "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
  165. : "%eax", "%ebx", "%ecx", "%edx", "%esi"
  166. */
  167. #define YSCALEYUV2PACKEDX_UV \
  168. __asm__ volatile(\
  169. "xor %%"REG_a", %%"REG_a" \n\t"\
  170. ".p2align 4 \n\t"\
  171. "nop \n\t"\
  172. "1: \n\t"\
  173. "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
  174. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  175. "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
  176. "movq %%mm3, %%mm4 \n\t"\
  177. ".p2align 4 \n\t"\
  178. "2: \n\t"\
  179. "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
  180. "movq (%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* UsrcData */\
  181. "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm5 \n\t" /* VsrcData */\
  182. "add $16, %%"REG_d" \n\t"\
  183. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  184. "pmulhw %%mm0, %%mm2 \n\t"\
  185. "pmulhw %%mm0, %%mm5 \n\t"\
  186. "paddw %%mm2, %%mm3 \n\t"\
  187. "paddw %%mm5, %%mm4 \n\t"\
  188. "test %%"REG_S", %%"REG_S" \n\t"\
  189. " jnz 2b \n\t"\
  190. #define YSCALEYUV2PACKEDX_YA(offset,coeff,src1,src2,dst1,dst2) \
  191. "lea "offset"(%0), %%"REG_d" \n\t"\
  192. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  193. "movq "VROUNDER_OFFSET"(%0), "#dst1" \n\t"\
  194. "movq "#dst1", "#dst2" \n\t"\
  195. ".p2align 4 \n\t"\
  196. "2: \n\t"\
  197. "movq 8(%%"REG_d"), "#coeff" \n\t" /* filterCoeff */\
  198. "movq (%%"REG_S", %%"REG_a", 2), "#src1" \n\t" /* Y1srcData */\
  199. "movq 8(%%"REG_S", %%"REG_a", 2), "#src2" \n\t" /* Y2srcData */\
  200. "add $16, %%"REG_d" \n\t"\
  201. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  202. "pmulhw "#coeff", "#src1" \n\t"\
  203. "pmulhw "#coeff", "#src2" \n\t"\
  204. "paddw "#src1", "#dst1" \n\t"\
  205. "paddw "#src2", "#dst2" \n\t"\
  206. "test %%"REG_S", %%"REG_S" \n\t"\
  207. " jnz 2b \n\t"\
  208. #define YSCALEYUV2PACKEDX \
  209. YSCALEYUV2PACKEDX_UV \
  210. YSCALEYUV2PACKEDX_YA(LUM_MMX_FILTER_OFFSET,%%mm0,%%mm2,%%mm5,%%mm1,%%mm7) \
  211. #define YSCALEYUV2PACKEDX_END \
  212. :: "r" (&c->redDither), \
  213. "m" (dummy), "m" (dummy), "m" (dummy),\
  214. "r" (dest), "m" (dstW_reg) \
  215. : "%"REG_a, "%"REG_d, "%"REG_S \
  216. );
  217. #define YSCALEYUV2PACKEDX_ACCURATE_UV \
  218. __asm__ volatile(\
  219. "xor %%"REG_a", %%"REG_a" \n\t"\
  220. ".p2align 4 \n\t"\
  221. "nop \n\t"\
  222. "1: \n\t"\
  223. "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
  224. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  225. "pxor %%mm4, %%mm4 \n\t"\
  226. "pxor %%mm5, %%mm5 \n\t"\
  227. "pxor %%mm6, %%mm6 \n\t"\
  228. "pxor %%mm7, %%mm7 \n\t"\
  229. ".p2align 4 \n\t"\
  230. "2: \n\t"\
  231. "movq (%%"REG_S", %%"REG_a"), %%mm0 \n\t" /* UsrcData */\
  232. "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* VsrcData */\
  233. "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
  234. "movq (%%"REG_S", %%"REG_a"), %%mm1 \n\t" /* UsrcData */\
  235. "movq %%mm0, %%mm3 \n\t"\
  236. "punpcklwd %%mm1, %%mm0 \n\t"\
  237. "punpckhwd %%mm1, %%mm3 \n\t"\
  238. "movq "STR(APCK_COEF)"(%%"REG_d"),%%mm1 \n\t" /* filterCoeff */\
  239. "pmaddwd %%mm1, %%mm0 \n\t"\
  240. "pmaddwd %%mm1, %%mm3 \n\t"\
  241. "paddd %%mm0, %%mm4 \n\t"\
  242. "paddd %%mm3, %%mm5 \n\t"\
  243. "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm3 \n\t" /* VsrcData */\
  244. "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
  245. "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
  246. "test %%"REG_S", %%"REG_S" \n\t"\
  247. "movq %%mm2, %%mm0 \n\t"\
  248. "punpcklwd %%mm3, %%mm2 \n\t"\
  249. "punpckhwd %%mm3, %%mm0 \n\t"\
  250. "pmaddwd %%mm1, %%mm2 \n\t"\
  251. "pmaddwd %%mm1, %%mm0 \n\t"\
  252. "paddd %%mm2, %%mm6 \n\t"\
  253. "paddd %%mm0, %%mm7 \n\t"\
  254. " jnz 2b \n\t"\
  255. "psrad $16, %%mm4 \n\t"\
  256. "psrad $16, %%mm5 \n\t"\
  257. "psrad $16, %%mm6 \n\t"\
  258. "psrad $16, %%mm7 \n\t"\
  259. "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
  260. "packssdw %%mm5, %%mm4 \n\t"\
  261. "packssdw %%mm7, %%mm6 \n\t"\
  262. "paddw %%mm0, %%mm4 \n\t"\
  263. "paddw %%mm0, %%mm6 \n\t"\
  264. "movq %%mm4, "U_TEMP"(%0) \n\t"\
  265. "movq %%mm6, "V_TEMP"(%0) \n\t"\
  266. #define YSCALEYUV2PACKEDX_ACCURATE_YA(offset) \
  267. "lea "offset"(%0), %%"REG_d" \n\t"\
  268. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  269. "pxor %%mm1, %%mm1 \n\t"\
  270. "pxor %%mm5, %%mm5 \n\t"\
  271. "pxor %%mm7, %%mm7 \n\t"\
  272. "pxor %%mm6, %%mm6 \n\t"\
  273. ".p2align 4 \n\t"\
  274. "2: \n\t"\
  275. "movq (%%"REG_S", %%"REG_a", 2), %%mm0 \n\t" /* Y1srcData */\
  276. "movq 8(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* Y2srcData */\
  277. "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
  278. "movq (%%"REG_S", %%"REG_a", 2), %%mm4 \n\t" /* Y1srcData */\
  279. "movq %%mm0, %%mm3 \n\t"\
  280. "punpcklwd %%mm4, %%mm0 \n\t"\
  281. "punpckhwd %%mm4, %%mm3 \n\t"\
  282. "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm4 \n\t" /* filterCoeff */\
  283. "pmaddwd %%mm4, %%mm0 \n\t"\
  284. "pmaddwd %%mm4, %%mm3 \n\t"\
  285. "paddd %%mm0, %%mm1 \n\t"\
  286. "paddd %%mm3, %%mm5 \n\t"\
  287. "movq 8(%%"REG_S", %%"REG_a", 2), %%mm3 \n\t" /* Y2srcData */\
  288. "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
  289. "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
  290. "test %%"REG_S", %%"REG_S" \n\t"\
  291. "movq %%mm2, %%mm0 \n\t"\
  292. "punpcklwd %%mm3, %%mm2 \n\t"\
  293. "punpckhwd %%mm3, %%mm0 \n\t"\
  294. "pmaddwd %%mm4, %%mm2 \n\t"\
  295. "pmaddwd %%mm4, %%mm0 \n\t"\
  296. "paddd %%mm2, %%mm7 \n\t"\
  297. "paddd %%mm0, %%mm6 \n\t"\
  298. " jnz 2b \n\t"\
  299. "psrad $16, %%mm1 \n\t"\
  300. "psrad $16, %%mm5 \n\t"\
  301. "psrad $16, %%mm7 \n\t"\
  302. "psrad $16, %%mm6 \n\t"\
  303. "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
  304. "packssdw %%mm5, %%mm1 \n\t"\
  305. "packssdw %%mm6, %%mm7 \n\t"\
  306. "paddw %%mm0, %%mm1 \n\t"\
  307. "paddw %%mm0, %%mm7 \n\t"\
  308. "movq "U_TEMP"(%0), %%mm3 \n\t"\
  309. "movq "V_TEMP"(%0), %%mm4 \n\t"\
  310. #define YSCALEYUV2PACKEDX_ACCURATE \
  311. YSCALEYUV2PACKEDX_ACCURATE_UV \
  312. YSCALEYUV2PACKEDX_ACCURATE_YA(LUM_MMX_FILTER_OFFSET)
  313. #define YSCALEYUV2RGBX \
  314. "psubw "U_OFFSET"(%0), %%mm3 \n\t" /* (U-128)8*/\
  315. "psubw "V_OFFSET"(%0), %%mm4 \n\t" /* (V-128)8*/\
  316. "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
  317. "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
  318. "pmulhw "UG_COEFF"(%0), %%mm3 \n\t"\
  319. "pmulhw "VG_COEFF"(%0), %%mm4 \n\t"\
  320. /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
  321. "pmulhw "UB_COEFF"(%0), %%mm2 \n\t"\
  322. "pmulhw "VR_COEFF"(%0), %%mm5 \n\t"\
  323. "psubw "Y_OFFSET"(%0), %%mm1 \n\t" /* 8(Y-16)*/\
  324. "psubw "Y_OFFSET"(%0), %%mm7 \n\t" /* 8(Y-16)*/\
  325. "pmulhw "Y_COEFF"(%0), %%mm1 \n\t"\
  326. "pmulhw "Y_COEFF"(%0), %%mm7 \n\t"\
  327. /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
  328. "paddw %%mm3, %%mm4 \n\t"\
  329. "movq %%mm2, %%mm0 \n\t"\
  330. "movq %%mm5, %%mm6 \n\t"\
  331. "movq %%mm4, %%mm3 \n\t"\
  332. "punpcklwd %%mm2, %%mm2 \n\t"\
  333. "punpcklwd %%mm5, %%mm5 \n\t"\
  334. "punpcklwd %%mm4, %%mm4 \n\t"\
  335. "paddw %%mm1, %%mm2 \n\t"\
  336. "paddw %%mm1, %%mm5 \n\t"\
  337. "paddw %%mm1, %%mm4 \n\t"\
  338. "punpckhwd %%mm0, %%mm0 \n\t"\
  339. "punpckhwd %%mm6, %%mm6 \n\t"\
  340. "punpckhwd %%mm3, %%mm3 \n\t"\
  341. "paddw %%mm7, %%mm0 \n\t"\
  342. "paddw %%mm7, %%mm6 \n\t"\
  343. "paddw %%mm7, %%mm3 \n\t"\
  344. /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
  345. "packuswb %%mm0, %%mm2 \n\t"\
  346. "packuswb %%mm6, %%mm5 \n\t"\
  347. "packuswb %%mm3, %%mm4 \n\t"\
  348. #define REAL_YSCALEYUV2PACKED(index, c) \
  349. "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
  350. "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1 \n\t"\
  351. "psraw $3, %%mm0 \n\t"\
  352. "psraw $3, %%mm1 \n\t"\
  353. "movq %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c") \n\t"\
  354. "movq %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c") \n\t"\
  355. "xor "#index", "#index" \n\t"\
  356. ".p2align 4 \n\t"\
  357. "1: \n\t"\
  358. "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
  359. "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
  360. "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
  361. "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
  362. "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
  363. "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
  364. "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
  365. "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
  366. "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
  367. "psraw $7, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
  368. "psraw $7, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
  369. "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
  370. "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
  371. "movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
  372. "movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
  373. "movq 8(%0, "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
  374. "movq 8(%1, "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
  375. "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
  376. "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
  377. "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
  378. "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
  379. "psraw $7, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  380. "psraw $7, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  381. "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
  382. "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
  383. #define YSCALEYUV2PACKED(index, c) REAL_YSCALEYUV2PACKED(index, c)
  384. #define REAL_YSCALEYUV2RGB_UV(index, c) \
  385. "xor "#index", "#index" \n\t"\
  386. ".p2align 4 \n\t"\
  387. "1: \n\t"\
  388. "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
  389. "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
  390. "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
  391. "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
  392. "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
  393. "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
  394. "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
  395. "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
  396. "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
  397. "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
  398. "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
  399. "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
  400. "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
  401. "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
  402. "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
  403. "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
  404. "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
  405. "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
  406. "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
  407. /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
  408. #define REAL_YSCALEYUV2RGB_YA(index, c, b1, b2) \
  409. "movq ("#b1", "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
  410. "movq ("#b2", "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
  411. "movq 8("#b1", "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
  412. "movq 8("#b2", "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
  413. "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
  414. "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
  415. "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
  416. "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
  417. "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  418. "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  419. "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
  420. "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
  421. #define REAL_YSCALEYUV2RGB_COEFF(c) \
  422. "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
  423. "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
  424. "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
  425. "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
  426. "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
  427. "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
  428. /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
  429. "paddw %%mm3, %%mm4 \n\t"\
  430. "movq %%mm2, %%mm0 \n\t"\
  431. "movq %%mm5, %%mm6 \n\t"\
  432. "movq %%mm4, %%mm3 \n\t"\
  433. "punpcklwd %%mm2, %%mm2 \n\t"\
  434. "punpcklwd %%mm5, %%mm5 \n\t"\
  435. "punpcklwd %%mm4, %%mm4 \n\t"\
  436. "paddw %%mm1, %%mm2 \n\t"\
  437. "paddw %%mm1, %%mm5 \n\t"\
  438. "paddw %%mm1, %%mm4 \n\t"\
  439. "punpckhwd %%mm0, %%mm0 \n\t"\
  440. "punpckhwd %%mm6, %%mm6 \n\t"\
  441. "punpckhwd %%mm3, %%mm3 \n\t"\
  442. "paddw %%mm7, %%mm0 \n\t"\
  443. "paddw %%mm7, %%mm6 \n\t"\
  444. "paddw %%mm7, %%mm3 \n\t"\
  445. /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
  446. "packuswb %%mm0, %%mm2 \n\t"\
  447. "packuswb %%mm6, %%mm5 \n\t"\
  448. "packuswb %%mm3, %%mm4 \n\t"\
  449. #define YSCALEYUV2RGB_YA(index, c, b1, b2) REAL_YSCALEYUV2RGB_YA(index, c, b1, b2)
  450. #define YSCALEYUV2RGB(index, c) \
  451. REAL_YSCALEYUV2RGB_UV(index, c) \
  452. REAL_YSCALEYUV2RGB_YA(index, c, %0, %1) \
  453. REAL_YSCALEYUV2RGB_COEFF(c)
  454. #define REAL_YSCALEYUV2PACKED1(index, c) \
  455. "xor "#index", "#index" \n\t"\
  456. ".p2align 4 \n\t"\
  457. "1: \n\t"\
  458. "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
  459. "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
  460. "psraw $7, %%mm3 \n\t" \
  461. "psraw $7, %%mm4 \n\t" \
  462. "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
  463. "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
  464. "psraw $7, %%mm1 \n\t" \
  465. "psraw $7, %%mm7 \n\t" \
  466. #define YSCALEYUV2PACKED1(index, c) REAL_YSCALEYUV2PACKED1(index, c)
  467. #define REAL_YSCALEYUV2RGB1(index, c) \
  468. "xor "#index", "#index" \n\t"\
  469. ".p2align 4 \n\t"\
  470. "1: \n\t"\
  471. "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
  472. "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
  473. "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
  474. "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
  475. "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
  476. "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
  477. "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
  478. "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
  479. "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
  480. "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
  481. /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
  482. "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
  483. "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
  484. "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  485. "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  486. "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
  487. "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
  488. "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
  489. "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
  490. "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
  491. "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
  492. /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
  493. "paddw %%mm3, %%mm4 \n\t"\
  494. "movq %%mm2, %%mm0 \n\t"\
  495. "movq %%mm5, %%mm6 \n\t"\
  496. "movq %%mm4, %%mm3 \n\t"\
  497. "punpcklwd %%mm2, %%mm2 \n\t"\
  498. "punpcklwd %%mm5, %%mm5 \n\t"\
  499. "punpcklwd %%mm4, %%mm4 \n\t"\
  500. "paddw %%mm1, %%mm2 \n\t"\
  501. "paddw %%mm1, %%mm5 \n\t"\
  502. "paddw %%mm1, %%mm4 \n\t"\
  503. "punpckhwd %%mm0, %%mm0 \n\t"\
  504. "punpckhwd %%mm6, %%mm6 \n\t"\
  505. "punpckhwd %%mm3, %%mm3 \n\t"\
  506. "paddw %%mm7, %%mm0 \n\t"\
  507. "paddw %%mm7, %%mm6 \n\t"\
  508. "paddw %%mm7, %%mm3 \n\t"\
  509. /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
  510. "packuswb %%mm0, %%mm2 \n\t"\
  511. "packuswb %%mm6, %%mm5 \n\t"\
  512. "packuswb %%mm3, %%mm4 \n\t"\
  513. #define YSCALEYUV2RGB1(index, c) REAL_YSCALEYUV2RGB1(index, c)
  514. #define REAL_YSCALEYUV2PACKED1b(index, c) \
  515. "xor "#index", "#index" \n\t"\
  516. ".p2align 4 \n\t"\
  517. "1: \n\t"\
  518. "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
  519. "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
  520. "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
  521. "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
  522. "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
  523. "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
  524. "psrlw $8, %%mm3 \n\t" \
  525. "psrlw $8, %%mm4 \n\t" \
  526. "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
  527. "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
  528. "psraw $7, %%mm1 \n\t" \
  529. "psraw $7, %%mm7 \n\t"
  530. #define YSCALEYUV2PACKED1b(index, c) REAL_YSCALEYUV2PACKED1b(index, c)
  531. // do vertical chrominance interpolation
  532. #define REAL_YSCALEYUV2RGB1b(index, c) \
  533. "xor "#index", "#index" \n\t"\
  534. ".p2align 4 \n\t"\
  535. "1: \n\t"\
  536. "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
  537. "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
  538. "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
  539. "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
  540. "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
  541. "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
  542. "psrlw $5, %%mm3 \n\t" /*FIXME might overflow*/\
  543. "psrlw $5, %%mm4 \n\t" /*FIXME might overflow*/\
  544. "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
  545. "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
  546. "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
  547. "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
  548. "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
  549. "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
  550. /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
  551. "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
  552. "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
  553. "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  554. "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  555. "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
  556. "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
  557. "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
  558. "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
  559. "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
  560. "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
  561. /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
  562. "paddw %%mm3, %%mm4 \n\t"\
  563. "movq %%mm2, %%mm0 \n\t"\
  564. "movq %%mm5, %%mm6 \n\t"\
  565. "movq %%mm4, %%mm3 \n\t"\
  566. "punpcklwd %%mm2, %%mm2 \n\t"\
  567. "punpcklwd %%mm5, %%mm5 \n\t"\
  568. "punpcklwd %%mm4, %%mm4 \n\t"\
  569. "paddw %%mm1, %%mm2 \n\t"\
  570. "paddw %%mm1, %%mm5 \n\t"\
  571. "paddw %%mm1, %%mm4 \n\t"\
  572. "punpckhwd %%mm0, %%mm0 \n\t"\
  573. "punpckhwd %%mm6, %%mm6 \n\t"\
  574. "punpckhwd %%mm3, %%mm3 \n\t"\
  575. "paddw %%mm7, %%mm0 \n\t"\
  576. "paddw %%mm7, %%mm6 \n\t"\
  577. "paddw %%mm7, %%mm3 \n\t"\
  578. /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
  579. "packuswb %%mm0, %%mm2 \n\t"\
  580. "packuswb %%mm6, %%mm5 \n\t"\
  581. "packuswb %%mm3, %%mm4 \n\t"\
  582. #define YSCALEYUV2RGB1b(index, c) REAL_YSCALEYUV2RGB1b(index, c)
  583. #define REAL_YSCALEYUV2RGB1_ALPHA(index) \
  584. "movq (%1, "#index", 2), %%mm7 \n\t" /* abuf0[index ] */\
  585. "movq 8(%1, "#index", 2), %%mm1 \n\t" /* abuf0[index+4] */\
  586. "psraw $7, %%mm7 \n\t" /* abuf0[index ] >>7 */\
  587. "psraw $7, %%mm1 \n\t" /* abuf0[index+4] >>7 */\
  588. "packuswb %%mm1, %%mm7 \n\t"
  589. #define YSCALEYUV2RGB1_ALPHA(index) REAL_YSCALEYUV2RGB1_ALPHA(index)
  590. #define REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) \
  591. "movq "#b", "#q2" \n\t" /* B */\
  592. "movq "#r", "#t" \n\t" /* R */\
  593. "punpcklbw "#g", "#b" \n\t" /* GBGBGBGB 0 */\
  594. "punpcklbw "#a", "#r" \n\t" /* ARARARAR 0 */\
  595. "punpckhbw "#g", "#q2" \n\t" /* GBGBGBGB 2 */\
  596. "punpckhbw "#a", "#t" \n\t" /* ARARARAR 2 */\
  597. "movq "#b", "#q0" \n\t" /* GBGBGBGB 0 */\
  598. "movq "#q2", "#q3" \n\t" /* GBGBGBGB 2 */\
  599. "punpcklwd "#r", "#q0" \n\t" /* ARGBARGB 0 */\
  600. "punpckhwd "#r", "#b" \n\t" /* ARGBARGB 1 */\
  601. "punpcklwd "#t", "#q2" \n\t" /* ARGBARGB 2 */\
  602. "punpckhwd "#t", "#q3" \n\t" /* ARGBARGB 3 */\
  603. \
  604. MOVNTQ( q0, (dst, index, 4))\
  605. MOVNTQ( b, 8(dst, index, 4))\
  606. MOVNTQ( q2, 16(dst, index, 4))\
  607. MOVNTQ( q3, 24(dst, index, 4))\
  608. \
  609. "add $8, "#index" \n\t"\
  610. "cmp "#dstw", "#index" \n\t"\
  611. " jb 1b \n\t"
  612. #define WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t)
  613. #define REAL_WRITERGB16(dst, dstw, index) \
  614. "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
  615. "pand "MANGLE(bFC)", %%mm4 \n\t" /* G */\
  616. "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
  617. "psrlq $3, %%mm2 \n\t"\
  618. \
  619. "movq %%mm2, %%mm1 \n\t"\
  620. "movq %%mm4, %%mm3 \n\t"\
  621. \
  622. "punpcklbw %%mm7, %%mm3 \n\t"\
  623. "punpcklbw %%mm5, %%mm2 \n\t"\
  624. "punpckhbw %%mm7, %%mm4 \n\t"\
  625. "punpckhbw %%mm5, %%mm1 \n\t"\
  626. \
  627. "psllq $3, %%mm3 \n\t"\
  628. "psllq $3, %%mm4 \n\t"\
  629. \
  630. "por %%mm3, %%mm2 \n\t"\
  631. "por %%mm4, %%mm1 \n\t"\
  632. \
  633. MOVNTQ(%%mm2, (dst, index, 2))\
  634. MOVNTQ(%%mm1, 8(dst, index, 2))\
  635. \
  636. "add $8, "#index" \n\t"\
  637. "cmp "#dstw", "#index" \n\t"\
  638. " jb 1b \n\t"
  639. #define WRITERGB16(dst, dstw, index) REAL_WRITERGB16(dst, dstw, index)
  640. #define REAL_WRITERGB15(dst, dstw, index) \
  641. "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
  642. "pand "MANGLE(bF8)", %%mm4 \n\t" /* G */\
  643. "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
  644. "psrlq $3, %%mm2 \n\t"\
  645. "psrlq $1, %%mm5 \n\t"\
  646. \
  647. "movq %%mm2, %%mm1 \n\t"\
  648. "movq %%mm4, %%mm3 \n\t"\
  649. \
  650. "punpcklbw %%mm7, %%mm3 \n\t"\
  651. "punpcklbw %%mm5, %%mm2 \n\t"\
  652. "punpckhbw %%mm7, %%mm4 \n\t"\
  653. "punpckhbw %%mm5, %%mm1 \n\t"\
  654. \
  655. "psllq $2, %%mm3 \n\t"\
  656. "psllq $2, %%mm4 \n\t"\
  657. \
  658. "por %%mm3, %%mm2 \n\t"\
  659. "por %%mm4, %%mm1 \n\t"\
  660. \
  661. MOVNTQ(%%mm2, (dst, index, 2))\
  662. MOVNTQ(%%mm1, 8(dst, index, 2))\
  663. \
  664. "add $8, "#index" \n\t"\
  665. "cmp "#dstw", "#index" \n\t"\
  666. " jb 1b \n\t"
  667. #define WRITERGB15(dst, dstw, index) REAL_WRITERGB15(dst, dstw, index)
  668. #define WRITEBGR24MMX(dst, dstw, index) \
  669. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
  670. "movq %%mm2, %%mm1 \n\t" /* B */\
  671. "movq %%mm5, %%mm6 \n\t" /* R */\
  672. "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
  673. "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
  674. "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
  675. "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
  676. "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
  677. "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
  678. "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
  679. "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
  680. "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
  681. "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
  682. \
  683. "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
  684. "movq %%mm2, %%mm6 \n\t" /* 0RGB0RGB 1 */\
  685. "movq %%mm1, %%mm5 \n\t" /* 0RGB0RGB 2 */\
  686. "movq %%mm3, %%mm7 \n\t" /* 0RGB0RGB 3 */\
  687. \
  688. "psllq $40, %%mm0 \n\t" /* RGB00000 0 */\
  689. "psllq $40, %%mm2 \n\t" /* RGB00000 1 */\
  690. "psllq $40, %%mm1 \n\t" /* RGB00000 2 */\
  691. "psllq $40, %%mm3 \n\t" /* RGB00000 3 */\
  692. \
  693. "punpckhdq %%mm4, %%mm0 \n\t" /* 0RGBRGB0 0 */\
  694. "punpckhdq %%mm6, %%mm2 \n\t" /* 0RGBRGB0 1 */\
  695. "punpckhdq %%mm5, %%mm1 \n\t" /* 0RGBRGB0 2 */\
  696. "punpckhdq %%mm7, %%mm3 \n\t" /* 0RGBRGB0 3 */\
  697. \
  698. "psrlq $8, %%mm0 \n\t" /* 00RGBRGB 0 */\
  699. "movq %%mm2, %%mm6 \n\t" /* 0RGBRGB0 1 */\
  700. "psllq $40, %%mm2 \n\t" /* GB000000 1 */\
  701. "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
  702. MOVNTQ(%%mm0, (dst))\
  703. \
  704. "psrlq $24, %%mm6 \n\t" /* 0000RGBR 1 */\
  705. "movq %%mm1, %%mm5 \n\t" /* 0RGBRGB0 2 */\
  706. "psllq $24, %%mm1 \n\t" /* BRGB0000 2 */\
  707. "por %%mm1, %%mm6 \n\t" /* BRGBRGBR 1 */\
  708. MOVNTQ(%%mm6, 8(dst))\
  709. \
  710. "psrlq $40, %%mm5 \n\t" /* 000000RG 2 */\
  711. "psllq $8, %%mm3 \n\t" /* RGBRGB00 3 */\
  712. "por %%mm3, %%mm5 \n\t" /* RGBRGBRG 2 */\
  713. MOVNTQ(%%mm5, 16(dst))\
  714. \
  715. "add $24, "#dst" \n\t"\
  716. \
  717. "add $8, "#index" \n\t"\
  718. "cmp "#dstw", "#index" \n\t"\
  719. " jb 1b \n\t"
  720. #define WRITEBGR24MMX2(dst, dstw, index) \
  721. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
  722. "movq "MANGLE(ff_M24A)", %%mm0 \n\t"\
  723. "movq "MANGLE(ff_M24C)", %%mm7 \n\t"\
  724. "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2 B1 B0 B1 B0 */\
  725. "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2 G1 G0 G1 G0 */\
  726. "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0 R1 R0 R1 R0 */\
  727. \
  728. "pand %%mm0, %%mm1 \n\t" /* B2 B1 B0 */\
  729. "pand %%mm0, %%mm3 \n\t" /* G2 G1 G0 */\
  730. "pand %%mm7, %%mm6 \n\t" /* R1 R0 */\
  731. \
  732. "psllq $8, %%mm3 \n\t" /* G2 G1 G0 */\
  733. "por %%mm1, %%mm6 \n\t"\
  734. "por %%mm3, %%mm6 \n\t"\
  735. MOVNTQ(%%mm6, (dst))\
  736. \
  737. "psrlq $8, %%mm4 \n\t" /* 00 G7 G6 G5 G4 G3 G2 G1 */\
  738. "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4 B3 B2 B3 B2 */\
  739. "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3 G4 G3 G4 G3 */\
  740. "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4 R3 R2 R3 R2 */\
  741. \
  742. "pand "MANGLE(ff_M24B)", %%mm1 \n\t" /* B5 B4 B3 */\
  743. "pand %%mm7, %%mm3 \n\t" /* G4 G3 */\
  744. "pand %%mm0, %%mm6 \n\t" /* R4 R3 R2 */\
  745. \
  746. "por %%mm1, %%mm3 \n\t" /* B5 G4 B4 G3 B3 */\
  747. "por %%mm3, %%mm6 \n\t"\
  748. MOVNTQ(%%mm6, 8(dst))\
  749. \
  750. "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6 B7 B6 B6 B7 */\
  751. "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7 G6 G5 G6 G5 */\
  752. "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6 R5 R4 R5 R4 */\
  753. \
  754. "pand %%mm7, %%mm1 \n\t" /* B7 B6 */\
  755. "pand %%mm0, %%mm3 \n\t" /* G7 G6 G5 */\
  756. "pand "MANGLE(ff_M24B)", %%mm6 \n\t" /* R7 R6 R5 */\
  757. \
  758. "por %%mm1, %%mm3 \n\t"\
  759. "por %%mm3, %%mm6 \n\t"\
  760. MOVNTQ(%%mm6, 16(dst))\
  761. \
  762. "add $24, "#dst" \n\t"\
  763. \
  764. "add $8, "#index" \n\t"\
  765. "cmp "#dstw", "#index" \n\t"\
  766. " jb 1b \n\t"
  767. #if COMPILE_TEMPLATE_MMX2
  768. #undef WRITEBGR24
  769. #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX2(dst, dstw, index)
  770. #else
  771. #undef WRITEBGR24
  772. #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX(dst, dstw, index)
  773. #endif
  774. #define REAL_WRITEYUY2(dst, dstw, index) \
  775. "packuswb %%mm3, %%mm3 \n\t"\
  776. "packuswb %%mm4, %%mm4 \n\t"\
  777. "packuswb %%mm7, %%mm1 \n\t"\
  778. "punpcklbw %%mm4, %%mm3 \n\t"\
  779. "movq %%mm1, %%mm7 \n\t"\
  780. "punpcklbw %%mm3, %%mm1 \n\t"\
  781. "punpckhbw %%mm3, %%mm7 \n\t"\
  782. \
  783. MOVNTQ(%%mm1, (dst, index, 2))\
  784. MOVNTQ(%%mm7, 8(dst, index, 2))\
  785. \
  786. "add $8, "#index" \n\t"\
  787. "cmp "#dstw", "#index" \n\t"\
  788. " jb 1b \n\t"
  789. #define WRITEYUY2(dst, dstw, index) REAL_WRITEYUY2(dst, dstw, index)
  790. static inline void RENAME(yuv2yuvX)(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
  791. const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize, const int16_t **alpSrc,
  792. uint8_t *dest, uint8_t *uDest, uint8_t *vDest, uint8_t *aDest, long dstW, long chrDstW)
  793. {
  794. if (c->flags & SWS_ACCURATE_RND) {
  795. if (uDest) {
  796. YSCALEYUV2YV12X_ACCURATE( "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
  797. YSCALEYUV2YV12X_ACCURATE(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
  798. }
  799. if (CONFIG_SWSCALE_ALPHA && aDest) {
  800. YSCALEYUV2YV12X_ACCURATE( "0", ALP_MMX_FILTER_OFFSET, aDest, dstW)
  801. }
  802. YSCALEYUV2YV12X_ACCURATE("0", LUM_MMX_FILTER_OFFSET, dest, dstW)
  803. } else {
  804. if (uDest) {
  805. YSCALEYUV2YV12X( "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
  806. YSCALEYUV2YV12X(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
  807. }
  808. if (CONFIG_SWSCALE_ALPHA && aDest) {
  809. YSCALEYUV2YV12X( "0", ALP_MMX_FILTER_OFFSET, aDest, dstW)
  810. }
  811. YSCALEYUV2YV12X("0", LUM_MMX_FILTER_OFFSET, dest, dstW)
  812. }
  813. }
  814. static inline void RENAME(yuv2yuv1)(SwsContext *c, const int16_t *lumSrc, const int16_t *chrSrc, const int16_t *alpSrc,
  815. uint8_t *dest, uint8_t *uDest, uint8_t *vDest, uint8_t *aDest, long dstW, long chrDstW)
  816. {
  817. long p= 4;
  818. const int16_t *src[4]= {alpSrc + dstW, lumSrc + dstW, chrSrc + chrDstW, chrSrc + VOFW + chrDstW};
  819. uint8_t *dst[4]= {aDest, dest, uDest, vDest};
  820. x86_reg counter[4]= {dstW, dstW, chrDstW, chrDstW};
  821. if (c->flags & SWS_ACCURATE_RND) {
  822. while(p--) {
  823. if (dst[p]) {
  824. __asm__ volatile(
  825. YSCALEYUV2YV121_ACCURATE
  826. :: "r" (src[p]), "r" (dst[p] + counter[p]),
  827. "g" (-counter[p])
  828. : "%"REG_a
  829. );
  830. }
  831. }
  832. } else {
  833. while(p--) {
  834. if (dst[p]) {
  835. __asm__ volatile(
  836. YSCALEYUV2YV121
  837. :: "r" (src[p]), "r" (dst[p] + counter[p]),
  838. "g" (-counter[p])
  839. : "%"REG_a
  840. );
  841. }
  842. }
  843. }
  844. }
  845. /**
  846. * vertical scale YV12 to RGB
  847. */
  848. static inline void RENAME(yuv2packedX)(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
  849. const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize,
  850. const int16_t **alpSrc, uint8_t *dest, long dstW, long dstY)
  851. {
  852. x86_reg dummy=0;
  853. x86_reg dstW_reg = dstW;
  854. if (c->flags & SWS_ACCURATE_RND) {
  855. switch(c->dstFormat) {
  856. case PIX_FMT_RGB32:
  857. if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
  858. YSCALEYUV2PACKEDX_ACCURATE
  859. YSCALEYUV2RGBX
  860. "movq %%mm2, "U_TEMP"(%0) \n\t"
  861. "movq %%mm4, "V_TEMP"(%0) \n\t"
  862. "movq %%mm5, "Y_TEMP"(%0) \n\t"
  863. YSCALEYUV2PACKEDX_ACCURATE_YA(ALP_MMX_FILTER_OFFSET)
  864. "movq "Y_TEMP"(%0), %%mm5 \n\t"
  865. "psraw $3, %%mm1 \n\t"
  866. "psraw $3, %%mm7 \n\t"
  867. "packuswb %%mm7, %%mm1 \n\t"
  868. WRITEBGR32(%4, %5, %%REGa, %%mm3, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm2, %%mm6)
  869. YSCALEYUV2PACKEDX_END
  870. } else {
  871. YSCALEYUV2PACKEDX_ACCURATE
  872. YSCALEYUV2RGBX
  873. "pcmpeqd %%mm7, %%mm7 \n\t"
  874. WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
  875. YSCALEYUV2PACKEDX_END
  876. }
  877. return;
  878. case PIX_FMT_BGR24:
  879. YSCALEYUV2PACKEDX_ACCURATE
  880. YSCALEYUV2RGBX
  881. "pxor %%mm7, %%mm7 \n\t"
  882. "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"\n\t" //FIXME optimize
  883. "add %4, %%"REG_c" \n\t"
  884. WRITEBGR24(%%REGc, %5, %%REGa)
  885. :: "r" (&c->redDither),
  886. "m" (dummy), "m" (dummy), "m" (dummy),
  887. "r" (dest), "m" (dstW_reg)
  888. : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
  889. );
  890. return;
  891. case PIX_FMT_RGB555:
  892. YSCALEYUV2PACKEDX_ACCURATE
  893. YSCALEYUV2RGBX
  894. "pxor %%mm7, %%mm7 \n\t"
  895. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  896. #ifdef DITHER1XBPP
  897. "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
  898. "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
  899. "paddusb "RED_DITHER"(%0), %%mm5\n\t"
  900. #endif
  901. WRITERGB15(%4, %5, %%REGa)
  902. YSCALEYUV2PACKEDX_END
  903. return;
  904. case PIX_FMT_RGB565:
  905. YSCALEYUV2PACKEDX_ACCURATE
  906. YSCALEYUV2RGBX
  907. "pxor %%mm7, %%mm7 \n\t"
  908. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  909. #ifdef DITHER1XBPP
  910. "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
  911. "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
  912. "paddusb "RED_DITHER"(%0), %%mm5\n\t"
  913. #endif
  914. WRITERGB16(%4, %5, %%REGa)
  915. YSCALEYUV2PACKEDX_END
  916. return;
  917. case PIX_FMT_YUYV422:
  918. YSCALEYUV2PACKEDX_ACCURATE
  919. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  920. "psraw $3, %%mm3 \n\t"
  921. "psraw $3, %%mm4 \n\t"
  922. "psraw $3, %%mm1 \n\t"
  923. "psraw $3, %%mm7 \n\t"
  924. WRITEYUY2(%4, %5, %%REGa)
  925. YSCALEYUV2PACKEDX_END
  926. return;
  927. }
  928. } else {
  929. switch(c->dstFormat) {
  930. case PIX_FMT_RGB32:
  931. if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
  932. YSCALEYUV2PACKEDX
  933. YSCALEYUV2RGBX
  934. YSCALEYUV2PACKEDX_YA(ALP_MMX_FILTER_OFFSET, %%mm0, %%mm3, %%mm6, %%mm1, %%mm7)
  935. "psraw $3, %%mm1 \n\t"
  936. "psraw $3, %%mm7 \n\t"
  937. "packuswb %%mm7, %%mm1 \n\t"
  938. WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
  939. YSCALEYUV2PACKEDX_END
  940. } else {
  941. YSCALEYUV2PACKEDX
  942. YSCALEYUV2RGBX
  943. "pcmpeqd %%mm7, %%mm7 \n\t"
  944. WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
  945. YSCALEYUV2PACKEDX_END
  946. }
  947. return;
  948. case PIX_FMT_BGR24:
  949. YSCALEYUV2PACKEDX
  950. YSCALEYUV2RGBX
  951. "pxor %%mm7, %%mm7 \n\t"
  952. "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c" \n\t" //FIXME optimize
  953. "add %4, %%"REG_c" \n\t"
  954. WRITEBGR24(%%REGc, %5, %%REGa)
  955. :: "r" (&c->redDither),
  956. "m" (dummy), "m" (dummy), "m" (dummy),
  957. "r" (dest), "m" (dstW_reg)
  958. : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
  959. );
  960. return;
  961. case PIX_FMT_RGB555:
  962. YSCALEYUV2PACKEDX
  963. YSCALEYUV2RGBX
  964. "pxor %%mm7, %%mm7 \n\t"
  965. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  966. #ifdef DITHER1XBPP
  967. "paddusb "BLUE_DITHER"(%0), %%mm2 \n\t"
  968. "paddusb "GREEN_DITHER"(%0), %%mm4 \n\t"
  969. "paddusb "RED_DITHER"(%0), %%mm5 \n\t"
  970. #endif
  971. WRITERGB15(%4, %5, %%REGa)
  972. YSCALEYUV2PACKEDX_END
  973. return;
  974. case PIX_FMT_RGB565:
  975. YSCALEYUV2PACKEDX
  976. YSCALEYUV2RGBX
  977. "pxor %%mm7, %%mm7 \n\t"
  978. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  979. #ifdef DITHER1XBPP
  980. "paddusb "BLUE_DITHER"(%0), %%mm2 \n\t"
  981. "paddusb "GREEN_DITHER"(%0), %%mm4 \n\t"
  982. "paddusb "RED_DITHER"(%0), %%mm5 \n\t"
  983. #endif
  984. WRITERGB16(%4, %5, %%REGa)
  985. YSCALEYUV2PACKEDX_END
  986. return;
  987. case PIX_FMT_YUYV422:
  988. YSCALEYUV2PACKEDX
  989. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  990. "psraw $3, %%mm3 \n\t"
  991. "psraw $3, %%mm4 \n\t"
  992. "psraw $3, %%mm1 \n\t"
  993. "psraw $3, %%mm7 \n\t"
  994. WRITEYUY2(%4, %5, %%REGa)
  995. YSCALEYUV2PACKEDX_END
  996. return;
  997. }
  998. }
  999. yuv2packedXinC(c, lumFilter, lumSrc, lumFilterSize,
  1000. chrFilter, chrSrc, chrFilterSize,
  1001. alpSrc, dest, dstW, dstY);
  1002. }
  1003. /**
  1004. * vertical bilinear scale YV12 to RGB
  1005. */
  1006. static inline void RENAME(yuv2packed2)(SwsContext *c, const uint16_t *buf0, const uint16_t *buf1, const uint16_t *uvbuf0, const uint16_t *uvbuf1,
  1007. const uint16_t *abuf0, const uint16_t *abuf1, uint8_t *dest, int dstW, int yalpha, int uvalpha, int y)
  1008. {
  1009. switch(c->dstFormat) {
  1010. //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
  1011. case PIX_FMT_RGB32:
  1012. if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
  1013. #if ARCH_X86_64
  1014. __asm__ volatile(
  1015. YSCALEYUV2RGB(%%r8, %5)
  1016. YSCALEYUV2RGB_YA(%%r8, %5, %6, %7)
  1017. "psraw $3, %%mm1 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
  1018. "psraw $3, %%mm7 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
  1019. "packuswb %%mm7, %%mm1 \n\t"
  1020. WRITEBGR32(%4, 8280(%5), %%r8, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
  1021. :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "r" (dest),
  1022. "a" (&c->redDither)
  1023. ,"r" (abuf0), "r" (abuf1)
  1024. : "%r8"
  1025. );
  1026. #else
  1027. c->u_temp=(intptr_t)abuf0;
  1028. c->v_temp=(intptr_t)abuf1;
  1029. __asm__ volatile(
  1030. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1031. "mov %4, %%"REG_b" \n\t"
  1032. "push %%"REG_BP" \n\t"
  1033. YSCALEYUV2RGB(%%REGBP, %5)
  1034. "push %0 \n\t"
  1035. "push %1 \n\t"
  1036. "mov "U_TEMP"(%5), %0 \n\t"
  1037. "mov "V_TEMP"(%5), %1 \n\t"
  1038. YSCALEYUV2RGB_YA(%%REGBP, %5, %0, %1)
  1039. "psraw $3, %%mm1 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
  1040. "psraw $3, %%mm7 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
  1041. "packuswb %%mm7, %%mm1 \n\t"
  1042. "pop %1 \n\t"
  1043. "pop %0 \n\t"
  1044. WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
  1045. "pop %%"REG_BP" \n\t"
  1046. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1047. :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
  1048. "a" (&c->redDither)
  1049. );
  1050. #endif
  1051. } else {
  1052. __asm__ volatile(
  1053. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1054. "mov %4, %%"REG_b" \n\t"
  1055. "push %%"REG_BP" \n\t"
  1056. YSCALEYUV2RGB(%%REGBP, %5)
  1057. "pcmpeqd %%mm7, %%mm7 \n\t"
  1058. WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
  1059. "pop %%"REG_BP" \n\t"
  1060. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1061. :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
  1062. "a" (&c->redDither)
  1063. );
  1064. }
  1065. return;
  1066. case PIX_FMT_BGR24:
  1067. __asm__ volatile(
  1068. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1069. "mov %4, %%"REG_b" \n\t"
  1070. "push %%"REG_BP" \n\t"
  1071. YSCALEYUV2RGB(%%REGBP, %5)
  1072. "pxor %%mm7, %%mm7 \n\t"
  1073. WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
  1074. "pop %%"REG_BP" \n\t"
  1075. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1076. :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
  1077. "a" (&c->redDither)
  1078. );
  1079. return;
  1080. case PIX_FMT_RGB555:
  1081. __asm__ volatile(
  1082. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1083. "mov %4, %%"REG_b" \n\t"
  1084. "push %%"REG_BP" \n\t"
  1085. YSCALEYUV2RGB(%%REGBP, %5)
  1086. "pxor %%mm7, %%mm7 \n\t"
  1087. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1088. #ifdef DITHER1XBPP
  1089. "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
  1090. "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
  1091. "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
  1092. #endif
  1093. WRITERGB15(%%REGb, 8280(%5), %%REGBP)
  1094. "pop %%"REG_BP" \n\t"
  1095. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1096. :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
  1097. "a" (&c->redDither)
  1098. );
  1099. return;
  1100. case PIX_FMT_RGB565:
  1101. __asm__ volatile(
  1102. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1103. "mov %4, %%"REG_b" \n\t"
  1104. "push %%"REG_BP" \n\t"
  1105. YSCALEYUV2RGB(%%REGBP, %5)
  1106. "pxor %%mm7, %%mm7 \n\t"
  1107. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1108. #ifdef DITHER1XBPP
  1109. "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
  1110. "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
  1111. "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
  1112. #endif
  1113. WRITERGB16(%%REGb, 8280(%5), %%REGBP)
  1114. "pop %%"REG_BP" \n\t"
  1115. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1116. :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
  1117. "a" (&c->redDither)
  1118. );
  1119. return;
  1120. case PIX_FMT_YUYV422:
  1121. __asm__ volatile(
  1122. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1123. "mov %4, %%"REG_b" \n\t"
  1124. "push %%"REG_BP" \n\t"
  1125. YSCALEYUV2PACKED(%%REGBP, %5)
  1126. WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
  1127. "pop %%"REG_BP" \n\t"
  1128. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1129. :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
  1130. "a" (&c->redDither)
  1131. );
  1132. return;
  1133. }
  1134. yuv2packed2_c(c, buf0, buf1, uvbuf0, uvbuf1, abuf0, abuf1,
  1135. dest, dstW, yalpha, uvalpha, y);
  1136. }
  1137. /**
  1138. * YV12 to RGB without scaling or interpolating
  1139. */
  1140. static inline void RENAME(yuv2packed1)(SwsContext *c, const uint16_t *buf0, const uint16_t *uvbuf0, const uint16_t *uvbuf1,
  1141. const uint16_t *abuf0, uint8_t *dest, int dstW, int uvalpha, enum PixelFormat dstFormat, int flags, int y)
  1142. {
  1143. const uint16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
  1144. if (flags&SWS_FULL_CHR_H_INT) {
  1145. c->yuv2packed2(c, buf0, buf0, uvbuf0, uvbuf1, abuf0, abuf0, dest, dstW, 0, uvalpha, y);
  1146. return;
  1147. }
  1148. if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
  1149. switch(dstFormat) {
  1150. case PIX_FMT_RGB32:
  1151. if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
  1152. __asm__ volatile(
  1153. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1154. "mov %4, %%"REG_b" \n\t"
  1155. "push %%"REG_BP" \n\t"
  1156. YSCALEYUV2RGB1(%%REGBP, %5)
  1157. YSCALEYUV2RGB1_ALPHA(%%REGBP)
  1158. WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
  1159. "pop %%"REG_BP" \n\t"
  1160. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1161. :: "c" (buf0), "d" (abuf0), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
  1162. "a" (&c->redDither)
  1163. );
  1164. } else {
  1165. __asm__ volatile(
  1166. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1167. "mov %4, %%"REG_b" \n\t"
  1168. "push %%"REG_BP" \n\t"
  1169. YSCALEYUV2RGB1(%%REGBP, %5)
  1170. "pcmpeqd %%mm7, %%mm7 \n\t"
  1171. WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
  1172. "pop %%"REG_BP" \n\t"
  1173. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1174. :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
  1175. "a" (&c->redDither)
  1176. );
  1177. }
  1178. return;
  1179. case PIX_FMT_BGR24:
  1180. __asm__ volatile(
  1181. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1182. "mov %4, %%"REG_b" \n\t"
  1183. "push %%"REG_BP" \n\t"
  1184. YSCALEYUV2RGB1(%%REGBP, %5)
  1185. "pxor %%mm7, %%mm7 \n\t"
  1186. WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
  1187. "pop %%"REG_BP" \n\t"
  1188. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1189. :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
  1190. "a" (&c->redDither)
  1191. );
  1192. return;
  1193. case PIX_FMT_RGB555:
  1194. __asm__ volatile(
  1195. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1196. "mov %4, %%"REG_b" \n\t"
  1197. "push %%"REG_BP" \n\t"
  1198. YSCALEYUV2RGB1(%%REGBP, %5)
  1199. "pxor %%mm7, %%mm7 \n\t"
  1200. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1201. #ifdef DITHER1XBPP
  1202. "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
  1203. "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
  1204. "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
  1205. #endif
  1206. WRITERGB15(%%REGb, 8280(%5), %%REGBP)
  1207. "pop %%"REG_BP" \n\t"
  1208. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1209. :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
  1210. "a" (&c->redDither)
  1211. );
  1212. return;
  1213. case PIX_FMT_RGB565:
  1214. __asm__ volatile(
  1215. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1216. "mov %4, %%"REG_b" \n\t"
  1217. "push %%"REG_BP" \n\t"
  1218. YSCALEYUV2RGB1(%%REGBP, %5)
  1219. "pxor %%mm7, %%mm7 \n\t"
  1220. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1221. #ifdef DITHER1XBPP
  1222. "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
  1223. "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
  1224. "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
  1225. #endif
  1226. WRITERGB16(%%REGb, 8280(%5), %%REGBP)
  1227. "pop %%"REG_BP" \n\t"
  1228. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1229. :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
  1230. "a" (&c->redDither)
  1231. );
  1232. return;
  1233. case PIX_FMT_YUYV422:
  1234. __asm__ volatile(
  1235. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1236. "mov %4, %%"REG_b" \n\t"
  1237. "push %%"REG_BP" \n\t"
  1238. YSCALEYUV2PACKED1(%%REGBP, %5)
  1239. WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
  1240. "pop %%"REG_BP" \n\t"
  1241. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1242. :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
  1243. "a" (&c->redDither)
  1244. );
  1245. return;
  1246. }
  1247. } else {
  1248. switch(dstFormat) {
  1249. case PIX_FMT_RGB32:
  1250. if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
  1251. __asm__ volatile(
  1252. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1253. "mov %4, %%"REG_b" \n\t"
  1254. "push %%"REG_BP" \n\t"
  1255. YSCALEYUV2RGB1b(%%REGBP, %5)
  1256. YSCALEYUV2RGB1_ALPHA(%%REGBP)
  1257. WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
  1258. "pop %%"REG_BP" \n\t"
  1259. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1260. :: "c" (buf0), "d" (abuf0), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
  1261. "a" (&c->redDither)
  1262. );
  1263. } else {
  1264. __asm__ volatile(
  1265. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1266. "mov %4, %%"REG_b" \n\t"
  1267. "push %%"REG_BP" \n\t"
  1268. YSCALEYUV2RGB1b(%%REGBP, %5)
  1269. "pcmpeqd %%mm7, %%mm7 \n\t"
  1270. WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
  1271. "pop %%"REG_BP" \n\t"
  1272. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1273. :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
  1274. "a" (&c->redDither)
  1275. );
  1276. }
  1277. return;
  1278. case PIX_FMT_BGR24:
  1279. __asm__ volatile(
  1280. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1281. "mov %4, %%"REG_b" \n\t"
  1282. "push %%"REG_BP" \n\t"
  1283. YSCALEYUV2RGB1b(%%REGBP, %5)
  1284. "pxor %%mm7, %%mm7 \n\t"
  1285. WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
  1286. "pop %%"REG_BP" \n\t"
  1287. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1288. :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
  1289. "a" (&c->redDither)
  1290. );
  1291. return;
  1292. case PIX_FMT_RGB555:
  1293. __asm__ volatile(
  1294. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1295. "mov %4, %%"REG_b" \n\t"
  1296. "push %%"REG_BP" \n\t"
  1297. YSCALEYUV2RGB1b(%%REGBP, %5)
  1298. "pxor %%mm7, %%mm7 \n\t"
  1299. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1300. #ifdef DITHER1XBPP
  1301. "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
  1302. "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
  1303. "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
  1304. #endif
  1305. WRITERGB15(%%REGb, 8280(%5), %%REGBP)
  1306. "pop %%"REG_BP" \n\t"
  1307. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1308. :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
  1309. "a" (&c->redDither)
  1310. );
  1311. return;
  1312. case PIX_FMT_RGB565:
  1313. __asm__ volatile(
  1314. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1315. "mov %4, %%"REG_b" \n\t"
  1316. "push %%"REG_BP" \n\t"
  1317. YSCALEYUV2RGB1b(%%REGBP, %5)
  1318. "pxor %%mm7, %%mm7 \n\t"
  1319. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1320. #ifdef DITHER1XBPP
  1321. "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
  1322. "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
  1323. "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
  1324. #endif
  1325. WRITERGB16(%%REGb, 8280(%5), %%REGBP)
  1326. "pop %%"REG_BP" \n\t"
  1327. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1328. :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
  1329. "a" (&c->redDither)
  1330. );
  1331. return;
  1332. case PIX_FMT_YUYV422:
  1333. __asm__ volatile(
  1334. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1335. "mov %4, %%"REG_b" \n\t"
  1336. "push %%"REG_BP" \n\t"
  1337. YSCALEYUV2PACKED1b(%%REGBP, %5)
  1338. WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
  1339. "pop %%"REG_BP" \n\t"
  1340. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1341. :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
  1342. "a" (&c->redDither)
  1343. );
  1344. return;
  1345. }
  1346. }
  1347. yuv2packed1_c(c, buf0, uvbuf0, uvbuf1, abuf0, dest,
  1348. dstW, uvalpha, dstFormat, flags, y);
  1349. }
  1350. //FIXME yuy2* can read up to 7 samples too much
  1351. static inline void RENAME(yuy2ToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
  1352. {
  1353. __asm__ volatile(
  1354. "movq "MANGLE(bm01010101)", %%mm2 \n\t"
  1355. "mov %0, %%"REG_a" \n\t"
  1356. "1: \n\t"
  1357. "movq (%1, %%"REG_a",2), %%mm0 \n\t"
  1358. "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
  1359. "pand %%mm2, %%mm0 \n\t"
  1360. "pand %%mm2, %%mm1 \n\t"
  1361. "packuswb %%mm1, %%mm0 \n\t"
  1362. "movq %%mm0, (%2, %%"REG_a") \n\t"
  1363. "add $8, %%"REG_a" \n\t"
  1364. " js 1b \n\t"
  1365. : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst+width)
  1366. : "%"REG_a
  1367. );
  1368. }
  1369. static inline void RENAME(yuy2ToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
  1370. {
  1371. __asm__ volatile(
  1372. "movq "MANGLE(bm01010101)", %%mm4 \n\t"
  1373. "mov %0, %%"REG_a" \n\t"
  1374. "1: \n\t"
  1375. "movq (%1, %%"REG_a",4), %%mm0 \n\t"
  1376. "movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
  1377. "psrlw $8, %%mm0 \n\t"
  1378. "psrlw $8, %%mm1 \n\t"
  1379. "packuswb %%mm1, %%mm0 \n\t"
  1380. "movq %%mm0, %%mm1 \n\t"
  1381. "psrlw $8, %%mm0 \n\t"
  1382. "pand %%mm4, %%mm1 \n\t"
  1383. "packuswb %%mm0, %%mm0 \n\t"
  1384. "packuswb %%mm1, %%mm1 \n\t"
  1385. "movd %%mm0, (%3, %%"REG_a") \n\t"
  1386. "movd %%mm1, (%2, %%"REG_a") \n\t"
  1387. "add $4, %%"REG_a" \n\t"
  1388. " js 1b \n\t"
  1389. : : "g" ((x86_reg)-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
  1390. : "%"REG_a
  1391. );
  1392. assert(src1 == src2);
  1393. }
  1394. static inline void RENAME(LEToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
  1395. {
  1396. __asm__ volatile(
  1397. "mov %0, %%"REG_a" \n\t"
  1398. "1: \n\t"
  1399. "movq (%1, %%"REG_a",2), %%mm0 \n\t"
  1400. "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
  1401. "movq (%2, %%"REG_a",2), %%mm2 \n\t"
  1402. "movq 8(%2, %%"REG_a",2), %%mm3 \n\t"
  1403. "psrlw $8, %%mm0 \n\t"
  1404. "psrlw $8, %%mm1 \n\t"
  1405. "psrlw $8, %%mm2 \n\t"
  1406. "psrlw $8, %%mm3 \n\t"
  1407. "packuswb %%mm1, %%mm0 \n\t"
  1408. "packuswb %%mm3, %%mm2 \n\t"
  1409. "movq %%mm0, (%3, %%"REG_a") \n\t"
  1410. "movq %%mm2, (%4, %%"REG_a") \n\t"
  1411. "add $8, %%"REG_a" \n\t"
  1412. " js 1b \n\t"
  1413. : : "g" ((x86_reg)-width), "r" (src1+width*2), "r" (src2+width*2), "r" (dstU+width), "r" (dstV+width)
  1414. : "%"REG_a
  1415. );
  1416. }
  1417. /* This is almost identical to the previous, end exists only because
  1418. * yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses. */
  1419. static inline void RENAME(uyvyToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
  1420. {
  1421. __asm__ volatile(
  1422. "mov %0, %%"REG_a" \n\t"
  1423. "1: \n\t"
  1424. "movq (%1, %%"REG_a",2), %%mm0 \n\t"
  1425. "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
  1426. "psrlw $8, %%mm0 \n\t"
  1427. "psrlw $8, %%mm1 \n\t"
  1428. "packuswb %%mm1, %%mm0 \n\t"
  1429. "movq %%mm0, (%2, %%"REG_a") \n\t"
  1430. "add $8, %%"REG_a" \n\t"
  1431. " js 1b \n\t"
  1432. : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst+width)
  1433. : "%"REG_a
  1434. );
  1435. }
  1436. static inline void RENAME(uyvyToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
  1437. {
  1438. __asm__ volatile(
  1439. "movq "MANGLE(bm01010101)", %%mm4 \n\t"
  1440. "mov %0, %%"REG_a" \n\t"
  1441. "1: \n\t"
  1442. "movq (%1, %%"REG_a",4), %%mm0 \n\t"
  1443. "movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
  1444. "pand %%mm4, %%mm0 \n\t"
  1445. "pand %%mm4, %%mm1 \n\t"
  1446. "packuswb %%mm1, %%mm0 \n\t"
  1447. "movq %%mm0, %%mm1 \n\t"
  1448. "psrlw $8, %%mm0 \n\t"
  1449. "pand %%mm4, %%mm1 \n\t"
  1450. "packuswb %%mm0, %%mm0 \n\t"
  1451. "packuswb %%mm1, %%mm1 \n\t"
  1452. "movd %%mm0, (%3, %%"REG_a") \n\t"
  1453. "movd %%mm1, (%2, %%"REG_a") \n\t"
  1454. "add $4, %%"REG_a" \n\t"
  1455. " js 1b \n\t"
  1456. : : "g" ((x86_reg)-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
  1457. : "%"REG_a
  1458. );
  1459. assert(src1 == src2);
  1460. }
  1461. static inline void RENAME(BEToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
  1462. {
  1463. __asm__ volatile(
  1464. "movq "MANGLE(bm01010101)", %%mm4 \n\t"
  1465. "mov %0, %%"REG_a" \n\t"
  1466. "1: \n\t"
  1467. "movq (%1, %%"REG_a",2), %%mm0 \n\t"
  1468. "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
  1469. "movq (%2, %%"REG_a",2), %%mm2 \n\t"
  1470. "movq 8(%2, %%"REG_a",2), %%mm3 \n\t"
  1471. "pand %%mm4, %%mm0 \n\t"
  1472. "pand %%mm4, %%mm1 \n\t"
  1473. "pand %%mm4, %%mm2 \n\t"
  1474. "pand %%mm4, %%mm3 \n\t"
  1475. "packuswb %%mm1, %%mm0 \n\t"
  1476. "packuswb %%mm3, %%mm2 \n\t"
  1477. "movq %%mm0, (%3, %%"REG_a") \n\t"
  1478. "movq %%mm2, (%4, %%"REG_a") \n\t"
  1479. "add $8, %%"REG_a" \n\t"
  1480. " js 1b \n\t"
  1481. : : "g" ((x86_reg)-width), "r" (src1+width*2), "r" (src2+width*2), "r" (dstU+width), "r" (dstV+width)
  1482. : "%"REG_a
  1483. );
  1484. }
  1485. static inline void RENAME(nvXXtoUV)(uint8_t *dst1, uint8_t *dst2,
  1486. const uint8_t *src, long width)
  1487. {
  1488. __asm__ volatile(
  1489. "movq "MANGLE(bm01010101)", %%mm4 \n\t"
  1490. "mov %0, %%"REG_a" \n\t"
  1491. "1: \n\t"
  1492. "movq (%1, %%"REG_a",2), %%mm0 \n\t"
  1493. "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
  1494. "movq %%mm0, %%mm2 \n\t"
  1495. "movq %%mm1, %%mm3 \n\t"
  1496. "pand %%mm4, %%mm0 \n\t"
  1497. "pand %%mm4, %%mm1 \n\t"
  1498. "psrlw $8, %%mm2 \n\t"
  1499. "psrlw $8, %%mm3 \n\t"
  1500. "packuswb %%mm1, %%mm0 \n\t"
  1501. "packuswb %%mm3, %%mm2 \n\t"
  1502. "movq %%mm0, (%2, %%"REG_a") \n\t"
  1503. "movq %%mm2, (%3, %%"REG_a") \n\t"
  1504. "add $8, %%"REG_a" \n\t"
  1505. " js 1b \n\t"
  1506. : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst1+width), "r" (dst2+width)
  1507. : "%"REG_a
  1508. );
  1509. }
  1510. static inline void RENAME(nv12ToUV)(uint8_t *dstU, uint8_t *dstV,
  1511. const uint8_t *src1, const uint8_t *src2,
  1512. long width, uint32_t *unused)
  1513. {
  1514. RENAME(nvXXtoUV)(dstU, dstV, src1, width);
  1515. }
  1516. static inline void RENAME(nv21ToUV)(uint8_t *dstU, uint8_t *dstV,
  1517. const uint8_t *src1, const uint8_t *src2,
  1518. long width, uint32_t *unused)
  1519. {
  1520. RENAME(nvXXtoUV)(dstV, dstU, src1, width);
  1521. }
  1522. static inline void RENAME(bgr24ToY_mmx)(int16_t *dst, const uint8_t *src, long width, enum PixelFormat srcFormat)
  1523. {
  1524. if(srcFormat == PIX_FMT_BGR24) {
  1525. __asm__ volatile(
  1526. "movq "MANGLE(ff_bgr24toY1Coeff)", %%mm5 \n\t"
  1527. "movq "MANGLE(ff_bgr24toY2Coeff)", %%mm6 \n\t"
  1528. :
  1529. );
  1530. } else {
  1531. __asm__ volatile(
  1532. "movq "MANGLE(ff_rgb24toY1Coeff)", %%mm5 \n\t"
  1533. "movq "MANGLE(ff_rgb24toY2Coeff)", %%mm6 \n\t"
  1534. :
  1535. );
  1536. }
  1537. __asm__ volatile(
  1538. "movq "MANGLE(ff_bgr24toYOffset)", %%mm4 \n\t"
  1539. "mov %2, %%"REG_a" \n\t"
  1540. "pxor %%mm7, %%mm7 \n\t"
  1541. "1: \n\t"
  1542. PREFETCH" 64(%0) \n\t"
  1543. "movd (%0), %%mm0 \n\t"
  1544. "movd 2(%0), %%mm1 \n\t"
  1545. "movd 6(%0), %%mm2 \n\t"
  1546. "movd 8(%0), %%mm3 \n\t"
  1547. "add $12, %0 \n\t"
  1548. "punpcklbw %%mm7, %%mm0 \n\t"
  1549. "punpcklbw %%mm7, %%mm1 \n\t"
  1550. "punpcklbw %%mm7, %%mm2 \n\t"
  1551. "punpcklbw %%mm7, %%mm3 \n\t"
  1552. "pmaddwd %%mm5, %%mm0 \n\t"
  1553. "pmaddwd %%mm6, %%mm1 \n\t"
  1554. "pmaddwd %%mm5, %%mm2 \n\t"
  1555. "pmaddwd %%mm6, %%mm3 \n\t"
  1556. "paddd %%mm1, %%mm0 \n\t"
  1557. "paddd %%mm3, %%mm2 \n\t"
  1558. "paddd %%mm4, %%mm0 \n\t"
  1559. "paddd %%mm4, %%mm2 \n\t"
  1560. "psrad $9, %%mm0 \n\t"
  1561. "psrad $9, %%mm2 \n\t"
  1562. "packssdw %%mm2, %%mm0 \n\t"
  1563. "movq %%mm0, (%1, %%"REG_a") \n\t"
  1564. "add $8, %%"REG_a" \n\t"
  1565. " js 1b \n\t"
  1566. : "+r" (src)
  1567. : "r" (dst+width), "g" ((x86_reg)-2*width)
  1568. : "%"REG_a
  1569. );
  1570. }
  1571. static inline void RENAME(bgr24ToUV_mmx)(int16_t *dstU, int16_t *dstV, const uint8_t *src, long width, enum PixelFormat srcFormat)
  1572. {
  1573. __asm__ volatile(
  1574. "movq 24(%4), %%mm6 \n\t"
  1575. "mov %3, %%"REG_a" \n\t"
  1576. "pxor %%mm7, %%mm7 \n\t"
  1577. "1: \n\t"
  1578. PREFETCH" 64(%0) \n\t"
  1579. "movd (%0), %%mm0 \n\t"
  1580. "movd 2(%0), %%mm1 \n\t"
  1581. "punpcklbw %%mm7, %%mm0 \n\t"
  1582. "punpcklbw %%mm7, %%mm1 \n\t"
  1583. "movq %%mm0, %%mm2 \n\t"
  1584. "movq %%mm1, %%mm3 \n\t"
  1585. "pmaddwd (%4), %%mm0 \n\t"
  1586. "pmaddwd 8(%4), %%mm1 \n\t"
  1587. "pmaddwd 16(%4), %%mm2 \n\t"
  1588. "pmaddwd %%mm6, %%mm3 \n\t"
  1589. "paddd %%mm1, %%mm0 \n\t"
  1590. "paddd %%mm3, %%mm2 \n\t"
  1591. "movd 6(%0), %%mm1 \n\t"
  1592. "movd 8(%0), %%mm3 \n\t"
  1593. "add $12, %0 \n\t"
  1594. "punpcklbw %%mm7, %%mm1 \n\t"
  1595. "punpcklbw %%mm7, %%mm3 \n\t"
  1596. "movq %%mm1, %%mm4 \n\t"
  1597. "movq %%mm3, %%mm5 \n\t"
  1598. "pmaddwd (%4), %%mm1 \n\t"
  1599. "pmaddwd 8(%4), %%mm3 \n\t"
  1600. "pmaddwd 16(%4), %%mm4 \n\t"
  1601. "pmaddwd %%mm6, %%mm5 \n\t"
  1602. "paddd %%mm3, %%mm1 \n\t"
  1603. "paddd %%mm5, %%mm4 \n\t"
  1604. "movq "MANGLE(ff_bgr24toUVOffset)", %%mm3 \n\t"
  1605. "paddd %%mm3, %%mm0 \n\t"
  1606. "paddd %%mm3, %%mm2 \n\t"
  1607. "paddd %%mm3, %%mm1 \n\t"
  1608. "paddd %%mm3, %%mm4 \n\t"
  1609. "psrad $9, %%mm0 \n\t"
  1610. "psrad $9, %%mm2 \n\t"
  1611. "psrad $9, %%mm1 \n\t"
  1612. "psrad $9, %%mm4 \n\t"
  1613. "packssdw %%mm1, %%mm0 \n\t"
  1614. "packssdw %%mm4, %%mm2 \n\t"
  1615. "movq %%mm0, (%1, %%"REG_a") \n\t"
  1616. "movq %%mm2, (%2, %%"REG_a") \n\t"
  1617. "add $8, %%"REG_a" \n\t"
  1618. " js 1b \n\t"
  1619. : "+r" (src)
  1620. : "r" (dstU+width), "r" (dstV+width), "g" ((x86_reg)-2*width), "r"(ff_bgr24toUV[srcFormat == PIX_FMT_RGB24])
  1621. : "%"REG_a
  1622. );
  1623. }
  1624. static inline void RENAME(bgr24ToY)(int16_t *dst, const uint8_t *src, long width, uint32_t *unused)
  1625. {
  1626. RENAME(bgr24ToY_mmx)(dst, src, width, PIX_FMT_BGR24);
  1627. }
  1628. static inline void RENAME(bgr24ToUV)(int16_t *dstU, int16_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
  1629. {
  1630. RENAME(bgr24ToUV_mmx)(dstU, dstV, src1, width, PIX_FMT_BGR24);
  1631. assert(src1 == src2);
  1632. }
  1633. static inline void RENAME(rgb24ToY)(int16_t *dst, const uint8_t *src, long width, uint32_t *unused)
  1634. {
  1635. RENAME(bgr24ToY_mmx)(dst, src, width, PIX_FMT_RGB24);
  1636. }
  1637. static inline void RENAME(rgb24ToUV)(int16_t *dstU, int16_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
  1638. {
  1639. assert(src1==src2);
  1640. RENAME(bgr24ToUV_mmx)(dstU, dstV, src1, width, PIX_FMT_RGB24);
  1641. }
  1642. // bilinear / bicubic scaling
  1643. static inline void RENAME(hScale)(int16_t *dst, int dstW, const uint8_t *src, int srcW, int xInc,
  1644. const int16_t *filter, const int16_t *filterPos, long filterSize)
  1645. {
  1646. assert(filterSize % 4 == 0 && filterSize>0);
  1647. if (filterSize==4) { // Always true for upscaling, sometimes for down, too.
  1648. x86_reg counter= -2*dstW;
  1649. filter-= counter*2;
  1650. filterPos-= counter/2;
  1651. dst-= counter/2;
  1652. __asm__ volatile(
  1653. #if defined(PIC)
  1654. "push %%"REG_b" \n\t"
  1655. #endif
  1656. "pxor %%mm7, %%mm7 \n\t"
  1657. "push %%"REG_BP" \n\t" // we use 7 regs here ...
  1658. "mov %%"REG_a", %%"REG_BP" \n\t"
  1659. ".p2align 4 \n\t"
  1660. "1: \n\t"
  1661. "movzwl (%2, %%"REG_BP"), %%eax \n\t"
  1662. "movzwl 2(%2, %%"REG_BP"), %%ebx \n\t"
  1663. "movq (%1, %%"REG_BP", 4), %%mm1 \n\t"
  1664. "movq 8(%1, %%"REG_BP", 4), %%mm3 \n\t"
  1665. "movd (%3, %%"REG_a"), %%mm0 \n\t"
  1666. "movd (%3, %%"REG_b"), %%mm2 \n\t"
  1667. "punpcklbw %%mm7, %%mm0 \n\t"
  1668. "punpcklbw %%mm7, %%mm2 \n\t"
  1669. "pmaddwd %%mm1, %%mm0 \n\t"
  1670. "pmaddwd %%mm2, %%mm3 \n\t"
  1671. "movq %%mm0, %%mm4 \n\t"
  1672. "punpckldq %%mm3, %%mm0 \n\t"
  1673. "punpckhdq %%mm3, %%mm4 \n\t"
  1674. "paddd %%mm4, %%mm0 \n\t"
  1675. "psrad $7, %%mm0 \n\t"
  1676. "packssdw %%mm0, %%mm0 \n\t"
  1677. "movd %%mm0, (%4, %%"REG_BP") \n\t"
  1678. "add $4, %%"REG_BP" \n\t"
  1679. " jnc 1b \n\t"
  1680. "pop %%"REG_BP" \n\t"
  1681. #if defined(PIC)
  1682. "pop %%"REG_b" \n\t"
  1683. #endif
  1684. : "+a" (counter)
  1685. : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
  1686. #if !defined(PIC)
  1687. : "%"REG_b
  1688. #endif
  1689. );
  1690. } else if (filterSize==8) {
  1691. x86_reg counter= -2*dstW;
  1692. filter-= counter*4;
  1693. filterPos-= counter/2;
  1694. dst-= counter/2;
  1695. __asm__ volatile(
  1696. #if defined(PIC)
  1697. "push %%"REG_b" \n\t"
  1698. #endif
  1699. "pxor %%mm7, %%mm7 \n\t"
  1700. "push %%"REG_BP" \n\t" // we use 7 regs here ...
  1701. "mov %%"REG_a", %%"REG_BP" \n\t"
  1702. ".p2align 4 \n\t"
  1703. "1: \n\t"
  1704. "movzwl (%2, %%"REG_BP"), %%eax \n\t"
  1705. "movzwl 2(%2, %%"REG_BP"), %%ebx \n\t"
  1706. "movq (%1, %%"REG_BP", 8), %%mm1 \n\t"
  1707. "movq 16(%1, %%"REG_BP", 8), %%mm3 \n\t"
  1708. "movd (%3, %%"REG_a"), %%mm0 \n\t"
  1709. "movd (%3, %%"REG_b"), %%mm2 \n\t"
  1710. "punpcklbw %%mm7, %%mm0 \n\t"
  1711. "punpcklbw %%mm7, %%mm2 \n\t"
  1712. "pmaddwd %%mm1, %%mm0 \n\t"
  1713. "pmaddwd %%mm2, %%mm3 \n\t"
  1714. "movq 8(%1, %%"REG_BP", 8), %%mm1 \n\t"
  1715. "movq 24(%1, %%"REG_BP", 8), %%mm5 \n\t"
  1716. "movd 4(%3, %%"REG_a"), %%mm4 \n\t"
  1717. "movd 4(%3, %%"REG_b"), %%mm2 \n\t"
  1718. "punpcklbw %%mm7, %%mm4 \n\t"
  1719. "punpcklbw %%mm7, %%mm2 \n\t"
  1720. "pmaddwd %%mm1, %%mm4 \n\t"
  1721. "pmaddwd %%mm2, %%mm5 \n\t"
  1722. "paddd %%mm4, %%mm0 \n\t"
  1723. "paddd %%mm5, %%mm3 \n\t"
  1724. "movq %%mm0, %%mm4 \n\t"
  1725. "punpckldq %%mm3, %%mm0 \n\t"
  1726. "punpckhdq %%mm3, %%mm4 \n\t"
  1727. "paddd %%mm4, %%mm0 \n\t"
  1728. "psrad $7, %%mm0 \n\t"
  1729. "packssdw %%mm0, %%mm0 \n\t"
  1730. "movd %%mm0, (%4, %%"REG_BP") \n\t"
  1731. "add $4, %%"REG_BP" \n\t"
  1732. " jnc 1b \n\t"
  1733. "pop %%"REG_BP" \n\t"
  1734. #if defined(PIC)
  1735. "pop %%"REG_b" \n\t"
  1736. #endif
  1737. : "+a" (counter)
  1738. : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
  1739. #if !defined(PIC)
  1740. : "%"REG_b
  1741. #endif
  1742. );
  1743. } else {
  1744. const uint8_t *offset = src+filterSize;
  1745. x86_reg counter= -2*dstW;
  1746. //filter-= counter*filterSize/2;
  1747. filterPos-= counter/2;
  1748. dst-= counter/2;
  1749. __asm__ volatile(
  1750. "pxor %%mm7, %%mm7 \n\t"
  1751. ".p2align 4 \n\t"
  1752. "1: \n\t"
  1753. "mov %2, %%"REG_c" \n\t"
  1754. "movzwl (%%"REG_c", %0), %%eax \n\t"
  1755. "movzwl 2(%%"REG_c", %0), %%edx \n\t"
  1756. "mov %5, %%"REG_c" \n\t"
  1757. "pxor %%mm4, %%mm4 \n\t"
  1758. "pxor %%mm5, %%mm5 \n\t"
  1759. "2: \n\t"
  1760. "movq (%1), %%mm1 \n\t"
  1761. "movq (%1, %6), %%mm3 \n\t"
  1762. "movd (%%"REG_c", %%"REG_a"), %%mm0 \n\t"
  1763. "movd (%%"REG_c", %%"REG_d"), %%mm2 \n\t"
  1764. "punpcklbw %%mm7, %%mm0 \n\t"
  1765. "punpcklbw %%mm7, %%mm2 \n\t"
  1766. "pmaddwd %%mm1, %%mm0 \n\t"
  1767. "pmaddwd %%mm2, %%mm3 \n\t"
  1768. "paddd %%mm3, %%mm5 \n\t"
  1769. "paddd %%mm0, %%mm4 \n\t"
  1770. "add $8, %1 \n\t"
  1771. "add $4, %%"REG_c" \n\t"
  1772. "cmp %4, %%"REG_c" \n\t"
  1773. " jb 2b \n\t"
  1774. "add %6, %1 \n\t"
  1775. "movq %%mm4, %%mm0 \n\t"
  1776. "punpckldq %%mm5, %%mm4 \n\t"
  1777. "punpckhdq %%mm5, %%mm0 \n\t"
  1778. "paddd %%mm0, %%mm4 \n\t"
  1779. "psrad $7, %%mm4 \n\t"
  1780. "packssdw %%mm4, %%mm4 \n\t"
  1781. "mov %3, %%"REG_a" \n\t"
  1782. "movd %%mm4, (%%"REG_a", %0) \n\t"
  1783. "add $4, %0 \n\t"
  1784. " jnc 1b \n\t"
  1785. : "+r" (counter), "+r" (filter)
  1786. : "m" (filterPos), "m" (dst), "m"(offset),
  1787. "m" (src), "r" ((x86_reg)filterSize*2)
  1788. : "%"REG_a, "%"REG_c, "%"REG_d
  1789. );
  1790. }
  1791. }
  1792. static inline void RENAME(hScale16)(int16_t *dst, int dstW, const uint16_t *src, int srcW, int xInc,
  1793. const int16_t *filter, const int16_t *filterPos, long filterSize, int shift)
  1794. {
  1795. int i, j;
  1796. assert(filterSize % 4 == 0 && filterSize>0);
  1797. if (filterSize==4 && shift<15) { // Always true for upscaling, sometimes for down, too.
  1798. x86_reg counter= -2*dstW;
  1799. filter-= counter*2;
  1800. filterPos-= counter/2;
  1801. dst-= counter/2;
  1802. __asm__ volatile(
  1803. "movd %5, %%mm7 \n\t"
  1804. #if defined(PIC)
  1805. "push %%"REG_b" \n\t"
  1806. #endif
  1807. "push %%"REG_BP" \n\t" // we use 7 regs here ...
  1808. "mov %%"REG_a", %%"REG_BP" \n\t"
  1809. ".p2align 4 \n\t"
  1810. "1: \n\t"
  1811. "movzwl (%2, %%"REG_BP"), %%eax \n\t"
  1812. "movzwl 2(%2, %%"REG_BP"), %%ebx \n\t"
  1813. "movq (%1, %%"REG_BP", 4), %%mm1 \n\t"
  1814. "movq 8(%1, %%"REG_BP", 4), %%mm3 \n\t"
  1815. "movq (%3, %%"REG_a", 2), %%mm0 \n\t"
  1816. "movq (%3, %%"REG_b", 2), %%mm2 \n\t"
  1817. "pmaddwd %%mm1, %%mm0 \n\t"
  1818. "pmaddwd %%mm2, %%mm3 \n\t"
  1819. "movq %%mm0, %%mm4 \n\t"
  1820. "punpckldq %%mm3, %%mm0 \n\t"
  1821. "punpckhdq %%mm3, %%mm4 \n\t"
  1822. "paddd %%mm4, %%mm0 \n\t"
  1823. "psrad %%mm7, %%mm0 \n\t"
  1824. "packssdw %%mm0, %%mm0 \n\t"
  1825. "movd %%mm0, (%4, %%"REG_BP") \n\t"
  1826. "add $4, %%"REG_BP" \n\t"
  1827. " jnc 1b \n\t"
  1828. "pop %%"REG_BP" \n\t"
  1829. #if defined(PIC)
  1830. "pop %%"REG_b" \n\t"
  1831. #endif
  1832. : "+a" (counter)
  1833. : "c" (filter), "d" (filterPos), "S" (src), "D" (dst), "m"(shift)
  1834. #if !defined(PIC)
  1835. : "%"REG_b
  1836. #endif
  1837. );
  1838. } else if (filterSize==8 && shift<15) {
  1839. x86_reg counter= -2*dstW;
  1840. filter-= counter*4;
  1841. filterPos-= counter/2;
  1842. dst-= counter/2;
  1843. __asm__ volatile(
  1844. "movd %5, %%mm7 \n\t"
  1845. #if defined(PIC)
  1846. "push %%"REG_b" \n\t"
  1847. #endif
  1848. "push %%"REG_BP" \n\t" // we use 7 regs here ...
  1849. "mov %%"REG_a", %%"REG_BP" \n\t"
  1850. ".p2align 4 \n\t"
  1851. "1: \n\t"
  1852. "movzwl (%2, %%"REG_BP"), %%eax \n\t"
  1853. "movzwl 2(%2, %%"REG_BP"), %%ebx \n\t"
  1854. "movq (%1, %%"REG_BP", 8), %%mm1 \n\t"
  1855. "movq 16(%1, %%"REG_BP", 8), %%mm3 \n\t"
  1856. "movq (%3, %%"REG_a", 2), %%mm0 \n\t"
  1857. "movq (%3, %%"REG_b", 2), %%mm2 \n\t"
  1858. "pmaddwd %%mm1, %%mm0 \n\t"
  1859. "pmaddwd %%mm2, %%mm3 \n\t"
  1860. "movq 8(%1, %%"REG_BP", 8), %%mm1 \n\t"
  1861. "movq 24(%1, %%"REG_BP", 8), %%mm5 \n\t"
  1862. "movq 8(%3, %%"REG_a", 2), %%mm4 \n\t"
  1863. "movq 8(%3, %%"REG_b", 2), %%mm2 \n\t"
  1864. "pmaddwd %%mm1, %%mm4 \n\t"
  1865. "pmaddwd %%mm2, %%mm5 \n\t"
  1866. "paddd %%mm4, %%mm0 \n\t"
  1867. "paddd %%mm5, %%mm3 \n\t"
  1868. "movq %%mm0, %%mm4 \n\t"
  1869. "punpckldq %%mm3, %%mm0 \n\t"
  1870. "punpckhdq %%mm3, %%mm4 \n\t"
  1871. "paddd %%mm4, %%mm0 \n\t"
  1872. "psrad %%mm7, %%mm0 \n\t"
  1873. "packssdw %%mm0, %%mm0 \n\t"
  1874. "movd %%mm0, (%4, %%"REG_BP") \n\t"
  1875. "add $4, %%"REG_BP" \n\t"
  1876. " jnc 1b \n\t"
  1877. "pop %%"REG_BP" \n\t"
  1878. #if defined(PIC)
  1879. "pop %%"REG_b" \n\t"
  1880. #endif
  1881. : "+a" (counter)
  1882. : "c" (filter), "d" (filterPos), "S" (src), "D" (dst), "m"(shift)
  1883. #if !defined(PIC)
  1884. : "%"REG_b
  1885. #endif
  1886. );
  1887. } else if (shift<15){
  1888. const uint16_t *offset = src+filterSize;
  1889. x86_reg counter= -2*dstW;
  1890. //filter-= counter*filterSize/2;
  1891. filterPos-= counter/2;
  1892. dst-= counter/2;
  1893. __asm__ volatile(
  1894. "movd %7, %%mm7 \n\t"
  1895. ".p2align 4 \n\t"
  1896. "1: \n\t"
  1897. "mov %2, %%"REG_c" \n\t"
  1898. "movzwl (%%"REG_c", %0), %%eax \n\t"
  1899. "movzwl 2(%%"REG_c", %0), %%edx \n\t"
  1900. "mov %5, %%"REG_c" \n\t"
  1901. "pxor %%mm4, %%mm4 \n\t"
  1902. "pxor %%mm5, %%mm5 \n\t"
  1903. "2: \n\t"
  1904. "movq (%1), %%mm1 \n\t"
  1905. "movq (%1, %6), %%mm3 \n\t"
  1906. "movq (%%"REG_c", %%"REG_a", 2), %%mm0 \n\t"
  1907. "movq (%%"REG_c", %%"REG_d", 2), %%mm2 \n\t"
  1908. "pmaddwd %%mm1, %%mm0 \n\t"
  1909. "pmaddwd %%mm2, %%mm3 \n\t"
  1910. "paddd %%mm3, %%mm5 \n\t"
  1911. "paddd %%mm0, %%mm4 \n\t"
  1912. "add $8, %1 \n\t"
  1913. "add $8, %%"REG_c" \n\t"
  1914. "cmp %4, %%"REG_c" \n\t"
  1915. " jb 2b \n\t"
  1916. "add %6, %1 \n\t"
  1917. "movq %%mm4, %%mm0 \n\t"
  1918. "punpckldq %%mm5, %%mm4 \n\t"
  1919. "punpckhdq %%mm5, %%mm0 \n\t"
  1920. "paddd %%mm0, %%mm4 \n\t"
  1921. "psrad %%mm7, %%mm4 \n\t"
  1922. "packssdw %%mm4, %%mm4 \n\t"
  1923. "mov %3, %%"REG_a" \n\t"
  1924. "movd %%mm4, (%%"REG_a", %0) \n\t"
  1925. "add $4, %0 \n\t"
  1926. " jnc 1b \n\t"
  1927. : "+r" (counter), "+r" (filter)
  1928. : "m" (filterPos), "m" (dst), "m"(offset),
  1929. "m" (src), "r" ((x86_reg)filterSize*2), "m"(shift)
  1930. : "%"REG_a, "%"REG_c, "%"REG_d
  1931. );
  1932. } else
  1933. for (i=0; i<dstW; i++) {
  1934. int srcPos= filterPos[i];
  1935. int val=0;
  1936. for (j=0; j<filterSize; j++) {
  1937. val += ((int)src[srcPos + j])*filter[filterSize*i + j];
  1938. }
  1939. dst[i] = FFMIN(val>>shift, (1<<15)-1); // the cubic equation does overflow ...
  1940. }
  1941. }
  1942. #if COMPILE_TEMPLATE_MMX2
  1943. static inline void RENAME(hyscale_fast)(SwsContext *c, int16_t *dst,
  1944. long dstWidth, const uint8_t *src, int srcW,
  1945. int xInc)
  1946. {
  1947. int32_t *filterPos = c->hLumFilterPos;
  1948. int16_t *filter = c->hLumFilter;
  1949. int canMMX2BeUsed = c->canMMX2BeUsed;
  1950. void *mmx2FilterCode= c->lumMmx2FilterCode;
  1951. int i;
  1952. #if defined(PIC)
  1953. DECLARE_ALIGNED(8, uint64_t, ebxsave);
  1954. #endif
  1955. __asm__ volatile(
  1956. #if defined(PIC)
  1957. "mov %%"REG_b", %5 \n\t"
  1958. #endif
  1959. "pxor %%mm7, %%mm7 \n\t"
  1960. "mov %0, %%"REG_c" \n\t"
  1961. "mov %1, %%"REG_D" \n\t"
  1962. "mov %2, %%"REG_d" \n\t"
  1963. "mov %3, %%"REG_b" \n\t"
  1964. "xor %%"REG_a", %%"REG_a" \n\t" // i
  1965. PREFETCH" (%%"REG_c") \n\t"
  1966. PREFETCH" 32(%%"REG_c") \n\t"
  1967. PREFETCH" 64(%%"REG_c") \n\t"
  1968. #if ARCH_X86_64
  1969. #define CALL_MMX2_FILTER_CODE \
  1970. "movl (%%"REG_b"), %%esi \n\t"\
  1971. "call *%4 \n\t"\
  1972. "movl (%%"REG_b", %%"REG_a"), %%esi \n\t"\
  1973. "add %%"REG_S", %%"REG_c" \n\t"\
  1974. "add %%"REG_a", %%"REG_D" \n\t"\
  1975. "xor %%"REG_a", %%"REG_a" \n\t"\
  1976. #else
  1977. #define CALL_MMX2_FILTER_CODE \
  1978. "movl (%%"REG_b"), %%esi \n\t"\
  1979. "call *%4 \n\t"\
  1980. "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\
  1981. "add %%"REG_a", %%"REG_D" \n\t"\
  1982. "xor %%"REG_a", %%"REG_a" \n\t"\
  1983. #endif /* ARCH_X86_64 */
  1984. CALL_MMX2_FILTER_CODE
  1985. CALL_MMX2_FILTER_CODE
  1986. CALL_MMX2_FILTER_CODE
  1987. CALL_MMX2_FILTER_CODE
  1988. CALL_MMX2_FILTER_CODE
  1989. CALL_MMX2_FILTER_CODE
  1990. CALL_MMX2_FILTER_CODE
  1991. CALL_MMX2_FILTER_CODE
  1992. #if defined(PIC)
  1993. "mov %5, %%"REG_b" \n\t"
  1994. #endif
  1995. :: "m" (src), "m" (dst), "m" (filter), "m" (filterPos),
  1996. "m" (mmx2FilterCode)
  1997. #if defined(PIC)
  1998. ,"m" (ebxsave)
  1999. #endif
  2000. : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
  2001. #if !defined(PIC)
  2002. ,"%"REG_b
  2003. #endif
  2004. );
  2005. for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) dst[i] = src[srcW-1]*128;
  2006. }
  2007. static inline void RENAME(hcscale_fast)(SwsContext *c, int16_t *dst,
  2008. long dstWidth, const uint8_t *src1,
  2009. const uint8_t *src2, int srcW, int xInc)
  2010. {
  2011. int32_t *filterPos = c->hChrFilterPos;
  2012. int16_t *filter = c->hChrFilter;
  2013. int canMMX2BeUsed = c->canMMX2BeUsed;
  2014. void *mmx2FilterCode= c->chrMmx2FilterCode;
  2015. int i;
  2016. #if defined(PIC)
  2017. DECLARE_ALIGNED(8, uint64_t, ebxsave);
  2018. #endif
  2019. __asm__ volatile(
  2020. #if defined(PIC)
  2021. "mov %%"REG_b", %6 \n\t"
  2022. #endif
  2023. "pxor %%mm7, %%mm7 \n\t"
  2024. "mov %0, %%"REG_c" \n\t"
  2025. "mov %1, %%"REG_D" \n\t"
  2026. "mov %2, %%"REG_d" \n\t"
  2027. "mov %3, %%"REG_b" \n\t"
  2028. "xor %%"REG_a", %%"REG_a" \n\t" // i
  2029. PREFETCH" (%%"REG_c") \n\t"
  2030. PREFETCH" 32(%%"REG_c") \n\t"
  2031. PREFETCH" 64(%%"REG_c") \n\t"
  2032. CALL_MMX2_FILTER_CODE
  2033. CALL_MMX2_FILTER_CODE
  2034. CALL_MMX2_FILTER_CODE
  2035. CALL_MMX2_FILTER_CODE
  2036. "xor %%"REG_a", %%"REG_a" \n\t" // i
  2037. "mov %5, %%"REG_c" \n\t" // src
  2038. "mov %1, %%"REG_D" \n\t" // buf1
  2039. "add $"AV_STRINGIFY(VOF)", %%"REG_D" \n\t"
  2040. PREFETCH" (%%"REG_c") \n\t"
  2041. PREFETCH" 32(%%"REG_c") \n\t"
  2042. PREFETCH" 64(%%"REG_c") \n\t"
  2043. CALL_MMX2_FILTER_CODE
  2044. CALL_MMX2_FILTER_CODE
  2045. CALL_MMX2_FILTER_CODE
  2046. CALL_MMX2_FILTER_CODE
  2047. #if defined(PIC)
  2048. "mov %6, %%"REG_b" \n\t"
  2049. #endif
  2050. :: "m" (src1), "m" (dst), "m" (filter), "m" (filterPos),
  2051. "m" (mmx2FilterCode), "m" (src2)
  2052. #if defined(PIC)
  2053. ,"m" (ebxsave)
  2054. #endif
  2055. : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
  2056. #if !defined(PIC)
  2057. ,"%"REG_b
  2058. #endif
  2059. );
  2060. for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) {
  2061. //printf("%d %d %d\n", dstWidth, i, srcW);
  2062. dst[i] = src1[srcW-1]*128;
  2063. dst[i+VOFW] = src2[srcW-1]*128;
  2064. }
  2065. }
  2066. #endif /* COMPILE_TEMPLATE_MMX2 */
  2067. #if !COMPILE_TEMPLATE_MMX2
  2068. static void updateMMXDitherTables(SwsContext *c, int dstY, int lumBufIndex, int chrBufIndex,
  2069. int lastInLumBuf, int lastInChrBuf)
  2070. {
  2071. const int dstH= c->dstH;
  2072. const int flags= c->flags;
  2073. int16_t **lumPixBuf= c->lumPixBuf;
  2074. int16_t **chrPixBuf= c->chrPixBuf;
  2075. int16_t **alpPixBuf= c->alpPixBuf;
  2076. const int vLumBufSize= c->vLumBufSize;
  2077. const int vChrBufSize= c->vChrBufSize;
  2078. int16_t *vLumFilterPos= c->vLumFilterPos;
  2079. int16_t *vChrFilterPos= c->vChrFilterPos;
  2080. int16_t *vLumFilter= c->vLumFilter;
  2081. int16_t *vChrFilter= c->vChrFilter;
  2082. int32_t *lumMmxFilter= c->lumMmxFilter;
  2083. int32_t *chrMmxFilter= c->chrMmxFilter;
  2084. int32_t av_unused *alpMmxFilter= c->alpMmxFilter;
  2085. const int vLumFilterSize= c->vLumFilterSize;
  2086. const int vChrFilterSize= c->vChrFilterSize;
  2087. const int chrDstY= dstY>>c->chrDstVSubSample;
  2088. const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input
  2089. const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input
  2090. c->blueDither= ff_dither8[dstY&1];
  2091. if (c->dstFormat == PIX_FMT_RGB555 || c->dstFormat == PIX_FMT_BGR555)
  2092. c->greenDither= ff_dither8[dstY&1];
  2093. else
  2094. c->greenDither= ff_dither4[dstY&1];
  2095. c->redDither= ff_dither8[(dstY+1)&1];
  2096. if (dstY < dstH - 2) {
  2097. const int16_t **lumSrcPtr= (const int16_t **) lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
  2098. const int16_t **chrSrcPtr= (const int16_t **) chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
  2099. const int16_t **alpSrcPtr= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? (const int16_t **) alpPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize : NULL;
  2100. int i;
  2101. if (flags & SWS_ACCURATE_RND) {
  2102. int s= APCK_SIZE / 8;
  2103. for (i=0; i<vLumFilterSize; i+=2) {
  2104. *(const void**)&lumMmxFilter[s*i ]= lumSrcPtr[i ];
  2105. *(const void**)&lumMmxFilter[s*i+APCK_PTR2/4 ]= lumSrcPtr[i+(vLumFilterSize>1)];
  2106. lumMmxFilter[s*i+APCK_COEF/4 ]=
  2107. lumMmxFilter[s*i+APCK_COEF/4+1]= vLumFilter[dstY*vLumFilterSize + i ]
  2108. + (vLumFilterSize>1 ? vLumFilter[dstY*vLumFilterSize + i + 1]<<16 : 0);
  2109. if (CONFIG_SWSCALE_ALPHA && alpPixBuf) {
  2110. *(const void**)&alpMmxFilter[s*i ]= alpSrcPtr[i ];
  2111. *(const void**)&alpMmxFilter[s*i+APCK_PTR2/4 ]= alpSrcPtr[i+(vLumFilterSize>1)];
  2112. alpMmxFilter[s*i+APCK_COEF/4 ]=
  2113. alpMmxFilter[s*i+APCK_COEF/4+1]= lumMmxFilter[s*i+APCK_COEF/4 ];
  2114. }
  2115. }
  2116. for (i=0; i<vChrFilterSize; i+=2) {
  2117. *(const void**)&chrMmxFilter[s*i ]= chrSrcPtr[i ];
  2118. *(const void**)&chrMmxFilter[s*i+APCK_PTR2/4 ]= chrSrcPtr[i+(vChrFilterSize>1)];
  2119. chrMmxFilter[s*i+APCK_COEF/4 ]=
  2120. chrMmxFilter[s*i+APCK_COEF/4+1]= vChrFilter[chrDstY*vChrFilterSize + i ]
  2121. + (vChrFilterSize>1 ? vChrFilter[chrDstY*vChrFilterSize + i + 1]<<16 : 0);
  2122. }
  2123. } else {
  2124. for (i=0; i<vLumFilterSize; i++) {
  2125. lumMmxFilter[4*i+0]= (int32_t)lumSrcPtr[i];
  2126. lumMmxFilter[4*i+1]= (uint64_t)lumSrcPtr[i] >> 32;
  2127. lumMmxFilter[4*i+2]=
  2128. lumMmxFilter[4*i+3]=
  2129. ((uint16_t)vLumFilter[dstY*vLumFilterSize + i])*0x10001;
  2130. if (CONFIG_SWSCALE_ALPHA && alpPixBuf) {
  2131. alpMmxFilter[4*i+0]= (int32_t)alpSrcPtr[i];
  2132. alpMmxFilter[4*i+1]= (uint64_t)alpSrcPtr[i] >> 32;
  2133. alpMmxFilter[4*i+2]=
  2134. alpMmxFilter[4*i+3]= lumMmxFilter[4*i+2];
  2135. }
  2136. }
  2137. for (i=0; i<vChrFilterSize; i++) {
  2138. chrMmxFilter[4*i+0]= (int32_t)chrSrcPtr[i];
  2139. chrMmxFilter[4*i+1]= (uint64_t)chrSrcPtr[i] >> 32;
  2140. chrMmxFilter[4*i+2]=
  2141. chrMmxFilter[4*i+3]=
  2142. ((uint16_t)vChrFilter[chrDstY*vChrFilterSize + i])*0x10001;
  2143. }
  2144. }
  2145. }
  2146. }
  2147. #endif /* !COMPILE_TEMPLATE_MMX2 */
  2148. static void RENAME(sws_init_swScale)(SwsContext *c)
  2149. {
  2150. enum PixelFormat srcFormat = c->srcFormat;
  2151. if (!(c->flags & SWS_BITEXACT)) {
  2152. c->yuv2yuv1 = RENAME(yuv2yuv1 );
  2153. c->yuv2yuvX = RENAME(yuv2yuvX );
  2154. c->yuv2packed1 = RENAME(yuv2packed1 );
  2155. c->yuv2packed2 = RENAME(yuv2packed2 );
  2156. c->yuv2packedX = RENAME(yuv2packedX );
  2157. }
  2158. c->hScale = RENAME(hScale );
  2159. // Use the new MMX scaler if the MMX2 one can't be used (it is faster than the x86 ASM one).
  2160. #if COMPILE_TEMPLATE_MMX2
  2161. if (c->flags & SWS_FAST_BILINEAR && c->canMMX2BeUsed)
  2162. {
  2163. c->hyscale_fast = RENAME(hyscale_fast);
  2164. c->hcscale_fast = RENAME(hcscale_fast);
  2165. } else {
  2166. #endif /* COMPILE_TEMPLATE_MMX2 */
  2167. c->hyscale_fast = NULL;
  2168. c->hcscale_fast = NULL;
  2169. #if COMPILE_TEMPLATE_MMX2
  2170. }
  2171. #endif /* COMPILE_TEMPLATE_MMX2 */
  2172. switch(srcFormat) {
  2173. case PIX_FMT_YUYV422 : c->chrToYV12 = RENAME(yuy2ToUV); break;
  2174. case PIX_FMT_UYVY422 : c->chrToYV12 = RENAME(uyvyToUV); break;
  2175. case PIX_FMT_NV12 : c->chrToYV12 = RENAME(nv12ToUV); break;
  2176. case PIX_FMT_NV21 : c->chrToYV12 = RENAME(nv21ToUV); break;
  2177. case PIX_FMT_GRAY16LE :
  2178. case PIX_FMT_YUV420P9LE:
  2179. case PIX_FMT_YUV422P10LE:
  2180. case PIX_FMT_YUV420P10LE:
  2181. case PIX_FMT_YUV420P16LE:
  2182. case PIX_FMT_YUV422P16LE:
  2183. case PIX_FMT_YUV444P16LE: c->hScale16= RENAME(hScale16); break;
  2184. }
  2185. if (!c->chrSrcHSubSample) {
  2186. switch(srcFormat) {
  2187. case PIX_FMT_BGR24 : c->chrToYV12 = RENAME(bgr24ToUV); break;
  2188. case PIX_FMT_RGB24 : c->chrToYV12 = RENAME(rgb24ToUV); break;
  2189. default: break;
  2190. }
  2191. }
  2192. switch (srcFormat) {
  2193. case PIX_FMT_YUYV422 :
  2194. case PIX_FMT_Y400A :
  2195. c->lumToYV12 = RENAME(yuy2ToY); break;
  2196. case PIX_FMT_UYVY422 :
  2197. c->lumToYV12 = RENAME(uyvyToY); break;
  2198. case PIX_FMT_BGR24 : c->lumToYV12 = RENAME(bgr24ToY); break;
  2199. case PIX_FMT_RGB24 : c->lumToYV12 = RENAME(rgb24ToY); break;
  2200. default: break;
  2201. }
  2202. if (c->alpPixBuf) {
  2203. switch (srcFormat) {
  2204. case PIX_FMT_Y400A : c->alpToYV12 = RENAME(yuy2ToY); break;
  2205. default: break;
  2206. }
  2207. }
  2208. if(isAnyRGB(c->srcFormat))
  2209. c->hScale16= RENAME(hScale16);
  2210. }