You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

2213 lines
99KB

  1. /*
  2. * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
  3. *
  4. * This file is part of Libav.
  5. *
  6. * Libav is free software; you can redistribute it and/or
  7. * modify it under the terms of the GNU Lesser General Public
  8. * License as published by the Free Software Foundation; either
  9. * version 2.1 of the License, or (at your option) any later version.
  10. *
  11. * Libav is distributed in the hope that it will be useful,
  12. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  14. * Lesser General Public License for more details.
  15. *
  16. * You should have received a copy of the GNU Lesser General Public
  17. * License along with Libav; if not, write to the Free Software
  18. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  19. */
  20. #undef REAL_MOVNTQ
  21. #undef MOVNTQ
  22. #undef PREFETCH
  23. #if COMPILE_TEMPLATE_MMX2
  24. #define PREFETCH "prefetchnta"
  25. #else
  26. #define PREFETCH " # nop"
  27. #endif
  28. #if COMPILE_TEMPLATE_MMX2
  29. #define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
  30. #else
  31. #define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
  32. #endif
  33. #define MOVNTQ(a,b) REAL_MOVNTQ(a,b)
  34. #define YSCALEYUV2YV12X(offset, dest, end, pos) \
  35. __asm__ volatile(\
  36. "movq "DITHER16"+0(%0), %%mm3 \n\t"\
  37. "movq "DITHER16"+8(%0), %%mm4 \n\t"\
  38. "lea " offset "(%0), %%"REG_d" \n\t"\
  39. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  40. ".p2align 4 \n\t" /* FIXME Unroll? */\
  41. "1: \n\t"\
  42. "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
  43. "movq (%%"REG_S", %3, 2), %%mm2 \n\t" /* srcData */\
  44. "movq 8(%%"REG_S", %3, 2), %%mm5 \n\t" /* srcData */\
  45. "add $16, %%"REG_d" \n\t"\
  46. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  47. "test %%"REG_S", %%"REG_S" \n\t"\
  48. "pmulhw %%mm0, %%mm2 \n\t"\
  49. "pmulhw %%mm0, %%mm5 \n\t"\
  50. "paddw %%mm2, %%mm3 \n\t"\
  51. "paddw %%mm5, %%mm4 \n\t"\
  52. " jnz 1b \n\t"\
  53. "psraw $3, %%mm3 \n\t"\
  54. "psraw $3, %%mm4 \n\t"\
  55. "packuswb %%mm4, %%mm3 \n\t"\
  56. MOVNTQ(%%mm3, (%1, %3))\
  57. "add $8, %3 \n\t"\
  58. "cmp %2, %3 \n\t"\
  59. "movq "DITHER16"+0(%0), %%mm3 \n\t"\
  60. "movq "DITHER16"+8(%0), %%mm4 \n\t"\
  61. "lea " offset "(%0), %%"REG_d" \n\t"\
  62. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  63. "jb 1b \n\t"\
  64. :: "r" (&c->redDither),\
  65. "r" (dest), "g" ((x86_reg)(end)), "r"((x86_reg)(pos))\
  66. : "%"REG_d, "%"REG_S\
  67. );
  68. #if !COMPILE_TEMPLATE_MMX2
  69. static av_always_inline void
  70. dither_8to16(SwsContext *c, const uint8_t *srcDither, int rot)
  71. {
  72. if (rot) {
  73. __asm__ volatile("pxor %%mm0, %%mm0\n\t"
  74. "movq (%0), %%mm3\n\t"
  75. "movq %%mm3, %%mm4\n\t"
  76. "psrlq $24, %%mm3\n\t"
  77. "psllq $40, %%mm4\n\t"
  78. "por %%mm4, %%mm3\n\t"
  79. "movq %%mm3, %%mm4\n\t"
  80. "punpcklbw %%mm0, %%mm3\n\t"
  81. "punpckhbw %%mm0, %%mm4\n\t"
  82. "psraw $4, %%mm3\n\t"
  83. "psraw $4, %%mm4\n\t"
  84. "movq %%mm3, "DITHER16"+0(%1)\n\t"
  85. "movq %%mm4, "DITHER16"+8(%1)\n\t"
  86. :: "r"(srcDither), "r"(&c->redDither)
  87. );
  88. } else {
  89. __asm__ volatile("pxor %%mm0, %%mm0\n\t"
  90. "movq (%0), %%mm3\n\t"
  91. "movq %%mm3, %%mm4\n\t"
  92. "punpcklbw %%mm0, %%mm3\n\t"
  93. "punpckhbw %%mm0, %%mm4\n\t"
  94. "psraw $4, %%mm3\n\t"
  95. "psraw $4, %%mm4\n\t"
  96. "movq %%mm3, "DITHER16"+0(%1)\n\t"
  97. "movq %%mm4, "DITHER16"+8(%1)\n\t"
  98. :: "r"(srcDither), "r"(&c->redDither)
  99. );
  100. }
  101. }
  102. #endif
  103. static void RENAME(yuv2yuvX)(SwsContext *c, const int16_t *lumFilter,
  104. const int16_t **lumSrc, int lumFilterSize,
  105. const int16_t *chrFilter, const int16_t **chrUSrc,
  106. const int16_t **chrVSrc,
  107. int chrFilterSize, const int16_t **alpSrc,
  108. uint8_t *dest[4], int dstW, int chrDstW)
  109. {
  110. uint8_t *yDest = dest[0], *uDest = dest[1], *vDest = dest[2],
  111. *aDest = CONFIG_SWSCALE_ALPHA ? dest[3] : NULL;
  112. const uint8_t *lumDither = c->lumDither8, *chrDither = c->chrDither8;
  113. if (uDest) {
  114. x86_reg uv_off = c->uv_off_byte >> 1;
  115. dither_8to16(c, chrDither, 0);
  116. YSCALEYUV2YV12X(CHR_MMX_FILTER_OFFSET, uDest, chrDstW, 0)
  117. dither_8to16(c, chrDither, 1);
  118. YSCALEYUV2YV12X(CHR_MMX_FILTER_OFFSET, vDest - uv_off, chrDstW + uv_off, uv_off)
  119. }
  120. dither_8to16(c, lumDither, 0);
  121. if (CONFIG_SWSCALE_ALPHA && aDest) {
  122. YSCALEYUV2YV12X(ALP_MMX_FILTER_OFFSET, aDest, dstW, 0)
  123. }
  124. YSCALEYUV2YV12X(LUM_MMX_FILTER_OFFSET, yDest, dstW, 0)
  125. }
  126. #define YSCALEYUV2YV12X_ACCURATE(offset, dest, end, pos) \
  127. __asm__ volatile(\
  128. "lea " offset "(%0), %%"REG_d" \n\t"\
  129. "movq "DITHER32"+0(%0), %%mm4 \n\t"\
  130. "movq "DITHER32"+8(%0), %%mm5 \n\t"\
  131. "movq "DITHER32"+16(%0), %%mm6 \n\t"\
  132. "movq "DITHER32"+24(%0), %%mm7 \n\t"\
  133. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  134. ".p2align 4 \n\t"\
  135. "1: \n\t"\
  136. "movq (%%"REG_S", %3, 2), %%mm0 \n\t" /* srcData */\
  137. "movq 8(%%"REG_S", %3, 2), %%mm2 \n\t" /* srcData */\
  138. "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
  139. "movq (%%"REG_S", %3, 2), %%mm1 \n\t" /* srcData */\
  140. "movq %%mm0, %%mm3 \n\t"\
  141. "punpcklwd %%mm1, %%mm0 \n\t"\
  142. "punpckhwd %%mm1, %%mm3 \n\t"\
  143. "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm1 \n\t" /* filterCoeff */\
  144. "pmaddwd %%mm1, %%mm0 \n\t"\
  145. "pmaddwd %%mm1, %%mm3 \n\t"\
  146. "paddd %%mm0, %%mm4 \n\t"\
  147. "paddd %%mm3, %%mm5 \n\t"\
  148. "movq 8(%%"REG_S", %3, 2), %%mm3 \n\t" /* srcData */\
  149. "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
  150. "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
  151. "test %%"REG_S", %%"REG_S" \n\t"\
  152. "movq %%mm2, %%mm0 \n\t"\
  153. "punpcklwd %%mm3, %%mm2 \n\t"\
  154. "punpckhwd %%mm3, %%mm0 \n\t"\
  155. "pmaddwd %%mm1, %%mm2 \n\t"\
  156. "pmaddwd %%mm1, %%mm0 \n\t"\
  157. "paddd %%mm2, %%mm6 \n\t"\
  158. "paddd %%mm0, %%mm7 \n\t"\
  159. " jnz 1b \n\t"\
  160. "psrad $16, %%mm4 \n\t"\
  161. "psrad $16, %%mm5 \n\t"\
  162. "psrad $16, %%mm6 \n\t"\
  163. "psrad $16, %%mm7 \n\t"\
  164. "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
  165. "packssdw %%mm5, %%mm4 \n\t"\
  166. "packssdw %%mm7, %%mm6 \n\t"\
  167. "paddw %%mm0, %%mm4 \n\t"\
  168. "paddw %%mm0, %%mm6 \n\t"\
  169. "psraw $3, %%mm4 \n\t"\
  170. "psraw $3, %%mm6 \n\t"\
  171. "packuswb %%mm6, %%mm4 \n\t"\
  172. MOVNTQ(%%mm4, (%1, %3))\
  173. "add $8, %3 \n\t"\
  174. "cmp %2, %3 \n\t"\
  175. "lea " offset "(%0), %%"REG_d" \n\t"\
  176. "movq "DITHER32"+0(%0), %%mm4 \n\t"\
  177. "movq "DITHER32"+8(%0), %%mm5 \n\t"\
  178. "movq "DITHER32"+16(%0), %%mm6 \n\t"\
  179. "movq "DITHER32"+24(%0), %%mm7 \n\t"\
  180. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  181. "jb 1b \n\t"\
  182. :: "r" (&c->redDither),\
  183. "r" (dest), "g" ((x86_reg)(end)), "r"((x86_reg)(pos))\
  184. : "%"REG_a, "%"REG_d, "%"REG_S\
  185. );
  186. #if !COMPILE_TEMPLATE_MMX2
  187. static av_always_inline void
  188. dither_8to32(SwsContext *c, const uint8_t *srcDither, int rot)
  189. {
  190. if (rot) {
  191. __asm__ volatile("pxor %%mm0, %%mm0\n\t"
  192. "movq (%0), %%mm4\n\t"
  193. "movq %%mm4, %%mm5\n\t"
  194. "psrlq $24, %%mm4\n\t"
  195. "psllq $40, %%mm5\n\t"
  196. "por %%mm5, %%mm4\n\t"
  197. "movq %%mm4, %%mm6\n\t"
  198. "punpcklbw %%mm0, %%mm4\n\t"
  199. "punpckhbw %%mm0, %%mm6\n\t"
  200. "movq %%mm4, %%mm5\n\t"
  201. "movq %%mm6, %%mm7\n\t"
  202. "punpcklwd %%mm0, %%mm4\n\t"
  203. "punpckhwd %%mm0, %%mm5\n\t"
  204. "punpcklwd %%mm0, %%mm6\n\t"
  205. "punpckhwd %%mm0, %%mm7\n\t"
  206. "pslld $12, %%mm4\n\t"
  207. "pslld $12, %%mm5\n\t"
  208. "pslld $12, %%mm6\n\t"
  209. "pslld $12, %%mm7\n\t"
  210. "movq %%mm4, "DITHER32"+0(%1)\n\t"
  211. "movq %%mm5, "DITHER32"+8(%1)\n\t"
  212. "movq %%mm6, "DITHER32"+16(%1)\n\t"
  213. "movq %%mm7, "DITHER32"+24(%1)\n\t"
  214. :: "r"(srcDither), "r"(&c->redDither)
  215. );
  216. } else {
  217. __asm__ volatile("pxor %%mm0, %%mm0\n\t"
  218. "movq (%0), %%mm4\n\t"
  219. "movq %%mm4, %%mm6\n\t"
  220. "punpcklbw %%mm0, %%mm4\n\t"
  221. "punpckhbw %%mm0, %%mm6\n\t"
  222. "movq %%mm4, %%mm5\n\t"
  223. "movq %%mm6, %%mm7\n\t"
  224. "punpcklwd %%mm0, %%mm4\n\t"
  225. "punpckhwd %%mm0, %%mm5\n\t"
  226. "punpcklwd %%mm0, %%mm6\n\t"
  227. "punpckhwd %%mm0, %%mm7\n\t"
  228. "pslld $12, %%mm4\n\t"
  229. "pslld $12, %%mm5\n\t"
  230. "pslld $12, %%mm6\n\t"
  231. "pslld $12, %%mm7\n\t"
  232. "movq %%mm4, "DITHER32"+0(%1)\n\t"
  233. "movq %%mm5, "DITHER32"+8(%1)\n\t"
  234. "movq %%mm6, "DITHER32"+16(%1)\n\t"
  235. "movq %%mm7, "DITHER32"+24(%1)\n\t"
  236. :: "r"(srcDither), "r"(&c->redDither)
  237. );
  238. }
  239. }
  240. #endif
  241. static void RENAME(yuv2yuvX_ar)(SwsContext *c, const int16_t *lumFilter,
  242. const int16_t **lumSrc, int lumFilterSize,
  243. const int16_t *chrFilter, const int16_t **chrUSrc,
  244. const int16_t **chrVSrc,
  245. int chrFilterSize, const int16_t **alpSrc,
  246. uint8_t *dest[4], int dstW, int chrDstW)
  247. {
  248. uint8_t *yDest = dest[0], *uDest = dest[1], *vDest = dest[2],
  249. *aDest = CONFIG_SWSCALE_ALPHA ? dest[3] : NULL;
  250. const uint8_t *lumDither = c->lumDither8, *chrDither = c->chrDither8;
  251. if (uDest) {
  252. x86_reg uv_off = c->uv_off_byte >> 1;
  253. dither_8to32(c, chrDither, 0);
  254. YSCALEYUV2YV12X_ACCURATE(CHR_MMX_FILTER_OFFSET, uDest, chrDstW, 0)
  255. dither_8to32(c, chrDither, 1);
  256. YSCALEYUV2YV12X_ACCURATE(CHR_MMX_FILTER_OFFSET, vDest - uv_off, chrDstW + uv_off, uv_off)
  257. }
  258. dither_8to32(c, lumDither, 0);
  259. if (CONFIG_SWSCALE_ALPHA && aDest) {
  260. YSCALEYUV2YV12X_ACCURATE(ALP_MMX_FILTER_OFFSET, aDest, dstW, 0)
  261. }
  262. YSCALEYUV2YV12X_ACCURATE(LUM_MMX_FILTER_OFFSET, yDest, dstW, 0)
  263. }
  264. static void RENAME(yuv2yuv1)(SwsContext *c, const int16_t *lumSrc,
  265. const int16_t *chrUSrc, const int16_t *chrVSrc,
  266. const int16_t *alpSrc,
  267. uint8_t *dst[4], int dstW, int chrDstW)
  268. {
  269. int p= 4;
  270. const int16_t *src[4]= {
  271. lumSrc + dstW, chrUSrc + chrDstW,
  272. chrVSrc + chrDstW, alpSrc + dstW
  273. };
  274. x86_reg counter[4]= { dstW, chrDstW, chrDstW, dstW };
  275. while (p--) {
  276. if (dst[p]) {
  277. __asm__ volatile(
  278. "mov %2, %%"REG_a" \n\t"
  279. ".p2align 4 \n\t" /* FIXME Unroll? */
  280. "1: \n\t"
  281. "movq (%0, %%"REG_a", 2), %%mm0 \n\t"
  282. "movq 8(%0, %%"REG_a", 2), %%mm1 \n\t"
  283. "psraw $7, %%mm0 \n\t"
  284. "psraw $7, %%mm1 \n\t"
  285. "packuswb %%mm1, %%mm0 \n\t"
  286. MOVNTQ(%%mm0, (%1, %%REGa))
  287. "add $8, %%"REG_a" \n\t"
  288. "jnc 1b \n\t"
  289. :: "r" (src[p]), "r" (dst[p] + counter[p]),
  290. "g" (-counter[p])
  291. : "%"REG_a
  292. );
  293. }
  294. }
  295. }
  296. static void RENAME(yuv2yuv1_ar)(SwsContext *c, const int16_t *lumSrc,
  297. const int16_t *chrUSrc, const int16_t *chrVSrc,
  298. const int16_t *alpSrc,
  299. uint8_t *dst[4], int dstW, int chrDstW)
  300. {
  301. int p= 4;
  302. const int16_t *src[4]= {
  303. lumSrc + dstW, chrUSrc + chrDstW,
  304. chrVSrc + chrDstW, alpSrc + dstW
  305. };
  306. x86_reg counter[4]= { dstW, chrDstW, chrDstW, dstW };
  307. const uint8_t *lumDither = c->lumDither8, *chrDither = c->chrDither8;
  308. while (p--) {
  309. if (dst[p]) {
  310. dither_8to16(c, (p == 2 || p == 3) ? chrDither : lumDither, p == 2);
  311. __asm__ volatile(
  312. "mov %2, %%"REG_a" \n\t"
  313. "movq "DITHER16"+0(%3), %%mm6 \n\t"
  314. "movq "DITHER16"+8(%3), %%mm7 \n\t"
  315. ".p2align 4 \n\t" /* FIXME Unroll? */
  316. "1: \n\t"
  317. "movq (%0, %%"REG_a", 2), %%mm0 \n\t"
  318. "movq 8(%0, %%"REG_a", 2), %%mm1 \n\t"
  319. "paddsw %%mm6, %%mm0 \n\t"
  320. "paddsw %%mm7, %%mm1 \n\t"
  321. "psraw $7, %%mm0 \n\t"
  322. "psraw $7, %%mm1 \n\t"
  323. "packuswb %%mm1, %%mm0 \n\t"
  324. MOVNTQ(%%mm0, (%1, %%REGa))
  325. "add $8, %%"REG_a" \n\t"
  326. "jnc 1b \n\t"
  327. :: "r" (src[p]), "r" (dst[p] + counter[p]),
  328. "g" (-counter[p]), "r"(&c->redDither)
  329. : "%"REG_a
  330. );
  331. }
  332. }
  333. }
  334. #define YSCALEYUV2PACKEDX_UV \
  335. __asm__ volatile(\
  336. "xor %%"REG_a", %%"REG_a" \n\t"\
  337. ".p2align 4 \n\t"\
  338. "nop \n\t"\
  339. "1: \n\t"\
  340. "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
  341. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  342. "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
  343. "movq %%mm3, %%mm4 \n\t"\
  344. ".p2align 4 \n\t"\
  345. "2: \n\t"\
  346. "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
  347. "movq (%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* UsrcData */\
  348. "add %6, %%"REG_S" \n\t" \
  349. "movq (%%"REG_S", %%"REG_a"), %%mm5 \n\t" /* VsrcData */\
  350. "add $16, %%"REG_d" \n\t"\
  351. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  352. "pmulhw %%mm0, %%mm2 \n\t"\
  353. "pmulhw %%mm0, %%mm5 \n\t"\
  354. "paddw %%mm2, %%mm3 \n\t"\
  355. "paddw %%mm5, %%mm4 \n\t"\
  356. "test %%"REG_S", %%"REG_S" \n\t"\
  357. " jnz 2b \n\t"\
  358. #define YSCALEYUV2PACKEDX_YA(offset,coeff,src1,src2,dst1,dst2) \
  359. "lea "offset"(%0), %%"REG_d" \n\t"\
  360. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  361. "movq "VROUNDER_OFFSET"(%0), "#dst1" \n\t"\
  362. "movq "#dst1", "#dst2" \n\t"\
  363. ".p2align 4 \n\t"\
  364. "2: \n\t"\
  365. "movq 8(%%"REG_d"), "#coeff" \n\t" /* filterCoeff */\
  366. "movq (%%"REG_S", %%"REG_a", 2), "#src1" \n\t" /* Y1srcData */\
  367. "movq 8(%%"REG_S", %%"REG_a", 2), "#src2" \n\t" /* Y2srcData */\
  368. "add $16, %%"REG_d" \n\t"\
  369. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  370. "pmulhw "#coeff", "#src1" \n\t"\
  371. "pmulhw "#coeff", "#src2" \n\t"\
  372. "paddw "#src1", "#dst1" \n\t"\
  373. "paddw "#src2", "#dst2" \n\t"\
  374. "test %%"REG_S", %%"REG_S" \n\t"\
  375. " jnz 2b \n\t"\
  376. #define YSCALEYUV2PACKEDX \
  377. YSCALEYUV2PACKEDX_UV \
  378. YSCALEYUV2PACKEDX_YA(LUM_MMX_FILTER_OFFSET,%%mm0,%%mm2,%%mm5,%%mm1,%%mm7) \
  379. #define YSCALEYUV2PACKEDX_END \
  380. :: "r" (&c->redDither), \
  381. "m" (dummy), "m" (dummy), "m" (dummy),\
  382. "r" (dest), "m" (dstW_reg), "m"(uv_off) \
  383. : "%"REG_a, "%"REG_d, "%"REG_S \
  384. );
  385. #define YSCALEYUV2PACKEDX_ACCURATE_UV \
  386. __asm__ volatile(\
  387. "xor %%"REG_a", %%"REG_a" \n\t"\
  388. ".p2align 4 \n\t"\
  389. "nop \n\t"\
  390. "1: \n\t"\
  391. "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
  392. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  393. "pxor %%mm4, %%mm4 \n\t"\
  394. "pxor %%mm5, %%mm5 \n\t"\
  395. "pxor %%mm6, %%mm6 \n\t"\
  396. "pxor %%mm7, %%mm7 \n\t"\
  397. ".p2align 4 \n\t"\
  398. "2: \n\t"\
  399. "movq (%%"REG_S", %%"REG_a"), %%mm0 \n\t" /* UsrcData */\
  400. "add %6, %%"REG_S" \n\t" \
  401. "movq (%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* VsrcData */\
  402. "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
  403. "movq (%%"REG_S", %%"REG_a"), %%mm1 \n\t" /* UsrcData */\
  404. "movq %%mm0, %%mm3 \n\t"\
  405. "punpcklwd %%mm1, %%mm0 \n\t"\
  406. "punpckhwd %%mm1, %%mm3 \n\t"\
  407. "movq "STR(APCK_COEF)"(%%"REG_d"),%%mm1 \n\t" /* filterCoeff */\
  408. "pmaddwd %%mm1, %%mm0 \n\t"\
  409. "pmaddwd %%mm1, %%mm3 \n\t"\
  410. "paddd %%mm0, %%mm4 \n\t"\
  411. "paddd %%mm3, %%mm5 \n\t"\
  412. "add %6, %%"REG_S" \n\t" \
  413. "movq (%%"REG_S", %%"REG_a"), %%mm3 \n\t" /* VsrcData */\
  414. "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
  415. "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
  416. "test %%"REG_S", %%"REG_S" \n\t"\
  417. "movq %%mm2, %%mm0 \n\t"\
  418. "punpcklwd %%mm3, %%mm2 \n\t"\
  419. "punpckhwd %%mm3, %%mm0 \n\t"\
  420. "pmaddwd %%mm1, %%mm2 \n\t"\
  421. "pmaddwd %%mm1, %%mm0 \n\t"\
  422. "paddd %%mm2, %%mm6 \n\t"\
  423. "paddd %%mm0, %%mm7 \n\t"\
  424. " jnz 2b \n\t"\
  425. "psrad $16, %%mm4 \n\t"\
  426. "psrad $16, %%mm5 \n\t"\
  427. "psrad $16, %%mm6 \n\t"\
  428. "psrad $16, %%mm7 \n\t"\
  429. "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
  430. "packssdw %%mm5, %%mm4 \n\t"\
  431. "packssdw %%mm7, %%mm6 \n\t"\
  432. "paddw %%mm0, %%mm4 \n\t"\
  433. "paddw %%mm0, %%mm6 \n\t"\
  434. "movq %%mm4, "U_TEMP"(%0) \n\t"\
  435. "movq %%mm6, "V_TEMP"(%0) \n\t"\
  436. #define YSCALEYUV2PACKEDX_ACCURATE_YA(offset) \
  437. "lea "offset"(%0), %%"REG_d" \n\t"\
  438. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  439. "pxor %%mm1, %%mm1 \n\t"\
  440. "pxor %%mm5, %%mm5 \n\t"\
  441. "pxor %%mm7, %%mm7 \n\t"\
  442. "pxor %%mm6, %%mm6 \n\t"\
  443. ".p2align 4 \n\t"\
  444. "2: \n\t"\
  445. "movq (%%"REG_S", %%"REG_a", 2), %%mm0 \n\t" /* Y1srcData */\
  446. "movq 8(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* Y2srcData */\
  447. "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
  448. "movq (%%"REG_S", %%"REG_a", 2), %%mm4 \n\t" /* Y1srcData */\
  449. "movq %%mm0, %%mm3 \n\t"\
  450. "punpcklwd %%mm4, %%mm0 \n\t"\
  451. "punpckhwd %%mm4, %%mm3 \n\t"\
  452. "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm4 \n\t" /* filterCoeff */\
  453. "pmaddwd %%mm4, %%mm0 \n\t"\
  454. "pmaddwd %%mm4, %%mm3 \n\t"\
  455. "paddd %%mm0, %%mm1 \n\t"\
  456. "paddd %%mm3, %%mm5 \n\t"\
  457. "movq 8(%%"REG_S", %%"REG_a", 2), %%mm3 \n\t" /* Y2srcData */\
  458. "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
  459. "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
  460. "test %%"REG_S", %%"REG_S" \n\t"\
  461. "movq %%mm2, %%mm0 \n\t"\
  462. "punpcklwd %%mm3, %%mm2 \n\t"\
  463. "punpckhwd %%mm3, %%mm0 \n\t"\
  464. "pmaddwd %%mm4, %%mm2 \n\t"\
  465. "pmaddwd %%mm4, %%mm0 \n\t"\
  466. "paddd %%mm2, %%mm7 \n\t"\
  467. "paddd %%mm0, %%mm6 \n\t"\
  468. " jnz 2b \n\t"\
  469. "psrad $16, %%mm1 \n\t"\
  470. "psrad $16, %%mm5 \n\t"\
  471. "psrad $16, %%mm7 \n\t"\
  472. "psrad $16, %%mm6 \n\t"\
  473. "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
  474. "packssdw %%mm5, %%mm1 \n\t"\
  475. "packssdw %%mm6, %%mm7 \n\t"\
  476. "paddw %%mm0, %%mm1 \n\t"\
  477. "paddw %%mm0, %%mm7 \n\t"\
  478. "movq "U_TEMP"(%0), %%mm3 \n\t"\
  479. "movq "V_TEMP"(%0), %%mm4 \n\t"\
  480. #define YSCALEYUV2PACKEDX_ACCURATE \
  481. YSCALEYUV2PACKEDX_ACCURATE_UV \
  482. YSCALEYUV2PACKEDX_ACCURATE_YA(LUM_MMX_FILTER_OFFSET)
  483. #define YSCALEYUV2RGBX \
  484. "psubw "U_OFFSET"(%0), %%mm3 \n\t" /* (U-128)8*/\
  485. "psubw "V_OFFSET"(%0), %%mm4 \n\t" /* (V-128)8*/\
  486. "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
  487. "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
  488. "pmulhw "UG_COEFF"(%0), %%mm3 \n\t"\
  489. "pmulhw "VG_COEFF"(%0), %%mm4 \n\t"\
  490. /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
  491. "pmulhw "UB_COEFF"(%0), %%mm2 \n\t"\
  492. "pmulhw "VR_COEFF"(%0), %%mm5 \n\t"\
  493. "psubw "Y_OFFSET"(%0), %%mm1 \n\t" /* 8(Y-16)*/\
  494. "psubw "Y_OFFSET"(%0), %%mm7 \n\t" /* 8(Y-16)*/\
  495. "pmulhw "Y_COEFF"(%0), %%mm1 \n\t"\
  496. "pmulhw "Y_COEFF"(%0), %%mm7 \n\t"\
  497. /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
  498. "paddw %%mm3, %%mm4 \n\t"\
  499. "movq %%mm2, %%mm0 \n\t"\
  500. "movq %%mm5, %%mm6 \n\t"\
  501. "movq %%mm4, %%mm3 \n\t"\
  502. "punpcklwd %%mm2, %%mm2 \n\t"\
  503. "punpcklwd %%mm5, %%mm5 \n\t"\
  504. "punpcklwd %%mm4, %%mm4 \n\t"\
  505. "paddw %%mm1, %%mm2 \n\t"\
  506. "paddw %%mm1, %%mm5 \n\t"\
  507. "paddw %%mm1, %%mm4 \n\t"\
  508. "punpckhwd %%mm0, %%mm0 \n\t"\
  509. "punpckhwd %%mm6, %%mm6 \n\t"\
  510. "punpckhwd %%mm3, %%mm3 \n\t"\
  511. "paddw %%mm7, %%mm0 \n\t"\
  512. "paddw %%mm7, %%mm6 \n\t"\
  513. "paddw %%mm7, %%mm3 \n\t"\
  514. /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
  515. "packuswb %%mm0, %%mm2 \n\t"\
  516. "packuswb %%mm6, %%mm5 \n\t"\
  517. "packuswb %%mm3, %%mm4 \n\t"\
  518. #define REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) \
  519. "movq "#b", "#q2" \n\t" /* B */\
  520. "movq "#r", "#t" \n\t" /* R */\
  521. "punpcklbw "#g", "#b" \n\t" /* GBGBGBGB 0 */\
  522. "punpcklbw "#a", "#r" \n\t" /* ARARARAR 0 */\
  523. "punpckhbw "#g", "#q2" \n\t" /* GBGBGBGB 2 */\
  524. "punpckhbw "#a", "#t" \n\t" /* ARARARAR 2 */\
  525. "movq "#b", "#q0" \n\t" /* GBGBGBGB 0 */\
  526. "movq "#q2", "#q3" \n\t" /* GBGBGBGB 2 */\
  527. "punpcklwd "#r", "#q0" \n\t" /* ARGBARGB 0 */\
  528. "punpckhwd "#r", "#b" \n\t" /* ARGBARGB 1 */\
  529. "punpcklwd "#t", "#q2" \n\t" /* ARGBARGB 2 */\
  530. "punpckhwd "#t", "#q3" \n\t" /* ARGBARGB 3 */\
  531. \
  532. MOVNTQ( q0, (dst, index, 4))\
  533. MOVNTQ( b, 8(dst, index, 4))\
  534. MOVNTQ( q2, 16(dst, index, 4))\
  535. MOVNTQ( q3, 24(dst, index, 4))\
  536. \
  537. "add $8, "#index" \n\t"\
  538. "cmp "#dstw", "#index" \n\t"\
  539. " jb 1b \n\t"
  540. #define WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t)
  541. static void RENAME(yuv2rgb32_X_ar)(SwsContext *c, const int16_t *lumFilter,
  542. const int16_t **lumSrc, int lumFilterSize,
  543. const int16_t *chrFilter, const int16_t **chrUSrc,
  544. const int16_t **chrVSrc,
  545. int chrFilterSize, const int16_t **alpSrc,
  546. uint8_t *dest, int dstW, int dstY)
  547. {
  548. x86_reg dummy=0;
  549. x86_reg dstW_reg = dstW;
  550. x86_reg uv_off = c->uv_off_byte;
  551. if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
  552. YSCALEYUV2PACKEDX_ACCURATE
  553. YSCALEYUV2RGBX
  554. "movq %%mm2, "U_TEMP"(%0) \n\t"
  555. "movq %%mm4, "V_TEMP"(%0) \n\t"
  556. "movq %%mm5, "Y_TEMP"(%0) \n\t"
  557. YSCALEYUV2PACKEDX_ACCURATE_YA(ALP_MMX_FILTER_OFFSET)
  558. "movq "Y_TEMP"(%0), %%mm5 \n\t"
  559. "psraw $3, %%mm1 \n\t"
  560. "psraw $3, %%mm7 \n\t"
  561. "packuswb %%mm7, %%mm1 \n\t"
  562. WRITEBGR32(%4, %5, %%REGa, %%mm3, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm2, %%mm6)
  563. YSCALEYUV2PACKEDX_END
  564. } else {
  565. YSCALEYUV2PACKEDX_ACCURATE
  566. YSCALEYUV2RGBX
  567. "pcmpeqd %%mm7, %%mm7 \n\t"
  568. WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
  569. YSCALEYUV2PACKEDX_END
  570. }
  571. }
  572. static void RENAME(yuv2rgb32_X)(SwsContext *c, const int16_t *lumFilter,
  573. const int16_t **lumSrc, int lumFilterSize,
  574. const int16_t *chrFilter, const int16_t **chrUSrc,
  575. const int16_t **chrVSrc,
  576. int chrFilterSize, const int16_t **alpSrc,
  577. uint8_t *dest, int dstW, int dstY)
  578. {
  579. x86_reg dummy=0;
  580. x86_reg dstW_reg = dstW;
  581. x86_reg uv_off = c->uv_off_byte;
  582. if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
  583. YSCALEYUV2PACKEDX
  584. YSCALEYUV2RGBX
  585. YSCALEYUV2PACKEDX_YA(ALP_MMX_FILTER_OFFSET, %%mm0, %%mm3, %%mm6, %%mm1, %%mm7)
  586. "psraw $3, %%mm1 \n\t"
  587. "psraw $3, %%mm7 \n\t"
  588. "packuswb %%mm7, %%mm1 \n\t"
  589. WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
  590. YSCALEYUV2PACKEDX_END
  591. } else {
  592. YSCALEYUV2PACKEDX
  593. YSCALEYUV2RGBX
  594. "pcmpeqd %%mm7, %%mm7 \n\t"
  595. WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
  596. YSCALEYUV2PACKEDX_END
  597. }
  598. }
  599. #define REAL_WRITERGB16(dst, dstw, index) \
  600. "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
  601. "pand "MANGLE(bFC)", %%mm4 \n\t" /* G */\
  602. "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
  603. "psrlq $3, %%mm2 \n\t"\
  604. \
  605. "movq %%mm2, %%mm1 \n\t"\
  606. "movq %%mm4, %%mm3 \n\t"\
  607. \
  608. "punpcklbw %%mm7, %%mm3 \n\t"\
  609. "punpcklbw %%mm5, %%mm2 \n\t"\
  610. "punpckhbw %%mm7, %%mm4 \n\t"\
  611. "punpckhbw %%mm5, %%mm1 \n\t"\
  612. \
  613. "psllq $3, %%mm3 \n\t"\
  614. "psllq $3, %%mm4 \n\t"\
  615. \
  616. "por %%mm3, %%mm2 \n\t"\
  617. "por %%mm4, %%mm1 \n\t"\
  618. \
  619. MOVNTQ(%%mm2, (dst, index, 2))\
  620. MOVNTQ(%%mm1, 8(dst, index, 2))\
  621. \
  622. "add $8, "#index" \n\t"\
  623. "cmp "#dstw", "#index" \n\t"\
  624. " jb 1b \n\t"
  625. #define WRITERGB16(dst, dstw, index) REAL_WRITERGB16(dst, dstw, index)
  626. static void RENAME(yuv2rgb565_X_ar)(SwsContext *c, const int16_t *lumFilter,
  627. const int16_t **lumSrc, int lumFilterSize,
  628. const int16_t *chrFilter, const int16_t **chrUSrc,
  629. const int16_t **chrVSrc,
  630. int chrFilterSize, const int16_t **alpSrc,
  631. uint8_t *dest, int dstW, int dstY)
  632. {
  633. x86_reg dummy=0;
  634. x86_reg dstW_reg = dstW;
  635. x86_reg uv_off = c->uv_off_byte;
  636. YSCALEYUV2PACKEDX_ACCURATE
  637. YSCALEYUV2RGBX
  638. "pxor %%mm7, %%mm7 \n\t"
  639. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  640. #ifdef DITHER1XBPP
  641. "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
  642. "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
  643. "paddusb "RED_DITHER"(%0), %%mm5\n\t"
  644. #endif
  645. WRITERGB16(%4, %5, %%REGa)
  646. YSCALEYUV2PACKEDX_END
  647. }
  648. static void RENAME(yuv2rgb565_X)(SwsContext *c, const int16_t *lumFilter,
  649. const int16_t **lumSrc, int lumFilterSize,
  650. const int16_t *chrFilter, const int16_t **chrUSrc,
  651. const int16_t **chrVSrc,
  652. int chrFilterSize, const int16_t **alpSrc,
  653. uint8_t *dest, int dstW, int dstY)
  654. {
  655. x86_reg dummy=0;
  656. x86_reg dstW_reg = dstW;
  657. x86_reg uv_off = c->uv_off_byte;
  658. YSCALEYUV2PACKEDX
  659. YSCALEYUV2RGBX
  660. "pxor %%mm7, %%mm7 \n\t"
  661. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  662. #ifdef DITHER1XBPP
  663. "paddusb "BLUE_DITHER"(%0), %%mm2 \n\t"
  664. "paddusb "GREEN_DITHER"(%0), %%mm4 \n\t"
  665. "paddusb "RED_DITHER"(%0), %%mm5 \n\t"
  666. #endif
  667. WRITERGB16(%4, %5, %%REGa)
  668. YSCALEYUV2PACKEDX_END
  669. }
  670. #define REAL_WRITERGB15(dst, dstw, index) \
  671. "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
  672. "pand "MANGLE(bF8)", %%mm4 \n\t" /* G */\
  673. "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
  674. "psrlq $3, %%mm2 \n\t"\
  675. "psrlq $1, %%mm5 \n\t"\
  676. \
  677. "movq %%mm2, %%mm1 \n\t"\
  678. "movq %%mm4, %%mm3 \n\t"\
  679. \
  680. "punpcklbw %%mm7, %%mm3 \n\t"\
  681. "punpcklbw %%mm5, %%mm2 \n\t"\
  682. "punpckhbw %%mm7, %%mm4 \n\t"\
  683. "punpckhbw %%mm5, %%mm1 \n\t"\
  684. \
  685. "psllq $2, %%mm3 \n\t"\
  686. "psllq $2, %%mm4 \n\t"\
  687. \
  688. "por %%mm3, %%mm2 \n\t"\
  689. "por %%mm4, %%mm1 \n\t"\
  690. \
  691. MOVNTQ(%%mm2, (dst, index, 2))\
  692. MOVNTQ(%%mm1, 8(dst, index, 2))\
  693. \
  694. "add $8, "#index" \n\t"\
  695. "cmp "#dstw", "#index" \n\t"\
  696. " jb 1b \n\t"
  697. #define WRITERGB15(dst, dstw, index) REAL_WRITERGB15(dst, dstw, index)
  698. static void RENAME(yuv2rgb555_X_ar)(SwsContext *c, const int16_t *lumFilter,
  699. const int16_t **lumSrc, int lumFilterSize,
  700. const int16_t *chrFilter, const int16_t **chrUSrc,
  701. const int16_t **chrVSrc,
  702. int chrFilterSize, const int16_t **alpSrc,
  703. uint8_t *dest, int dstW, int dstY)
  704. {
  705. x86_reg dummy=0;
  706. x86_reg dstW_reg = dstW;
  707. x86_reg uv_off = c->uv_off_byte;
  708. YSCALEYUV2PACKEDX_ACCURATE
  709. YSCALEYUV2RGBX
  710. "pxor %%mm7, %%mm7 \n\t"
  711. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  712. #ifdef DITHER1XBPP
  713. "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
  714. "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
  715. "paddusb "RED_DITHER"(%0), %%mm5\n\t"
  716. #endif
  717. WRITERGB15(%4, %5, %%REGa)
  718. YSCALEYUV2PACKEDX_END
  719. }
  720. static void RENAME(yuv2rgb555_X)(SwsContext *c, const int16_t *lumFilter,
  721. const int16_t **lumSrc, int lumFilterSize,
  722. const int16_t *chrFilter, const int16_t **chrUSrc,
  723. const int16_t **chrVSrc,
  724. int chrFilterSize, const int16_t **alpSrc,
  725. uint8_t *dest, int dstW, int dstY)
  726. {
  727. x86_reg dummy=0;
  728. x86_reg dstW_reg = dstW;
  729. x86_reg uv_off = c->uv_off_byte;
  730. YSCALEYUV2PACKEDX
  731. YSCALEYUV2RGBX
  732. "pxor %%mm7, %%mm7 \n\t"
  733. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  734. #ifdef DITHER1XBPP
  735. "paddusb "BLUE_DITHER"(%0), %%mm2 \n\t"
  736. "paddusb "GREEN_DITHER"(%0), %%mm4 \n\t"
  737. "paddusb "RED_DITHER"(%0), %%mm5 \n\t"
  738. #endif
  739. WRITERGB15(%4, %5, %%REGa)
  740. YSCALEYUV2PACKEDX_END
  741. }
  742. #define WRITEBGR24MMX(dst, dstw, index) \
  743. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
  744. "movq %%mm2, %%mm1 \n\t" /* B */\
  745. "movq %%mm5, %%mm6 \n\t" /* R */\
  746. "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
  747. "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
  748. "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
  749. "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
  750. "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
  751. "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
  752. "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
  753. "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
  754. "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
  755. "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
  756. \
  757. "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
  758. "movq %%mm2, %%mm6 \n\t" /* 0RGB0RGB 1 */\
  759. "movq %%mm1, %%mm5 \n\t" /* 0RGB0RGB 2 */\
  760. "movq %%mm3, %%mm7 \n\t" /* 0RGB0RGB 3 */\
  761. \
  762. "psllq $40, %%mm0 \n\t" /* RGB00000 0 */\
  763. "psllq $40, %%mm2 \n\t" /* RGB00000 1 */\
  764. "psllq $40, %%mm1 \n\t" /* RGB00000 2 */\
  765. "psllq $40, %%mm3 \n\t" /* RGB00000 3 */\
  766. \
  767. "punpckhdq %%mm4, %%mm0 \n\t" /* 0RGBRGB0 0 */\
  768. "punpckhdq %%mm6, %%mm2 \n\t" /* 0RGBRGB0 1 */\
  769. "punpckhdq %%mm5, %%mm1 \n\t" /* 0RGBRGB0 2 */\
  770. "punpckhdq %%mm7, %%mm3 \n\t" /* 0RGBRGB0 3 */\
  771. \
  772. "psrlq $8, %%mm0 \n\t" /* 00RGBRGB 0 */\
  773. "movq %%mm2, %%mm6 \n\t" /* 0RGBRGB0 1 */\
  774. "psllq $40, %%mm2 \n\t" /* GB000000 1 */\
  775. "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
  776. MOVNTQ(%%mm0, (dst))\
  777. \
  778. "psrlq $24, %%mm6 \n\t" /* 0000RGBR 1 */\
  779. "movq %%mm1, %%mm5 \n\t" /* 0RGBRGB0 2 */\
  780. "psllq $24, %%mm1 \n\t" /* BRGB0000 2 */\
  781. "por %%mm1, %%mm6 \n\t" /* BRGBRGBR 1 */\
  782. MOVNTQ(%%mm6, 8(dst))\
  783. \
  784. "psrlq $40, %%mm5 \n\t" /* 000000RG 2 */\
  785. "psllq $8, %%mm3 \n\t" /* RGBRGB00 3 */\
  786. "por %%mm3, %%mm5 \n\t" /* RGBRGBRG 2 */\
  787. MOVNTQ(%%mm5, 16(dst))\
  788. \
  789. "add $24, "#dst" \n\t"\
  790. \
  791. "add $8, "#index" \n\t"\
  792. "cmp "#dstw", "#index" \n\t"\
  793. " jb 1b \n\t"
  794. #define WRITEBGR24MMX2(dst, dstw, index) \
  795. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
  796. "movq "MANGLE(ff_M24A)", %%mm0 \n\t"\
  797. "movq "MANGLE(ff_M24C)", %%mm7 \n\t"\
  798. "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2 B1 B0 B1 B0 */\
  799. "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2 G1 G0 G1 G0 */\
  800. "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0 R1 R0 R1 R0 */\
  801. \
  802. "pand %%mm0, %%mm1 \n\t" /* B2 B1 B0 */\
  803. "pand %%mm0, %%mm3 \n\t" /* G2 G1 G0 */\
  804. "pand %%mm7, %%mm6 \n\t" /* R1 R0 */\
  805. \
  806. "psllq $8, %%mm3 \n\t" /* G2 G1 G0 */\
  807. "por %%mm1, %%mm6 \n\t"\
  808. "por %%mm3, %%mm6 \n\t"\
  809. MOVNTQ(%%mm6, (dst))\
  810. \
  811. "psrlq $8, %%mm4 \n\t" /* 00 G7 G6 G5 G4 G3 G2 G1 */\
  812. "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4 B3 B2 B3 B2 */\
  813. "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3 G4 G3 G4 G3 */\
  814. "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4 R3 R2 R3 R2 */\
  815. \
  816. "pand "MANGLE(ff_M24B)", %%mm1 \n\t" /* B5 B4 B3 */\
  817. "pand %%mm7, %%mm3 \n\t" /* G4 G3 */\
  818. "pand %%mm0, %%mm6 \n\t" /* R4 R3 R2 */\
  819. \
  820. "por %%mm1, %%mm3 \n\t" /* B5 G4 B4 G3 B3 */\
  821. "por %%mm3, %%mm6 \n\t"\
  822. MOVNTQ(%%mm6, 8(dst))\
  823. \
  824. "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6 B7 B6 B6 B7 */\
  825. "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7 G6 G5 G6 G5 */\
  826. "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6 R5 R4 R5 R4 */\
  827. \
  828. "pand %%mm7, %%mm1 \n\t" /* B7 B6 */\
  829. "pand %%mm0, %%mm3 \n\t" /* G7 G6 G5 */\
  830. "pand "MANGLE(ff_M24B)", %%mm6 \n\t" /* R7 R6 R5 */\
  831. \
  832. "por %%mm1, %%mm3 \n\t"\
  833. "por %%mm3, %%mm6 \n\t"\
  834. MOVNTQ(%%mm6, 16(dst))\
  835. \
  836. "add $24, "#dst" \n\t"\
  837. \
  838. "add $8, "#index" \n\t"\
  839. "cmp "#dstw", "#index" \n\t"\
  840. " jb 1b \n\t"
  841. #if COMPILE_TEMPLATE_MMX2
  842. #undef WRITEBGR24
  843. #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX2(dst, dstw, index)
  844. #else
  845. #undef WRITEBGR24
  846. #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX(dst, dstw, index)
  847. #endif
  848. static void RENAME(yuv2bgr24_X_ar)(SwsContext *c, const int16_t *lumFilter,
  849. const int16_t **lumSrc, int lumFilterSize,
  850. const int16_t *chrFilter, const int16_t **chrUSrc,
  851. const int16_t **chrVSrc,
  852. int chrFilterSize, const int16_t **alpSrc,
  853. uint8_t *dest, int dstW, int dstY)
  854. {
  855. x86_reg dummy=0;
  856. x86_reg dstW_reg = dstW;
  857. x86_reg uv_off = c->uv_off_byte;
  858. YSCALEYUV2PACKEDX_ACCURATE
  859. YSCALEYUV2RGBX
  860. "pxor %%mm7, %%mm7 \n\t"
  861. "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"\n\t" //FIXME optimize
  862. "add %4, %%"REG_c" \n\t"
  863. WRITEBGR24(%%REGc, %5, %%REGa)
  864. :: "r" (&c->redDither),
  865. "m" (dummy), "m" (dummy), "m" (dummy),
  866. "r" (dest), "m" (dstW_reg), "m"(uv_off)
  867. : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
  868. );
  869. }
  870. static void RENAME(yuv2bgr24_X)(SwsContext *c, const int16_t *lumFilter,
  871. const int16_t **lumSrc, int lumFilterSize,
  872. const int16_t *chrFilter, const int16_t **chrUSrc,
  873. const int16_t **chrVSrc,
  874. int chrFilterSize, const int16_t **alpSrc,
  875. uint8_t *dest, int dstW, int dstY)
  876. {
  877. x86_reg dummy=0;
  878. x86_reg dstW_reg = dstW;
  879. x86_reg uv_off = c->uv_off_byte;
  880. YSCALEYUV2PACKEDX
  881. YSCALEYUV2RGBX
  882. "pxor %%mm7, %%mm7 \n\t"
  883. "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c" \n\t" //FIXME optimize
  884. "add %4, %%"REG_c" \n\t"
  885. WRITEBGR24(%%REGc, %5, %%REGa)
  886. :: "r" (&c->redDither),
  887. "m" (dummy), "m" (dummy), "m" (dummy),
  888. "r" (dest), "m" (dstW_reg), "m"(uv_off)
  889. : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
  890. );
  891. }
  892. #define REAL_WRITEYUY2(dst, dstw, index) \
  893. "packuswb %%mm3, %%mm3 \n\t"\
  894. "packuswb %%mm4, %%mm4 \n\t"\
  895. "packuswb %%mm7, %%mm1 \n\t"\
  896. "punpcklbw %%mm4, %%mm3 \n\t"\
  897. "movq %%mm1, %%mm7 \n\t"\
  898. "punpcklbw %%mm3, %%mm1 \n\t"\
  899. "punpckhbw %%mm3, %%mm7 \n\t"\
  900. \
  901. MOVNTQ(%%mm1, (dst, index, 2))\
  902. MOVNTQ(%%mm7, 8(dst, index, 2))\
  903. \
  904. "add $8, "#index" \n\t"\
  905. "cmp "#dstw", "#index" \n\t"\
  906. " jb 1b \n\t"
  907. #define WRITEYUY2(dst, dstw, index) REAL_WRITEYUY2(dst, dstw, index)
  908. static void RENAME(yuv2yuyv422_X_ar)(SwsContext *c, const int16_t *lumFilter,
  909. const int16_t **lumSrc, int lumFilterSize,
  910. const int16_t *chrFilter, const int16_t **chrUSrc,
  911. const int16_t **chrVSrc,
  912. int chrFilterSize, const int16_t **alpSrc,
  913. uint8_t *dest, int dstW, int dstY)
  914. {
  915. x86_reg dummy=0;
  916. x86_reg dstW_reg = dstW;
  917. x86_reg uv_off = c->uv_off_byte;
  918. YSCALEYUV2PACKEDX_ACCURATE
  919. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  920. "psraw $3, %%mm3 \n\t"
  921. "psraw $3, %%mm4 \n\t"
  922. "psraw $3, %%mm1 \n\t"
  923. "psraw $3, %%mm7 \n\t"
  924. WRITEYUY2(%4, %5, %%REGa)
  925. YSCALEYUV2PACKEDX_END
  926. }
  927. static void RENAME(yuv2yuyv422_X)(SwsContext *c, const int16_t *lumFilter,
  928. const int16_t **lumSrc, int lumFilterSize,
  929. const int16_t *chrFilter, const int16_t **chrUSrc,
  930. const int16_t **chrVSrc,
  931. int chrFilterSize, const int16_t **alpSrc,
  932. uint8_t *dest, int dstW, int dstY)
  933. {
  934. x86_reg dummy=0;
  935. x86_reg dstW_reg = dstW;
  936. x86_reg uv_off = c->uv_off_byte;
  937. YSCALEYUV2PACKEDX
  938. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  939. "psraw $3, %%mm3 \n\t"
  940. "psraw $3, %%mm4 \n\t"
  941. "psraw $3, %%mm1 \n\t"
  942. "psraw $3, %%mm7 \n\t"
  943. WRITEYUY2(%4, %5, %%REGa)
  944. YSCALEYUV2PACKEDX_END
  945. }
  946. #define REAL_YSCALEYUV2RGB_UV(index, c) \
  947. "xor "#index", "#index" \n\t"\
  948. ".p2align 4 \n\t"\
  949. "1: \n\t"\
  950. "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
  951. "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
  952. "add "UV_OFF_PX"("#c"), "#index" \n\t" \
  953. "movq (%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
  954. "movq (%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
  955. "sub "UV_OFF_PX"("#c"), "#index" \n\t" \
  956. "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
  957. "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
  958. "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
  959. "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
  960. "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
  961. "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
  962. "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
  963. "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
  964. "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
  965. "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
  966. "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
  967. "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
  968. "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
  969. "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
  970. "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
  971. /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
  972. #define REAL_YSCALEYUV2RGB_YA(index, c, b1, b2) \
  973. "movq ("#b1", "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
  974. "movq ("#b2", "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
  975. "movq 8("#b1", "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
  976. "movq 8("#b2", "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
  977. "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
  978. "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
  979. "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
  980. "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
  981. "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  982. "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  983. "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
  984. "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
  985. #define REAL_YSCALEYUV2RGB_COEFF(c) \
  986. "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
  987. "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
  988. "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
  989. "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
  990. "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
  991. "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
  992. /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
  993. "paddw %%mm3, %%mm4 \n\t"\
  994. "movq %%mm2, %%mm0 \n\t"\
  995. "movq %%mm5, %%mm6 \n\t"\
  996. "movq %%mm4, %%mm3 \n\t"\
  997. "punpcklwd %%mm2, %%mm2 \n\t"\
  998. "punpcklwd %%mm5, %%mm5 \n\t"\
  999. "punpcklwd %%mm4, %%mm4 \n\t"\
  1000. "paddw %%mm1, %%mm2 \n\t"\
  1001. "paddw %%mm1, %%mm5 \n\t"\
  1002. "paddw %%mm1, %%mm4 \n\t"\
  1003. "punpckhwd %%mm0, %%mm0 \n\t"\
  1004. "punpckhwd %%mm6, %%mm6 \n\t"\
  1005. "punpckhwd %%mm3, %%mm3 \n\t"\
  1006. "paddw %%mm7, %%mm0 \n\t"\
  1007. "paddw %%mm7, %%mm6 \n\t"\
  1008. "paddw %%mm7, %%mm3 \n\t"\
  1009. /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
  1010. "packuswb %%mm0, %%mm2 \n\t"\
  1011. "packuswb %%mm6, %%mm5 \n\t"\
  1012. "packuswb %%mm3, %%mm4 \n\t"\
  1013. #define YSCALEYUV2RGB_YA(index, c, b1, b2) REAL_YSCALEYUV2RGB_YA(index, c, b1, b2)
  1014. #define YSCALEYUV2RGB(index, c) \
  1015. REAL_YSCALEYUV2RGB_UV(index, c) \
  1016. REAL_YSCALEYUV2RGB_YA(index, c, %0, %1) \
  1017. REAL_YSCALEYUV2RGB_COEFF(c)
  1018. /**
  1019. * vertical bilinear scale YV12 to RGB
  1020. */
  1021. static void RENAME(yuv2rgb32_2)(SwsContext *c, const int16_t *buf[2],
  1022. const int16_t *ubuf[2], const int16_t *vbuf[2],
  1023. const int16_t *abuf[2], uint8_t *dest,
  1024. int dstW, int yalpha, int uvalpha, int y)
  1025. {
  1026. const int16_t *buf0 = buf[0], *buf1 = buf[1],
  1027. *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
  1028. if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
  1029. const int16_t *abuf0 = abuf[0], *abuf1 = abuf[1];
  1030. #if ARCH_X86_64
  1031. __asm__ volatile(
  1032. YSCALEYUV2RGB(%%r8, %5)
  1033. YSCALEYUV2RGB_YA(%%r8, %5, %6, %7)
  1034. "psraw $3, %%mm1 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
  1035. "psraw $3, %%mm7 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
  1036. "packuswb %%mm7, %%mm1 \n\t"
  1037. WRITEBGR32(%4, 8280(%5), %%r8, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
  1038. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "r" (dest),
  1039. "a" (&c->redDither),
  1040. "r" (abuf0), "r" (abuf1)
  1041. : "%r8"
  1042. );
  1043. #else
  1044. *(const uint16_t **)(&c->u_temp)=abuf0;
  1045. *(const uint16_t **)(&c->v_temp)=abuf1;
  1046. __asm__ volatile(
  1047. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1048. "mov %4, %%"REG_b" \n\t"
  1049. "push %%"REG_BP" \n\t"
  1050. YSCALEYUV2RGB(%%REGBP, %5)
  1051. "push %0 \n\t"
  1052. "push %1 \n\t"
  1053. "mov "U_TEMP"(%5), %0 \n\t"
  1054. "mov "V_TEMP"(%5), %1 \n\t"
  1055. YSCALEYUV2RGB_YA(%%REGBP, %5, %0, %1)
  1056. "psraw $3, %%mm1 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
  1057. "psraw $3, %%mm7 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
  1058. "packuswb %%mm7, %%mm1 \n\t"
  1059. "pop %1 \n\t"
  1060. "pop %0 \n\t"
  1061. WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
  1062. "pop %%"REG_BP" \n\t"
  1063. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1064. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1065. "a" (&c->redDither)
  1066. );
  1067. #endif
  1068. } else {
  1069. __asm__ volatile(
  1070. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1071. "mov %4, %%"REG_b" \n\t"
  1072. "push %%"REG_BP" \n\t"
  1073. YSCALEYUV2RGB(%%REGBP, %5)
  1074. "pcmpeqd %%mm7, %%mm7 \n\t"
  1075. WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
  1076. "pop %%"REG_BP" \n\t"
  1077. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1078. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1079. "a" (&c->redDither)
  1080. );
  1081. }
  1082. }
  1083. static void RENAME(yuv2bgr24_2)(SwsContext *c, const int16_t *buf[2],
  1084. const int16_t *ubuf[2], const int16_t *vbuf[2],
  1085. const int16_t *abuf[2], uint8_t *dest,
  1086. int dstW, int yalpha, int uvalpha, int y)
  1087. {
  1088. const int16_t *buf0 = buf[0], *buf1 = buf[1],
  1089. *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
  1090. //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
  1091. __asm__ volatile(
  1092. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1093. "mov %4, %%"REG_b" \n\t"
  1094. "push %%"REG_BP" \n\t"
  1095. YSCALEYUV2RGB(%%REGBP, %5)
  1096. "pxor %%mm7, %%mm7 \n\t"
  1097. WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
  1098. "pop %%"REG_BP" \n\t"
  1099. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1100. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1101. "a" (&c->redDither)
  1102. );
  1103. }
  1104. static void RENAME(yuv2rgb555_2)(SwsContext *c, const int16_t *buf[2],
  1105. const int16_t *ubuf[2], const int16_t *vbuf[2],
  1106. const int16_t *abuf[2], uint8_t *dest,
  1107. int dstW, int yalpha, int uvalpha, int y)
  1108. {
  1109. const int16_t *buf0 = buf[0], *buf1 = buf[1],
  1110. *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
  1111. //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
  1112. __asm__ volatile(
  1113. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1114. "mov %4, %%"REG_b" \n\t"
  1115. "push %%"REG_BP" \n\t"
  1116. YSCALEYUV2RGB(%%REGBP, %5)
  1117. "pxor %%mm7, %%mm7 \n\t"
  1118. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1119. #ifdef DITHER1XBPP
  1120. "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
  1121. "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
  1122. "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
  1123. #endif
  1124. WRITERGB15(%%REGb, 8280(%5), %%REGBP)
  1125. "pop %%"REG_BP" \n\t"
  1126. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1127. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1128. "a" (&c->redDither)
  1129. );
  1130. }
  1131. static void RENAME(yuv2rgb565_2)(SwsContext *c, const int16_t *buf[2],
  1132. const int16_t *ubuf[2], const int16_t *vbuf[2],
  1133. const int16_t *abuf[2], uint8_t *dest,
  1134. int dstW, int yalpha, int uvalpha, int y)
  1135. {
  1136. const int16_t *buf0 = buf[0], *buf1 = buf[1],
  1137. *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
  1138. //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
  1139. __asm__ volatile(
  1140. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1141. "mov %4, %%"REG_b" \n\t"
  1142. "push %%"REG_BP" \n\t"
  1143. YSCALEYUV2RGB(%%REGBP, %5)
  1144. "pxor %%mm7, %%mm7 \n\t"
  1145. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1146. #ifdef DITHER1XBPP
  1147. "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
  1148. "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
  1149. "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
  1150. #endif
  1151. WRITERGB16(%%REGb, 8280(%5), %%REGBP)
  1152. "pop %%"REG_BP" \n\t"
  1153. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1154. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1155. "a" (&c->redDither)
  1156. );
  1157. }
  1158. #define REAL_YSCALEYUV2PACKED(index, c) \
  1159. "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
  1160. "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1 \n\t"\
  1161. "psraw $3, %%mm0 \n\t"\
  1162. "psraw $3, %%mm1 \n\t"\
  1163. "movq %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c") \n\t"\
  1164. "movq %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c") \n\t"\
  1165. "xor "#index", "#index" \n\t"\
  1166. ".p2align 4 \n\t"\
  1167. "1: \n\t"\
  1168. "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
  1169. "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
  1170. "add "UV_OFF_PX"("#c"), "#index" \n\t" \
  1171. "movq (%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
  1172. "movq (%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
  1173. "sub "UV_OFF_PX"("#c"), "#index" \n\t" \
  1174. "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
  1175. "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
  1176. "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
  1177. "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
  1178. "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
  1179. "psraw $7, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
  1180. "psraw $7, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
  1181. "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
  1182. "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
  1183. "movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
  1184. "movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
  1185. "movq 8(%0, "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
  1186. "movq 8(%1, "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
  1187. "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
  1188. "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
  1189. "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
  1190. "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
  1191. "psraw $7, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  1192. "psraw $7, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  1193. "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
  1194. "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
  1195. #define YSCALEYUV2PACKED(index, c) REAL_YSCALEYUV2PACKED(index, c)
  1196. static void RENAME(yuv2yuyv422_2)(SwsContext *c, const int16_t *buf[2],
  1197. const int16_t *ubuf[2], const int16_t *vbuf[2],
  1198. const int16_t *abuf[2], uint8_t *dest,
  1199. int dstW, int yalpha, int uvalpha, int y)
  1200. {
  1201. const int16_t *buf0 = buf[0], *buf1 = buf[1],
  1202. *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
  1203. //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
  1204. __asm__ volatile(
  1205. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1206. "mov %4, %%"REG_b" \n\t"
  1207. "push %%"REG_BP" \n\t"
  1208. YSCALEYUV2PACKED(%%REGBP, %5)
  1209. WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
  1210. "pop %%"REG_BP" \n\t"
  1211. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1212. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1213. "a" (&c->redDither)
  1214. );
  1215. }
  1216. #define REAL_YSCALEYUV2RGB1(index, c) \
  1217. "xor "#index", "#index" \n\t"\
  1218. ".p2align 4 \n\t"\
  1219. "1: \n\t"\
  1220. "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
  1221. "add "UV_OFF_PX"("#c"), "#index" \n\t" \
  1222. "movq (%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
  1223. "sub "UV_OFF_PX"("#c"), "#index" \n\t" \
  1224. "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
  1225. "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
  1226. "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
  1227. "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
  1228. "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
  1229. "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
  1230. "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
  1231. "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
  1232. /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
  1233. "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
  1234. "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
  1235. "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  1236. "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  1237. "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
  1238. "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
  1239. "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
  1240. "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
  1241. "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
  1242. "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
  1243. /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
  1244. "paddw %%mm3, %%mm4 \n\t"\
  1245. "movq %%mm2, %%mm0 \n\t"\
  1246. "movq %%mm5, %%mm6 \n\t"\
  1247. "movq %%mm4, %%mm3 \n\t"\
  1248. "punpcklwd %%mm2, %%mm2 \n\t"\
  1249. "punpcklwd %%mm5, %%mm5 \n\t"\
  1250. "punpcklwd %%mm4, %%mm4 \n\t"\
  1251. "paddw %%mm1, %%mm2 \n\t"\
  1252. "paddw %%mm1, %%mm5 \n\t"\
  1253. "paddw %%mm1, %%mm4 \n\t"\
  1254. "punpckhwd %%mm0, %%mm0 \n\t"\
  1255. "punpckhwd %%mm6, %%mm6 \n\t"\
  1256. "punpckhwd %%mm3, %%mm3 \n\t"\
  1257. "paddw %%mm7, %%mm0 \n\t"\
  1258. "paddw %%mm7, %%mm6 \n\t"\
  1259. "paddw %%mm7, %%mm3 \n\t"\
  1260. /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
  1261. "packuswb %%mm0, %%mm2 \n\t"\
  1262. "packuswb %%mm6, %%mm5 \n\t"\
  1263. "packuswb %%mm3, %%mm4 \n\t"\
  1264. #define YSCALEYUV2RGB1(index, c) REAL_YSCALEYUV2RGB1(index, c)
  1265. // do vertical chrominance interpolation
  1266. #define REAL_YSCALEYUV2RGB1b(index, c) \
  1267. "xor "#index", "#index" \n\t"\
  1268. ".p2align 4 \n\t"\
  1269. "1: \n\t"\
  1270. "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
  1271. "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
  1272. "add "UV_OFF_PX"("#c"), "#index" \n\t" \
  1273. "movq (%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
  1274. "movq (%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
  1275. "sub "UV_OFF_PX"("#c"), "#index" \n\t" \
  1276. "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
  1277. "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
  1278. "psrlw $5, %%mm3 \n\t" /*FIXME might overflow*/\
  1279. "psrlw $5, %%mm4 \n\t" /*FIXME might overflow*/\
  1280. "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
  1281. "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
  1282. "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
  1283. "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
  1284. "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
  1285. "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
  1286. /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
  1287. "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
  1288. "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
  1289. "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  1290. "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  1291. "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
  1292. "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
  1293. "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
  1294. "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
  1295. "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
  1296. "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
  1297. /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
  1298. "paddw %%mm3, %%mm4 \n\t"\
  1299. "movq %%mm2, %%mm0 \n\t"\
  1300. "movq %%mm5, %%mm6 \n\t"\
  1301. "movq %%mm4, %%mm3 \n\t"\
  1302. "punpcklwd %%mm2, %%mm2 \n\t"\
  1303. "punpcklwd %%mm5, %%mm5 \n\t"\
  1304. "punpcklwd %%mm4, %%mm4 \n\t"\
  1305. "paddw %%mm1, %%mm2 \n\t"\
  1306. "paddw %%mm1, %%mm5 \n\t"\
  1307. "paddw %%mm1, %%mm4 \n\t"\
  1308. "punpckhwd %%mm0, %%mm0 \n\t"\
  1309. "punpckhwd %%mm6, %%mm6 \n\t"\
  1310. "punpckhwd %%mm3, %%mm3 \n\t"\
  1311. "paddw %%mm7, %%mm0 \n\t"\
  1312. "paddw %%mm7, %%mm6 \n\t"\
  1313. "paddw %%mm7, %%mm3 \n\t"\
  1314. /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
  1315. "packuswb %%mm0, %%mm2 \n\t"\
  1316. "packuswb %%mm6, %%mm5 \n\t"\
  1317. "packuswb %%mm3, %%mm4 \n\t"\
  1318. #define YSCALEYUV2RGB1b(index, c) REAL_YSCALEYUV2RGB1b(index, c)
  1319. #define REAL_YSCALEYUV2RGB1_ALPHA(index) \
  1320. "movq (%1, "#index", 2), %%mm7 \n\t" /* abuf0[index ] */\
  1321. "movq 8(%1, "#index", 2), %%mm1 \n\t" /* abuf0[index+4] */\
  1322. "psraw $7, %%mm7 \n\t" /* abuf0[index ] >>7 */\
  1323. "psraw $7, %%mm1 \n\t" /* abuf0[index+4] >>7 */\
  1324. "packuswb %%mm1, %%mm7 \n\t"
  1325. #define YSCALEYUV2RGB1_ALPHA(index) REAL_YSCALEYUV2RGB1_ALPHA(index)
  1326. /**
  1327. * YV12 to RGB without scaling or interpolating
  1328. */
  1329. static void RENAME(yuv2rgb32_1)(SwsContext *c, const int16_t *buf0,
  1330. const int16_t *ubuf[2], const int16_t *bguf[2],
  1331. const int16_t *abuf0, uint8_t *dest,
  1332. int dstW, int uvalpha, int y)
  1333. {
  1334. const int16_t *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
  1335. const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
  1336. if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
  1337. if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
  1338. __asm__ volatile(
  1339. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1340. "mov %4, %%"REG_b" \n\t"
  1341. "push %%"REG_BP" \n\t"
  1342. YSCALEYUV2RGB1(%%REGBP, %5)
  1343. YSCALEYUV2RGB1_ALPHA(%%REGBP)
  1344. WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
  1345. "pop %%"REG_BP" \n\t"
  1346. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1347. :: "c" (buf0), "d" (abuf0), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1348. "a" (&c->redDither)
  1349. );
  1350. } else {
  1351. __asm__ volatile(
  1352. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1353. "mov %4, %%"REG_b" \n\t"
  1354. "push %%"REG_BP" \n\t"
  1355. YSCALEYUV2RGB1(%%REGBP, %5)
  1356. "pcmpeqd %%mm7, %%mm7 \n\t"
  1357. WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
  1358. "pop %%"REG_BP" \n\t"
  1359. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1360. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1361. "a" (&c->redDither)
  1362. );
  1363. }
  1364. } else {
  1365. if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
  1366. __asm__ volatile(
  1367. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1368. "mov %4, %%"REG_b" \n\t"
  1369. "push %%"REG_BP" \n\t"
  1370. YSCALEYUV2RGB1b(%%REGBP, %5)
  1371. YSCALEYUV2RGB1_ALPHA(%%REGBP)
  1372. WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
  1373. "pop %%"REG_BP" \n\t"
  1374. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1375. :: "c" (buf0), "d" (abuf0), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1376. "a" (&c->redDither)
  1377. );
  1378. } else {
  1379. __asm__ volatile(
  1380. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1381. "mov %4, %%"REG_b" \n\t"
  1382. "push %%"REG_BP" \n\t"
  1383. YSCALEYUV2RGB1b(%%REGBP, %5)
  1384. "pcmpeqd %%mm7, %%mm7 \n\t"
  1385. WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
  1386. "pop %%"REG_BP" \n\t"
  1387. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1388. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1389. "a" (&c->redDither)
  1390. );
  1391. }
  1392. }
  1393. }
  1394. static void RENAME(yuv2bgr24_1)(SwsContext *c, const int16_t *buf0,
  1395. const int16_t *ubuf[2], const int16_t *bguf[2],
  1396. const int16_t *abuf0, uint8_t *dest,
  1397. int dstW, int uvalpha, int y)
  1398. {
  1399. const int16_t *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
  1400. const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
  1401. if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
  1402. __asm__ volatile(
  1403. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1404. "mov %4, %%"REG_b" \n\t"
  1405. "push %%"REG_BP" \n\t"
  1406. YSCALEYUV2RGB1(%%REGBP, %5)
  1407. "pxor %%mm7, %%mm7 \n\t"
  1408. WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
  1409. "pop %%"REG_BP" \n\t"
  1410. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1411. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1412. "a" (&c->redDither)
  1413. );
  1414. } else {
  1415. __asm__ volatile(
  1416. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1417. "mov %4, %%"REG_b" \n\t"
  1418. "push %%"REG_BP" \n\t"
  1419. YSCALEYUV2RGB1b(%%REGBP, %5)
  1420. "pxor %%mm7, %%mm7 \n\t"
  1421. WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
  1422. "pop %%"REG_BP" \n\t"
  1423. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1424. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1425. "a" (&c->redDither)
  1426. );
  1427. }
  1428. }
  1429. static void RENAME(yuv2rgb555_1)(SwsContext *c, const int16_t *buf0,
  1430. const int16_t *ubuf[2], const int16_t *bguf[2],
  1431. const int16_t *abuf0, uint8_t *dest,
  1432. int dstW, int uvalpha, int y)
  1433. {
  1434. const int16_t *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
  1435. const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
  1436. if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
  1437. __asm__ volatile(
  1438. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1439. "mov %4, %%"REG_b" \n\t"
  1440. "push %%"REG_BP" \n\t"
  1441. YSCALEYUV2RGB1(%%REGBP, %5)
  1442. "pxor %%mm7, %%mm7 \n\t"
  1443. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1444. #ifdef DITHER1XBPP
  1445. "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
  1446. "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
  1447. "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
  1448. #endif
  1449. WRITERGB15(%%REGb, 8280(%5), %%REGBP)
  1450. "pop %%"REG_BP" \n\t"
  1451. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1452. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1453. "a" (&c->redDither)
  1454. );
  1455. } else {
  1456. __asm__ volatile(
  1457. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1458. "mov %4, %%"REG_b" \n\t"
  1459. "push %%"REG_BP" \n\t"
  1460. YSCALEYUV2RGB1b(%%REGBP, %5)
  1461. "pxor %%mm7, %%mm7 \n\t"
  1462. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1463. #ifdef DITHER1XBPP
  1464. "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
  1465. "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
  1466. "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
  1467. #endif
  1468. WRITERGB15(%%REGb, 8280(%5), %%REGBP)
  1469. "pop %%"REG_BP" \n\t"
  1470. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1471. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1472. "a" (&c->redDither)
  1473. );
  1474. }
  1475. }
  1476. static void RENAME(yuv2rgb565_1)(SwsContext *c, const int16_t *buf0,
  1477. const int16_t *ubuf[2], const int16_t *bguf[2],
  1478. const int16_t *abuf0, uint8_t *dest,
  1479. int dstW, int uvalpha, int y)
  1480. {
  1481. const int16_t *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
  1482. const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
  1483. if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
  1484. __asm__ volatile(
  1485. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1486. "mov %4, %%"REG_b" \n\t"
  1487. "push %%"REG_BP" \n\t"
  1488. YSCALEYUV2RGB1(%%REGBP, %5)
  1489. "pxor %%mm7, %%mm7 \n\t"
  1490. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1491. #ifdef DITHER1XBPP
  1492. "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
  1493. "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
  1494. "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
  1495. #endif
  1496. WRITERGB16(%%REGb, 8280(%5), %%REGBP)
  1497. "pop %%"REG_BP" \n\t"
  1498. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1499. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1500. "a" (&c->redDither)
  1501. );
  1502. } else {
  1503. __asm__ volatile(
  1504. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1505. "mov %4, %%"REG_b" \n\t"
  1506. "push %%"REG_BP" \n\t"
  1507. YSCALEYUV2RGB1b(%%REGBP, %5)
  1508. "pxor %%mm7, %%mm7 \n\t"
  1509. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1510. #ifdef DITHER1XBPP
  1511. "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
  1512. "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
  1513. "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
  1514. #endif
  1515. WRITERGB16(%%REGb, 8280(%5), %%REGBP)
  1516. "pop %%"REG_BP" \n\t"
  1517. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1518. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1519. "a" (&c->redDither)
  1520. );
  1521. }
  1522. }
  1523. #define REAL_YSCALEYUV2PACKED1(index, c) \
  1524. "xor "#index", "#index" \n\t"\
  1525. ".p2align 4 \n\t"\
  1526. "1: \n\t"\
  1527. "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
  1528. "add "UV_OFF_PX"("#c"), "#index" \n\t" \
  1529. "movq (%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
  1530. "sub "UV_OFF_PX"("#c"), "#index" \n\t" \
  1531. "psraw $7, %%mm3 \n\t" \
  1532. "psraw $7, %%mm4 \n\t" \
  1533. "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
  1534. "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
  1535. "psraw $7, %%mm1 \n\t" \
  1536. "psraw $7, %%mm7 \n\t" \
  1537. #define YSCALEYUV2PACKED1(index, c) REAL_YSCALEYUV2PACKED1(index, c)
  1538. #define REAL_YSCALEYUV2PACKED1b(index, c) \
  1539. "xor "#index", "#index" \n\t"\
  1540. ".p2align 4 \n\t"\
  1541. "1: \n\t"\
  1542. "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
  1543. "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
  1544. "add "UV_OFF_PX"("#c"), "#index" \n\t" \
  1545. "movq (%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
  1546. "movq (%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
  1547. "sub "UV_OFF_PX"("#c"), "#index" \n\t" \
  1548. "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
  1549. "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
  1550. "psrlw $8, %%mm3 \n\t" \
  1551. "psrlw $8, %%mm4 \n\t" \
  1552. "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
  1553. "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
  1554. "psraw $7, %%mm1 \n\t" \
  1555. "psraw $7, %%mm7 \n\t"
  1556. #define YSCALEYUV2PACKED1b(index, c) REAL_YSCALEYUV2PACKED1b(index, c)
  1557. static void RENAME(yuv2yuyv422_1)(SwsContext *c, const int16_t *buf0,
  1558. const int16_t *ubuf[2], const int16_t *bguf[2],
  1559. const int16_t *abuf0, uint8_t *dest,
  1560. int dstW, int uvalpha, int y)
  1561. {
  1562. const int16_t *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
  1563. const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
  1564. if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
  1565. __asm__ volatile(
  1566. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1567. "mov %4, %%"REG_b" \n\t"
  1568. "push %%"REG_BP" \n\t"
  1569. YSCALEYUV2PACKED1(%%REGBP, %5)
  1570. WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
  1571. "pop %%"REG_BP" \n\t"
  1572. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1573. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1574. "a" (&c->redDither)
  1575. );
  1576. } else {
  1577. __asm__ volatile(
  1578. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1579. "mov %4, %%"REG_b" \n\t"
  1580. "push %%"REG_BP" \n\t"
  1581. YSCALEYUV2PACKED1b(%%REGBP, %5)
  1582. WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
  1583. "pop %%"REG_BP" \n\t"
  1584. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1585. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1586. "a" (&c->redDither)
  1587. );
  1588. }
  1589. }
  1590. #if !COMPILE_TEMPLATE_MMX2
  1591. //FIXME yuy2* can read up to 7 samples too much
  1592. static void RENAME(yuy2ToY)(uint8_t *dst, const uint8_t *src,
  1593. int width, uint32_t *unused)
  1594. {
  1595. __asm__ volatile(
  1596. "movq "MANGLE(bm01010101)", %%mm2 \n\t"
  1597. "mov %0, %%"REG_a" \n\t"
  1598. "1: \n\t"
  1599. "movq (%1, %%"REG_a",2), %%mm0 \n\t"
  1600. "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
  1601. "pand %%mm2, %%mm0 \n\t"
  1602. "pand %%mm2, %%mm1 \n\t"
  1603. "packuswb %%mm1, %%mm0 \n\t"
  1604. "movq %%mm0, (%2, %%"REG_a") \n\t"
  1605. "add $8, %%"REG_a" \n\t"
  1606. " js 1b \n\t"
  1607. : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst+width)
  1608. : "%"REG_a
  1609. );
  1610. }
  1611. static void RENAME(yuy2ToUV)(uint8_t *dstU, uint8_t *dstV,
  1612. const uint8_t *src1, const uint8_t *src2,
  1613. int width, uint32_t *unused)
  1614. {
  1615. __asm__ volatile(
  1616. "movq "MANGLE(bm01010101)", %%mm4 \n\t"
  1617. "mov %0, %%"REG_a" \n\t"
  1618. "1: \n\t"
  1619. "movq (%1, %%"REG_a",4), %%mm0 \n\t"
  1620. "movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
  1621. "psrlw $8, %%mm0 \n\t"
  1622. "psrlw $8, %%mm1 \n\t"
  1623. "packuswb %%mm1, %%mm0 \n\t"
  1624. "movq %%mm0, %%mm1 \n\t"
  1625. "psrlw $8, %%mm0 \n\t"
  1626. "pand %%mm4, %%mm1 \n\t"
  1627. "packuswb %%mm0, %%mm0 \n\t"
  1628. "packuswb %%mm1, %%mm1 \n\t"
  1629. "movd %%mm0, (%3, %%"REG_a") \n\t"
  1630. "movd %%mm1, (%2, %%"REG_a") \n\t"
  1631. "add $4, %%"REG_a" \n\t"
  1632. " js 1b \n\t"
  1633. : : "g" ((x86_reg)-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
  1634. : "%"REG_a
  1635. );
  1636. assert(src1 == src2);
  1637. }
  1638. /* This is almost identical to the previous, end exists only because
  1639. * yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses. */
  1640. static void RENAME(uyvyToY)(uint8_t *dst, const uint8_t *src,
  1641. int width, uint32_t *unused)
  1642. {
  1643. __asm__ volatile(
  1644. "mov %0, %%"REG_a" \n\t"
  1645. "1: \n\t"
  1646. "movq (%1, %%"REG_a",2), %%mm0 \n\t"
  1647. "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
  1648. "psrlw $8, %%mm0 \n\t"
  1649. "psrlw $8, %%mm1 \n\t"
  1650. "packuswb %%mm1, %%mm0 \n\t"
  1651. "movq %%mm0, (%2, %%"REG_a") \n\t"
  1652. "add $8, %%"REG_a" \n\t"
  1653. " js 1b \n\t"
  1654. : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst+width)
  1655. : "%"REG_a
  1656. );
  1657. }
  1658. static void RENAME(uyvyToUV)(uint8_t *dstU, uint8_t *dstV,
  1659. const uint8_t *src1, const uint8_t *src2,
  1660. int width, uint32_t *unused)
  1661. {
  1662. __asm__ volatile(
  1663. "movq "MANGLE(bm01010101)", %%mm4 \n\t"
  1664. "mov %0, %%"REG_a" \n\t"
  1665. "1: \n\t"
  1666. "movq (%1, %%"REG_a",4), %%mm0 \n\t"
  1667. "movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
  1668. "pand %%mm4, %%mm0 \n\t"
  1669. "pand %%mm4, %%mm1 \n\t"
  1670. "packuswb %%mm1, %%mm0 \n\t"
  1671. "movq %%mm0, %%mm1 \n\t"
  1672. "psrlw $8, %%mm0 \n\t"
  1673. "pand %%mm4, %%mm1 \n\t"
  1674. "packuswb %%mm0, %%mm0 \n\t"
  1675. "packuswb %%mm1, %%mm1 \n\t"
  1676. "movd %%mm0, (%3, %%"REG_a") \n\t"
  1677. "movd %%mm1, (%2, %%"REG_a") \n\t"
  1678. "add $4, %%"REG_a" \n\t"
  1679. " js 1b \n\t"
  1680. : : "g" ((x86_reg)-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
  1681. : "%"REG_a
  1682. );
  1683. assert(src1 == src2);
  1684. }
  1685. static av_always_inline void RENAME(nvXXtoUV)(uint8_t *dst1, uint8_t *dst2,
  1686. const uint8_t *src, int width)
  1687. {
  1688. __asm__ volatile(
  1689. "movq "MANGLE(bm01010101)", %%mm4 \n\t"
  1690. "mov %0, %%"REG_a" \n\t"
  1691. "1: \n\t"
  1692. "movq (%1, %%"REG_a",2), %%mm0 \n\t"
  1693. "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
  1694. "movq %%mm0, %%mm2 \n\t"
  1695. "movq %%mm1, %%mm3 \n\t"
  1696. "pand %%mm4, %%mm0 \n\t"
  1697. "pand %%mm4, %%mm1 \n\t"
  1698. "psrlw $8, %%mm2 \n\t"
  1699. "psrlw $8, %%mm3 \n\t"
  1700. "packuswb %%mm1, %%mm0 \n\t"
  1701. "packuswb %%mm3, %%mm2 \n\t"
  1702. "movq %%mm0, (%2, %%"REG_a") \n\t"
  1703. "movq %%mm2, (%3, %%"REG_a") \n\t"
  1704. "add $8, %%"REG_a" \n\t"
  1705. " js 1b \n\t"
  1706. : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst1+width), "r" (dst2+width)
  1707. : "%"REG_a
  1708. );
  1709. }
  1710. static void RENAME(nv12ToUV)(uint8_t *dstU, uint8_t *dstV,
  1711. const uint8_t *src1, const uint8_t *src2,
  1712. int width, uint32_t *unused)
  1713. {
  1714. RENAME(nvXXtoUV)(dstU, dstV, src1, width);
  1715. }
  1716. static void RENAME(nv21ToUV)(uint8_t *dstU, uint8_t *dstV,
  1717. const uint8_t *src1, const uint8_t *src2,
  1718. int width, uint32_t *unused)
  1719. {
  1720. RENAME(nvXXtoUV)(dstV, dstU, src1, width);
  1721. }
  1722. #endif /* !COMPILE_TEMPLATE_MMX2 */
  1723. static av_always_inline void RENAME(bgr24ToY_mmx)(uint8_t *dst, const uint8_t *src,
  1724. int width, enum PixelFormat srcFormat)
  1725. {
  1726. if(srcFormat == PIX_FMT_BGR24) {
  1727. __asm__ volatile(
  1728. "movq "MANGLE(ff_bgr24toY1Coeff)", %%mm5 \n\t"
  1729. "movq "MANGLE(ff_bgr24toY2Coeff)", %%mm6 \n\t"
  1730. :
  1731. );
  1732. } else {
  1733. __asm__ volatile(
  1734. "movq "MANGLE(ff_rgb24toY1Coeff)", %%mm5 \n\t"
  1735. "movq "MANGLE(ff_rgb24toY2Coeff)", %%mm6 \n\t"
  1736. :
  1737. );
  1738. }
  1739. __asm__ volatile(
  1740. "movq "MANGLE(ff_bgr24toYOffset)", %%mm4 \n\t"
  1741. "mov %2, %%"REG_a" \n\t"
  1742. "pxor %%mm7, %%mm7 \n\t"
  1743. "1: \n\t"
  1744. PREFETCH" 64(%0) \n\t"
  1745. "movd (%0), %%mm0 \n\t"
  1746. "movd 2(%0), %%mm1 \n\t"
  1747. "movd 6(%0), %%mm2 \n\t"
  1748. "movd 8(%0), %%mm3 \n\t"
  1749. "add $12, %0 \n\t"
  1750. "punpcklbw %%mm7, %%mm0 \n\t"
  1751. "punpcklbw %%mm7, %%mm1 \n\t"
  1752. "punpcklbw %%mm7, %%mm2 \n\t"
  1753. "punpcklbw %%mm7, %%mm3 \n\t"
  1754. "pmaddwd %%mm5, %%mm0 \n\t"
  1755. "pmaddwd %%mm6, %%mm1 \n\t"
  1756. "pmaddwd %%mm5, %%mm2 \n\t"
  1757. "pmaddwd %%mm6, %%mm3 \n\t"
  1758. "paddd %%mm1, %%mm0 \n\t"
  1759. "paddd %%mm3, %%mm2 \n\t"
  1760. "paddd %%mm4, %%mm0 \n\t"
  1761. "paddd %%mm4, %%mm2 \n\t"
  1762. "psrad $15, %%mm0 \n\t"
  1763. "psrad $15, %%mm2 \n\t"
  1764. "packssdw %%mm2, %%mm0 \n\t"
  1765. "packuswb %%mm0, %%mm0 \n\t"
  1766. "movd %%mm0, (%1, %%"REG_a") \n\t"
  1767. "add $4, %%"REG_a" \n\t"
  1768. " js 1b \n\t"
  1769. : "+r" (src)
  1770. : "r" (dst+width), "g" ((x86_reg)-width)
  1771. : "%"REG_a
  1772. );
  1773. }
  1774. static void RENAME(bgr24ToY)(uint8_t *dst, const uint8_t *src,
  1775. int width, uint32_t *unused)
  1776. {
  1777. RENAME(bgr24ToY_mmx)(dst, src, width, PIX_FMT_BGR24);
  1778. }
  1779. static void RENAME(rgb24ToY)(uint8_t *dst, const uint8_t *src,
  1780. int width, uint32_t *unused)
  1781. {
  1782. RENAME(bgr24ToY_mmx)(dst, src, width, PIX_FMT_RGB24);
  1783. }
  1784. static av_always_inline void RENAME(bgr24ToUV_mmx)(uint8_t *dstU, uint8_t *dstV,
  1785. const uint8_t *src, int width,
  1786. enum PixelFormat srcFormat)
  1787. {
  1788. __asm__ volatile(
  1789. "movq 24(%4), %%mm6 \n\t"
  1790. "mov %3, %%"REG_a" \n\t"
  1791. "pxor %%mm7, %%mm7 \n\t"
  1792. "1: \n\t"
  1793. PREFETCH" 64(%0) \n\t"
  1794. "movd (%0), %%mm0 \n\t"
  1795. "movd 2(%0), %%mm1 \n\t"
  1796. "punpcklbw %%mm7, %%mm0 \n\t"
  1797. "punpcklbw %%mm7, %%mm1 \n\t"
  1798. "movq %%mm0, %%mm2 \n\t"
  1799. "movq %%mm1, %%mm3 \n\t"
  1800. "pmaddwd (%4), %%mm0 \n\t"
  1801. "pmaddwd 8(%4), %%mm1 \n\t"
  1802. "pmaddwd 16(%4), %%mm2 \n\t"
  1803. "pmaddwd %%mm6, %%mm3 \n\t"
  1804. "paddd %%mm1, %%mm0 \n\t"
  1805. "paddd %%mm3, %%mm2 \n\t"
  1806. "movd 6(%0), %%mm1 \n\t"
  1807. "movd 8(%0), %%mm3 \n\t"
  1808. "add $12, %0 \n\t"
  1809. "punpcklbw %%mm7, %%mm1 \n\t"
  1810. "punpcklbw %%mm7, %%mm3 \n\t"
  1811. "movq %%mm1, %%mm4 \n\t"
  1812. "movq %%mm3, %%mm5 \n\t"
  1813. "pmaddwd (%4), %%mm1 \n\t"
  1814. "pmaddwd 8(%4), %%mm3 \n\t"
  1815. "pmaddwd 16(%4), %%mm4 \n\t"
  1816. "pmaddwd %%mm6, %%mm5 \n\t"
  1817. "paddd %%mm3, %%mm1 \n\t"
  1818. "paddd %%mm5, %%mm4 \n\t"
  1819. "movq "MANGLE(ff_bgr24toUVOffset)", %%mm3 \n\t"
  1820. "paddd %%mm3, %%mm0 \n\t"
  1821. "paddd %%mm3, %%mm2 \n\t"
  1822. "paddd %%mm3, %%mm1 \n\t"
  1823. "paddd %%mm3, %%mm4 \n\t"
  1824. "psrad $15, %%mm0 \n\t"
  1825. "psrad $15, %%mm2 \n\t"
  1826. "psrad $15, %%mm1 \n\t"
  1827. "psrad $15, %%mm4 \n\t"
  1828. "packssdw %%mm1, %%mm0 \n\t"
  1829. "packssdw %%mm4, %%mm2 \n\t"
  1830. "packuswb %%mm0, %%mm0 \n\t"
  1831. "packuswb %%mm2, %%mm2 \n\t"
  1832. "movd %%mm0, (%1, %%"REG_a") \n\t"
  1833. "movd %%mm2, (%2, %%"REG_a") \n\t"
  1834. "add $4, %%"REG_a" \n\t"
  1835. " js 1b \n\t"
  1836. : "+r" (src)
  1837. : "r" (dstU+width), "r" (dstV+width), "g" ((x86_reg)-width), "r"(ff_bgr24toUV[srcFormat == PIX_FMT_RGB24])
  1838. : "%"REG_a
  1839. );
  1840. }
  1841. static void RENAME(bgr24ToUV)(uint8_t *dstU, uint8_t *dstV,
  1842. const uint8_t *src1, const uint8_t *src2,
  1843. int width, uint32_t *unused)
  1844. {
  1845. RENAME(bgr24ToUV_mmx)(dstU, dstV, src1, width, PIX_FMT_BGR24);
  1846. assert(src1 == src2);
  1847. }
  1848. static void RENAME(rgb24ToUV)(uint8_t *dstU, uint8_t *dstV,
  1849. const uint8_t *src1, const uint8_t *src2,
  1850. int width, uint32_t *unused)
  1851. {
  1852. assert(src1==src2);
  1853. RENAME(bgr24ToUV_mmx)(dstU, dstV, src1, width, PIX_FMT_RGB24);
  1854. }
  1855. #if COMPILE_TEMPLATE_MMX2
  1856. static void RENAME(hyscale_fast)(SwsContext *c, int16_t *dst,
  1857. int dstWidth, const uint8_t *src,
  1858. int srcW, int xInc)
  1859. {
  1860. int16_t *filterPos = c->hLumFilterPos;
  1861. int16_t *filter = c->hLumFilter;
  1862. void *mmx2FilterCode= c->lumMmx2FilterCode;
  1863. int i;
  1864. #if defined(PIC)
  1865. DECLARE_ALIGNED(8, uint64_t, ebxsave);
  1866. #endif
  1867. __asm__ volatile(
  1868. #if defined(PIC)
  1869. "mov %%"REG_b", %5 \n\t"
  1870. #endif
  1871. "pxor %%mm7, %%mm7 \n\t"
  1872. "mov %0, %%"REG_c" \n\t"
  1873. "mov %1, %%"REG_D" \n\t"
  1874. "mov %2, %%"REG_d" \n\t"
  1875. "mov %3, %%"REG_b" \n\t"
  1876. "xor %%"REG_a", %%"REG_a" \n\t" // i
  1877. PREFETCH" (%%"REG_c") \n\t"
  1878. PREFETCH" 32(%%"REG_c") \n\t"
  1879. PREFETCH" 64(%%"REG_c") \n\t"
  1880. #if ARCH_X86_64
  1881. #define CALL_MMX2_FILTER_CODE \
  1882. "movl (%%"REG_b"), %%esi \n\t"\
  1883. "call *%4 \n\t"\
  1884. "movl (%%"REG_b", %%"REG_a"), %%esi \n\t"\
  1885. "add %%"REG_S", %%"REG_c" \n\t"\
  1886. "add %%"REG_a", %%"REG_D" \n\t"\
  1887. "xor %%"REG_a", %%"REG_a" \n\t"\
  1888. #else
  1889. #define CALL_MMX2_FILTER_CODE \
  1890. "movl (%%"REG_b"), %%esi \n\t"\
  1891. "call *%4 \n\t"\
  1892. "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\
  1893. "add %%"REG_a", %%"REG_D" \n\t"\
  1894. "xor %%"REG_a", %%"REG_a" \n\t"\
  1895. #endif /* ARCH_X86_64 */
  1896. CALL_MMX2_FILTER_CODE
  1897. CALL_MMX2_FILTER_CODE
  1898. CALL_MMX2_FILTER_CODE
  1899. CALL_MMX2_FILTER_CODE
  1900. CALL_MMX2_FILTER_CODE
  1901. CALL_MMX2_FILTER_CODE
  1902. CALL_MMX2_FILTER_CODE
  1903. CALL_MMX2_FILTER_CODE
  1904. #if defined(PIC)
  1905. "mov %5, %%"REG_b" \n\t"
  1906. #endif
  1907. :: "m" (src), "m" (dst), "m" (filter), "m" (filterPos),
  1908. "m" (mmx2FilterCode)
  1909. #if defined(PIC)
  1910. ,"m" (ebxsave)
  1911. #endif
  1912. : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
  1913. #if !defined(PIC)
  1914. ,"%"REG_b
  1915. #endif
  1916. );
  1917. for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
  1918. dst[i] = src[srcW-1]*128;
  1919. }
  1920. static void RENAME(hcscale_fast)(SwsContext *c, int16_t *dst1, int16_t *dst2,
  1921. int dstWidth, const uint8_t *src1,
  1922. const uint8_t *src2, int srcW, int xInc)
  1923. {
  1924. int16_t *filterPos = c->hChrFilterPos;
  1925. int16_t *filter = c->hChrFilter;
  1926. void *mmx2FilterCode= c->chrMmx2FilterCode;
  1927. int i;
  1928. #if defined(PIC)
  1929. DECLARE_ALIGNED(8, uint64_t, ebxsave);
  1930. #endif
  1931. __asm__ volatile(
  1932. #if defined(PIC)
  1933. "mov %%"REG_b", %7 \n\t"
  1934. #endif
  1935. "pxor %%mm7, %%mm7 \n\t"
  1936. "mov %0, %%"REG_c" \n\t"
  1937. "mov %1, %%"REG_D" \n\t"
  1938. "mov %2, %%"REG_d" \n\t"
  1939. "mov %3, %%"REG_b" \n\t"
  1940. "xor %%"REG_a", %%"REG_a" \n\t" // i
  1941. PREFETCH" (%%"REG_c") \n\t"
  1942. PREFETCH" 32(%%"REG_c") \n\t"
  1943. PREFETCH" 64(%%"REG_c") \n\t"
  1944. CALL_MMX2_FILTER_CODE
  1945. CALL_MMX2_FILTER_CODE
  1946. CALL_MMX2_FILTER_CODE
  1947. CALL_MMX2_FILTER_CODE
  1948. "xor %%"REG_a", %%"REG_a" \n\t" // i
  1949. "mov %5, %%"REG_c" \n\t" // src
  1950. "mov %6, %%"REG_D" \n\t" // buf2
  1951. PREFETCH" (%%"REG_c") \n\t"
  1952. PREFETCH" 32(%%"REG_c") \n\t"
  1953. PREFETCH" 64(%%"REG_c") \n\t"
  1954. CALL_MMX2_FILTER_CODE
  1955. CALL_MMX2_FILTER_CODE
  1956. CALL_MMX2_FILTER_CODE
  1957. CALL_MMX2_FILTER_CODE
  1958. #if defined(PIC)
  1959. "mov %7, %%"REG_b" \n\t"
  1960. #endif
  1961. :: "m" (src1), "m" (dst1), "m" (filter), "m" (filterPos),
  1962. "m" (mmx2FilterCode), "m" (src2), "m"(dst2)
  1963. #if defined(PIC)
  1964. ,"m" (ebxsave)
  1965. #endif
  1966. : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
  1967. #if !defined(PIC)
  1968. ,"%"REG_b
  1969. #endif
  1970. );
  1971. for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) {
  1972. dst1[i] = src1[srcW-1]*128;
  1973. dst2[i] = src2[srcW-1]*128;
  1974. }
  1975. }
  1976. #endif /* COMPILE_TEMPLATE_MMX2 */
  1977. static av_cold void RENAME(sws_init_swScale)(SwsContext *c)
  1978. {
  1979. enum PixelFormat srcFormat = c->srcFormat,
  1980. dstFormat = c->dstFormat;
  1981. if (!is16BPS(dstFormat) && !is9_OR_10BPS(dstFormat) &&
  1982. dstFormat != PIX_FMT_NV12 && dstFormat != PIX_FMT_NV21) {
  1983. if (!(c->flags & SWS_BITEXACT)) {
  1984. if (c->flags & SWS_ACCURATE_RND) {
  1985. c->yuv2yuv1 = RENAME(yuv2yuv1_ar );
  1986. c->yuv2yuvX = RENAME(yuv2yuvX_ar );
  1987. if (!(c->flags & SWS_FULL_CHR_H_INT)) {
  1988. switch (c->dstFormat) {
  1989. case PIX_FMT_RGB32: c->yuv2packedX = RENAME(yuv2rgb32_X_ar); break;
  1990. case PIX_FMT_BGR24: c->yuv2packedX = RENAME(yuv2bgr24_X_ar); break;
  1991. case PIX_FMT_RGB555: c->yuv2packedX = RENAME(yuv2rgb555_X_ar); break;
  1992. case PIX_FMT_RGB565: c->yuv2packedX = RENAME(yuv2rgb565_X_ar); break;
  1993. case PIX_FMT_YUYV422: c->yuv2packedX = RENAME(yuv2yuyv422_X_ar); break;
  1994. default: break;
  1995. }
  1996. }
  1997. } else {
  1998. c->yuv2yuv1 = RENAME(yuv2yuv1 );
  1999. c->yuv2yuvX = RENAME(yuv2yuvX );
  2000. if (!(c->flags & SWS_FULL_CHR_H_INT)) {
  2001. switch (c->dstFormat) {
  2002. case PIX_FMT_RGB32: c->yuv2packedX = RENAME(yuv2rgb32_X); break;
  2003. case PIX_FMT_BGR24: c->yuv2packedX = RENAME(yuv2bgr24_X); break;
  2004. case PIX_FMT_RGB555: c->yuv2packedX = RENAME(yuv2rgb555_X); break;
  2005. case PIX_FMT_RGB565: c->yuv2packedX = RENAME(yuv2rgb565_X); break;
  2006. case PIX_FMT_YUYV422: c->yuv2packedX = RENAME(yuv2yuyv422_X); break;
  2007. default: break;
  2008. }
  2009. }
  2010. }
  2011. }
  2012. if (!(c->flags & SWS_FULL_CHR_H_INT)) {
  2013. switch (c->dstFormat) {
  2014. case PIX_FMT_RGB32:
  2015. c->yuv2packed1 = RENAME(yuv2rgb32_1);
  2016. c->yuv2packed2 = RENAME(yuv2rgb32_2);
  2017. break;
  2018. case PIX_FMT_BGR24:
  2019. c->yuv2packed1 = RENAME(yuv2bgr24_1);
  2020. c->yuv2packed2 = RENAME(yuv2bgr24_2);
  2021. break;
  2022. case PIX_FMT_RGB555:
  2023. c->yuv2packed1 = RENAME(yuv2rgb555_1);
  2024. c->yuv2packed2 = RENAME(yuv2rgb555_2);
  2025. break;
  2026. case PIX_FMT_RGB565:
  2027. c->yuv2packed1 = RENAME(yuv2rgb565_1);
  2028. c->yuv2packed2 = RENAME(yuv2rgb565_2);
  2029. break;
  2030. case PIX_FMT_YUYV422:
  2031. c->yuv2packed1 = RENAME(yuv2yuyv422_1);
  2032. c->yuv2packed2 = RENAME(yuv2yuyv422_2);
  2033. break;
  2034. default:
  2035. break;
  2036. }
  2037. }
  2038. }
  2039. if (c->srcBpc == 8 && c->dstBpc <= 10) {
  2040. // Use the new MMX scaler if the MMX2 one can't be used (it is faster than the x86 ASM one).
  2041. #if COMPILE_TEMPLATE_MMX2
  2042. if (c->flags & SWS_FAST_BILINEAR && c->canMMX2BeUsed)
  2043. {
  2044. c->hyscale_fast = RENAME(hyscale_fast);
  2045. c->hcscale_fast = RENAME(hcscale_fast);
  2046. } else {
  2047. #endif /* COMPILE_TEMPLATE_MMX2 */
  2048. c->hyscale_fast = NULL;
  2049. c->hcscale_fast = NULL;
  2050. #if COMPILE_TEMPLATE_MMX2
  2051. }
  2052. #endif /* COMPILE_TEMPLATE_MMX2 */
  2053. }
  2054. #if !COMPILE_TEMPLATE_MMX2
  2055. switch(srcFormat) {
  2056. case PIX_FMT_YUYV422 : c->chrToYV12 = RENAME(yuy2ToUV); break;
  2057. case PIX_FMT_UYVY422 : c->chrToYV12 = RENAME(uyvyToUV); break;
  2058. case PIX_FMT_NV12 : c->chrToYV12 = RENAME(nv12ToUV); break;
  2059. case PIX_FMT_NV21 : c->chrToYV12 = RENAME(nv21ToUV); break;
  2060. default: break;
  2061. }
  2062. #endif /* !COMPILE_TEMPLATE_MMX2 */
  2063. if (!c->chrSrcHSubSample) {
  2064. switch(srcFormat) {
  2065. case PIX_FMT_BGR24 : c->chrToYV12 = RENAME(bgr24ToUV); break;
  2066. case PIX_FMT_RGB24 : c->chrToYV12 = RENAME(rgb24ToUV); break;
  2067. default: break;
  2068. }
  2069. }
  2070. switch (srcFormat) {
  2071. #if !COMPILE_TEMPLATE_MMX2
  2072. case PIX_FMT_YUYV422 :
  2073. case PIX_FMT_Y400A : c->lumToYV12 = RENAME(yuy2ToY); break;
  2074. case PIX_FMT_UYVY422 : c->lumToYV12 = RENAME(uyvyToY); break;
  2075. #endif /* !COMPILE_TEMPLATE_MMX2 */
  2076. case PIX_FMT_BGR24 : c->lumToYV12 = RENAME(bgr24ToY); break;
  2077. case PIX_FMT_RGB24 : c->lumToYV12 = RENAME(rgb24ToY); break;
  2078. default: break;
  2079. }
  2080. #if !COMPILE_TEMPLATE_MMX2
  2081. if (c->alpPixBuf) {
  2082. switch (srcFormat) {
  2083. case PIX_FMT_Y400A : c->alpToYV12 = RENAME(yuy2ToY); break;
  2084. default: break;
  2085. }
  2086. }
  2087. #endif /* !COMPILE_TEMPLATE_MMX2 */
  2088. }