You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

2205 lines
99KB

  1. /*
  2. * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
  3. *
  4. * This file is part of FFmpeg.
  5. *
  6. * FFmpeg is free software; you can redistribute it and/or
  7. * modify it under the terms of the GNU Lesser General Public
  8. * License as published by the Free Software Foundation; either
  9. * version 2.1 of the License, or (at your option) any later version.
  10. *
  11. * FFmpeg is distributed in the hope that it will be useful,
  12. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  14. * Lesser General Public License for more details.
  15. *
  16. * You should have received a copy of the GNU Lesser General Public
  17. * License along with FFmpeg; if not, write to the Free Software
  18. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  19. */
  20. #undef REAL_MOVNTQ
  21. #undef MOVNTQ
  22. #undef PREFETCH
  23. #if COMPILE_TEMPLATE_MMX2
  24. #define PREFETCH "prefetchnta"
  25. #else
  26. #define PREFETCH " # nop"
  27. #endif
  28. #if COMPILE_TEMPLATE_MMX2
  29. #define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
  30. #else
  31. #define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
  32. #endif
  33. #define MOVNTQ(a,b) REAL_MOVNTQ(a,b)
  34. #define YSCALEYUV2YV12X(offset, dest, end, pos) \
  35. __asm__ volatile(\
  36. "movq "DITHER16"+0(%0), %%mm3 \n\t"\
  37. "movq "DITHER16"+8(%0), %%mm4 \n\t"\
  38. "lea " offset "(%0), %%"REG_d" \n\t"\
  39. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  40. ".p2align 4 \n\t" /* FIXME Unroll? */\
  41. "1: \n\t"\
  42. "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
  43. "movq (%%"REG_S", %3, 2), %%mm2 \n\t" /* srcData */\
  44. "movq 8(%%"REG_S", %3, 2), %%mm5 \n\t" /* srcData */\
  45. "add $16, %%"REG_d" \n\t"\
  46. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  47. "test %%"REG_S", %%"REG_S" \n\t"\
  48. "pmulhw %%mm0, %%mm2 \n\t"\
  49. "pmulhw %%mm0, %%mm5 \n\t"\
  50. "paddw %%mm2, %%mm3 \n\t"\
  51. "paddw %%mm5, %%mm4 \n\t"\
  52. " jnz 1b \n\t"\
  53. "psraw $3, %%mm3 \n\t"\
  54. "psraw $3, %%mm4 \n\t"\
  55. "packuswb %%mm4, %%mm3 \n\t"\
  56. MOVNTQ(%%mm3, (%1, %3))\
  57. "add $8, %3 \n\t"\
  58. "cmp %2, %3 \n\t"\
  59. "movq "DITHER16"+0(%0), %%mm3 \n\t"\
  60. "movq "DITHER16"+8(%0), %%mm4 \n\t"\
  61. "lea " offset "(%0), %%"REG_d" \n\t"\
  62. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  63. "jb 1b \n\t"\
  64. :: "r" (&c->redDither),\
  65. "r" (dest), "g" ((x86_reg)(end)), "r"((x86_reg)(pos))\
  66. : "%"REG_d, "%"REG_S\
  67. );
  68. #if !COMPILE_TEMPLATE_MMX2
  69. static av_always_inline void
  70. dither_8to16(SwsContext *c, const uint8_t *srcDither, int rot)
  71. {
  72. if (rot) {
  73. __asm__ volatile("pxor %%mm0, %%mm0\n\t"
  74. "movq (%0), %%mm3\n\t"
  75. "movq %%mm3, %%mm4\n\t"
  76. "psrlq $24, %%mm3\n\t"
  77. "psllq $40, %%mm4\n\t"
  78. "por %%mm4, %%mm3\n\t"
  79. "movq %%mm3, %%mm4\n\t"
  80. "punpcklbw %%mm0, %%mm3\n\t"
  81. "punpckhbw %%mm0, %%mm4\n\t"
  82. "psraw $4, %%mm3\n\t"
  83. "psraw $4, %%mm4\n\t"
  84. "movq %%mm3, "DITHER16"+0(%1)\n\t"
  85. "movq %%mm4, "DITHER16"+8(%1)\n\t"
  86. :: "r"(srcDither), "r"(&c->redDither)
  87. );
  88. } else {
  89. __asm__ volatile("pxor %%mm0, %%mm0\n\t"
  90. "movq (%0), %%mm3\n\t"
  91. "movq %%mm3, %%mm4\n\t"
  92. "punpcklbw %%mm0, %%mm3\n\t"
  93. "punpckhbw %%mm0, %%mm4\n\t"
  94. "psraw $4, %%mm3\n\t"
  95. "psraw $4, %%mm4\n\t"
  96. "movq %%mm3, "DITHER16"+0(%1)\n\t"
  97. "movq %%mm4, "DITHER16"+8(%1)\n\t"
  98. :: "r"(srcDither), "r"(&c->redDither)
  99. );
  100. }
  101. }
  102. #endif
  103. static void RENAME(yuv2yuvX)(SwsContext *c, const int16_t *lumFilter,
  104. const int16_t **lumSrc, int lumFilterSize,
  105. const int16_t *chrFilter, const int16_t **chrUSrc,
  106. const int16_t **chrVSrc,
  107. int chrFilterSize, const int16_t **alpSrc,
  108. uint8_t *dest[4], int dstW, int chrDstW)
  109. {
  110. uint8_t *yDest = dest[0], *uDest = dest[1], *vDest = dest[2],
  111. *aDest = CONFIG_SWSCALE_ALPHA ? dest[3] : NULL;
  112. const uint8_t *lumDither = c->lumDither8, *chrDither = c->chrDither8;
  113. if (uDest) {
  114. x86_reg uv_off = c->uv_offx2 >> 1;
  115. dither_8to16(c, chrDither, 0);
  116. YSCALEYUV2YV12X(CHR_MMX_FILTER_OFFSET, uDest, chrDstW, 0)
  117. dither_8to16(c, chrDither, 1);
  118. YSCALEYUV2YV12X(CHR_MMX_FILTER_OFFSET, vDest - uv_off, chrDstW + uv_off, uv_off)
  119. }
  120. dither_8to16(c, lumDither, 0);
  121. if (CONFIG_SWSCALE_ALPHA && aDest) {
  122. YSCALEYUV2YV12X(ALP_MMX_FILTER_OFFSET, aDest, dstW, 0)
  123. }
  124. YSCALEYUV2YV12X(LUM_MMX_FILTER_OFFSET, yDest, dstW, 0)
  125. }
  126. #define YSCALEYUV2YV12X_ACCURATE(offset, dest, end, pos) \
  127. __asm__ volatile(\
  128. "lea " offset "(%0), %%"REG_d" \n\t"\
  129. "movq "DITHER32"+0(%0), %%mm4 \n\t"\
  130. "movq "DITHER32"+8(%0), %%mm5 \n\t"\
  131. "movq "DITHER32"+16(%0), %%mm6 \n\t"\
  132. "movq "DITHER32"+24(%0), %%mm7 \n\t"\
  133. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  134. ".p2align 4 \n\t"\
  135. "1: \n\t"\
  136. "movq (%%"REG_S", %3, 2), %%mm0 \n\t" /* srcData */\
  137. "movq 8(%%"REG_S", %3, 2), %%mm2 \n\t" /* srcData */\
  138. "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
  139. "movq (%%"REG_S", %3, 2), %%mm1 \n\t" /* srcData */\
  140. "movq %%mm0, %%mm3 \n\t"\
  141. "punpcklwd %%mm1, %%mm0 \n\t"\
  142. "punpckhwd %%mm1, %%mm3 \n\t"\
  143. "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm1 \n\t" /* filterCoeff */\
  144. "pmaddwd %%mm1, %%mm0 \n\t"\
  145. "pmaddwd %%mm1, %%mm3 \n\t"\
  146. "paddd %%mm0, %%mm4 \n\t"\
  147. "paddd %%mm3, %%mm5 \n\t"\
  148. "movq 8(%%"REG_S", %3, 2), %%mm3 \n\t" /* srcData */\
  149. "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
  150. "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
  151. "test %%"REG_S", %%"REG_S" \n\t"\
  152. "movq %%mm2, %%mm0 \n\t"\
  153. "punpcklwd %%mm3, %%mm2 \n\t"\
  154. "punpckhwd %%mm3, %%mm0 \n\t"\
  155. "pmaddwd %%mm1, %%mm2 \n\t"\
  156. "pmaddwd %%mm1, %%mm0 \n\t"\
  157. "paddd %%mm2, %%mm6 \n\t"\
  158. "paddd %%mm0, %%mm7 \n\t"\
  159. " jnz 1b \n\t"\
  160. "psrad $19, %%mm4 \n\t"\
  161. "psrad $19, %%mm5 \n\t"\
  162. "psrad $19, %%mm6 \n\t"\
  163. "psrad $19, %%mm7 \n\t"\
  164. "packssdw %%mm5, %%mm4 \n\t"\
  165. "packssdw %%mm7, %%mm6 \n\t"\
  166. "packuswb %%mm6, %%mm4 \n\t"\
  167. MOVNTQ(%%mm4, (%1, %3))\
  168. "add $8, %3 \n\t"\
  169. "cmp %2, %3 \n\t"\
  170. "lea " offset "(%0), %%"REG_d" \n\t"\
  171. "movq "DITHER32"+0(%0), %%mm4 \n\t"\
  172. "movq "DITHER32"+8(%0), %%mm5 \n\t"\
  173. "movq "DITHER32"+16(%0), %%mm6 \n\t"\
  174. "movq "DITHER32"+24(%0), %%mm7 \n\t"\
  175. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  176. "jb 1b \n\t"\
  177. :: "r" (&c->redDither),\
  178. "r" (dest), "g" ((x86_reg)(end)), "r"((x86_reg)(pos))\
  179. : "%"REG_a, "%"REG_d, "%"REG_S\
  180. );
  181. #if !COMPILE_TEMPLATE_MMX2
  182. static av_always_inline void
  183. dither_8to32(SwsContext *c, const uint8_t *srcDither, int rot)
  184. {
  185. if (rot) {
  186. __asm__ volatile("pxor %%mm0, %%mm0\n\t"
  187. "movq (%0), %%mm4\n\t"
  188. "movq %%mm4, %%mm5\n\t"
  189. "psrlq $24, %%mm4\n\t"
  190. "psllq $40, %%mm5\n\t"
  191. "por %%mm5, %%mm4\n\t"
  192. "movq %%mm4, %%mm6\n\t"
  193. "punpcklbw %%mm0, %%mm4\n\t"
  194. "punpckhbw %%mm0, %%mm6\n\t"
  195. "movq %%mm4, %%mm5\n\t"
  196. "movq %%mm6, %%mm7\n\t"
  197. "punpcklwd %%mm0, %%mm4\n\t"
  198. "punpckhwd %%mm0, %%mm5\n\t"
  199. "punpcklwd %%mm0, %%mm6\n\t"
  200. "punpckhwd %%mm0, %%mm7\n\t"
  201. "pslld $12, %%mm4\n\t"
  202. "pslld $12, %%mm5\n\t"
  203. "pslld $12, %%mm6\n\t"
  204. "pslld $12, %%mm7\n\t"
  205. "movq %%mm4, "DITHER32"+0(%1)\n\t"
  206. "movq %%mm5, "DITHER32"+8(%1)\n\t"
  207. "movq %%mm6, "DITHER32"+16(%1)\n\t"
  208. "movq %%mm7, "DITHER32"+24(%1)\n\t"
  209. :: "r"(srcDither), "r"(&c->redDither)
  210. );
  211. } else {
  212. __asm__ volatile("pxor %%mm0, %%mm0\n\t"
  213. "movq (%0), %%mm4\n\t"
  214. "movq %%mm4, %%mm6\n\t"
  215. "punpcklbw %%mm0, %%mm4\n\t"
  216. "punpckhbw %%mm0, %%mm6\n\t"
  217. "movq %%mm4, %%mm5\n\t"
  218. "movq %%mm6, %%mm7\n\t"
  219. "punpcklwd %%mm0, %%mm4\n\t"
  220. "punpckhwd %%mm0, %%mm5\n\t"
  221. "punpcklwd %%mm0, %%mm6\n\t"
  222. "punpckhwd %%mm0, %%mm7\n\t"
  223. "pslld $12, %%mm4\n\t"
  224. "pslld $12, %%mm5\n\t"
  225. "pslld $12, %%mm6\n\t"
  226. "pslld $12, %%mm7\n\t"
  227. "movq %%mm4, "DITHER32"+0(%1)\n\t"
  228. "movq %%mm5, "DITHER32"+8(%1)\n\t"
  229. "movq %%mm6, "DITHER32"+16(%1)\n\t"
  230. "movq %%mm7, "DITHER32"+24(%1)\n\t"
  231. :: "r"(srcDither), "r"(&c->redDither)
  232. );
  233. }
  234. }
  235. #endif
  236. static void RENAME(yuv2yuvX_ar)(SwsContext *c, const int16_t *lumFilter,
  237. const int16_t **lumSrc, int lumFilterSize,
  238. const int16_t *chrFilter, const int16_t **chrUSrc,
  239. const int16_t **chrVSrc,
  240. int chrFilterSize, const int16_t **alpSrc,
  241. uint8_t *dest[4], int dstW, int chrDstW)
  242. {
  243. uint8_t *yDest = dest[0], *uDest = dest[1], *vDest = dest[2],
  244. *aDest = CONFIG_SWSCALE_ALPHA ? dest[3] : NULL;
  245. const uint8_t *lumDither = c->lumDither8, *chrDither = c->chrDither8;
  246. if (uDest) {
  247. x86_reg uv_off = c->uv_offx2 >> 1;
  248. dither_8to32(c, chrDither, 0);
  249. YSCALEYUV2YV12X_ACCURATE(CHR_MMX_FILTER_OFFSET, uDest, chrDstW, 0)
  250. dither_8to32(c, chrDither, 1);
  251. YSCALEYUV2YV12X_ACCURATE(CHR_MMX_FILTER_OFFSET, vDest - uv_off, chrDstW + uv_off, uv_off)
  252. }
  253. dither_8to32(c, lumDither, 0);
  254. if (CONFIG_SWSCALE_ALPHA && aDest) {
  255. YSCALEYUV2YV12X_ACCURATE(ALP_MMX_FILTER_OFFSET, aDest, dstW, 0)
  256. }
  257. YSCALEYUV2YV12X_ACCURATE(LUM_MMX_FILTER_OFFSET, yDest, dstW, 0)
  258. }
  259. static void RENAME(yuv2yuv1)(SwsContext *c, const int16_t *lumSrc,
  260. const int16_t *chrUSrc, const int16_t *chrVSrc,
  261. const int16_t *alpSrc,
  262. uint8_t *dst[4], int dstW, int chrDstW)
  263. {
  264. int p= 4;
  265. const int16_t *src[4]= {
  266. lumSrc + dstW, chrUSrc + chrDstW,
  267. chrVSrc + chrDstW, alpSrc + dstW
  268. };
  269. x86_reg counter[4]= { dstW, chrDstW, chrDstW, dstW };
  270. while (p--) {
  271. if (dst[p]) {
  272. __asm__ volatile(
  273. "mov %2, %%"REG_a" \n\t"
  274. ".p2align 4 \n\t" /* FIXME Unroll? */
  275. "1: \n\t"
  276. "movq (%0, %%"REG_a", 2), %%mm0 \n\t"
  277. "movq 8(%0, %%"REG_a", 2), %%mm1 \n\t"
  278. "psraw $7, %%mm0 \n\t"
  279. "psraw $7, %%mm1 \n\t"
  280. "packuswb %%mm1, %%mm0 \n\t"
  281. MOVNTQ(%%mm0, (%1, %%REGa))
  282. "add $8, %%"REG_a" \n\t"
  283. "jnc 1b \n\t"
  284. :: "r" (src[p]), "r" (dst[p] + counter[p]),
  285. "g" (-counter[p])
  286. : "%"REG_a
  287. );
  288. }
  289. }
  290. }
  291. static void RENAME(yuv2yuv1_ar)(SwsContext *c, const int16_t *lumSrc,
  292. const int16_t *chrUSrc, const int16_t *chrVSrc,
  293. const int16_t *alpSrc,
  294. uint8_t *dst[4], int dstW, int chrDstW)
  295. {
  296. int p= 4;
  297. const int16_t *src[4]= {
  298. lumSrc + dstW, chrUSrc + chrDstW,
  299. chrVSrc + chrDstW, alpSrc + dstW
  300. };
  301. x86_reg counter[4]= { dstW, chrDstW, chrDstW, dstW };
  302. const uint8_t *lumDither = c->lumDither8, *chrDither = c->chrDither8;
  303. while (p--) {
  304. if (dst[p]) {
  305. int i;
  306. for(i=0; i<8; i++) c->dither16[i] = (p == 2 || p == 3) ? lumDither[i] : chrDither[i];
  307. __asm__ volatile(
  308. "mov %2, %%"REG_a" \n\t"
  309. "movq "DITHER16"+0(%3), %%mm6 \n\t"
  310. "movq "DITHER16"+8(%3), %%mm7 \n\t"
  311. ".p2align 4 \n\t" /* FIXME Unroll? */
  312. "1: \n\t"
  313. "movq (%0, %%"REG_a", 2), %%mm0 \n\t"
  314. "movq 8(%0, %%"REG_a", 2), %%mm1 \n\t"
  315. "paddsw %%mm6, %%mm0 \n\t"
  316. "paddsw %%mm7, %%mm1 \n\t"
  317. "psraw $7, %%mm0 \n\t"
  318. "psraw $7, %%mm1 \n\t"
  319. "packuswb %%mm1, %%mm0 \n\t"
  320. MOVNTQ(%%mm0, (%1, %%REGa))
  321. "add $8, %%"REG_a" \n\t"
  322. "jnc 1b \n\t"
  323. :: "r" (src[p]), "r" (dst[p] + counter[p]),
  324. "g" (-counter[p]), "r"(&c->redDither)
  325. : "%"REG_a
  326. );
  327. }
  328. }
  329. }
  330. #define YSCALEYUV2PACKEDX_UV \
  331. __asm__ volatile(\
  332. "xor %%"REG_a", %%"REG_a" \n\t"\
  333. ".p2align 4 \n\t"\
  334. "nop \n\t"\
  335. "1: \n\t"\
  336. "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
  337. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  338. "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
  339. "movq %%mm3, %%mm4 \n\t"\
  340. ".p2align 4 \n\t"\
  341. "2: \n\t"\
  342. "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
  343. "movq (%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* UsrcData */\
  344. "add %6, %%"REG_S" \n\t" \
  345. "movq (%%"REG_S", %%"REG_a"), %%mm5 \n\t" /* VsrcData */\
  346. "add $16, %%"REG_d" \n\t"\
  347. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  348. "pmulhw %%mm0, %%mm2 \n\t"\
  349. "pmulhw %%mm0, %%mm5 \n\t"\
  350. "paddw %%mm2, %%mm3 \n\t"\
  351. "paddw %%mm5, %%mm4 \n\t"\
  352. "test %%"REG_S", %%"REG_S" \n\t"\
  353. " jnz 2b \n\t"\
  354. #define YSCALEYUV2PACKEDX_YA(offset,coeff,src1,src2,dst1,dst2) \
  355. "lea "offset"(%0), %%"REG_d" \n\t"\
  356. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  357. "movq "VROUNDER_OFFSET"(%0), "#dst1" \n\t"\
  358. "movq "#dst1", "#dst2" \n\t"\
  359. ".p2align 4 \n\t"\
  360. "2: \n\t"\
  361. "movq 8(%%"REG_d"), "#coeff" \n\t" /* filterCoeff */\
  362. "movq (%%"REG_S", %%"REG_a", 2), "#src1" \n\t" /* Y1srcData */\
  363. "movq 8(%%"REG_S", %%"REG_a", 2), "#src2" \n\t" /* Y2srcData */\
  364. "add $16, %%"REG_d" \n\t"\
  365. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  366. "pmulhw "#coeff", "#src1" \n\t"\
  367. "pmulhw "#coeff", "#src2" \n\t"\
  368. "paddw "#src1", "#dst1" \n\t"\
  369. "paddw "#src2", "#dst2" \n\t"\
  370. "test %%"REG_S", %%"REG_S" \n\t"\
  371. " jnz 2b \n\t"\
  372. #define YSCALEYUV2PACKEDX \
  373. YSCALEYUV2PACKEDX_UV \
  374. YSCALEYUV2PACKEDX_YA(LUM_MMX_FILTER_OFFSET,%%mm0,%%mm2,%%mm5,%%mm1,%%mm7) \
  375. #define YSCALEYUV2PACKEDX_END \
  376. :: "r" (&c->redDither), \
  377. "m" (dummy), "m" (dummy), "m" (dummy),\
  378. "r" (dest), "m" (dstW_reg), "m"(uv_off) \
  379. : "%"REG_a, "%"REG_d, "%"REG_S \
  380. );
  381. #define YSCALEYUV2PACKEDX_ACCURATE_UV \
  382. __asm__ volatile(\
  383. "xor %%"REG_a", %%"REG_a" \n\t"\
  384. ".p2align 4 \n\t"\
  385. "nop \n\t"\
  386. "1: \n\t"\
  387. "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
  388. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  389. "pxor %%mm4, %%mm4 \n\t"\
  390. "pxor %%mm5, %%mm5 \n\t"\
  391. "pxor %%mm6, %%mm6 \n\t"\
  392. "pxor %%mm7, %%mm7 \n\t"\
  393. ".p2align 4 \n\t"\
  394. "2: \n\t"\
  395. "movq (%%"REG_S", %%"REG_a"), %%mm0 \n\t" /* UsrcData */\
  396. "add %6, %%"REG_S" \n\t" \
  397. "movq (%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* VsrcData */\
  398. "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
  399. "movq (%%"REG_S", %%"REG_a"), %%mm1 \n\t" /* UsrcData */\
  400. "movq %%mm0, %%mm3 \n\t"\
  401. "punpcklwd %%mm1, %%mm0 \n\t"\
  402. "punpckhwd %%mm1, %%mm3 \n\t"\
  403. "movq "STR(APCK_COEF)"(%%"REG_d"),%%mm1 \n\t" /* filterCoeff */\
  404. "pmaddwd %%mm1, %%mm0 \n\t"\
  405. "pmaddwd %%mm1, %%mm3 \n\t"\
  406. "paddd %%mm0, %%mm4 \n\t"\
  407. "paddd %%mm3, %%mm5 \n\t"\
  408. "add %6, %%"REG_S" \n\t" \
  409. "movq (%%"REG_S", %%"REG_a"), %%mm3 \n\t" /* VsrcData */\
  410. "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
  411. "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
  412. "test %%"REG_S", %%"REG_S" \n\t"\
  413. "movq %%mm2, %%mm0 \n\t"\
  414. "punpcklwd %%mm3, %%mm2 \n\t"\
  415. "punpckhwd %%mm3, %%mm0 \n\t"\
  416. "pmaddwd %%mm1, %%mm2 \n\t"\
  417. "pmaddwd %%mm1, %%mm0 \n\t"\
  418. "paddd %%mm2, %%mm6 \n\t"\
  419. "paddd %%mm0, %%mm7 \n\t"\
  420. " jnz 2b \n\t"\
  421. "psrad $16, %%mm4 \n\t"\
  422. "psrad $16, %%mm5 \n\t"\
  423. "psrad $16, %%mm6 \n\t"\
  424. "psrad $16, %%mm7 \n\t"\
  425. "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
  426. "packssdw %%mm5, %%mm4 \n\t"\
  427. "packssdw %%mm7, %%mm6 \n\t"\
  428. "paddw %%mm0, %%mm4 \n\t"\
  429. "paddw %%mm0, %%mm6 \n\t"\
  430. "movq %%mm4, "U_TEMP"(%0) \n\t"\
  431. "movq %%mm6, "V_TEMP"(%0) \n\t"\
  432. #define YSCALEYUV2PACKEDX_ACCURATE_YA(offset) \
  433. "lea "offset"(%0), %%"REG_d" \n\t"\
  434. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  435. "pxor %%mm1, %%mm1 \n\t"\
  436. "pxor %%mm5, %%mm5 \n\t"\
  437. "pxor %%mm7, %%mm7 \n\t"\
  438. "pxor %%mm6, %%mm6 \n\t"\
  439. ".p2align 4 \n\t"\
  440. "2: \n\t"\
  441. "movq (%%"REG_S", %%"REG_a", 2), %%mm0 \n\t" /* Y1srcData */\
  442. "movq 8(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* Y2srcData */\
  443. "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
  444. "movq (%%"REG_S", %%"REG_a", 2), %%mm4 \n\t" /* Y1srcData */\
  445. "movq %%mm0, %%mm3 \n\t"\
  446. "punpcklwd %%mm4, %%mm0 \n\t"\
  447. "punpckhwd %%mm4, %%mm3 \n\t"\
  448. "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm4 \n\t" /* filterCoeff */\
  449. "pmaddwd %%mm4, %%mm0 \n\t"\
  450. "pmaddwd %%mm4, %%mm3 \n\t"\
  451. "paddd %%mm0, %%mm1 \n\t"\
  452. "paddd %%mm3, %%mm5 \n\t"\
  453. "movq 8(%%"REG_S", %%"REG_a", 2), %%mm3 \n\t" /* Y2srcData */\
  454. "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
  455. "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
  456. "test %%"REG_S", %%"REG_S" \n\t"\
  457. "movq %%mm2, %%mm0 \n\t"\
  458. "punpcklwd %%mm3, %%mm2 \n\t"\
  459. "punpckhwd %%mm3, %%mm0 \n\t"\
  460. "pmaddwd %%mm4, %%mm2 \n\t"\
  461. "pmaddwd %%mm4, %%mm0 \n\t"\
  462. "paddd %%mm2, %%mm7 \n\t"\
  463. "paddd %%mm0, %%mm6 \n\t"\
  464. " jnz 2b \n\t"\
  465. "psrad $16, %%mm1 \n\t"\
  466. "psrad $16, %%mm5 \n\t"\
  467. "psrad $16, %%mm7 \n\t"\
  468. "psrad $16, %%mm6 \n\t"\
  469. "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
  470. "packssdw %%mm5, %%mm1 \n\t"\
  471. "packssdw %%mm6, %%mm7 \n\t"\
  472. "paddw %%mm0, %%mm1 \n\t"\
  473. "paddw %%mm0, %%mm7 \n\t"\
  474. "movq "U_TEMP"(%0), %%mm3 \n\t"\
  475. "movq "V_TEMP"(%0), %%mm4 \n\t"\
  476. #define YSCALEYUV2PACKEDX_ACCURATE \
  477. YSCALEYUV2PACKEDX_ACCURATE_UV \
  478. YSCALEYUV2PACKEDX_ACCURATE_YA(LUM_MMX_FILTER_OFFSET)
  479. #define YSCALEYUV2RGBX \
  480. "psubw "U_OFFSET"(%0), %%mm3 \n\t" /* (U-128)8*/\
  481. "psubw "V_OFFSET"(%0), %%mm4 \n\t" /* (V-128)8*/\
  482. "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
  483. "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
  484. "pmulhw "UG_COEFF"(%0), %%mm3 \n\t"\
  485. "pmulhw "VG_COEFF"(%0), %%mm4 \n\t"\
  486. /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
  487. "pmulhw "UB_COEFF"(%0), %%mm2 \n\t"\
  488. "pmulhw "VR_COEFF"(%0), %%mm5 \n\t"\
  489. "psubw "Y_OFFSET"(%0), %%mm1 \n\t" /* 8(Y-16)*/\
  490. "psubw "Y_OFFSET"(%0), %%mm7 \n\t" /* 8(Y-16)*/\
  491. "pmulhw "Y_COEFF"(%0), %%mm1 \n\t"\
  492. "pmulhw "Y_COEFF"(%0), %%mm7 \n\t"\
  493. /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
  494. "paddw %%mm3, %%mm4 \n\t"\
  495. "movq %%mm2, %%mm0 \n\t"\
  496. "movq %%mm5, %%mm6 \n\t"\
  497. "movq %%mm4, %%mm3 \n\t"\
  498. "punpcklwd %%mm2, %%mm2 \n\t"\
  499. "punpcklwd %%mm5, %%mm5 \n\t"\
  500. "punpcklwd %%mm4, %%mm4 \n\t"\
  501. "paddw %%mm1, %%mm2 \n\t"\
  502. "paddw %%mm1, %%mm5 \n\t"\
  503. "paddw %%mm1, %%mm4 \n\t"\
  504. "punpckhwd %%mm0, %%mm0 \n\t"\
  505. "punpckhwd %%mm6, %%mm6 \n\t"\
  506. "punpckhwd %%mm3, %%mm3 \n\t"\
  507. "paddw %%mm7, %%mm0 \n\t"\
  508. "paddw %%mm7, %%mm6 \n\t"\
  509. "paddw %%mm7, %%mm3 \n\t"\
  510. /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
  511. "packuswb %%mm0, %%mm2 \n\t"\
  512. "packuswb %%mm6, %%mm5 \n\t"\
  513. "packuswb %%mm3, %%mm4 \n\t"\
  514. #define REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) \
  515. "movq "#b", "#q2" \n\t" /* B */\
  516. "movq "#r", "#t" \n\t" /* R */\
  517. "punpcklbw "#g", "#b" \n\t" /* GBGBGBGB 0 */\
  518. "punpcklbw "#a", "#r" \n\t" /* ARARARAR 0 */\
  519. "punpckhbw "#g", "#q2" \n\t" /* GBGBGBGB 2 */\
  520. "punpckhbw "#a", "#t" \n\t" /* ARARARAR 2 */\
  521. "movq "#b", "#q0" \n\t" /* GBGBGBGB 0 */\
  522. "movq "#q2", "#q3" \n\t" /* GBGBGBGB 2 */\
  523. "punpcklwd "#r", "#q0" \n\t" /* ARGBARGB 0 */\
  524. "punpckhwd "#r", "#b" \n\t" /* ARGBARGB 1 */\
  525. "punpcklwd "#t", "#q2" \n\t" /* ARGBARGB 2 */\
  526. "punpckhwd "#t", "#q3" \n\t" /* ARGBARGB 3 */\
  527. \
  528. MOVNTQ( q0, (dst, index, 4))\
  529. MOVNTQ( b, 8(dst, index, 4))\
  530. MOVNTQ( q2, 16(dst, index, 4))\
  531. MOVNTQ( q3, 24(dst, index, 4))\
  532. \
  533. "add $8, "#index" \n\t"\
  534. "cmp "#dstw", "#index" \n\t"\
  535. " jb 1b \n\t"
  536. #define WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t)
  537. static void RENAME(yuv2rgb32_X_ar)(SwsContext *c, const int16_t *lumFilter,
  538. const int16_t **lumSrc, int lumFilterSize,
  539. const int16_t *chrFilter, const int16_t **chrUSrc,
  540. const int16_t **chrVSrc,
  541. int chrFilterSize, const int16_t **alpSrc,
  542. uint8_t *dest, int dstW, int dstY)
  543. {
  544. x86_reg dummy=0;
  545. x86_reg dstW_reg = dstW;
  546. x86_reg uv_off = c->uv_offx2;
  547. if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
  548. YSCALEYUV2PACKEDX_ACCURATE
  549. YSCALEYUV2RGBX
  550. "movq %%mm2, "U_TEMP"(%0) \n\t"
  551. "movq %%mm4, "V_TEMP"(%0) \n\t"
  552. "movq %%mm5, "Y_TEMP"(%0) \n\t"
  553. YSCALEYUV2PACKEDX_ACCURATE_YA(ALP_MMX_FILTER_OFFSET)
  554. "movq "Y_TEMP"(%0), %%mm5 \n\t"
  555. "psraw $3, %%mm1 \n\t"
  556. "psraw $3, %%mm7 \n\t"
  557. "packuswb %%mm7, %%mm1 \n\t"
  558. WRITEBGR32(%4, %5, %%REGa, %%mm3, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm2, %%mm6)
  559. YSCALEYUV2PACKEDX_END
  560. } else {
  561. YSCALEYUV2PACKEDX_ACCURATE
  562. YSCALEYUV2RGBX
  563. "pcmpeqd %%mm7, %%mm7 \n\t"
  564. WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
  565. YSCALEYUV2PACKEDX_END
  566. }
  567. }
  568. static void RENAME(yuv2rgb32_X)(SwsContext *c, const int16_t *lumFilter,
  569. const int16_t **lumSrc, int lumFilterSize,
  570. const int16_t *chrFilter, const int16_t **chrUSrc,
  571. const int16_t **chrVSrc,
  572. int chrFilterSize, const int16_t **alpSrc,
  573. uint8_t *dest, int dstW, int dstY)
  574. {
  575. x86_reg dummy=0;
  576. x86_reg dstW_reg = dstW;
  577. x86_reg uv_off = c->uv_offx2;
  578. if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
  579. YSCALEYUV2PACKEDX
  580. YSCALEYUV2RGBX
  581. YSCALEYUV2PACKEDX_YA(ALP_MMX_FILTER_OFFSET, %%mm0, %%mm3, %%mm6, %%mm1, %%mm7)
  582. "psraw $3, %%mm1 \n\t"
  583. "psraw $3, %%mm7 \n\t"
  584. "packuswb %%mm7, %%mm1 \n\t"
  585. WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
  586. YSCALEYUV2PACKEDX_END
  587. } else {
  588. YSCALEYUV2PACKEDX
  589. YSCALEYUV2RGBX
  590. "pcmpeqd %%mm7, %%mm7 \n\t"
  591. WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
  592. YSCALEYUV2PACKEDX_END
  593. }
  594. }
  595. #define REAL_WRITERGB16(dst, dstw, index) \
  596. "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
  597. "pand "MANGLE(bFC)", %%mm4 \n\t" /* G */\
  598. "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
  599. "psrlq $3, %%mm2 \n\t"\
  600. \
  601. "movq %%mm2, %%mm1 \n\t"\
  602. "movq %%mm4, %%mm3 \n\t"\
  603. \
  604. "punpcklbw %%mm7, %%mm3 \n\t"\
  605. "punpcklbw %%mm5, %%mm2 \n\t"\
  606. "punpckhbw %%mm7, %%mm4 \n\t"\
  607. "punpckhbw %%mm5, %%mm1 \n\t"\
  608. \
  609. "psllq $3, %%mm3 \n\t"\
  610. "psllq $3, %%mm4 \n\t"\
  611. \
  612. "por %%mm3, %%mm2 \n\t"\
  613. "por %%mm4, %%mm1 \n\t"\
  614. \
  615. MOVNTQ(%%mm2, (dst, index, 2))\
  616. MOVNTQ(%%mm1, 8(dst, index, 2))\
  617. \
  618. "add $8, "#index" \n\t"\
  619. "cmp "#dstw", "#index" \n\t"\
  620. " jb 1b \n\t"
  621. #define WRITERGB16(dst, dstw, index) REAL_WRITERGB16(dst, dstw, index)
  622. static void RENAME(yuv2rgb565_X_ar)(SwsContext *c, const int16_t *lumFilter,
  623. const int16_t **lumSrc, int lumFilterSize,
  624. const int16_t *chrFilter, const int16_t **chrUSrc,
  625. const int16_t **chrVSrc,
  626. int chrFilterSize, const int16_t **alpSrc,
  627. uint8_t *dest, int dstW, int dstY)
  628. {
  629. x86_reg dummy=0;
  630. x86_reg dstW_reg = dstW;
  631. x86_reg uv_off = c->uv_offx2;
  632. YSCALEYUV2PACKEDX_ACCURATE
  633. YSCALEYUV2RGBX
  634. "pxor %%mm7, %%mm7 \n\t"
  635. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  636. #ifdef DITHER1XBPP
  637. "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
  638. "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
  639. "paddusb "RED_DITHER"(%0), %%mm5\n\t"
  640. #endif
  641. WRITERGB16(%4, %5, %%REGa)
  642. YSCALEYUV2PACKEDX_END
  643. }
  644. static void RENAME(yuv2rgb565_X)(SwsContext *c, const int16_t *lumFilter,
  645. const int16_t **lumSrc, int lumFilterSize,
  646. const int16_t *chrFilter, const int16_t **chrUSrc,
  647. const int16_t **chrVSrc,
  648. int chrFilterSize, const int16_t **alpSrc,
  649. uint8_t *dest, int dstW, int dstY)
  650. {
  651. x86_reg dummy=0;
  652. x86_reg dstW_reg = dstW;
  653. x86_reg uv_off = c->uv_offx2;
  654. YSCALEYUV2PACKEDX
  655. YSCALEYUV2RGBX
  656. "pxor %%mm7, %%mm7 \n\t"
  657. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  658. #ifdef DITHER1XBPP
  659. "paddusb "BLUE_DITHER"(%0), %%mm2 \n\t"
  660. "paddusb "GREEN_DITHER"(%0), %%mm4 \n\t"
  661. "paddusb "RED_DITHER"(%0), %%mm5 \n\t"
  662. #endif
  663. WRITERGB16(%4, %5, %%REGa)
  664. YSCALEYUV2PACKEDX_END
  665. }
  666. #define REAL_WRITERGB15(dst, dstw, index) \
  667. "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
  668. "pand "MANGLE(bF8)", %%mm4 \n\t" /* G */\
  669. "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
  670. "psrlq $3, %%mm2 \n\t"\
  671. "psrlq $1, %%mm5 \n\t"\
  672. \
  673. "movq %%mm2, %%mm1 \n\t"\
  674. "movq %%mm4, %%mm3 \n\t"\
  675. \
  676. "punpcklbw %%mm7, %%mm3 \n\t"\
  677. "punpcklbw %%mm5, %%mm2 \n\t"\
  678. "punpckhbw %%mm7, %%mm4 \n\t"\
  679. "punpckhbw %%mm5, %%mm1 \n\t"\
  680. \
  681. "psllq $2, %%mm3 \n\t"\
  682. "psllq $2, %%mm4 \n\t"\
  683. \
  684. "por %%mm3, %%mm2 \n\t"\
  685. "por %%mm4, %%mm1 \n\t"\
  686. \
  687. MOVNTQ(%%mm2, (dst, index, 2))\
  688. MOVNTQ(%%mm1, 8(dst, index, 2))\
  689. \
  690. "add $8, "#index" \n\t"\
  691. "cmp "#dstw", "#index" \n\t"\
  692. " jb 1b \n\t"
  693. #define WRITERGB15(dst, dstw, index) REAL_WRITERGB15(dst, dstw, index)
  694. static void RENAME(yuv2rgb555_X_ar)(SwsContext *c, const int16_t *lumFilter,
  695. const int16_t **lumSrc, int lumFilterSize,
  696. const int16_t *chrFilter, const int16_t **chrUSrc,
  697. const int16_t **chrVSrc,
  698. int chrFilterSize, const int16_t **alpSrc,
  699. uint8_t *dest, int dstW, int dstY)
  700. {
  701. x86_reg dummy=0;
  702. x86_reg dstW_reg = dstW;
  703. x86_reg uv_off = c->uv_offx2;
  704. YSCALEYUV2PACKEDX_ACCURATE
  705. YSCALEYUV2RGBX
  706. "pxor %%mm7, %%mm7 \n\t"
  707. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  708. #ifdef DITHER1XBPP
  709. "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
  710. "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
  711. "paddusb "RED_DITHER"(%0), %%mm5\n\t"
  712. #endif
  713. WRITERGB15(%4, %5, %%REGa)
  714. YSCALEYUV2PACKEDX_END
  715. }
  716. static void RENAME(yuv2rgb555_X)(SwsContext *c, const int16_t *lumFilter,
  717. const int16_t **lumSrc, int lumFilterSize,
  718. const int16_t *chrFilter, const int16_t **chrUSrc,
  719. const int16_t **chrVSrc,
  720. int chrFilterSize, const int16_t **alpSrc,
  721. uint8_t *dest, int dstW, int dstY)
  722. {
  723. x86_reg dummy=0;
  724. x86_reg dstW_reg = dstW;
  725. x86_reg uv_off = c->uv_offx2;
  726. YSCALEYUV2PACKEDX
  727. YSCALEYUV2RGBX
  728. "pxor %%mm7, %%mm7 \n\t"
  729. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  730. #ifdef DITHER1XBPP
  731. "paddusb "BLUE_DITHER"(%0), %%mm2 \n\t"
  732. "paddusb "GREEN_DITHER"(%0), %%mm4 \n\t"
  733. "paddusb "RED_DITHER"(%0), %%mm5 \n\t"
  734. #endif
  735. WRITERGB15(%4, %5, %%REGa)
  736. YSCALEYUV2PACKEDX_END
  737. }
  738. #define WRITEBGR24MMX(dst, dstw, index) \
  739. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
  740. "movq %%mm2, %%mm1 \n\t" /* B */\
  741. "movq %%mm5, %%mm6 \n\t" /* R */\
  742. "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
  743. "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
  744. "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
  745. "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
  746. "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
  747. "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
  748. "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
  749. "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
  750. "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
  751. "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
  752. \
  753. "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
  754. "movq %%mm2, %%mm6 \n\t" /* 0RGB0RGB 1 */\
  755. "movq %%mm1, %%mm5 \n\t" /* 0RGB0RGB 2 */\
  756. "movq %%mm3, %%mm7 \n\t" /* 0RGB0RGB 3 */\
  757. \
  758. "psllq $40, %%mm0 \n\t" /* RGB00000 0 */\
  759. "psllq $40, %%mm2 \n\t" /* RGB00000 1 */\
  760. "psllq $40, %%mm1 \n\t" /* RGB00000 2 */\
  761. "psllq $40, %%mm3 \n\t" /* RGB00000 3 */\
  762. \
  763. "punpckhdq %%mm4, %%mm0 \n\t" /* 0RGBRGB0 0 */\
  764. "punpckhdq %%mm6, %%mm2 \n\t" /* 0RGBRGB0 1 */\
  765. "punpckhdq %%mm5, %%mm1 \n\t" /* 0RGBRGB0 2 */\
  766. "punpckhdq %%mm7, %%mm3 \n\t" /* 0RGBRGB0 3 */\
  767. \
  768. "psrlq $8, %%mm0 \n\t" /* 00RGBRGB 0 */\
  769. "movq %%mm2, %%mm6 \n\t" /* 0RGBRGB0 1 */\
  770. "psllq $40, %%mm2 \n\t" /* GB000000 1 */\
  771. "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
  772. MOVNTQ(%%mm0, (dst))\
  773. \
  774. "psrlq $24, %%mm6 \n\t" /* 0000RGBR 1 */\
  775. "movq %%mm1, %%mm5 \n\t" /* 0RGBRGB0 2 */\
  776. "psllq $24, %%mm1 \n\t" /* BRGB0000 2 */\
  777. "por %%mm1, %%mm6 \n\t" /* BRGBRGBR 1 */\
  778. MOVNTQ(%%mm6, 8(dst))\
  779. \
  780. "psrlq $40, %%mm5 \n\t" /* 000000RG 2 */\
  781. "psllq $8, %%mm3 \n\t" /* RGBRGB00 3 */\
  782. "por %%mm3, %%mm5 \n\t" /* RGBRGBRG 2 */\
  783. MOVNTQ(%%mm5, 16(dst))\
  784. \
  785. "add $24, "#dst" \n\t"\
  786. \
  787. "add $8, "#index" \n\t"\
  788. "cmp "#dstw", "#index" \n\t"\
  789. " jb 1b \n\t"
  790. #define WRITEBGR24MMX2(dst, dstw, index) \
  791. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
  792. "movq "MANGLE(ff_M24A)", %%mm0 \n\t"\
  793. "movq "MANGLE(ff_M24C)", %%mm7 \n\t"\
  794. "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2 B1 B0 B1 B0 */\
  795. "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2 G1 G0 G1 G0 */\
  796. "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0 R1 R0 R1 R0 */\
  797. \
  798. "pand %%mm0, %%mm1 \n\t" /* B2 B1 B0 */\
  799. "pand %%mm0, %%mm3 \n\t" /* G2 G1 G0 */\
  800. "pand %%mm7, %%mm6 \n\t" /* R1 R0 */\
  801. \
  802. "psllq $8, %%mm3 \n\t" /* G2 G1 G0 */\
  803. "por %%mm1, %%mm6 \n\t"\
  804. "por %%mm3, %%mm6 \n\t"\
  805. MOVNTQ(%%mm6, (dst))\
  806. \
  807. "psrlq $8, %%mm4 \n\t" /* 00 G7 G6 G5 G4 G3 G2 G1 */\
  808. "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4 B3 B2 B3 B2 */\
  809. "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3 G4 G3 G4 G3 */\
  810. "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4 R3 R2 R3 R2 */\
  811. \
  812. "pand "MANGLE(ff_M24B)", %%mm1 \n\t" /* B5 B4 B3 */\
  813. "pand %%mm7, %%mm3 \n\t" /* G4 G3 */\
  814. "pand %%mm0, %%mm6 \n\t" /* R4 R3 R2 */\
  815. \
  816. "por %%mm1, %%mm3 \n\t" /* B5 G4 B4 G3 B3 */\
  817. "por %%mm3, %%mm6 \n\t"\
  818. MOVNTQ(%%mm6, 8(dst))\
  819. \
  820. "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6 B7 B6 B6 B7 */\
  821. "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7 G6 G5 G6 G5 */\
  822. "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6 R5 R4 R5 R4 */\
  823. \
  824. "pand %%mm7, %%mm1 \n\t" /* B7 B6 */\
  825. "pand %%mm0, %%mm3 \n\t" /* G7 G6 G5 */\
  826. "pand "MANGLE(ff_M24B)", %%mm6 \n\t" /* R7 R6 R5 */\
  827. \
  828. "por %%mm1, %%mm3 \n\t"\
  829. "por %%mm3, %%mm6 \n\t"\
  830. MOVNTQ(%%mm6, 16(dst))\
  831. \
  832. "add $24, "#dst" \n\t"\
  833. \
  834. "add $8, "#index" \n\t"\
  835. "cmp "#dstw", "#index" \n\t"\
  836. " jb 1b \n\t"
  837. #if COMPILE_TEMPLATE_MMX2
  838. #undef WRITEBGR24
  839. #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX2(dst, dstw, index)
  840. #else
  841. #undef WRITEBGR24
  842. #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX(dst, dstw, index)
  843. #endif
  844. static void RENAME(yuv2bgr24_X_ar)(SwsContext *c, const int16_t *lumFilter,
  845. const int16_t **lumSrc, int lumFilterSize,
  846. const int16_t *chrFilter, const int16_t **chrUSrc,
  847. const int16_t **chrVSrc,
  848. int chrFilterSize, const int16_t **alpSrc,
  849. uint8_t *dest, int dstW, int dstY)
  850. {
  851. x86_reg dummy=0;
  852. x86_reg dstW_reg = dstW;
  853. x86_reg uv_off = c->uv_offx2;
  854. YSCALEYUV2PACKEDX_ACCURATE
  855. YSCALEYUV2RGBX
  856. "pxor %%mm7, %%mm7 \n\t"
  857. "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"\n\t" //FIXME optimize
  858. "add %4, %%"REG_c" \n\t"
  859. WRITEBGR24(%%REGc, %5, %%REGa)
  860. :: "r" (&c->redDither),
  861. "m" (dummy), "m" (dummy), "m" (dummy),
  862. "r" (dest), "m" (dstW_reg), "m"(uv_off)
  863. : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
  864. );
  865. }
  866. static void RENAME(yuv2bgr24_X)(SwsContext *c, const int16_t *lumFilter,
  867. const int16_t **lumSrc, int lumFilterSize,
  868. const int16_t *chrFilter, const int16_t **chrUSrc,
  869. const int16_t **chrVSrc,
  870. int chrFilterSize, const int16_t **alpSrc,
  871. uint8_t *dest, int dstW, int dstY)
  872. {
  873. x86_reg dummy=0;
  874. x86_reg dstW_reg = dstW;
  875. x86_reg uv_off = c->uv_offx2;
  876. YSCALEYUV2PACKEDX
  877. YSCALEYUV2RGBX
  878. "pxor %%mm7, %%mm7 \n\t"
  879. "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c" \n\t" //FIXME optimize
  880. "add %4, %%"REG_c" \n\t"
  881. WRITEBGR24(%%REGc, %5, %%REGa)
  882. :: "r" (&c->redDither),
  883. "m" (dummy), "m" (dummy), "m" (dummy),
  884. "r" (dest), "m" (dstW_reg), "m"(uv_off)
  885. : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
  886. );
  887. }
  888. #define REAL_WRITEYUY2(dst, dstw, index) \
  889. "packuswb %%mm3, %%mm3 \n\t"\
  890. "packuswb %%mm4, %%mm4 \n\t"\
  891. "packuswb %%mm7, %%mm1 \n\t"\
  892. "punpcklbw %%mm4, %%mm3 \n\t"\
  893. "movq %%mm1, %%mm7 \n\t"\
  894. "punpcklbw %%mm3, %%mm1 \n\t"\
  895. "punpckhbw %%mm3, %%mm7 \n\t"\
  896. \
  897. MOVNTQ(%%mm1, (dst, index, 2))\
  898. MOVNTQ(%%mm7, 8(dst, index, 2))\
  899. \
  900. "add $8, "#index" \n\t"\
  901. "cmp "#dstw", "#index" \n\t"\
  902. " jb 1b \n\t"
  903. #define WRITEYUY2(dst, dstw, index) REAL_WRITEYUY2(dst, dstw, index)
  904. static void RENAME(yuv2yuyv422_X_ar)(SwsContext *c, const int16_t *lumFilter,
  905. const int16_t **lumSrc, int lumFilterSize,
  906. const int16_t *chrFilter, const int16_t **chrUSrc,
  907. const int16_t **chrVSrc,
  908. int chrFilterSize, const int16_t **alpSrc,
  909. uint8_t *dest, int dstW, int dstY)
  910. {
  911. x86_reg dummy=0;
  912. x86_reg dstW_reg = dstW;
  913. x86_reg uv_off = c->uv_offx2;
  914. YSCALEYUV2PACKEDX_ACCURATE
  915. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  916. "psraw $3, %%mm3 \n\t"
  917. "psraw $3, %%mm4 \n\t"
  918. "psraw $3, %%mm1 \n\t"
  919. "psraw $3, %%mm7 \n\t"
  920. WRITEYUY2(%4, %5, %%REGa)
  921. YSCALEYUV2PACKEDX_END
  922. }
  923. static void RENAME(yuv2yuyv422_X)(SwsContext *c, const int16_t *lumFilter,
  924. const int16_t **lumSrc, int lumFilterSize,
  925. const int16_t *chrFilter, const int16_t **chrUSrc,
  926. const int16_t **chrVSrc,
  927. int chrFilterSize, const int16_t **alpSrc,
  928. uint8_t *dest, int dstW, int dstY)
  929. {
  930. x86_reg dummy=0;
  931. x86_reg dstW_reg = dstW;
  932. x86_reg uv_off = c->uv_offx2;
  933. YSCALEYUV2PACKEDX
  934. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  935. "psraw $3, %%mm3 \n\t"
  936. "psraw $3, %%mm4 \n\t"
  937. "psraw $3, %%mm1 \n\t"
  938. "psraw $3, %%mm7 \n\t"
  939. WRITEYUY2(%4, %5, %%REGa)
  940. YSCALEYUV2PACKEDX_END
  941. }
  942. #define REAL_YSCALEYUV2RGB_UV(index, c) \
  943. "xor "#index", "#index" \n\t"\
  944. ".p2align 4 \n\t"\
  945. "1: \n\t"\
  946. "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
  947. "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
  948. "add "UV_OFFx2"("#c"), "#index" \n\t" \
  949. "movq (%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
  950. "movq (%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
  951. "sub "UV_OFFx2"("#c"), "#index" \n\t" \
  952. "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
  953. "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
  954. "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
  955. "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
  956. "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
  957. "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
  958. "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
  959. "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
  960. "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
  961. "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
  962. "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
  963. "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
  964. "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
  965. "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
  966. "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
  967. /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
  968. #define REAL_YSCALEYUV2RGB_YA(index, c, b1, b2) \
  969. "movq ("#b1", "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
  970. "movq ("#b2", "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
  971. "movq 8("#b1", "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
  972. "movq 8("#b2", "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
  973. "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
  974. "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
  975. "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
  976. "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
  977. "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  978. "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  979. "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
  980. "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
  981. #define REAL_YSCALEYUV2RGB_COEFF(c) \
  982. "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
  983. "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
  984. "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
  985. "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
  986. "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
  987. "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
  988. /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
  989. "paddw %%mm3, %%mm4 \n\t"\
  990. "movq %%mm2, %%mm0 \n\t"\
  991. "movq %%mm5, %%mm6 \n\t"\
  992. "movq %%mm4, %%mm3 \n\t"\
  993. "punpcklwd %%mm2, %%mm2 \n\t"\
  994. "punpcklwd %%mm5, %%mm5 \n\t"\
  995. "punpcklwd %%mm4, %%mm4 \n\t"\
  996. "paddw %%mm1, %%mm2 \n\t"\
  997. "paddw %%mm1, %%mm5 \n\t"\
  998. "paddw %%mm1, %%mm4 \n\t"\
  999. "punpckhwd %%mm0, %%mm0 \n\t"\
  1000. "punpckhwd %%mm6, %%mm6 \n\t"\
  1001. "punpckhwd %%mm3, %%mm3 \n\t"\
  1002. "paddw %%mm7, %%mm0 \n\t"\
  1003. "paddw %%mm7, %%mm6 \n\t"\
  1004. "paddw %%mm7, %%mm3 \n\t"\
  1005. /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
  1006. "packuswb %%mm0, %%mm2 \n\t"\
  1007. "packuswb %%mm6, %%mm5 \n\t"\
  1008. "packuswb %%mm3, %%mm4 \n\t"\
  1009. #define YSCALEYUV2RGB_YA(index, c, b1, b2) REAL_YSCALEYUV2RGB_YA(index, c, b1, b2)
  1010. #define YSCALEYUV2RGB(index, c) \
  1011. REAL_YSCALEYUV2RGB_UV(index, c) \
  1012. REAL_YSCALEYUV2RGB_YA(index, c, %0, %1) \
  1013. REAL_YSCALEYUV2RGB_COEFF(c)
  1014. /**
  1015. * vertical bilinear scale YV12 to RGB
  1016. */
  1017. static void RENAME(yuv2rgb32_2)(SwsContext *c, const int16_t *buf[2],
  1018. const int16_t *ubuf[2], const int16_t *vbuf[2],
  1019. const int16_t *abuf[2], uint8_t *dest,
  1020. int dstW, int yalpha, int uvalpha, int y)
  1021. {
  1022. const int16_t *buf0 = buf[0], *buf1 = buf[1],
  1023. *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
  1024. if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
  1025. const int16_t *abuf0 = abuf[0], *abuf1 = abuf[1];
  1026. #if ARCH_X86_64
  1027. __asm__ volatile(
  1028. YSCALEYUV2RGB(%%r8, %5)
  1029. YSCALEYUV2RGB_YA(%%r8, %5, %6, %7)
  1030. "psraw $3, %%mm1 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
  1031. "psraw $3, %%mm7 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
  1032. "packuswb %%mm7, %%mm1 \n\t"
  1033. WRITEBGR32(%4, 8280(%5), %%r8, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
  1034. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "r" (dest),
  1035. "a" (&c->redDither),
  1036. "r" (abuf0), "r" (abuf1)
  1037. : "%r8"
  1038. );
  1039. #else
  1040. c->u_temp=(intptr_t)abuf0;
  1041. c->v_temp=(intptr_t)abuf1;
  1042. __asm__ volatile(
  1043. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1044. "mov %4, %%"REG_b" \n\t"
  1045. "push %%"REG_BP" \n\t"
  1046. YSCALEYUV2RGB(%%REGBP, %5)
  1047. "push %0 \n\t"
  1048. "push %1 \n\t"
  1049. "mov "U_TEMP"(%5), %0 \n\t"
  1050. "mov "V_TEMP"(%5), %1 \n\t"
  1051. YSCALEYUV2RGB_YA(%%REGBP, %5, %0, %1)
  1052. "psraw $3, %%mm1 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
  1053. "psraw $3, %%mm7 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
  1054. "packuswb %%mm7, %%mm1 \n\t"
  1055. "pop %1 \n\t"
  1056. "pop %0 \n\t"
  1057. WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
  1058. "pop %%"REG_BP" \n\t"
  1059. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1060. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1061. "a" (&c->redDither)
  1062. );
  1063. #endif
  1064. } else {
  1065. __asm__ volatile(
  1066. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1067. "mov %4, %%"REG_b" \n\t"
  1068. "push %%"REG_BP" \n\t"
  1069. YSCALEYUV2RGB(%%REGBP, %5)
  1070. "pcmpeqd %%mm7, %%mm7 \n\t"
  1071. WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
  1072. "pop %%"REG_BP" \n\t"
  1073. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1074. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1075. "a" (&c->redDither)
  1076. );
  1077. }
  1078. }
  1079. static void RENAME(yuv2bgr24_2)(SwsContext *c, const int16_t *buf[2],
  1080. const int16_t *ubuf[2], const int16_t *vbuf[2],
  1081. const int16_t *abuf[2], uint8_t *dest,
  1082. int dstW, int yalpha, int uvalpha, int y)
  1083. {
  1084. const int16_t *buf0 = buf[0], *buf1 = buf[1],
  1085. *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
  1086. //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
  1087. __asm__ volatile(
  1088. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1089. "mov %4, %%"REG_b" \n\t"
  1090. "push %%"REG_BP" \n\t"
  1091. YSCALEYUV2RGB(%%REGBP, %5)
  1092. "pxor %%mm7, %%mm7 \n\t"
  1093. WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
  1094. "pop %%"REG_BP" \n\t"
  1095. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1096. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1097. "a" (&c->redDither)
  1098. );
  1099. }
  1100. static void RENAME(yuv2rgb555_2)(SwsContext *c, const int16_t *buf[2],
  1101. const int16_t *ubuf[2], const int16_t *vbuf[2],
  1102. const int16_t *abuf[2], uint8_t *dest,
  1103. int dstW, int yalpha, int uvalpha, int y)
  1104. {
  1105. const int16_t *buf0 = buf[0], *buf1 = buf[1],
  1106. *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
  1107. //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
  1108. __asm__ volatile(
  1109. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1110. "mov %4, %%"REG_b" \n\t"
  1111. "push %%"REG_BP" \n\t"
  1112. YSCALEYUV2RGB(%%REGBP, %5)
  1113. "pxor %%mm7, %%mm7 \n\t"
  1114. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1115. #ifdef DITHER1XBPP
  1116. "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
  1117. "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
  1118. "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
  1119. #endif
  1120. WRITERGB15(%%REGb, 8280(%5), %%REGBP)
  1121. "pop %%"REG_BP" \n\t"
  1122. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1123. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1124. "a" (&c->redDither)
  1125. );
  1126. }
  1127. static void RENAME(yuv2rgb565_2)(SwsContext *c, const int16_t *buf[2],
  1128. const int16_t *ubuf[2], const int16_t *vbuf[2],
  1129. const int16_t *abuf[2], uint8_t *dest,
  1130. int dstW, int yalpha, int uvalpha, int y)
  1131. {
  1132. const int16_t *buf0 = buf[0], *buf1 = buf[1],
  1133. *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
  1134. //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
  1135. __asm__ volatile(
  1136. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1137. "mov %4, %%"REG_b" \n\t"
  1138. "push %%"REG_BP" \n\t"
  1139. YSCALEYUV2RGB(%%REGBP, %5)
  1140. "pxor %%mm7, %%mm7 \n\t"
  1141. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1142. #ifdef DITHER1XBPP
  1143. "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
  1144. "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
  1145. "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
  1146. #endif
  1147. WRITERGB16(%%REGb, 8280(%5), %%REGBP)
  1148. "pop %%"REG_BP" \n\t"
  1149. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1150. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1151. "a" (&c->redDither)
  1152. );
  1153. }
  1154. #define REAL_YSCALEYUV2PACKED(index, c) \
  1155. "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
  1156. "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1 \n\t"\
  1157. "psraw $3, %%mm0 \n\t"\
  1158. "psraw $3, %%mm1 \n\t"\
  1159. "movq %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c") \n\t"\
  1160. "movq %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c") \n\t"\
  1161. "xor "#index", "#index" \n\t"\
  1162. ".p2align 4 \n\t"\
  1163. "1: \n\t"\
  1164. "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
  1165. "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
  1166. "add "UV_OFFx2"("#c"), "#index" \n\t" \
  1167. "movq (%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
  1168. "movq (%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
  1169. "sub "UV_OFFx2"("#c"), "#index" \n\t" \
  1170. "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
  1171. "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
  1172. "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
  1173. "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
  1174. "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
  1175. "psraw $7, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
  1176. "psraw $7, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
  1177. "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
  1178. "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
  1179. "movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
  1180. "movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
  1181. "movq 8(%0, "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
  1182. "movq 8(%1, "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
  1183. "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
  1184. "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
  1185. "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
  1186. "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
  1187. "psraw $7, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  1188. "psraw $7, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  1189. "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
  1190. "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
  1191. #define YSCALEYUV2PACKED(index, c) REAL_YSCALEYUV2PACKED(index, c)
  1192. static void RENAME(yuv2yuyv422_2)(SwsContext *c, const int16_t *buf[2],
  1193. const int16_t *ubuf[2], const int16_t *vbuf[2],
  1194. const int16_t *abuf[2], uint8_t *dest,
  1195. int dstW, int yalpha, int uvalpha, int y)
  1196. {
  1197. const int16_t *buf0 = buf[0], *buf1 = buf[1],
  1198. *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
  1199. //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
  1200. __asm__ volatile(
  1201. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1202. "mov %4, %%"REG_b" \n\t"
  1203. "push %%"REG_BP" \n\t"
  1204. YSCALEYUV2PACKED(%%REGBP, %5)
  1205. WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
  1206. "pop %%"REG_BP" \n\t"
  1207. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1208. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1209. "a" (&c->redDither)
  1210. );
  1211. }
  1212. #define REAL_YSCALEYUV2RGB1(index, c) \
  1213. "xor "#index", "#index" \n\t"\
  1214. ".p2align 4 \n\t"\
  1215. "1: \n\t"\
  1216. "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
  1217. "add "UV_OFFx2"("#c"), "#index" \n\t" \
  1218. "movq (%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
  1219. "sub "UV_OFFx2"("#c"), "#index" \n\t" \
  1220. "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
  1221. "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
  1222. "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
  1223. "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
  1224. "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
  1225. "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
  1226. "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
  1227. "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
  1228. /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
  1229. "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
  1230. "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
  1231. "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  1232. "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  1233. "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
  1234. "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
  1235. "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
  1236. "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
  1237. "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
  1238. "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
  1239. /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
  1240. "paddw %%mm3, %%mm4 \n\t"\
  1241. "movq %%mm2, %%mm0 \n\t"\
  1242. "movq %%mm5, %%mm6 \n\t"\
  1243. "movq %%mm4, %%mm3 \n\t"\
  1244. "punpcklwd %%mm2, %%mm2 \n\t"\
  1245. "punpcklwd %%mm5, %%mm5 \n\t"\
  1246. "punpcklwd %%mm4, %%mm4 \n\t"\
  1247. "paddw %%mm1, %%mm2 \n\t"\
  1248. "paddw %%mm1, %%mm5 \n\t"\
  1249. "paddw %%mm1, %%mm4 \n\t"\
  1250. "punpckhwd %%mm0, %%mm0 \n\t"\
  1251. "punpckhwd %%mm6, %%mm6 \n\t"\
  1252. "punpckhwd %%mm3, %%mm3 \n\t"\
  1253. "paddw %%mm7, %%mm0 \n\t"\
  1254. "paddw %%mm7, %%mm6 \n\t"\
  1255. "paddw %%mm7, %%mm3 \n\t"\
  1256. /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
  1257. "packuswb %%mm0, %%mm2 \n\t"\
  1258. "packuswb %%mm6, %%mm5 \n\t"\
  1259. "packuswb %%mm3, %%mm4 \n\t"\
  1260. #define YSCALEYUV2RGB1(index, c) REAL_YSCALEYUV2RGB1(index, c)
  1261. // do vertical chrominance interpolation
  1262. #define REAL_YSCALEYUV2RGB1b(index, c) \
  1263. "xor "#index", "#index" \n\t"\
  1264. ".p2align 4 \n\t"\
  1265. "1: \n\t"\
  1266. "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
  1267. "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
  1268. "add "UV_OFFx2"("#c"), "#index" \n\t" \
  1269. "movq (%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
  1270. "movq (%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
  1271. "sub "UV_OFFx2"("#c"), "#index" \n\t" \
  1272. "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
  1273. "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
  1274. "psrlw $5, %%mm3 \n\t" /*FIXME might overflow*/\
  1275. "psrlw $5, %%mm4 \n\t" /*FIXME might overflow*/\
  1276. "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
  1277. "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
  1278. "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
  1279. "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
  1280. "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
  1281. "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
  1282. /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
  1283. "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
  1284. "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
  1285. "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  1286. "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  1287. "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
  1288. "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
  1289. "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
  1290. "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
  1291. "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
  1292. "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
  1293. /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
  1294. "paddw %%mm3, %%mm4 \n\t"\
  1295. "movq %%mm2, %%mm0 \n\t"\
  1296. "movq %%mm5, %%mm6 \n\t"\
  1297. "movq %%mm4, %%mm3 \n\t"\
  1298. "punpcklwd %%mm2, %%mm2 \n\t"\
  1299. "punpcklwd %%mm5, %%mm5 \n\t"\
  1300. "punpcklwd %%mm4, %%mm4 \n\t"\
  1301. "paddw %%mm1, %%mm2 \n\t"\
  1302. "paddw %%mm1, %%mm5 \n\t"\
  1303. "paddw %%mm1, %%mm4 \n\t"\
  1304. "punpckhwd %%mm0, %%mm0 \n\t"\
  1305. "punpckhwd %%mm6, %%mm6 \n\t"\
  1306. "punpckhwd %%mm3, %%mm3 \n\t"\
  1307. "paddw %%mm7, %%mm0 \n\t"\
  1308. "paddw %%mm7, %%mm6 \n\t"\
  1309. "paddw %%mm7, %%mm3 \n\t"\
  1310. /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
  1311. "packuswb %%mm0, %%mm2 \n\t"\
  1312. "packuswb %%mm6, %%mm5 \n\t"\
  1313. "packuswb %%mm3, %%mm4 \n\t"\
  1314. #define YSCALEYUV2RGB1b(index, c) REAL_YSCALEYUV2RGB1b(index, c)
  1315. #define REAL_YSCALEYUV2RGB1_ALPHA(index) \
  1316. "movq (%1, "#index", 2), %%mm7 \n\t" /* abuf0[index ] */\
  1317. "movq 8(%1, "#index", 2), %%mm1 \n\t" /* abuf0[index+4] */\
  1318. "psraw $7, %%mm7 \n\t" /* abuf0[index ] >>7 */\
  1319. "psraw $7, %%mm1 \n\t" /* abuf0[index+4] >>7 */\
  1320. "packuswb %%mm1, %%mm7 \n\t"
  1321. #define YSCALEYUV2RGB1_ALPHA(index) REAL_YSCALEYUV2RGB1_ALPHA(index)
  1322. /**
  1323. * YV12 to RGB without scaling or interpolating
  1324. */
  1325. static void RENAME(yuv2rgb32_1)(SwsContext *c, const int16_t *buf0,
  1326. const int16_t *ubuf[2], const int16_t *bguf[2],
  1327. const int16_t *abuf0, uint8_t *dest,
  1328. int dstW, int uvalpha, int y)
  1329. {
  1330. const int16_t *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
  1331. const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
  1332. if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
  1333. if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
  1334. __asm__ volatile(
  1335. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1336. "mov %4, %%"REG_b" \n\t"
  1337. "push %%"REG_BP" \n\t"
  1338. YSCALEYUV2RGB1(%%REGBP, %5)
  1339. YSCALEYUV2RGB1_ALPHA(%%REGBP)
  1340. WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
  1341. "pop %%"REG_BP" \n\t"
  1342. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1343. :: "c" (buf0), "d" (abuf0), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1344. "a" (&c->redDither)
  1345. );
  1346. } else {
  1347. __asm__ volatile(
  1348. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1349. "mov %4, %%"REG_b" \n\t"
  1350. "push %%"REG_BP" \n\t"
  1351. YSCALEYUV2RGB1(%%REGBP, %5)
  1352. "pcmpeqd %%mm7, %%mm7 \n\t"
  1353. WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
  1354. "pop %%"REG_BP" \n\t"
  1355. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1356. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1357. "a" (&c->redDither)
  1358. );
  1359. }
  1360. } else {
  1361. if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
  1362. __asm__ volatile(
  1363. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1364. "mov %4, %%"REG_b" \n\t"
  1365. "push %%"REG_BP" \n\t"
  1366. YSCALEYUV2RGB1b(%%REGBP, %5)
  1367. YSCALEYUV2RGB1_ALPHA(%%REGBP)
  1368. WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
  1369. "pop %%"REG_BP" \n\t"
  1370. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1371. :: "c" (buf0), "d" (abuf0), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1372. "a" (&c->redDither)
  1373. );
  1374. } else {
  1375. __asm__ volatile(
  1376. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1377. "mov %4, %%"REG_b" \n\t"
  1378. "push %%"REG_BP" \n\t"
  1379. YSCALEYUV2RGB1b(%%REGBP, %5)
  1380. "pcmpeqd %%mm7, %%mm7 \n\t"
  1381. WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
  1382. "pop %%"REG_BP" \n\t"
  1383. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1384. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1385. "a" (&c->redDither)
  1386. );
  1387. }
  1388. }
  1389. }
  1390. static void RENAME(yuv2bgr24_1)(SwsContext *c, const int16_t *buf0,
  1391. const int16_t *ubuf[2], const int16_t *bguf[2],
  1392. const int16_t *abuf0, uint8_t *dest,
  1393. int dstW, int uvalpha, int y)
  1394. {
  1395. const int16_t *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
  1396. const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
  1397. if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
  1398. __asm__ volatile(
  1399. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1400. "mov %4, %%"REG_b" \n\t"
  1401. "push %%"REG_BP" \n\t"
  1402. YSCALEYUV2RGB1(%%REGBP, %5)
  1403. "pxor %%mm7, %%mm7 \n\t"
  1404. WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
  1405. "pop %%"REG_BP" \n\t"
  1406. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1407. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1408. "a" (&c->redDither)
  1409. );
  1410. } else {
  1411. __asm__ volatile(
  1412. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1413. "mov %4, %%"REG_b" \n\t"
  1414. "push %%"REG_BP" \n\t"
  1415. YSCALEYUV2RGB1b(%%REGBP, %5)
  1416. "pxor %%mm7, %%mm7 \n\t"
  1417. WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
  1418. "pop %%"REG_BP" \n\t"
  1419. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1420. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1421. "a" (&c->redDither)
  1422. );
  1423. }
  1424. }
  1425. static void RENAME(yuv2rgb555_1)(SwsContext *c, const int16_t *buf0,
  1426. const int16_t *ubuf[2], const int16_t *bguf[2],
  1427. const int16_t *abuf0, uint8_t *dest,
  1428. int dstW, int uvalpha, int y)
  1429. {
  1430. const int16_t *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
  1431. const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
  1432. if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
  1433. __asm__ volatile(
  1434. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1435. "mov %4, %%"REG_b" \n\t"
  1436. "push %%"REG_BP" \n\t"
  1437. YSCALEYUV2RGB1(%%REGBP, %5)
  1438. "pxor %%mm7, %%mm7 \n\t"
  1439. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1440. #ifdef DITHER1XBPP
  1441. "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
  1442. "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
  1443. "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
  1444. #endif
  1445. WRITERGB15(%%REGb, 8280(%5), %%REGBP)
  1446. "pop %%"REG_BP" \n\t"
  1447. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1448. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1449. "a" (&c->redDither)
  1450. );
  1451. } else {
  1452. __asm__ volatile(
  1453. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1454. "mov %4, %%"REG_b" \n\t"
  1455. "push %%"REG_BP" \n\t"
  1456. YSCALEYUV2RGB1b(%%REGBP, %5)
  1457. "pxor %%mm7, %%mm7 \n\t"
  1458. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1459. #ifdef DITHER1XBPP
  1460. "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
  1461. "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
  1462. "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
  1463. #endif
  1464. WRITERGB15(%%REGb, 8280(%5), %%REGBP)
  1465. "pop %%"REG_BP" \n\t"
  1466. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1467. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1468. "a" (&c->redDither)
  1469. );
  1470. }
  1471. }
  1472. static void RENAME(yuv2rgb565_1)(SwsContext *c, const int16_t *buf0,
  1473. const int16_t *ubuf[2], const int16_t *bguf[2],
  1474. const int16_t *abuf0, uint8_t *dest,
  1475. int dstW, int uvalpha, int y)
  1476. {
  1477. const int16_t *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
  1478. const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
  1479. if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
  1480. __asm__ volatile(
  1481. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1482. "mov %4, %%"REG_b" \n\t"
  1483. "push %%"REG_BP" \n\t"
  1484. YSCALEYUV2RGB1(%%REGBP, %5)
  1485. "pxor %%mm7, %%mm7 \n\t"
  1486. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1487. #ifdef DITHER1XBPP
  1488. "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
  1489. "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
  1490. "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
  1491. #endif
  1492. WRITERGB16(%%REGb, 8280(%5), %%REGBP)
  1493. "pop %%"REG_BP" \n\t"
  1494. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1495. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1496. "a" (&c->redDither)
  1497. );
  1498. } else {
  1499. __asm__ volatile(
  1500. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1501. "mov %4, %%"REG_b" \n\t"
  1502. "push %%"REG_BP" \n\t"
  1503. YSCALEYUV2RGB1b(%%REGBP, %5)
  1504. "pxor %%mm7, %%mm7 \n\t"
  1505. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1506. #ifdef DITHER1XBPP
  1507. "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
  1508. "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
  1509. "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
  1510. #endif
  1511. WRITERGB16(%%REGb, 8280(%5), %%REGBP)
  1512. "pop %%"REG_BP" \n\t"
  1513. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1514. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1515. "a" (&c->redDither)
  1516. );
  1517. }
  1518. }
  1519. #define REAL_YSCALEYUV2PACKED1(index, c) \
  1520. "xor "#index", "#index" \n\t"\
  1521. ".p2align 4 \n\t"\
  1522. "1: \n\t"\
  1523. "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
  1524. "add "UV_OFFx2"("#c"), "#index" \n\t" \
  1525. "movq (%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
  1526. "sub "UV_OFFx2"("#c"), "#index" \n\t" \
  1527. "psraw $7, %%mm3 \n\t" \
  1528. "psraw $7, %%mm4 \n\t" \
  1529. "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
  1530. "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
  1531. "psraw $7, %%mm1 \n\t" \
  1532. "psraw $7, %%mm7 \n\t" \
  1533. #define YSCALEYUV2PACKED1(index, c) REAL_YSCALEYUV2PACKED1(index, c)
  1534. #define REAL_YSCALEYUV2PACKED1b(index, c) \
  1535. "xor "#index", "#index" \n\t"\
  1536. ".p2align 4 \n\t"\
  1537. "1: \n\t"\
  1538. "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
  1539. "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
  1540. "add "UV_OFFx2"("#c"), "#index" \n\t" \
  1541. "movq (%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
  1542. "movq (%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
  1543. "sub "UV_OFFx2"("#c"), "#index" \n\t" \
  1544. "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
  1545. "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
  1546. "psrlw $8, %%mm3 \n\t" \
  1547. "psrlw $8, %%mm4 \n\t" \
  1548. "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
  1549. "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
  1550. "psraw $7, %%mm1 \n\t" \
  1551. "psraw $7, %%mm7 \n\t"
  1552. #define YSCALEYUV2PACKED1b(index, c) REAL_YSCALEYUV2PACKED1b(index, c)
  1553. static void RENAME(yuv2yuyv422_1)(SwsContext *c, const int16_t *buf0,
  1554. const int16_t *ubuf[2], const int16_t *bguf[2],
  1555. const int16_t *abuf0, uint8_t *dest,
  1556. int dstW, int uvalpha, int y)
  1557. {
  1558. const int16_t *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
  1559. const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
  1560. if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
  1561. __asm__ volatile(
  1562. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1563. "mov %4, %%"REG_b" \n\t"
  1564. "push %%"REG_BP" \n\t"
  1565. YSCALEYUV2PACKED1(%%REGBP, %5)
  1566. WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
  1567. "pop %%"REG_BP" \n\t"
  1568. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1569. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1570. "a" (&c->redDither)
  1571. );
  1572. } else {
  1573. __asm__ volatile(
  1574. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1575. "mov %4, %%"REG_b" \n\t"
  1576. "push %%"REG_BP" \n\t"
  1577. YSCALEYUV2PACKED1b(%%REGBP, %5)
  1578. WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
  1579. "pop %%"REG_BP" \n\t"
  1580. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1581. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1582. "a" (&c->redDither)
  1583. );
  1584. }
  1585. }
  1586. #if !COMPILE_TEMPLATE_MMX2
  1587. //FIXME yuy2* can read up to 7 samples too much
  1588. static void RENAME(yuy2ToY)(uint8_t *dst, const uint8_t *src,
  1589. int width, uint32_t *unused)
  1590. {
  1591. __asm__ volatile(
  1592. "movq "MANGLE(bm01010101)", %%mm2 \n\t"
  1593. "mov %0, %%"REG_a" \n\t"
  1594. "1: \n\t"
  1595. "movq (%1, %%"REG_a",2), %%mm0 \n\t"
  1596. "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
  1597. "pand %%mm2, %%mm0 \n\t"
  1598. "pand %%mm2, %%mm1 \n\t"
  1599. "packuswb %%mm1, %%mm0 \n\t"
  1600. "movq %%mm0, (%2, %%"REG_a") \n\t"
  1601. "add $8, %%"REG_a" \n\t"
  1602. " js 1b \n\t"
  1603. : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst+width)
  1604. : "%"REG_a
  1605. );
  1606. }
  1607. static void RENAME(yuy2ToUV)(uint8_t *dstU, uint8_t *dstV,
  1608. const uint8_t *src1, const uint8_t *src2,
  1609. int width, uint32_t *unused)
  1610. {
  1611. __asm__ volatile(
  1612. "movq "MANGLE(bm01010101)", %%mm4 \n\t"
  1613. "mov %0, %%"REG_a" \n\t"
  1614. "1: \n\t"
  1615. "movq (%1, %%"REG_a",4), %%mm0 \n\t"
  1616. "movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
  1617. "psrlw $8, %%mm0 \n\t"
  1618. "psrlw $8, %%mm1 \n\t"
  1619. "packuswb %%mm1, %%mm0 \n\t"
  1620. "movq %%mm0, %%mm1 \n\t"
  1621. "psrlw $8, %%mm0 \n\t"
  1622. "pand %%mm4, %%mm1 \n\t"
  1623. "packuswb %%mm0, %%mm0 \n\t"
  1624. "packuswb %%mm1, %%mm1 \n\t"
  1625. "movd %%mm0, (%3, %%"REG_a") \n\t"
  1626. "movd %%mm1, (%2, %%"REG_a") \n\t"
  1627. "add $4, %%"REG_a" \n\t"
  1628. " js 1b \n\t"
  1629. : : "g" ((x86_reg)-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
  1630. : "%"REG_a
  1631. );
  1632. assert(src1 == src2);
  1633. }
  1634. /* This is almost identical to the previous, end exists only because
  1635. * yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses. */
  1636. static void RENAME(uyvyToY)(uint8_t *dst, const uint8_t *src,
  1637. int width, uint32_t *unused)
  1638. {
  1639. __asm__ volatile(
  1640. "mov %0, %%"REG_a" \n\t"
  1641. "1: \n\t"
  1642. "movq (%1, %%"REG_a",2), %%mm0 \n\t"
  1643. "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
  1644. "psrlw $8, %%mm0 \n\t"
  1645. "psrlw $8, %%mm1 \n\t"
  1646. "packuswb %%mm1, %%mm0 \n\t"
  1647. "movq %%mm0, (%2, %%"REG_a") \n\t"
  1648. "add $8, %%"REG_a" \n\t"
  1649. " js 1b \n\t"
  1650. : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst+width)
  1651. : "%"REG_a
  1652. );
  1653. }
  1654. static void RENAME(uyvyToUV)(uint8_t *dstU, uint8_t *dstV,
  1655. const uint8_t *src1, const uint8_t *src2,
  1656. int width, uint32_t *unused)
  1657. {
  1658. __asm__ volatile(
  1659. "movq "MANGLE(bm01010101)", %%mm4 \n\t"
  1660. "mov %0, %%"REG_a" \n\t"
  1661. "1: \n\t"
  1662. "movq (%1, %%"REG_a",4), %%mm0 \n\t"
  1663. "movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
  1664. "pand %%mm4, %%mm0 \n\t"
  1665. "pand %%mm4, %%mm1 \n\t"
  1666. "packuswb %%mm1, %%mm0 \n\t"
  1667. "movq %%mm0, %%mm1 \n\t"
  1668. "psrlw $8, %%mm0 \n\t"
  1669. "pand %%mm4, %%mm1 \n\t"
  1670. "packuswb %%mm0, %%mm0 \n\t"
  1671. "packuswb %%mm1, %%mm1 \n\t"
  1672. "movd %%mm0, (%3, %%"REG_a") \n\t"
  1673. "movd %%mm1, (%2, %%"REG_a") \n\t"
  1674. "add $4, %%"REG_a" \n\t"
  1675. " js 1b \n\t"
  1676. : : "g" ((x86_reg)-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
  1677. : "%"REG_a
  1678. );
  1679. assert(src1 == src2);
  1680. }
  1681. static av_always_inline void RENAME(nvXXtoUV)(uint8_t *dst1, uint8_t *dst2,
  1682. const uint8_t *src, int width)
  1683. {
  1684. __asm__ volatile(
  1685. "movq "MANGLE(bm01010101)", %%mm4 \n\t"
  1686. "mov %0, %%"REG_a" \n\t"
  1687. "1: \n\t"
  1688. "movq (%1, %%"REG_a",2), %%mm0 \n\t"
  1689. "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
  1690. "movq %%mm0, %%mm2 \n\t"
  1691. "movq %%mm1, %%mm3 \n\t"
  1692. "pand %%mm4, %%mm0 \n\t"
  1693. "pand %%mm4, %%mm1 \n\t"
  1694. "psrlw $8, %%mm2 \n\t"
  1695. "psrlw $8, %%mm3 \n\t"
  1696. "packuswb %%mm1, %%mm0 \n\t"
  1697. "packuswb %%mm3, %%mm2 \n\t"
  1698. "movq %%mm0, (%2, %%"REG_a") \n\t"
  1699. "movq %%mm2, (%3, %%"REG_a") \n\t"
  1700. "add $8, %%"REG_a" \n\t"
  1701. " js 1b \n\t"
  1702. : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst1+width), "r" (dst2+width)
  1703. : "%"REG_a
  1704. );
  1705. }
  1706. static void RENAME(nv12ToUV)(uint8_t *dstU, uint8_t *dstV,
  1707. const uint8_t *src1, const uint8_t *src2,
  1708. int width, uint32_t *unused)
  1709. {
  1710. RENAME(nvXXtoUV)(dstU, dstV, src1, width);
  1711. }
  1712. static void RENAME(nv21ToUV)(uint8_t *dstU, uint8_t *dstV,
  1713. const uint8_t *src1, const uint8_t *src2,
  1714. int width, uint32_t *unused)
  1715. {
  1716. RENAME(nvXXtoUV)(dstV, dstU, src1, width);
  1717. }
  1718. #endif /* !COMPILE_TEMPLATE_MMX2 */
  1719. static av_always_inline void RENAME(bgr24ToY_mmx)(int16_t *dst, const uint8_t *src,
  1720. int width, enum PixelFormat srcFormat)
  1721. {
  1722. if(srcFormat == PIX_FMT_BGR24) {
  1723. __asm__ volatile(
  1724. "movq "MANGLE(ff_bgr24toY1Coeff)", %%mm5 \n\t"
  1725. "movq "MANGLE(ff_bgr24toY2Coeff)", %%mm6 \n\t"
  1726. :
  1727. );
  1728. } else {
  1729. __asm__ volatile(
  1730. "movq "MANGLE(ff_rgb24toY1Coeff)", %%mm5 \n\t"
  1731. "movq "MANGLE(ff_rgb24toY2Coeff)", %%mm6 \n\t"
  1732. :
  1733. );
  1734. }
  1735. __asm__ volatile(
  1736. "movq "MANGLE(ff_bgr24toYOffset)", %%mm4 \n\t"
  1737. "mov %2, %%"REG_a" \n\t"
  1738. "pxor %%mm7, %%mm7 \n\t"
  1739. "1: \n\t"
  1740. PREFETCH" 64(%0) \n\t"
  1741. "movd (%0), %%mm0 \n\t"
  1742. "movd 2(%0), %%mm1 \n\t"
  1743. "movd 6(%0), %%mm2 \n\t"
  1744. "movd 8(%0), %%mm3 \n\t"
  1745. "add $12, %0 \n\t"
  1746. "punpcklbw %%mm7, %%mm0 \n\t"
  1747. "punpcklbw %%mm7, %%mm1 \n\t"
  1748. "punpcklbw %%mm7, %%mm2 \n\t"
  1749. "punpcklbw %%mm7, %%mm3 \n\t"
  1750. "pmaddwd %%mm5, %%mm0 \n\t"
  1751. "pmaddwd %%mm6, %%mm1 \n\t"
  1752. "pmaddwd %%mm5, %%mm2 \n\t"
  1753. "pmaddwd %%mm6, %%mm3 \n\t"
  1754. "paddd %%mm1, %%mm0 \n\t"
  1755. "paddd %%mm3, %%mm2 \n\t"
  1756. "paddd %%mm4, %%mm0 \n\t"
  1757. "paddd %%mm4, %%mm2 \n\t"
  1758. "psrad $9, %%mm0 \n\t"
  1759. "psrad $9, %%mm2 \n\t"
  1760. "packssdw %%mm2, %%mm0 \n\t"
  1761. "movq %%mm0, (%1, %%"REG_a") \n\t"
  1762. "add $8, %%"REG_a" \n\t"
  1763. " js 1b \n\t"
  1764. : "+r" (src)
  1765. : "r" (dst+width), "g" ((x86_reg)-2*width)
  1766. : "%"REG_a
  1767. );
  1768. }
  1769. static void RENAME(bgr24ToY)(int16_t *dst, const uint8_t *src,
  1770. int width, uint32_t *unused)
  1771. {
  1772. RENAME(bgr24ToY_mmx)(dst, src, width, PIX_FMT_BGR24);
  1773. }
  1774. static void RENAME(rgb24ToY)(int16_t *dst, const uint8_t *src,
  1775. int width, uint32_t *unused)
  1776. {
  1777. RENAME(bgr24ToY_mmx)(dst, src, width, PIX_FMT_RGB24);
  1778. }
  1779. static av_always_inline void RENAME(bgr24ToUV_mmx)(int16_t *dstU, int16_t *dstV,
  1780. const uint8_t *src, int width,
  1781. enum PixelFormat srcFormat)
  1782. {
  1783. __asm__ volatile(
  1784. "movq 24(%4), %%mm6 \n\t"
  1785. "mov %3, %%"REG_a" \n\t"
  1786. "pxor %%mm7, %%mm7 \n\t"
  1787. "1: \n\t"
  1788. PREFETCH" 64(%0) \n\t"
  1789. "movd (%0), %%mm0 \n\t"
  1790. "movd 2(%0), %%mm1 \n\t"
  1791. "punpcklbw %%mm7, %%mm0 \n\t"
  1792. "punpcklbw %%mm7, %%mm1 \n\t"
  1793. "movq %%mm0, %%mm2 \n\t"
  1794. "movq %%mm1, %%mm3 \n\t"
  1795. "pmaddwd (%4), %%mm0 \n\t"
  1796. "pmaddwd 8(%4), %%mm1 \n\t"
  1797. "pmaddwd 16(%4), %%mm2 \n\t"
  1798. "pmaddwd %%mm6, %%mm3 \n\t"
  1799. "paddd %%mm1, %%mm0 \n\t"
  1800. "paddd %%mm3, %%mm2 \n\t"
  1801. "movd 6(%0), %%mm1 \n\t"
  1802. "movd 8(%0), %%mm3 \n\t"
  1803. "add $12, %0 \n\t"
  1804. "punpcklbw %%mm7, %%mm1 \n\t"
  1805. "punpcklbw %%mm7, %%mm3 \n\t"
  1806. "movq %%mm1, %%mm4 \n\t"
  1807. "movq %%mm3, %%mm5 \n\t"
  1808. "pmaddwd (%4), %%mm1 \n\t"
  1809. "pmaddwd 8(%4), %%mm3 \n\t"
  1810. "pmaddwd 16(%4), %%mm4 \n\t"
  1811. "pmaddwd %%mm6, %%mm5 \n\t"
  1812. "paddd %%mm3, %%mm1 \n\t"
  1813. "paddd %%mm5, %%mm4 \n\t"
  1814. "movq "MANGLE(ff_bgr24toUVOffset)", %%mm3 \n\t"
  1815. "paddd %%mm3, %%mm0 \n\t"
  1816. "paddd %%mm3, %%mm2 \n\t"
  1817. "paddd %%mm3, %%mm1 \n\t"
  1818. "paddd %%mm3, %%mm4 \n\t"
  1819. "psrad $9, %%mm0 \n\t"
  1820. "psrad $9, %%mm2 \n\t"
  1821. "psrad $9, %%mm1 \n\t"
  1822. "psrad $9, %%mm4 \n\t"
  1823. "packssdw %%mm1, %%mm0 \n\t"
  1824. "packssdw %%mm4, %%mm2 \n\t"
  1825. "movq %%mm0, (%1, %%"REG_a") \n\t"
  1826. "movq %%mm2, (%2, %%"REG_a") \n\t"
  1827. "add $8, %%"REG_a" \n\t"
  1828. " js 1b \n\t"
  1829. : "+r" (src)
  1830. : "r" (dstU+width), "r" (dstV+width), "g" ((x86_reg)-2*width), "r"(ff_bgr24toUV[srcFormat == PIX_FMT_RGB24])
  1831. : "%"REG_a
  1832. );
  1833. }
  1834. static void RENAME(bgr24ToUV)(int16_t *dstU, int16_t *dstV,
  1835. const uint8_t *src1, const uint8_t *src2,
  1836. int width, uint32_t *unused)
  1837. {
  1838. RENAME(bgr24ToUV_mmx)(dstU, dstV, src1, width, PIX_FMT_BGR24);
  1839. assert(src1 == src2);
  1840. }
  1841. static void RENAME(rgb24ToUV)(int16_t *dstU, int16_t *dstV,
  1842. const uint8_t *src1, const uint8_t *src2,
  1843. int width, uint32_t *unused)
  1844. {
  1845. assert(src1==src2);
  1846. RENAME(bgr24ToUV_mmx)(dstU, dstV, src1, width, PIX_FMT_RGB24);
  1847. }
  1848. #if COMPILE_TEMPLATE_MMX2
  1849. static void RENAME(hyscale_fast)(SwsContext *c, int16_t *dst,
  1850. int dstWidth, const uint8_t *src,
  1851. int srcW, int xInc)
  1852. {
  1853. int16_t *filterPos = c->hLumFilterPos;
  1854. int16_t *filter = c->hLumFilter;
  1855. void *mmx2FilterCode= c->lumMmx2FilterCode;
  1856. int i;
  1857. #if defined(PIC)
  1858. DECLARE_ALIGNED(8, uint64_t, ebxsave);
  1859. #endif
  1860. __asm__ volatile(
  1861. #if defined(PIC)
  1862. "mov %%"REG_b", %5 \n\t"
  1863. #endif
  1864. "pxor %%mm7, %%mm7 \n\t"
  1865. "mov %0, %%"REG_c" \n\t"
  1866. "mov %1, %%"REG_D" \n\t"
  1867. "mov %2, %%"REG_d" \n\t"
  1868. "mov %3, %%"REG_b" \n\t"
  1869. "xor %%"REG_a", %%"REG_a" \n\t" // i
  1870. PREFETCH" (%%"REG_c") \n\t"
  1871. PREFETCH" 32(%%"REG_c") \n\t"
  1872. PREFETCH" 64(%%"REG_c") \n\t"
  1873. #if ARCH_X86_64
  1874. #define CALL_MMX2_FILTER_CODE \
  1875. "movl (%%"REG_b"), %%esi \n\t"\
  1876. "call *%4 \n\t"\
  1877. "movl (%%"REG_b", %%"REG_a"), %%esi \n\t"\
  1878. "add %%"REG_S", %%"REG_c" \n\t"\
  1879. "add %%"REG_a", %%"REG_D" \n\t"\
  1880. "xor %%"REG_a", %%"REG_a" \n\t"\
  1881. #else
  1882. #define CALL_MMX2_FILTER_CODE \
  1883. "movl (%%"REG_b"), %%esi \n\t"\
  1884. "call *%4 \n\t"\
  1885. "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\
  1886. "add %%"REG_a", %%"REG_D" \n\t"\
  1887. "xor %%"REG_a", %%"REG_a" \n\t"\
  1888. #endif /* ARCH_X86_64 */
  1889. CALL_MMX2_FILTER_CODE
  1890. CALL_MMX2_FILTER_CODE
  1891. CALL_MMX2_FILTER_CODE
  1892. CALL_MMX2_FILTER_CODE
  1893. CALL_MMX2_FILTER_CODE
  1894. CALL_MMX2_FILTER_CODE
  1895. CALL_MMX2_FILTER_CODE
  1896. CALL_MMX2_FILTER_CODE
  1897. #if defined(PIC)
  1898. "mov %5, %%"REG_b" \n\t"
  1899. #endif
  1900. :: "m" (src), "m" (dst), "m" (filter), "m" (filterPos),
  1901. "m" (mmx2FilterCode)
  1902. #if defined(PIC)
  1903. ,"m" (ebxsave)
  1904. #endif
  1905. : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
  1906. #if !defined(PIC)
  1907. ,"%"REG_b
  1908. #endif
  1909. );
  1910. for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
  1911. dst[i] = src[srcW-1]*128;
  1912. }
  1913. static void RENAME(hcscale_fast)(SwsContext *c, int16_t *dst1, int16_t *dst2,
  1914. int dstWidth, const uint8_t *src1,
  1915. const uint8_t *src2, int srcW, int xInc)
  1916. {
  1917. int16_t *filterPos = c->hChrFilterPos;
  1918. int16_t *filter = c->hChrFilter;
  1919. void *mmx2FilterCode= c->chrMmx2FilterCode;
  1920. int i;
  1921. #if defined(PIC)
  1922. DECLARE_ALIGNED(8, uint64_t, ebxsave);
  1923. #endif
  1924. __asm__ volatile(
  1925. #if defined(PIC)
  1926. "mov %%"REG_b", %7 \n\t"
  1927. #endif
  1928. "pxor %%mm7, %%mm7 \n\t"
  1929. "mov %0, %%"REG_c" \n\t"
  1930. "mov %1, %%"REG_D" \n\t"
  1931. "mov %2, %%"REG_d" \n\t"
  1932. "mov %3, %%"REG_b" \n\t"
  1933. "xor %%"REG_a", %%"REG_a" \n\t" // i
  1934. PREFETCH" (%%"REG_c") \n\t"
  1935. PREFETCH" 32(%%"REG_c") \n\t"
  1936. PREFETCH" 64(%%"REG_c") \n\t"
  1937. CALL_MMX2_FILTER_CODE
  1938. CALL_MMX2_FILTER_CODE
  1939. CALL_MMX2_FILTER_CODE
  1940. CALL_MMX2_FILTER_CODE
  1941. "xor %%"REG_a", %%"REG_a" \n\t" // i
  1942. "mov %5, %%"REG_c" \n\t" // src
  1943. "mov %6, %%"REG_D" \n\t" // buf2
  1944. PREFETCH" (%%"REG_c") \n\t"
  1945. PREFETCH" 32(%%"REG_c") \n\t"
  1946. PREFETCH" 64(%%"REG_c") \n\t"
  1947. CALL_MMX2_FILTER_CODE
  1948. CALL_MMX2_FILTER_CODE
  1949. CALL_MMX2_FILTER_CODE
  1950. CALL_MMX2_FILTER_CODE
  1951. #if defined(PIC)
  1952. "mov %7, %%"REG_b" \n\t"
  1953. #endif
  1954. :: "m" (src1), "m" (dst1), "m" (filter), "m" (filterPos),
  1955. "m" (mmx2FilterCode), "m" (src2), "m"(dst2)
  1956. #if defined(PIC)
  1957. ,"m" (ebxsave)
  1958. #endif
  1959. : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
  1960. #if !defined(PIC)
  1961. ,"%"REG_b
  1962. #endif
  1963. );
  1964. for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) {
  1965. dst1[i] = src1[srcW-1]*128;
  1966. dst2[i] = src2[srcW-1]*128;
  1967. }
  1968. }
  1969. #endif /* COMPILE_TEMPLATE_MMX2 */
  1970. static av_cold void RENAME(sws_init_swScale)(SwsContext *c)
  1971. {
  1972. enum PixelFormat srcFormat = c->srcFormat,
  1973. dstFormat = c->dstFormat;
  1974. if (!is16BPS(dstFormat) && !is9_OR_10BPS(dstFormat) && dstFormat != PIX_FMT_NV12
  1975. && dstFormat != PIX_FMT_NV21 && !(c->flags & SWS_BITEXACT)) {
  1976. if (c->flags & SWS_ACCURATE_RND) {
  1977. c->yuv2yuv1 = RENAME(yuv2yuv1_ar );
  1978. c->yuv2yuvX = RENAME(yuv2yuvX_ar );
  1979. if (!(c->flags & SWS_FULL_CHR_H_INT)) {
  1980. switch (c->dstFormat) {
  1981. case PIX_FMT_RGB32: c->yuv2packedX = RENAME(yuv2rgb32_X_ar); break;
  1982. case PIX_FMT_BGR24: c->yuv2packedX = RENAME(yuv2bgr24_X_ar); break;
  1983. case PIX_FMT_RGB555: c->yuv2packedX = RENAME(yuv2rgb555_X_ar); break;
  1984. case PIX_FMT_RGB565: c->yuv2packedX = RENAME(yuv2rgb565_X_ar); break;
  1985. case PIX_FMT_YUYV422: c->yuv2packedX = RENAME(yuv2yuyv422_X_ar); break;
  1986. default: break;
  1987. }
  1988. }
  1989. } else {
  1990. int should_dither= isNBPS(c->srcFormat) || is16BPS(c->srcFormat);
  1991. c->yuv2yuv1 = should_dither ? RENAME(yuv2yuv1_ar ) : RENAME(yuv2yuv1 );
  1992. c->yuv2yuvX = RENAME(yuv2yuvX );
  1993. if (!(c->flags & SWS_FULL_CHR_H_INT)) {
  1994. switch (c->dstFormat) {
  1995. case PIX_FMT_RGB32: c->yuv2packedX = RENAME(yuv2rgb32_X); break;
  1996. case PIX_FMT_BGR24: c->yuv2packedX = RENAME(yuv2bgr24_X); break;
  1997. case PIX_FMT_RGB555: c->yuv2packedX = RENAME(yuv2rgb555_X); break;
  1998. case PIX_FMT_RGB565: c->yuv2packedX = RENAME(yuv2rgb565_X); break;
  1999. case PIX_FMT_YUYV422: c->yuv2packedX = RENAME(yuv2yuyv422_X); break;
  2000. default: break;
  2001. }
  2002. }
  2003. }
  2004. if (!(c->flags & SWS_FULL_CHR_H_INT)) {
  2005. switch (c->dstFormat) {
  2006. case PIX_FMT_RGB32:
  2007. c->yuv2packed1 = RENAME(yuv2rgb32_1);
  2008. c->yuv2packed2 = RENAME(yuv2rgb32_2);
  2009. break;
  2010. case PIX_FMT_BGR24:
  2011. c->yuv2packed1 = RENAME(yuv2bgr24_1);
  2012. c->yuv2packed2 = RENAME(yuv2bgr24_2);
  2013. break;
  2014. case PIX_FMT_RGB555:
  2015. c->yuv2packed1 = RENAME(yuv2rgb555_1);
  2016. c->yuv2packed2 = RENAME(yuv2rgb555_2);
  2017. break;
  2018. case PIX_FMT_RGB565:
  2019. c->yuv2packed1 = RENAME(yuv2rgb565_1);
  2020. c->yuv2packed2 = RENAME(yuv2rgb565_2);
  2021. break;
  2022. case PIX_FMT_YUYV422:
  2023. c->yuv2packed1 = RENAME(yuv2yuyv422_1);
  2024. c->yuv2packed2 = RENAME(yuv2yuyv422_2);
  2025. break;
  2026. default:
  2027. break;
  2028. }
  2029. }
  2030. }
  2031. if (c->srcBpc == 8 && c->dstBpc <= 10) {
  2032. // Use the new MMX scaler if the MMX2 one can't be used (it is faster than the x86 ASM one).
  2033. #if COMPILE_TEMPLATE_MMX2
  2034. if (c->flags & SWS_FAST_BILINEAR && c->canMMX2BeUsed)
  2035. {
  2036. c->hyscale_fast = RENAME(hyscale_fast);
  2037. c->hcscale_fast = RENAME(hcscale_fast);
  2038. } else {
  2039. #endif /* COMPILE_TEMPLATE_MMX2 */
  2040. c->hyscale_fast = NULL;
  2041. c->hcscale_fast = NULL;
  2042. #if COMPILE_TEMPLATE_MMX2
  2043. }
  2044. #endif /* COMPILE_TEMPLATE_MMX2 */
  2045. }
  2046. #if !COMPILE_TEMPLATE_MMX2
  2047. switch(srcFormat) {
  2048. case PIX_FMT_YUYV422 : c->chrToYV12 = RENAME(yuy2ToUV); break;
  2049. case PIX_FMT_UYVY422 : c->chrToYV12 = RENAME(uyvyToUV); break;
  2050. case PIX_FMT_NV12 : c->chrToYV12 = RENAME(nv12ToUV); break;
  2051. case PIX_FMT_NV21 : c->chrToYV12 = RENAME(nv21ToUV); break;
  2052. default: break;
  2053. }
  2054. #endif /* !COMPILE_TEMPLATE_MMX2 */
  2055. if (!c->chrSrcHSubSample) {
  2056. switch(srcFormat) {
  2057. case PIX_FMT_BGR24 : c->chrToYV12 = RENAME(bgr24ToUV); break;
  2058. case PIX_FMT_RGB24 : c->chrToYV12 = RENAME(rgb24ToUV); break;
  2059. default: break;
  2060. }
  2061. }
  2062. switch (srcFormat) {
  2063. #if !COMPILE_TEMPLATE_MMX2
  2064. case PIX_FMT_YUYV422 :
  2065. case PIX_FMT_Y400A : c->lumToYV12 = RENAME(yuy2ToY); break;
  2066. case PIX_FMT_UYVY422 : c->lumToYV12 = RENAME(uyvyToY); break;
  2067. #endif /* !COMPILE_TEMPLATE_MMX2 */
  2068. case PIX_FMT_BGR24 : c->lumToYV12 = RENAME(bgr24ToY); break;
  2069. case PIX_FMT_RGB24 : c->lumToYV12 = RENAME(rgb24ToY); break;
  2070. default: break;
  2071. }
  2072. #if !COMPILE_TEMPLATE_MMX2
  2073. if (c->alpPixBuf) {
  2074. switch (srcFormat) {
  2075. case PIX_FMT_Y400A : c->alpToYV12 = RENAME(yuy2ToY); break;
  2076. default: break;
  2077. }
  2078. }
  2079. #endif /* !COMPILE_TEMPLATE_MMX2 */
  2080. }