You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

2493 lines
113KB

  1. /*
  2. * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
  3. *
  4. * This file is part of FFmpeg.
  5. *
  6. * FFmpeg is free software; you can redistribute it and/or
  7. * modify it under the terms of the GNU Lesser General Public
  8. * License as published by the Free Software Foundation; either
  9. * version 2.1 of the License, or (at your option) any later version.
  10. *
  11. * FFmpeg is distributed in the hope that it will be useful,
  12. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  14. * Lesser General Public License for more details.
  15. *
  16. * You should have received a copy of the GNU Lesser General Public
  17. * License along with FFmpeg; if not, write to the Free Software
  18. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  19. */
  20. #undef REAL_MOVNTQ
  21. #undef MOVNTQ
  22. #undef PREFETCH
  23. #if COMPILE_TEMPLATE_MMX2
  24. #define PREFETCH "prefetchnta"
  25. #else
  26. #define PREFETCH " # nop"
  27. #endif
  28. #if COMPILE_TEMPLATE_MMX2
  29. #define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
  30. #else
  31. #define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
  32. #endif
  33. #define MOVNTQ(a,b) REAL_MOVNTQ(a,b)
  34. #define YSCALEYUV2YV12X(offset, dest, end, pos) \
  35. __asm__ volatile(\
  36. "movq "DITHER16"+0(%0), %%mm3 \n\t"\
  37. "movq "DITHER16"+8(%0), %%mm4 \n\t"\
  38. "lea " offset "(%0), %%"REG_d" \n\t"\
  39. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  40. ".p2align 4 \n\t" /* FIXME Unroll? */\
  41. "1: \n\t"\
  42. "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
  43. "movq (%%"REG_S", %3, 2), %%mm2 \n\t" /* srcData */\
  44. "movq 8(%%"REG_S", %3, 2), %%mm5 \n\t" /* srcData */\
  45. "add $16, %%"REG_d" \n\t"\
  46. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  47. "test %%"REG_S", %%"REG_S" \n\t"\
  48. "pmulhw %%mm0, %%mm2 \n\t"\
  49. "pmulhw %%mm0, %%mm5 \n\t"\
  50. "paddw %%mm2, %%mm3 \n\t"\
  51. "paddw %%mm5, %%mm4 \n\t"\
  52. " jnz 1b \n\t"\
  53. "psraw $3, %%mm3 \n\t"\
  54. "psraw $3, %%mm4 \n\t"\
  55. "packuswb %%mm4, %%mm3 \n\t"\
  56. MOVNTQ(%%mm3, (%1, %3))\
  57. "add $8, %3 \n\t"\
  58. "cmp %2, %3 \n\t"\
  59. "movq "DITHER16"+0(%0), %%mm3 \n\t"\
  60. "movq "DITHER16"+8(%0), %%mm4 \n\t"\
  61. "lea " offset "(%0), %%"REG_d" \n\t"\
  62. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  63. "jb 1b \n\t"\
  64. :: "r" (&c->redDither),\
  65. "r" (dest), "g" ((x86_reg)(end)), "r"((x86_reg)(pos))\
  66. : "%"REG_d, "%"REG_S\
  67. );
  68. static void RENAME(yuv2yuvX)(SwsContext *c, const int16_t *lumFilter,
  69. const int16_t **lumSrc, int lumFilterSize,
  70. const int16_t *chrFilter, const int16_t **chrUSrc,
  71. const int16_t **chrVSrc,
  72. int chrFilterSize, const int16_t **alpSrc,
  73. uint8_t *dest, uint8_t *uDest, uint8_t *vDest,
  74. uint8_t *aDest, int dstW, int chrDstW,
  75. const uint8_t *lumDither, const uint8_t *chrDither)
  76. {
  77. int i;
  78. if (uDest) {
  79. x86_reg uv_off = c->uv_off;
  80. for(i=0; i<8; i++) c->dither16[i] = chrDither[i]>>4;
  81. YSCALEYUV2YV12X(CHR_MMX_FILTER_OFFSET, uDest, chrDstW, 0)
  82. for(i=0; i<8; i++) c->dither16[i] = chrDither[(i+3)&7]>>4;
  83. YSCALEYUV2YV12X(CHR_MMX_FILTER_OFFSET, vDest - uv_off, chrDstW + uv_off, uv_off)
  84. }
  85. for(i=0; i<8; i++) c->dither16[i] = lumDither[i]>>4;
  86. if (CONFIG_SWSCALE_ALPHA && aDest) {
  87. YSCALEYUV2YV12X(ALP_MMX_FILTER_OFFSET, aDest, dstW, 0)
  88. }
  89. YSCALEYUV2YV12X(LUM_MMX_FILTER_OFFSET, dest, dstW, 0)
  90. }
  91. #define YSCALEYUV2YV12X_ACCURATE(offset, dest, end, pos) \
  92. __asm__ volatile(\
  93. "lea " offset "(%0), %%"REG_d" \n\t"\
  94. "movq "DITHER32"+0(%0), %%mm4 \n\t"\
  95. "movq "DITHER32"+8(%0), %%mm5 \n\t"\
  96. "movq "DITHER32"+16(%0), %%mm6 \n\t"\
  97. "movq "DITHER32"+24(%0), %%mm7 \n\t"\
  98. "pxor %%mm4, %%mm4 \n\t"\
  99. "pxor %%mm5, %%mm5 \n\t"\
  100. "pxor %%mm6, %%mm6 \n\t"\
  101. "pxor %%mm7, %%mm7 \n\t"\
  102. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  103. ".p2align 4 \n\t"\
  104. "1: \n\t"\
  105. "movq (%%"REG_S", %3, 2), %%mm0 \n\t" /* srcData */\
  106. "movq 8(%%"REG_S", %3, 2), %%mm2 \n\t" /* srcData */\
  107. "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
  108. "movq (%%"REG_S", %3, 2), %%mm1 \n\t" /* srcData */\
  109. "movq %%mm0, %%mm3 \n\t"\
  110. "punpcklwd %%mm1, %%mm0 \n\t"\
  111. "punpckhwd %%mm1, %%mm3 \n\t"\
  112. "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm1 \n\t" /* filterCoeff */\
  113. "pmaddwd %%mm1, %%mm0 \n\t"\
  114. "pmaddwd %%mm1, %%mm3 \n\t"\
  115. "paddd %%mm0, %%mm4 \n\t"\
  116. "paddd %%mm3, %%mm5 \n\t"\
  117. "movq 8(%%"REG_S", %3, 2), %%mm3 \n\t" /* srcData */\
  118. "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
  119. "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
  120. "test %%"REG_S", %%"REG_S" \n\t"\
  121. "movq %%mm2, %%mm0 \n\t"\
  122. "punpcklwd %%mm3, %%mm2 \n\t"\
  123. "punpckhwd %%mm3, %%mm0 \n\t"\
  124. "pmaddwd %%mm1, %%mm2 \n\t"\
  125. "pmaddwd %%mm1, %%mm0 \n\t"\
  126. "paddd %%mm2, %%mm6 \n\t"\
  127. "paddd %%mm0, %%mm7 \n\t"\
  128. " jnz 1b \n\t"\
  129. "psrad $19, %%mm4 \n\t"\
  130. "psrad $19, %%mm5 \n\t"\
  131. "psrad $19, %%mm6 \n\t"\
  132. "psrad $19, %%mm7 \n\t"\
  133. "packssdw %%mm5, %%mm4 \n\t"\
  134. "packssdw %%mm7, %%mm6 \n\t"\
  135. "packuswb %%mm6, %%mm4 \n\t"\
  136. MOVNTQ(%%mm4, (%1, %3))\
  137. "add $8, %3 \n\t"\
  138. "cmp %2, %3 \n\t"\
  139. "lea " offset "(%0), %%"REG_d" \n\t"\
  140. "movq "DITHER32"+0(%0), %%mm4 \n\t"\
  141. "movq "DITHER32"+8(%0), %%mm5 \n\t"\
  142. "movq "DITHER32"+16(%0), %%mm6 \n\t"\
  143. "movq "DITHER32"+24(%0), %%mm7 \n\t"\
  144. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  145. "jb 1b \n\t"\
  146. :: "r" (&c->redDither),\
  147. "r" (dest), "g" ((x86_reg)(end)), "r"((x86_reg)(pos))\
  148. : "%"REG_a, "%"REG_d, "%"REG_S\
  149. );
  150. static void RENAME(yuv2yuvX_ar)(SwsContext *c, const int16_t *lumFilter,
  151. const int16_t **lumSrc, int lumFilterSize,
  152. const int16_t *chrFilter, const int16_t **chrUSrc,
  153. const int16_t **chrVSrc,
  154. int chrFilterSize, const int16_t **alpSrc,
  155. uint8_t *dest, uint8_t *uDest, uint8_t *vDest,
  156. uint8_t *aDest, int dstW, int chrDstW,
  157. const uint8_t *lumDither, const uint8_t *chrDither)
  158. {
  159. int i;
  160. if (uDest) {
  161. x86_reg uv_off = c->uv_off;
  162. for(i=0; i<8; i++) c->dither32[i] = chrDither[i]<<12;
  163. YSCALEYUV2YV12X_ACCURATE(CHR_MMX_FILTER_OFFSET, uDest, chrDstW, 0)
  164. for(i=0; i<8; i++) c->dither32[i] = chrDither[(i+3)&7]<<12;
  165. YSCALEYUV2YV12X_ACCURATE(CHR_MMX_FILTER_OFFSET, vDest - uv_off, chrDstW + uv_off, uv_off)
  166. }
  167. for(i=0; i<8; i++) c->dither32[i] = lumDither[i]<<12;
  168. if (CONFIG_SWSCALE_ALPHA && aDest) {
  169. YSCALEYUV2YV12X_ACCURATE(ALP_MMX_FILTER_OFFSET, aDest, dstW, 0)
  170. }
  171. YSCALEYUV2YV12X_ACCURATE(LUM_MMX_FILTER_OFFSET, dest, dstW, 0)
  172. }
  173. static void RENAME(yuv2yuv1)(SwsContext *c, const int16_t *lumSrc,
  174. const int16_t *chrUSrc, const int16_t *chrVSrc,
  175. const int16_t *alpSrc,
  176. uint8_t *dest, uint8_t *uDest, uint8_t *vDest,
  177. uint8_t *aDest, int dstW, int chrDstW,
  178. const uint8_t *lumDither, const uint8_t *chrDither)
  179. {
  180. int p= 4;
  181. const int16_t *src[4]= { alpSrc + dstW, lumSrc + dstW, chrUSrc + chrDstW, chrVSrc + chrDstW };
  182. uint8_t *dst[4]= { aDest, dest, uDest, vDest };
  183. x86_reg counter[4]= { dstW, dstW, chrDstW, chrDstW };
  184. while (p--) {
  185. if (dst[p]) {
  186. __asm__ volatile(
  187. "mov %2, %%"REG_a" \n\t"
  188. ".p2align 4 \n\t" /* FIXME Unroll? */
  189. "1: \n\t"
  190. "movq (%0, %%"REG_a", 2), %%mm0 \n\t"
  191. "movq 8(%0, %%"REG_a", 2), %%mm1 \n\t"
  192. "psraw $7, %%mm0 \n\t"
  193. "psraw $7, %%mm1 \n\t"
  194. "packuswb %%mm1, %%mm0 \n\t"
  195. MOVNTQ(%%mm0, (%1, %%REGa))
  196. "add $8, %%"REG_a" \n\t"
  197. "jnc 1b \n\t"
  198. :: "r" (src[p]), "r" (dst[p] + counter[p]),
  199. "g" (-counter[p])
  200. : "%"REG_a
  201. );
  202. }
  203. }
  204. }
  205. static void RENAME(yuv2yuv1_ar)(SwsContext *c, const int16_t *lumSrc,
  206. const int16_t *chrUSrc, const int16_t *chrVSrc,
  207. const int16_t *alpSrc,
  208. uint8_t *dest, uint8_t *uDest, uint8_t *vDest,
  209. uint8_t *aDest, int dstW, int chrDstW,
  210. const uint8_t *lumDither, const uint8_t *chrDither)
  211. {
  212. int p= 4;
  213. const int16_t *src[4]= { alpSrc + dstW, lumSrc + dstW, chrUSrc + chrDstW, chrVSrc + chrDstW };
  214. uint8_t *dst[4]= { aDest, dest, uDest, vDest };
  215. x86_reg counter[4]= { dstW, dstW, chrDstW, chrDstW };
  216. while (p--) {
  217. if (dst[p]) {
  218. int i;
  219. for(i=0; i<8; i++) c->dither16[i] = i<2 ? lumDither[i] : chrDither[i];
  220. __asm__ volatile(
  221. "mov %2, %%"REG_a" \n\t"
  222. "movq 0(%3), %%mm6 \n\t"
  223. "movq 8(%3), %%mm7 \n\t"
  224. ".p2align 4 \n\t" /* FIXME Unroll? */
  225. "1: \n\t"
  226. "movq (%0, %%"REG_a", 2), %%mm0 \n\t"
  227. "movq 8(%0, %%"REG_a", 2), %%mm1 \n\t"
  228. "paddsw %%mm6, %%mm0 \n\t"
  229. "paddsw %%mm7, %%mm1 \n\t"
  230. "psraw $7, %%mm0 \n\t"
  231. "psraw $7, %%mm1 \n\t"
  232. "packuswb %%mm1, %%mm0 \n\t"
  233. MOVNTQ(%%mm0, (%1, %%REGa))
  234. "add $8, %%"REG_a" \n\t"
  235. "jnc 1b \n\t"
  236. :: "r" (src[p]), "r" (dst[p] + counter[p]),
  237. "g" (-counter[p]), "r"(c->dither16)
  238. : "%"REG_a
  239. );
  240. }
  241. }
  242. }
  243. #define YSCALEYUV2PACKEDX_UV \
  244. __asm__ volatile(\
  245. "xor %%"REG_a", %%"REG_a" \n\t"\
  246. ".p2align 4 \n\t"\
  247. "nop \n\t"\
  248. "1: \n\t"\
  249. "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
  250. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  251. "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
  252. "movq %%mm3, %%mm4 \n\t"\
  253. ".p2align 4 \n\t"\
  254. "2: \n\t"\
  255. "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
  256. "movq (%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* UsrcData */\
  257. "add %6, %%"REG_S" \n\t" \
  258. "movq (%%"REG_S", %%"REG_a"), %%mm5 \n\t" /* VsrcData */\
  259. "add $16, %%"REG_d" \n\t"\
  260. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  261. "pmulhw %%mm0, %%mm2 \n\t"\
  262. "pmulhw %%mm0, %%mm5 \n\t"\
  263. "paddw %%mm2, %%mm3 \n\t"\
  264. "paddw %%mm5, %%mm4 \n\t"\
  265. "test %%"REG_S", %%"REG_S" \n\t"\
  266. " jnz 2b \n\t"\
  267. #define YSCALEYUV2PACKEDX_YA(offset,coeff,src1,src2,dst1,dst2) \
  268. "lea "offset"(%0), %%"REG_d" \n\t"\
  269. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  270. "movq "VROUNDER_OFFSET"(%0), "#dst1" \n\t"\
  271. "movq "#dst1", "#dst2" \n\t"\
  272. ".p2align 4 \n\t"\
  273. "2: \n\t"\
  274. "movq 8(%%"REG_d"), "#coeff" \n\t" /* filterCoeff */\
  275. "movq (%%"REG_S", %%"REG_a", 2), "#src1" \n\t" /* Y1srcData */\
  276. "movq 8(%%"REG_S", %%"REG_a", 2), "#src2" \n\t" /* Y2srcData */\
  277. "add $16, %%"REG_d" \n\t"\
  278. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  279. "pmulhw "#coeff", "#src1" \n\t"\
  280. "pmulhw "#coeff", "#src2" \n\t"\
  281. "paddw "#src1", "#dst1" \n\t"\
  282. "paddw "#src2", "#dst2" \n\t"\
  283. "test %%"REG_S", %%"REG_S" \n\t"\
  284. " jnz 2b \n\t"\
  285. #define YSCALEYUV2PACKEDX \
  286. YSCALEYUV2PACKEDX_UV \
  287. YSCALEYUV2PACKEDX_YA(LUM_MMX_FILTER_OFFSET,%%mm0,%%mm2,%%mm5,%%mm1,%%mm7) \
  288. #define YSCALEYUV2PACKEDX_END \
  289. :: "r" (&c->redDither), \
  290. "m" (dummy), "m" (dummy), "m" (dummy),\
  291. "r" (dest), "m" (dstW_reg), "m"(uv_off) \
  292. : "%"REG_a, "%"REG_d, "%"REG_S \
  293. );
  294. #define YSCALEYUV2PACKEDX_ACCURATE_UV \
  295. __asm__ volatile(\
  296. "xor %%"REG_a", %%"REG_a" \n\t"\
  297. ".p2align 4 \n\t"\
  298. "nop \n\t"\
  299. "1: \n\t"\
  300. "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
  301. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  302. "pxor %%mm4, %%mm4 \n\t"\
  303. "pxor %%mm5, %%mm5 \n\t"\
  304. "pxor %%mm6, %%mm6 \n\t"\
  305. "pxor %%mm7, %%mm7 \n\t"\
  306. ".p2align 4 \n\t"\
  307. "2: \n\t"\
  308. "movq (%%"REG_S", %%"REG_a"), %%mm0 \n\t" /* UsrcData */\
  309. "add %6, %%"REG_S" \n\t" \
  310. "movq (%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* VsrcData */\
  311. "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
  312. "movq (%%"REG_S", %%"REG_a"), %%mm1 \n\t" /* UsrcData */\
  313. "movq %%mm0, %%mm3 \n\t"\
  314. "punpcklwd %%mm1, %%mm0 \n\t"\
  315. "punpckhwd %%mm1, %%mm3 \n\t"\
  316. "movq "STR(APCK_COEF)"(%%"REG_d"),%%mm1 \n\t" /* filterCoeff */\
  317. "pmaddwd %%mm1, %%mm0 \n\t"\
  318. "pmaddwd %%mm1, %%mm3 \n\t"\
  319. "paddd %%mm0, %%mm4 \n\t"\
  320. "paddd %%mm3, %%mm5 \n\t"\
  321. "add %6, %%"REG_S" \n\t" \
  322. "movq (%%"REG_S", %%"REG_a"), %%mm3 \n\t" /* VsrcData */\
  323. "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
  324. "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
  325. "test %%"REG_S", %%"REG_S" \n\t"\
  326. "movq %%mm2, %%mm0 \n\t"\
  327. "punpcklwd %%mm3, %%mm2 \n\t"\
  328. "punpckhwd %%mm3, %%mm0 \n\t"\
  329. "pmaddwd %%mm1, %%mm2 \n\t"\
  330. "pmaddwd %%mm1, %%mm0 \n\t"\
  331. "paddd %%mm2, %%mm6 \n\t"\
  332. "paddd %%mm0, %%mm7 \n\t"\
  333. " jnz 2b \n\t"\
  334. "psrad $16, %%mm4 \n\t"\
  335. "psrad $16, %%mm5 \n\t"\
  336. "psrad $16, %%mm6 \n\t"\
  337. "psrad $16, %%mm7 \n\t"\
  338. "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
  339. "packssdw %%mm5, %%mm4 \n\t"\
  340. "packssdw %%mm7, %%mm6 \n\t"\
  341. "paddw %%mm0, %%mm4 \n\t"\
  342. "paddw %%mm0, %%mm6 \n\t"\
  343. "movq %%mm4, "U_TEMP"(%0) \n\t"\
  344. "movq %%mm6, "V_TEMP"(%0) \n\t"\
  345. #define YSCALEYUV2PACKEDX_ACCURATE_YA(offset) \
  346. "lea "offset"(%0), %%"REG_d" \n\t"\
  347. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  348. "pxor %%mm1, %%mm1 \n\t"\
  349. "pxor %%mm5, %%mm5 \n\t"\
  350. "pxor %%mm7, %%mm7 \n\t"\
  351. "pxor %%mm6, %%mm6 \n\t"\
  352. ".p2align 4 \n\t"\
  353. "2: \n\t"\
  354. "movq (%%"REG_S", %%"REG_a", 2), %%mm0 \n\t" /* Y1srcData */\
  355. "movq 8(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* Y2srcData */\
  356. "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
  357. "movq (%%"REG_S", %%"REG_a", 2), %%mm4 \n\t" /* Y1srcData */\
  358. "movq %%mm0, %%mm3 \n\t"\
  359. "punpcklwd %%mm4, %%mm0 \n\t"\
  360. "punpckhwd %%mm4, %%mm3 \n\t"\
  361. "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm4 \n\t" /* filterCoeff */\
  362. "pmaddwd %%mm4, %%mm0 \n\t"\
  363. "pmaddwd %%mm4, %%mm3 \n\t"\
  364. "paddd %%mm0, %%mm1 \n\t"\
  365. "paddd %%mm3, %%mm5 \n\t"\
  366. "movq 8(%%"REG_S", %%"REG_a", 2), %%mm3 \n\t" /* Y2srcData */\
  367. "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
  368. "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
  369. "test %%"REG_S", %%"REG_S" \n\t"\
  370. "movq %%mm2, %%mm0 \n\t"\
  371. "punpcklwd %%mm3, %%mm2 \n\t"\
  372. "punpckhwd %%mm3, %%mm0 \n\t"\
  373. "pmaddwd %%mm4, %%mm2 \n\t"\
  374. "pmaddwd %%mm4, %%mm0 \n\t"\
  375. "paddd %%mm2, %%mm7 \n\t"\
  376. "paddd %%mm0, %%mm6 \n\t"\
  377. " jnz 2b \n\t"\
  378. "psrad $16, %%mm1 \n\t"\
  379. "psrad $16, %%mm5 \n\t"\
  380. "psrad $16, %%mm7 \n\t"\
  381. "psrad $16, %%mm6 \n\t"\
  382. "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
  383. "packssdw %%mm5, %%mm1 \n\t"\
  384. "packssdw %%mm6, %%mm7 \n\t"\
  385. "paddw %%mm0, %%mm1 \n\t"\
  386. "paddw %%mm0, %%mm7 \n\t"\
  387. "movq "U_TEMP"(%0), %%mm3 \n\t"\
  388. "movq "V_TEMP"(%0), %%mm4 \n\t"\
  389. #define YSCALEYUV2PACKEDX_ACCURATE \
  390. YSCALEYUV2PACKEDX_ACCURATE_UV \
  391. YSCALEYUV2PACKEDX_ACCURATE_YA(LUM_MMX_FILTER_OFFSET)
  392. #define YSCALEYUV2RGBX \
  393. "psubw "U_OFFSET"(%0), %%mm3 \n\t" /* (U-128)8*/\
  394. "psubw "V_OFFSET"(%0), %%mm4 \n\t" /* (V-128)8*/\
  395. "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
  396. "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
  397. "pmulhw "UG_COEFF"(%0), %%mm3 \n\t"\
  398. "pmulhw "VG_COEFF"(%0), %%mm4 \n\t"\
  399. /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
  400. "pmulhw "UB_COEFF"(%0), %%mm2 \n\t"\
  401. "pmulhw "VR_COEFF"(%0), %%mm5 \n\t"\
  402. "psubw "Y_OFFSET"(%0), %%mm1 \n\t" /* 8(Y-16)*/\
  403. "psubw "Y_OFFSET"(%0), %%mm7 \n\t" /* 8(Y-16)*/\
  404. "pmulhw "Y_COEFF"(%0), %%mm1 \n\t"\
  405. "pmulhw "Y_COEFF"(%0), %%mm7 \n\t"\
  406. /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
  407. "paddw %%mm3, %%mm4 \n\t"\
  408. "movq %%mm2, %%mm0 \n\t"\
  409. "movq %%mm5, %%mm6 \n\t"\
  410. "movq %%mm4, %%mm3 \n\t"\
  411. "punpcklwd %%mm2, %%mm2 \n\t"\
  412. "punpcklwd %%mm5, %%mm5 \n\t"\
  413. "punpcklwd %%mm4, %%mm4 \n\t"\
  414. "paddw %%mm1, %%mm2 \n\t"\
  415. "paddw %%mm1, %%mm5 \n\t"\
  416. "paddw %%mm1, %%mm4 \n\t"\
  417. "punpckhwd %%mm0, %%mm0 \n\t"\
  418. "punpckhwd %%mm6, %%mm6 \n\t"\
  419. "punpckhwd %%mm3, %%mm3 \n\t"\
  420. "paddw %%mm7, %%mm0 \n\t"\
  421. "paddw %%mm7, %%mm6 \n\t"\
  422. "paddw %%mm7, %%mm3 \n\t"\
  423. /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
  424. "packuswb %%mm0, %%mm2 \n\t"\
  425. "packuswb %%mm6, %%mm5 \n\t"\
  426. "packuswb %%mm3, %%mm4 \n\t"\
  427. #define REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) \
  428. "movq "#b", "#q2" \n\t" /* B */\
  429. "movq "#r", "#t" \n\t" /* R */\
  430. "punpcklbw "#g", "#b" \n\t" /* GBGBGBGB 0 */\
  431. "punpcklbw "#a", "#r" \n\t" /* ARARARAR 0 */\
  432. "punpckhbw "#g", "#q2" \n\t" /* GBGBGBGB 2 */\
  433. "punpckhbw "#a", "#t" \n\t" /* ARARARAR 2 */\
  434. "movq "#b", "#q0" \n\t" /* GBGBGBGB 0 */\
  435. "movq "#q2", "#q3" \n\t" /* GBGBGBGB 2 */\
  436. "punpcklwd "#r", "#q0" \n\t" /* ARGBARGB 0 */\
  437. "punpckhwd "#r", "#b" \n\t" /* ARGBARGB 1 */\
  438. "punpcklwd "#t", "#q2" \n\t" /* ARGBARGB 2 */\
  439. "punpckhwd "#t", "#q3" \n\t" /* ARGBARGB 3 */\
  440. \
  441. MOVNTQ( q0, (dst, index, 4))\
  442. MOVNTQ( b, 8(dst, index, 4))\
  443. MOVNTQ( q2, 16(dst, index, 4))\
  444. MOVNTQ( q3, 24(dst, index, 4))\
  445. \
  446. "add $8, "#index" \n\t"\
  447. "cmp "#dstw", "#index" \n\t"\
  448. " jb 1b \n\t"
  449. #define WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t)
  450. static void RENAME(yuv2rgb32_X_ar)(SwsContext *c, const int16_t *lumFilter,
  451. const int16_t **lumSrc, int lumFilterSize,
  452. const int16_t *chrFilter, const int16_t **chrUSrc,
  453. const int16_t **chrVSrc,
  454. int chrFilterSize, const int16_t **alpSrc,
  455. uint8_t *dest, int dstW, int dstY)
  456. {
  457. x86_reg dummy=0;
  458. x86_reg dstW_reg = dstW;
  459. x86_reg uv_off = c->uv_off << 1;
  460. if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
  461. YSCALEYUV2PACKEDX_ACCURATE
  462. YSCALEYUV2RGBX
  463. "movq %%mm2, "U_TEMP"(%0) \n\t"
  464. "movq %%mm4, "V_TEMP"(%0) \n\t"
  465. "movq %%mm5, "Y_TEMP"(%0) \n\t"
  466. YSCALEYUV2PACKEDX_ACCURATE_YA(ALP_MMX_FILTER_OFFSET)
  467. "movq "Y_TEMP"(%0), %%mm5 \n\t"
  468. "psraw $3, %%mm1 \n\t"
  469. "psraw $3, %%mm7 \n\t"
  470. "packuswb %%mm7, %%mm1 \n\t"
  471. WRITEBGR32(%4, %5, %%REGa, %%mm3, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm2, %%mm6)
  472. YSCALEYUV2PACKEDX_END
  473. } else {
  474. YSCALEYUV2PACKEDX_ACCURATE
  475. YSCALEYUV2RGBX
  476. "pcmpeqd %%mm7, %%mm7 \n\t"
  477. WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
  478. YSCALEYUV2PACKEDX_END
  479. }
  480. }
  481. static void RENAME(yuv2rgb32_X)(SwsContext *c, const int16_t *lumFilter,
  482. const int16_t **lumSrc, int lumFilterSize,
  483. const int16_t *chrFilter, const int16_t **chrUSrc,
  484. const int16_t **chrVSrc,
  485. int chrFilterSize, const int16_t **alpSrc,
  486. uint8_t *dest, int dstW, int dstY)
  487. {
  488. x86_reg dummy=0;
  489. x86_reg dstW_reg = dstW;
  490. x86_reg uv_off = c->uv_off << 1;
  491. if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
  492. YSCALEYUV2PACKEDX
  493. YSCALEYUV2RGBX
  494. YSCALEYUV2PACKEDX_YA(ALP_MMX_FILTER_OFFSET, %%mm0, %%mm3, %%mm6, %%mm1, %%mm7)
  495. "psraw $3, %%mm1 \n\t"
  496. "psraw $3, %%mm7 \n\t"
  497. "packuswb %%mm7, %%mm1 \n\t"
  498. WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
  499. YSCALEYUV2PACKEDX_END
  500. } else {
  501. YSCALEYUV2PACKEDX
  502. YSCALEYUV2RGBX
  503. "pcmpeqd %%mm7, %%mm7 \n\t"
  504. WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
  505. YSCALEYUV2PACKEDX_END
  506. }
  507. }
  508. #define REAL_WRITERGB16(dst, dstw, index) \
  509. "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
  510. "pand "MANGLE(bFC)", %%mm4 \n\t" /* G */\
  511. "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
  512. "psrlq $3, %%mm2 \n\t"\
  513. \
  514. "movq %%mm2, %%mm1 \n\t"\
  515. "movq %%mm4, %%mm3 \n\t"\
  516. \
  517. "punpcklbw %%mm7, %%mm3 \n\t"\
  518. "punpcklbw %%mm5, %%mm2 \n\t"\
  519. "punpckhbw %%mm7, %%mm4 \n\t"\
  520. "punpckhbw %%mm5, %%mm1 \n\t"\
  521. \
  522. "psllq $3, %%mm3 \n\t"\
  523. "psllq $3, %%mm4 \n\t"\
  524. \
  525. "por %%mm3, %%mm2 \n\t"\
  526. "por %%mm4, %%mm1 \n\t"\
  527. \
  528. MOVNTQ(%%mm2, (dst, index, 2))\
  529. MOVNTQ(%%mm1, 8(dst, index, 2))\
  530. \
  531. "add $8, "#index" \n\t"\
  532. "cmp "#dstw", "#index" \n\t"\
  533. " jb 1b \n\t"
  534. #define WRITERGB16(dst, dstw, index) REAL_WRITERGB16(dst, dstw, index)
  535. static void RENAME(yuv2rgb565_X_ar)(SwsContext *c, const int16_t *lumFilter,
  536. const int16_t **lumSrc, int lumFilterSize,
  537. const int16_t *chrFilter, const int16_t **chrUSrc,
  538. const int16_t **chrVSrc,
  539. int chrFilterSize, const int16_t **alpSrc,
  540. uint8_t *dest, int dstW, int dstY)
  541. {
  542. x86_reg dummy=0;
  543. x86_reg dstW_reg = dstW;
  544. x86_reg uv_off = c->uv_off << 1;
  545. YSCALEYUV2PACKEDX_ACCURATE
  546. YSCALEYUV2RGBX
  547. "pxor %%mm7, %%mm7 \n\t"
  548. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  549. #ifdef DITHER1XBPP
  550. "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
  551. "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
  552. "paddusb "RED_DITHER"(%0), %%mm5\n\t"
  553. #endif
  554. WRITERGB16(%4, %5, %%REGa)
  555. YSCALEYUV2PACKEDX_END
  556. }
  557. static void RENAME(yuv2rgb565_X)(SwsContext *c, const int16_t *lumFilter,
  558. const int16_t **lumSrc, int lumFilterSize,
  559. const int16_t *chrFilter, const int16_t **chrUSrc,
  560. const int16_t **chrVSrc,
  561. int chrFilterSize, const int16_t **alpSrc,
  562. uint8_t *dest, int dstW, int dstY)
  563. {
  564. x86_reg dummy=0;
  565. x86_reg dstW_reg = dstW;
  566. x86_reg uv_off = c->uv_off << 1;
  567. YSCALEYUV2PACKEDX
  568. YSCALEYUV2RGBX
  569. "pxor %%mm7, %%mm7 \n\t"
  570. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  571. #ifdef DITHER1XBPP
  572. "paddusb "BLUE_DITHER"(%0), %%mm2 \n\t"
  573. "paddusb "GREEN_DITHER"(%0), %%mm4 \n\t"
  574. "paddusb "RED_DITHER"(%0), %%mm5 \n\t"
  575. #endif
  576. WRITERGB16(%4, %5, %%REGa)
  577. YSCALEYUV2PACKEDX_END
  578. }
  579. #define REAL_WRITERGB15(dst, dstw, index) \
  580. "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
  581. "pand "MANGLE(bF8)", %%mm4 \n\t" /* G */\
  582. "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
  583. "psrlq $3, %%mm2 \n\t"\
  584. "psrlq $1, %%mm5 \n\t"\
  585. \
  586. "movq %%mm2, %%mm1 \n\t"\
  587. "movq %%mm4, %%mm3 \n\t"\
  588. \
  589. "punpcklbw %%mm7, %%mm3 \n\t"\
  590. "punpcklbw %%mm5, %%mm2 \n\t"\
  591. "punpckhbw %%mm7, %%mm4 \n\t"\
  592. "punpckhbw %%mm5, %%mm1 \n\t"\
  593. \
  594. "psllq $2, %%mm3 \n\t"\
  595. "psllq $2, %%mm4 \n\t"\
  596. \
  597. "por %%mm3, %%mm2 \n\t"\
  598. "por %%mm4, %%mm1 \n\t"\
  599. \
  600. MOVNTQ(%%mm2, (dst, index, 2))\
  601. MOVNTQ(%%mm1, 8(dst, index, 2))\
  602. \
  603. "add $8, "#index" \n\t"\
  604. "cmp "#dstw", "#index" \n\t"\
  605. " jb 1b \n\t"
  606. #define WRITERGB15(dst, dstw, index) REAL_WRITERGB15(dst, dstw, index)
  607. static void RENAME(yuv2rgb555_X_ar)(SwsContext *c, const int16_t *lumFilter,
  608. const int16_t **lumSrc, int lumFilterSize,
  609. const int16_t *chrFilter, const int16_t **chrUSrc,
  610. const int16_t **chrVSrc,
  611. int chrFilterSize, const int16_t **alpSrc,
  612. uint8_t *dest, int dstW, int dstY)
  613. {
  614. x86_reg dummy=0;
  615. x86_reg dstW_reg = dstW;
  616. x86_reg uv_off = c->uv_off << 1;
  617. YSCALEYUV2PACKEDX_ACCURATE
  618. YSCALEYUV2RGBX
  619. "pxor %%mm7, %%mm7 \n\t"
  620. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  621. #ifdef DITHER1XBPP
  622. "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
  623. "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
  624. "paddusb "RED_DITHER"(%0), %%mm5\n\t"
  625. #endif
  626. WRITERGB15(%4, %5, %%REGa)
  627. YSCALEYUV2PACKEDX_END
  628. }
  629. static void RENAME(yuv2rgb555_X)(SwsContext *c, const int16_t *lumFilter,
  630. const int16_t **lumSrc, int lumFilterSize,
  631. const int16_t *chrFilter, const int16_t **chrUSrc,
  632. const int16_t **chrVSrc,
  633. int chrFilterSize, const int16_t **alpSrc,
  634. uint8_t *dest, int dstW, int dstY)
  635. {
  636. x86_reg dummy=0;
  637. x86_reg dstW_reg = dstW;
  638. x86_reg uv_off = c->uv_off << 1;
  639. YSCALEYUV2PACKEDX
  640. YSCALEYUV2RGBX
  641. "pxor %%mm7, %%mm7 \n\t"
  642. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  643. #ifdef DITHER1XBPP
  644. "paddusb "BLUE_DITHER"(%0), %%mm2 \n\t"
  645. "paddusb "GREEN_DITHER"(%0), %%mm4 \n\t"
  646. "paddusb "RED_DITHER"(%0), %%mm5 \n\t"
  647. #endif
  648. WRITERGB15(%4, %5, %%REGa)
  649. YSCALEYUV2PACKEDX_END
  650. }
  651. #define WRITEBGR24MMX(dst, dstw, index) \
  652. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
  653. "movq %%mm2, %%mm1 \n\t" /* B */\
  654. "movq %%mm5, %%mm6 \n\t" /* R */\
  655. "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
  656. "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
  657. "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
  658. "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
  659. "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
  660. "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
  661. "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
  662. "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
  663. "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
  664. "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
  665. \
  666. "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
  667. "movq %%mm2, %%mm6 \n\t" /* 0RGB0RGB 1 */\
  668. "movq %%mm1, %%mm5 \n\t" /* 0RGB0RGB 2 */\
  669. "movq %%mm3, %%mm7 \n\t" /* 0RGB0RGB 3 */\
  670. \
  671. "psllq $40, %%mm0 \n\t" /* RGB00000 0 */\
  672. "psllq $40, %%mm2 \n\t" /* RGB00000 1 */\
  673. "psllq $40, %%mm1 \n\t" /* RGB00000 2 */\
  674. "psllq $40, %%mm3 \n\t" /* RGB00000 3 */\
  675. \
  676. "punpckhdq %%mm4, %%mm0 \n\t" /* 0RGBRGB0 0 */\
  677. "punpckhdq %%mm6, %%mm2 \n\t" /* 0RGBRGB0 1 */\
  678. "punpckhdq %%mm5, %%mm1 \n\t" /* 0RGBRGB0 2 */\
  679. "punpckhdq %%mm7, %%mm3 \n\t" /* 0RGBRGB0 3 */\
  680. \
  681. "psrlq $8, %%mm0 \n\t" /* 00RGBRGB 0 */\
  682. "movq %%mm2, %%mm6 \n\t" /* 0RGBRGB0 1 */\
  683. "psllq $40, %%mm2 \n\t" /* GB000000 1 */\
  684. "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
  685. MOVNTQ(%%mm0, (dst))\
  686. \
  687. "psrlq $24, %%mm6 \n\t" /* 0000RGBR 1 */\
  688. "movq %%mm1, %%mm5 \n\t" /* 0RGBRGB0 2 */\
  689. "psllq $24, %%mm1 \n\t" /* BRGB0000 2 */\
  690. "por %%mm1, %%mm6 \n\t" /* BRGBRGBR 1 */\
  691. MOVNTQ(%%mm6, 8(dst))\
  692. \
  693. "psrlq $40, %%mm5 \n\t" /* 000000RG 2 */\
  694. "psllq $8, %%mm3 \n\t" /* RGBRGB00 3 */\
  695. "por %%mm3, %%mm5 \n\t" /* RGBRGBRG 2 */\
  696. MOVNTQ(%%mm5, 16(dst))\
  697. \
  698. "add $24, "#dst" \n\t"\
  699. \
  700. "add $8, "#index" \n\t"\
  701. "cmp "#dstw", "#index" \n\t"\
  702. " jb 1b \n\t"
  703. #define WRITEBGR24MMX2(dst, dstw, index) \
  704. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
  705. "movq "MANGLE(ff_M24A)", %%mm0 \n\t"\
  706. "movq "MANGLE(ff_M24C)", %%mm7 \n\t"\
  707. "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2 B1 B0 B1 B0 */\
  708. "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2 G1 G0 G1 G0 */\
  709. "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0 R1 R0 R1 R0 */\
  710. \
  711. "pand %%mm0, %%mm1 \n\t" /* B2 B1 B0 */\
  712. "pand %%mm0, %%mm3 \n\t" /* G2 G1 G0 */\
  713. "pand %%mm7, %%mm6 \n\t" /* R1 R0 */\
  714. \
  715. "psllq $8, %%mm3 \n\t" /* G2 G1 G0 */\
  716. "por %%mm1, %%mm6 \n\t"\
  717. "por %%mm3, %%mm6 \n\t"\
  718. MOVNTQ(%%mm6, (dst))\
  719. \
  720. "psrlq $8, %%mm4 \n\t" /* 00 G7 G6 G5 G4 G3 G2 G1 */\
  721. "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4 B3 B2 B3 B2 */\
  722. "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3 G4 G3 G4 G3 */\
  723. "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4 R3 R2 R3 R2 */\
  724. \
  725. "pand "MANGLE(ff_M24B)", %%mm1 \n\t" /* B5 B4 B3 */\
  726. "pand %%mm7, %%mm3 \n\t" /* G4 G3 */\
  727. "pand %%mm0, %%mm6 \n\t" /* R4 R3 R2 */\
  728. \
  729. "por %%mm1, %%mm3 \n\t" /* B5 G4 B4 G3 B3 */\
  730. "por %%mm3, %%mm6 \n\t"\
  731. MOVNTQ(%%mm6, 8(dst))\
  732. \
  733. "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6 B7 B6 B6 B7 */\
  734. "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7 G6 G5 G6 G5 */\
  735. "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6 R5 R4 R5 R4 */\
  736. \
  737. "pand %%mm7, %%mm1 \n\t" /* B7 B6 */\
  738. "pand %%mm0, %%mm3 \n\t" /* G7 G6 G5 */\
  739. "pand "MANGLE(ff_M24B)", %%mm6 \n\t" /* R7 R6 R5 */\
  740. \
  741. "por %%mm1, %%mm3 \n\t"\
  742. "por %%mm3, %%mm6 \n\t"\
  743. MOVNTQ(%%mm6, 16(dst))\
  744. \
  745. "add $24, "#dst" \n\t"\
  746. \
  747. "add $8, "#index" \n\t"\
  748. "cmp "#dstw", "#index" \n\t"\
  749. " jb 1b \n\t"
  750. #if COMPILE_TEMPLATE_MMX2
  751. #undef WRITEBGR24
  752. #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX2(dst, dstw, index)
  753. #else
  754. #undef WRITEBGR24
  755. #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX(dst, dstw, index)
  756. #endif
  757. static void RENAME(yuv2bgr24_X_ar)(SwsContext *c, const int16_t *lumFilter,
  758. const int16_t **lumSrc, int lumFilterSize,
  759. const int16_t *chrFilter, const int16_t **chrUSrc,
  760. const int16_t **chrVSrc,
  761. int chrFilterSize, const int16_t **alpSrc,
  762. uint8_t *dest, int dstW, int dstY)
  763. {
  764. x86_reg dummy=0;
  765. x86_reg dstW_reg = dstW;
  766. x86_reg uv_off = c->uv_off << 1;
  767. YSCALEYUV2PACKEDX_ACCURATE
  768. YSCALEYUV2RGBX
  769. "pxor %%mm7, %%mm7 \n\t"
  770. "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"\n\t" //FIXME optimize
  771. "add %4, %%"REG_c" \n\t"
  772. WRITEBGR24(%%REGc, %5, %%REGa)
  773. :: "r" (&c->redDither),
  774. "m" (dummy), "m" (dummy), "m" (dummy),
  775. "r" (dest), "m" (dstW_reg), "m"(uv_off)
  776. : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
  777. );
  778. }
  779. static void RENAME(yuv2bgr24_X)(SwsContext *c, const int16_t *lumFilter,
  780. const int16_t **lumSrc, int lumFilterSize,
  781. const int16_t *chrFilter, const int16_t **chrUSrc,
  782. const int16_t **chrVSrc,
  783. int chrFilterSize, const int16_t **alpSrc,
  784. uint8_t *dest, int dstW, int dstY)
  785. {
  786. x86_reg dummy=0;
  787. x86_reg dstW_reg = dstW;
  788. x86_reg uv_off = c->uv_off << 1;
  789. YSCALEYUV2PACKEDX
  790. YSCALEYUV2RGBX
  791. "pxor %%mm7, %%mm7 \n\t"
  792. "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c" \n\t" //FIXME optimize
  793. "add %4, %%"REG_c" \n\t"
  794. WRITEBGR24(%%REGc, %5, %%REGa)
  795. :: "r" (&c->redDither),
  796. "m" (dummy), "m" (dummy), "m" (dummy),
  797. "r" (dest), "m" (dstW_reg), "m"(uv_off)
  798. : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
  799. );
  800. }
  801. #define REAL_WRITEYUY2(dst, dstw, index) \
  802. "packuswb %%mm3, %%mm3 \n\t"\
  803. "packuswb %%mm4, %%mm4 \n\t"\
  804. "packuswb %%mm7, %%mm1 \n\t"\
  805. "punpcklbw %%mm4, %%mm3 \n\t"\
  806. "movq %%mm1, %%mm7 \n\t"\
  807. "punpcklbw %%mm3, %%mm1 \n\t"\
  808. "punpckhbw %%mm3, %%mm7 \n\t"\
  809. \
  810. MOVNTQ(%%mm1, (dst, index, 2))\
  811. MOVNTQ(%%mm7, 8(dst, index, 2))\
  812. \
  813. "add $8, "#index" \n\t"\
  814. "cmp "#dstw", "#index" \n\t"\
  815. " jb 1b \n\t"
  816. #define WRITEYUY2(dst, dstw, index) REAL_WRITEYUY2(dst, dstw, index)
  817. static void RENAME(yuv2yuyv422_X_ar)(SwsContext *c, const int16_t *lumFilter,
  818. const int16_t **lumSrc, int lumFilterSize,
  819. const int16_t *chrFilter, const int16_t **chrUSrc,
  820. const int16_t **chrVSrc,
  821. int chrFilterSize, const int16_t **alpSrc,
  822. uint8_t *dest, int dstW, int dstY)
  823. {
  824. x86_reg dummy=0;
  825. x86_reg dstW_reg = dstW;
  826. x86_reg uv_off = c->uv_off << 1;
  827. YSCALEYUV2PACKEDX_ACCURATE
  828. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  829. "psraw $3, %%mm3 \n\t"
  830. "psraw $3, %%mm4 \n\t"
  831. "psraw $3, %%mm1 \n\t"
  832. "psraw $3, %%mm7 \n\t"
  833. WRITEYUY2(%4, %5, %%REGa)
  834. YSCALEYUV2PACKEDX_END
  835. }
  836. static void RENAME(yuv2yuyv422_X)(SwsContext *c, const int16_t *lumFilter,
  837. const int16_t **lumSrc, int lumFilterSize,
  838. const int16_t *chrFilter, const int16_t **chrUSrc,
  839. const int16_t **chrVSrc,
  840. int chrFilterSize, const int16_t **alpSrc,
  841. uint8_t *dest, int dstW, int dstY)
  842. {
  843. x86_reg dummy=0;
  844. x86_reg dstW_reg = dstW;
  845. x86_reg uv_off = c->uv_off << 1;
  846. YSCALEYUV2PACKEDX
  847. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  848. "psraw $3, %%mm3 \n\t"
  849. "psraw $3, %%mm4 \n\t"
  850. "psraw $3, %%mm1 \n\t"
  851. "psraw $3, %%mm7 \n\t"
  852. WRITEYUY2(%4, %5, %%REGa)
  853. YSCALEYUV2PACKEDX_END
  854. }
  855. #define REAL_YSCALEYUV2RGB_UV(index, c) \
  856. "xor "#index", "#index" \n\t"\
  857. ".p2align 4 \n\t"\
  858. "1: \n\t"\
  859. "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
  860. "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
  861. "add "UV_OFFx2"("#c"), "#index" \n\t" \
  862. "movq (%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
  863. "movq (%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
  864. "sub "UV_OFFx2"("#c"), "#index" \n\t" \
  865. "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
  866. "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
  867. "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
  868. "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
  869. "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
  870. "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
  871. "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
  872. "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
  873. "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
  874. "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
  875. "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
  876. "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
  877. "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
  878. "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
  879. "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
  880. /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
  881. #define REAL_YSCALEYUV2RGB_YA(index, c, b1, b2) \
  882. "movq ("#b1", "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
  883. "movq ("#b2", "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
  884. "movq 8("#b1", "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
  885. "movq 8("#b2", "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
  886. "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
  887. "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
  888. "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
  889. "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
  890. "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  891. "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  892. "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
  893. "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
  894. #define REAL_YSCALEYUV2RGB_COEFF(c) \
  895. "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
  896. "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
  897. "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
  898. "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
  899. "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
  900. "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
  901. /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
  902. "paddw %%mm3, %%mm4 \n\t"\
  903. "movq %%mm2, %%mm0 \n\t"\
  904. "movq %%mm5, %%mm6 \n\t"\
  905. "movq %%mm4, %%mm3 \n\t"\
  906. "punpcklwd %%mm2, %%mm2 \n\t"\
  907. "punpcklwd %%mm5, %%mm5 \n\t"\
  908. "punpcklwd %%mm4, %%mm4 \n\t"\
  909. "paddw %%mm1, %%mm2 \n\t"\
  910. "paddw %%mm1, %%mm5 \n\t"\
  911. "paddw %%mm1, %%mm4 \n\t"\
  912. "punpckhwd %%mm0, %%mm0 \n\t"\
  913. "punpckhwd %%mm6, %%mm6 \n\t"\
  914. "punpckhwd %%mm3, %%mm3 \n\t"\
  915. "paddw %%mm7, %%mm0 \n\t"\
  916. "paddw %%mm7, %%mm6 \n\t"\
  917. "paddw %%mm7, %%mm3 \n\t"\
  918. /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
  919. "packuswb %%mm0, %%mm2 \n\t"\
  920. "packuswb %%mm6, %%mm5 \n\t"\
  921. "packuswb %%mm3, %%mm4 \n\t"\
  922. #define YSCALEYUV2RGB_YA(index, c, b1, b2) REAL_YSCALEYUV2RGB_YA(index, c, b1, b2)
  923. #define YSCALEYUV2RGB(index, c) \
  924. REAL_YSCALEYUV2RGB_UV(index, c) \
  925. REAL_YSCALEYUV2RGB_YA(index, c, %0, %1) \
  926. REAL_YSCALEYUV2RGB_COEFF(c)
  927. /**
  928. * vertical bilinear scale YV12 to RGB
  929. */
  930. static void RENAME(yuv2rgb32_2)(SwsContext *c, const uint16_t *buf0,
  931. const uint16_t *buf1, const uint16_t *ubuf0,
  932. const uint16_t *ubuf1, const uint16_t *vbuf0,
  933. const uint16_t *vbuf1, const uint16_t *abuf0,
  934. const uint16_t *abuf1, uint8_t *dest,
  935. int dstW, int yalpha, int uvalpha, int y)
  936. {
  937. if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
  938. #if ARCH_X86_64
  939. __asm__ volatile(
  940. YSCALEYUV2RGB(%%r8, %5)
  941. YSCALEYUV2RGB_YA(%%r8, %5, %6, %7)
  942. "psraw $3, %%mm1 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
  943. "psraw $3, %%mm7 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
  944. "packuswb %%mm7, %%mm1 \n\t"
  945. WRITEBGR32(%4, 8280(%5), %%r8, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
  946. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "r" (dest),
  947. "a" (&c->redDither),
  948. "r" (abuf0), "r" (abuf1)
  949. : "%r8"
  950. );
  951. #else
  952. c->u_temp=(intptr_t)abuf0;
  953. c->v_temp=(intptr_t)abuf1;
  954. __asm__ volatile(
  955. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  956. "mov %4, %%"REG_b" \n\t"
  957. "push %%"REG_BP" \n\t"
  958. YSCALEYUV2RGB(%%REGBP, %5)
  959. "push %0 \n\t"
  960. "push %1 \n\t"
  961. "mov "U_TEMP"(%5), %0 \n\t"
  962. "mov "V_TEMP"(%5), %1 \n\t"
  963. YSCALEYUV2RGB_YA(%%REGBP, %5, %0, %1)
  964. "psraw $3, %%mm1 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
  965. "psraw $3, %%mm7 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
  966. "packuswb %%mm7, %%mm1 \n\t"
  967. "pop %1 \n\t"
  968. "pop %0 \n\t"
  969. WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
  970. "pop %%"REG_BP" \n\t"
  971. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  972. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  973. "a" (&c->redDither)
  974. );
  975. #endif
  976. } else {
  977. __asm__ volatile(
  978. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  979. "mov %4, %%"REG_b" \n\t"
  980. "push %%"REG_BP" \n\t"
  981. YSCALEYUV2RGB(%%REGBP, %5)
  982. "pcmpeqd %%mm7, %%mm7 \n\t"
  983. WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
  984. "pop %%"REG_BP" \n\t"
  985. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  986. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  987. "a" (&c->redDither)
  988. );
  989. }
  990. }
  991. static void RENAME(yuv2bgr24_2)(SwsContext *c, const uint16_t *buf0,
  992. const uint16_t *buf1, const uint16_t *ubuf0,
  993. const uint16_t *ubuf1, const uint16_t *vbuf0,
  994. const uint16_t *vbuf1, const uint16_t *abuf0,
  995. const uint16_t *abuf1, uint8_t *dest,
  996. int dstW, int yalpha, int uvalpha, int y)
  997. {
  998. //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
  999. __asm__ volatile(
  1000. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1001. "mov %4, %%"REG_b" \n\t"
  1002. "push %%"REG_BP" \n\t"
  1003. YSCALEYUV2RGB(%%REGBP, %5)
  1004. "pxor %%mm7, %%mm7 \n\t"
  1005. WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
  1006. "pop %%"REG_BP" \n\t"
  1007. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1008. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1009. "a" (&c->redDither)
  1010. );
  1011. }
  1012. static void RENAME(yuv2rgb555_2)(SwsContext *c, const uint16_t *buf0,
  1013. const uint16_t *buf1, const uint16_t *ubuf0,
  1014. const uint16_t *ubuf1, const uint16_t *vbuf0,
  1015. const uint16_t *vbuf1, const uint16_t *abuf0,
  1016. const uint16_t *abuf1, uint8_t *dest,
  1017. int dstW, int yalpha, int uvalpha, int y)
  1018. {
  1019. //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
  1020. __asm__ volatile(
  1021. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1022. "mov %4, %%"REG_b" \n\t"
  1023. "push %%"REG_BP" \n\t"
  1024. YSCALEYUV2RGB(%%REGBP, %5)
  1025. "pxor %%mm7, %%mm7 \n\t"
  1026. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1027. #ifdef DITHER1XBPP
  1028. "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
  1029. "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
  1030. "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
  1031. #endif
  1032. WRITERGB15(%%REGb, 8280(%5), %%REGBP)
  1033. "pop %%"REG_BP" \n\t"
  1034. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1035. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1036. "a" (&c->redDither)
  1037. );
  1038. }
  1039. static void RENAME(yuv2rgb565_2)(SwsContext *c, const uint16_t *buf0,
  1040. const uint16_t *buf1, const uint16_t *ubuf0,
  1041. const uint16_t *ubuf1, const uint16_t *vbuf0,
  1042. const uint16_t *vbuf1, const uint16_t *abuf0,
  1043. const uint16_t *abuf1, uint8_t *dest,
  1044. int dstW, int yalpha, int uvalpha, int y)
  1045. {
  1046. //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
  1047. __asm__ volatile(
  1048. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1049. "mov %4, %%"REG_b" \n\t"
  1050. "push %%"REG_BP" \n\t"
  1051. YSCALEYUV2RGB(%%REGBP, %5)
  1052. "pxor %%mm7, %%mm7 \n\t"
  1053. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1054. #ifdef DITHER1XBPP
  1055. "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
  1056. "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
  1057. "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
  1058. #endif
  1059. WRITERGB16(%%REGb, 8280(%5), %%REGBP)
  1060. "pop %%"REG_BP" \n\t"
  1061. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1062. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1063. "a" (&c->redDither)
  1064. );
  1065. }
  1066. #define REAL_YSCALEYUV2PACKED(index, c) \
  1067. "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
  1068. "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1 \n\t"\
  1069. "psraw $3, %%mm0 \n\t"\
  1070. "psraw $3, %%mm1 \n\t"\
  1071. "movq %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c") \n\t"\
  1072. "movq %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c") \n\t"\
  1073. "xor "#index", "#index" \n\t"\
  1074. ".p2align 4 \n\t"\
  1075. "1: \n\t"\
  1076. "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
  1077. "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
  1078. "add "UV_OFFx2"("#c"), "#index" \n\t" \
  1079. "movq (%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
  1080. "movq (%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
  1081. "sub "UV_OFFx2"("#c"), "#index" \n\t" \
  1082. "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
  1083. "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
  1084. "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
  1085. "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
  1086. "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
  1087. "psraw $7, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
  1088. "psraw $7, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
  1089. "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
  1090. "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
  1091. "movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
  1092. "movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
  1093. "movq 8(%0, "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
  1094. "movq 8(%1, "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
  1095. "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
  1096. "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
  1097. "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
  1098. "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
  1099. "psraw $7, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  1100. "psraw $7, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  1101. "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
  1102. "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
  1103. #define YSCALEYUV2PACKED(index, c) REAL_YSCALEYUV2PACKED(index, c)
  1104. static void RENAME(yuv2yuyv422_2)(SwsContext *c, const uint16_t *buf0,
  1105. const uint16_t *buf1, const uint16_t *ubuf0,
  1106. const uint16_t *ubuf1, const uint16_t *vbuf0,
  1107. const uint16_t *vbuf1, const uint16_t *abuf0,
  1108. const uint16_t *abuf1, uint8_t *dest,
  1109. int dstW, int yalpha, int uvalpha, int y)
  1110. {
  1111. //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
  1112. __asm__ volatile(
  1113. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1114. "mov %4, %%"REG_b" \n\t"
  1115. "push %%"REG_BP" \n\t"
  1116. YSCALEYUV2PACKED(%%REGBP, %5)
  1117. WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
  1118. "pop %%"REG_BP" \n\t"
  1119. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1120. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1121. "a" (&c->redDither)
  1122. );
  1123. }
  1124. #define REAL_YSCALEYUV2RGB1(index, c) \
  1125. "xor "#index", "#index" \n\t"\
  1126. ".p2align 4 \n\t"\
  1127. "1: \n\t"\
  1128. "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
  1129. "add "UV_OFFx2"("#c"), "#index" \n\t" \
  1130. "movq (%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
  1131. "sub "UV_OFFx2"("#c"), "#index" \n\t" \
  1132. "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
  1133. "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
  1134. "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
  1135. "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
  1136. "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
  1137. "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
  1138. "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
  1139. "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
  1140. /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
  1141. "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
  1142. "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
  1143. "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  1144. "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  1145. "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
  1146. "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
  1147. "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
  1148. "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
  1149. "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
  1150. "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
  1151. /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
  1152. "paddw %%mm3, %%mm4 \n\t"\
  1153. "movq %%mm2, %%mm0 \n\t"\
  1154. "movq %%mm5, %%mm6 \n\t"\
  1155. "movq %%mm4, %%mm3 \n\t"\
  1156. "punpcklwd %%mm2, %%mm2 \n\t"\
  1157. "punpcklwd %%mm5, %%mm5 \n\t"\
  1158. "punpcklwd %%mm4, %%mm4 \n\t"\
  1159. "paddw %%mm1, %%mm2 \n\t"\
  1160. "paddw %%mm1, %%mm5 \n\t"\
  1161. "paddw %%mm1, %%mm4 \n\t"\
  1162. "punpckhwd %%mm0, %%mm0 \n\t"\
  1163. "punpckhwd %%mm6, %%mm6 \n\t"\
  1164. "punpckhwd %%mm3, %%mm3 \n\t"\
  1165. "paddw %%mm7, %%mm0 \n\t"\
  1166. "paddw %%mm7, %%mm6 \n\t"\
  1167. "paddw %%mm7, %%mm3 \n\t"\
  1168. /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
  1169. "packuswb %%mm0, %%mm2 \n\t"\
  1170. "packuswb %%mm6, %%mm5 \n\t"\
  1171. "packuswb %%mm3, %%mm4 \n\t"\
  1172. #define YSCALEYUV2RGB1(index, c) REAL_YSCALEYUV2RGB1(index, c)
  1173. // do vertical chrominance interpolation
  1174. #define REAL_YSCALEYUV2RGB1b(index, c) \
  1175. "xor "#index", "#index" \n\t"\
  1176. ".p2align 4 \n\t"\
  1177. "1: \n\t"\
  1178. "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
  1179. "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
  1180. "add "UV_OFFx2"("#c"), "#index" \n\t" \
  1181. "movq (%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
  1182. "movq (%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
  1183. "sub "UV_OFFx2"("#c"), "#index" \n\t" \
  1184. "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
  1185. "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
  1186. "psrlw $5, %%mm3 \n\t" /*FIXME might overflow*/\
  1187. "psrlw $5, %%mm4 \n\t" /*FIXME might overflow*/\
  1188. "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
  1189. "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
  1190. "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
  1191. "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
  1192. "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
  1193. "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
  1194. /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
  1195. "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
  1196. "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
  1197. "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  1198. "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  1199. "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
  1200. "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
  1201. "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
  1202. "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
  1203. "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
  1204. "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
  1205. /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
  1206. "paddw %%mm3, %%mm4 \n\t"\
  1207. "movq %%mm2, %%mm0 \n\t"\
  1208. "movq %%mm5, %%mm6 \n\t"\
  1209. "movq %%mm4, %%mm3 \n\t"\
  1210. "punpcklwd %%mm2, %%mm2 \n\t"\
  1211. "punpcklwd %%mm5, %%mm5 \n\t"\
  1212. "punpcklwd %%mm4, %%mm4 \n\t"\
  1213. "paddw %%mm1, %%mm2 \n\t"\
  1214. "paddw %%mm1, %%mm5 \n\t"\
  1215. "paddw %%mm1, %%mm4 \n\t"\
  1216. "punpckhwd %%mm0, %%mm0 \n\t"\
  1217. "punpckhwd %%mm6, %%mm6 \n\t"\
  1218. "punpckhwd %%mm3, %%mm3 \n\t"\
  1219. "paddw %%mm7, %%mm0 \n\t"\
  1220. "paddw %%mm7, %%mm6 \n\t"\
  1221. "paddw %%mm7, %%mm3 \n\t"\
  1222. /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
  1223. "packuswb %%mm0, %%mm2 \n\t"\
  1224. "packuswb %%mm6, %%mm5 \n\t"\
  1225. "packuswb %%mm3, %%mm4 \n\t"\
  1226. #define YSCALEYUV2RGB1b(index, c) REAL_YSCALEYUV2RGB1b(index, c)
  1227. #define REAL_YSCALEYUV2RGB1_ALPHA(index) \
  1228. "movq (%1, "#index", 2), %%mm7 \n\t" /* abuf0[index ] */\
  1229. "movq 8(%1, "#index", 2), %%mm1 \n\t" /* abuf0[index+4] */\
  1230. "psraw $7, %%mm7 \n\t" /* abuf0[index ] >>7 */\
  1231. "psraw $7, %%mm1 \n\t" /* abuf0[index+4] >>7 */\
  1232. "packuswb %%mm1, %%mm7 \n\t"
  1233. #define YSCALEYUV2RGB1_ALPHA(index) REAL_YSCALEYUV2RGB1_ALPHA(index)
  1234. /**
  1235. * YV12 to RGB without scaling or interpolating
  1236. */
  1237. static void RENAME(yuv2rgb32_1)(SwsContext *c, const uint16_t *buf0,
  1238. const uint16_t *ubuf0, const uint16_t *ubuf1,
  1239. const uint16_t *vbuf0, const uint16_t *vbuf1,
  1240. const uint16_t *abuf0, uint8_t *dest,
  1241. int dstW, int uvalpha, enum PixelFormat dstFormat,
  1242. int flags, int y)
  1243. {
  1244. const uint16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
  1245. if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
  1246. if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
  1247. __asm__ volatile(
  1248. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1249. "mov %4, %%"REG_b" \n\t"
  1250. "push %%"REG_BP" \n\t"
  1251. YSCALEYUV2RGB1(%%REGBP, %5)
  1252. YSCALEYUV2RGB1_ALPHA(%%REGBP)
  1253. WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
  1254. "pop %%"REG_BP" \n\t"
  1255. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1256. :: "c" (buf0), "d" (abuf0), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1257. "a" (&c->redDither)
  1258. );
  1259. } else {
  1260. __asm__ volatile(
  1261. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1262. "mov %4, %%"REG_b" \n\t"
  1263. "push %%"REG_BP" \n\t"
  1264. YSCALEYUV2RGB1(%%REGBP, %5)
  1265. "pcmpeqd %%mm7, %%mm7 \n\t"
  1266. WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
  1267. "pop %%"REG_BP" \n\t"
  1268. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1269. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1270. "a" (&c->redDither)
  1271. );
  1272. }
  1273. } else {
  1274. if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
  1275. __asm__ volatile(
  1276. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1277. "mov %4, %%"REG_b" \n\t"
  1278. "push %%"REG_BP" \n\t"
  1279. YSCALEYUV2RGB1b(%%REGBP, %5)
  1280. YSCALEYUV2RGB1_ALPHA(%%REGBP)
  1281. WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
  1282. "pop %%"REG_BP" \n\t"
  1283. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1284. :: "c" (buf0), "d" (abuf0), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1285. "a" (&c->redDither)
  1286. );
  1287. } else {
  1288. __asm__ volatile(
  1289. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1290. "mov %4, %%"REG_b" \n\t"
  1291. "push %%"REG_BP" \n\t"
  1292. YSCALEYUV2RGB1b(%%REGBP, %5)
  1293. "pcmpeqd %%mm7, %%mm7 \n\t"
  1294. WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
  1295. "pop %%"REG_BP" \n\t"
  1296. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1297. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1298. "a" (&c->redDither)
  1299. );
  1300. }
  1301. }
  1302. }
  1303. static void RENAME(yuv2bgr24_1)(SwsContext *c, const uint16_t *buf0,
  1304. const uint16_t *ubuf0, const uint16_t *ubuf1,
  1305. const uint16_t *vbuf0, const uint16_t *vbuf1,
  1306. const uint16_t *abuf0, uint8_t *dest,
  1307. int dstW, int uvalpha, enum PixelFormat dstFormat,
  1308. int flags, int y)
  1309. {
  1310. const uint16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
  1311. if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
  1312. __asm__ volatile(
  1313. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1314. "mov %4, %%"REG_b" \n\t"
  1315. "push %%"REG_BP" \n\t"
  1316. YSCALEYUV2RGB1(%%REGBP, %5)
  1317. "pxor %%mm7, %%mm7 \n\t"
  1318. WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
  1319. "pop %%"REG_BP" \n\t"
  1320. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1321. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1322. "a" (&c->redDither)
  1323. );
  1324. } else {
  1325. __asm__ volatile(
  1326. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1327. "mov %4, %%"REG_b" \n\t"
  1328. "push %%"REG_BP" \n\t"
  1329. YSCALEYUV2RGB1b(%%REGBP, %5)
  1330. "pxor %%mm7, %%mm7 \n\t"
  1331. WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
  1332. "pop %%"REG_BP" \n\t"
  1333. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1334. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1335. "a" (&c->redDither)
  1336. );
  1337. }
  1338. }
  1339. static void RENAME(yuv2rgb555_1)(SwsContext *c, const uint16_t *buf0,
  1340. const uint16_t *ubuf0, const uint16_t *ubuf1,
  1341. const uint16_t *vbuf0, const uint16_t *vbuf1,
  1342. const uint16_t *abuf0, uint8_t *dest,
  1343. int dstW, int uvalpha, enum PixelFormat dstFormat,
  1344. int flags, int y)
  1345. {
  1346. const uint16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
  1347. if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
  1348. __asm__ volatile(
  1349. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1350. "mov %4, %%"REG_b" \n\t"
  1351. "push %%"REG_BP" \n\t"
  1352. YSCALEYUV2RGB1(%%REGBP, %5)
  1353. "pxor %%mm7, %%mm7 \n\t"
  1354. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1355. #ifdef DITHER1XBPP
  1356. "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
  1357. "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
  1358. "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
  1359. #endif
  1360. WRITERGB15(%%REGb, 8280(%5), %%REGBP)
  1361. "pop %%"REG_BP" \n\t"
  1362. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1363. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1364. "a" (&c->redDither)
  1365. );
  1366. } else {
  1367. __asm__ volatile(
  1368. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1369. "mov %4, %%"REG_b" \n\t"
  1370. "push %%"REG_BP" \n\t"
  1371. YSCALEYUV2RGB1b(%%REGBP, %5)
  1372. "pxor %%mm7, %%mm7 \n\t"
  1373. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1374. #ifdef DITHER1XBPP
  1375. "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
  1376. "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
  1377. "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
  1378. #endif
  1379. WRITERGB15(%%REGb, 8280(%5), %%REGBP)
  1380. "pop %%"REG_BP" \n\t"
  1381. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1382. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1383. "a" (&c->redDither)
  1384. );
  1385. }
  1386. }
  1387. static void RENAME(yuv2rgb565_1)(SwsContext *c, const uint16_t *buf0,
  1388. const uint16_t *ubuf0, const uint16_t *ubuf1,
  1389. const uint16_t *vbuf0, const uint16_t *vbuf1,
  1390. const uint16_t *abuf0, uint8_t *dest,
  1391. int dstW, int uvalpha, enum PixelFormat dstFormat,
  1392. int flags, int y)
  1393. {
  1394. const uint16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
  1395. if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
  1396. __asm__ volatile(
  1397. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1398. "mov %4, %%"REG_b" \n\t"
  1399. "push %%"REG_BP" \n\t"
  1400. YSCALEYUV2RGB1(%%REGBP, %5)
  1401. "pxor %%mm7, %%mm7 \n\t"
  1402. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1403. #ifdef DITHER1XBPP
  1404. "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
  1405. "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
  1406. "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
  1407. #endif
  1408. WRITERGB16(%%REGb, 8280(%5), %%REGBP)
  1409. "pop %%"REG_BP" \n\t"
  1410. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1411. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1412. "a" (&c->redDither)
  1413. );
  1414. } else {
  1415. __asm__ volatile(
  1416. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1417. "mov %4, %%"REG_b" \n\t"
  1418. "push %%"REG_BP" \n\t"
  1419. YSCALEYUV2RGB1b(%%REGBP, %5)
  1420. "pxor %%mm7, %%mm7 \n\t"
  1421. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1422. #ifdef DITHER1XBPP
  1423. "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
  1424. "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
  1425. "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
  1426. #endif
  1427. WRITERGB16(%%REGb, 8280(%5), %%REGBP)
  1428. "pop %%"REG_BP" \n\t"
  1429. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1430. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1431. "a" (&c->redDither)
  1432. );
  1433. }
  1434. }
  1435. #define REAL_YSCALEYUV2PACKED1(index, c) \
  1436. "xor "#index", "#index" \n\t"\
  1437. ".p2align 4 \n\t"\
  1438. "1: \n\t"\
  1439. "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
  1440. "add "UV_OFFx2"("#c"), "#index" \n\t" \
  1441. "movq (%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
  1442. "sub "UV_OFFx2"("#c"), "#index" \n\t" \
  1443. "psraw $7, %%mm3 \n\t" \
  1444. "psraw $7, %%mm4 \n\t" \
  1445. "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
  1446. "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
  1447. "psraw $7, %%mm1 \n\t" \
  1448. "psraw $7, %%mm7 \n\t" \
  1449. #define YSCALEYUV2PACKED1(index, c) REAL_YSCALEYUV2PACKED1(index, c)
  1450. #define REAL_YSCALEYUV2PACKED1b(index, c) \
  1451. "xor "#index", "#index" \n\t"\
  1452. ".p2align 4 \n\t"\
  1453. "1: \n\t"\
  1454. "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
  1455. "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
  1456. "add "UV_OFFx2"("#c"), "#index" \n\t" \
  1457. "movq (%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
  1458. "movq (%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
  1459. "sub "UV_OFFx2"("#c"), "#index" \n\t" \
  1460. "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
  1461. "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
  1462. "psrlw $8, %%mm3 \n\t" \
  1463. "psrlw $8, %%mm4 \n\t" \
  1464. "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
  1465. "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
  1466. "psraw $7, %%mm1 \n\t" \
  1467. "psraw $7, %%mm7 \n\t"
  1468. #define YSCALEYUV2PACKED1b(index, c) REAL_YSCALEYUV2PACKED1b(index, c)
  1469. static void RENAME(yuv2yuyv422_1)(SwsContext *c, const uint16_t *buf0,
  1470. const uint16_t *ubuf0, const uint16_t *ubuf1,
  1471. const uint16_t *vbuf0, const uint16_t *vbuf1,
  1472. const uint16_t *abuf0, uint8_t *dest,
  1473. int dstW, int uvalpha, enum PixelFormat dstFormat,
  1474. int flags, int y)
  1475. {
  1476. const uint16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
  1477. if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
  1478. __asm__ volatile(
  1479. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1480. "mov %4, %%"REG_b" \n\t"
  1481. "push %%"REG_BP" \n\t"
  1482. YSCALEYUV2PACKED1(%%REGBP, %5)
  1483. WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
  1484. "pop %%"REG_BP" \n\t"
  1485. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1486. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1487. "a" (&c->redDither)
  1488. );
  1489. } else {
  1490. __asm__ volatile(
  1491. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1492. "mov %4, %%"REG_b" \n\t"
  1493. "push %%"REG_BP" \n\t"
  1494. YSCALEYUV2PACKED1b(%%REGBP, %5)
  1495. WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
  1496. "pop %%"REG_BP" \n\t"
  1497. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1498. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1499. "a" (&c->redDither)
  1500. );
  1501. }
  1502. }
  1503. #if !COMPILE_TEMPLATE_MMX2
  1504. //FIXME yuy2* can read up to 7 samples too much
  1505. static void RENAME(yuy2ToY)(uint8_t *dst, const uint8_t *src,
  1506. int width, uint32_t *unused)
  1507. {
  1508. __asm__ volatile(
  1509. "movq "MANGLE(bm01010101)", %%mm2 \n\t"
  1510. "mov %0, %%"REG_a" \n\t"
  1511. "1: \n\t"
  1512. "movq (%1, %%"REG_a",2), %%mm0 \n\t"
  1513. "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
  1514. "pand %%mm2, %%mm0 \n\t"
  1515. "pand %%mm2, %%mm1 \n\t"
  1516. "packuswb %%mm1, %%mm0 \n\t"
  1517. "movq %%mm0, (%2, %%"REG_a") \n\t"
  1518. "add $8, %%"REG_a" \n\t"
  1519. " js 1b \n\t"
  1520. : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst+width)
  1521. : "%"REG_a
  1522. );
  1523. }
  1524. static void RENAME(yuy2ToUV)(uint8_t *dstU, uint8_t *dstV,
  1525. const uint8_t *src1, const uint8_t *src2,
  1526. int width, uint32_t *unused)
  1527. {
  1528. __asm__ volatile(
  1529. "movq "MANGLE(bm01010101)", %%mm4 \n\t"
  1530. "mov %0, %%"REG_a" \n\t"
  1531. "1: \n\t"
  1532. "movq (%1, %%"REG_a",4), %%mm0 \n\t"
  1533. "movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
  1534. "psrlw $8, %%mm0 \n\t"
  1535. "psrlw $8, %%mm1 \n\t"
  1536. "packuswb %%mm1, %%mm0 \n\t"
  1537. "movq %%mm0, %%mm1 \n\t"
  1538. "psrlw $8, %%mm0 \n\t"
  1539. "pand %%mm4, %%mm1 \n\t"
  1540. "packuswb %%mm0, %%mm0 \n\t"
  1541. "packuswb %%mm1, %%mm1 \n\t"
  1542. "movd %%mm0, (%3, %%"REG_a") \n\t"
  1543. "movd %%mm1, (%2, %%"REG_a") \n\t"
  1544. "add $4, %%"REG_a" \n\t"
  1545. " js 1b \n\t"
  1546. : : "g" ((x86_reg)-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
  1547. : "%"REG_a
  1548. );
  1549. assert(src1 == src2);
  1550. }
  1551. static void RENAME(LEToUV)(uint8_t *dstU, uint8_t *dstV,
  1552. const uint8_t *src1, const uint8_t *src2,
  1553. int width, uint32_t *unused)
  1554. {
  1555. __asm__ volatile(
  1556. "mov %0, %%"REG_a" \n\t"
  1557. "1: \n\t"
  1558. "movq (%1, %%"REG_a",2), %%mm0 \n\t"
  1559. "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
  1560. "movq (%2, %%"REG_a",2), %%mm2 \n\t"
  1561. "movq 8(%2, %%"REG_a",2), %%mm3 \n\t"
  1562. "psrlw $8, %%mm0 \n\t"
  1563. "psrlw $8, %%mm1 \n\t"
  1564. "psrlw $8, %%mm2 \n\t"
  1565. "psrlw $8, %%mm3 \n\t"
  1566. "packuswb %%mm1, %%mm0 \n\t"
  1567. "packuswb %%mm3, %%mm2 \n\t"
  1568. "movq %%mm0, (%3, %%"REG_a") \n\t"
  1569. "movq %%mm2, (%4, %%"REG_a") \n\t"
  1570. "add $8, %%"REG_a" \n\t"
  1571. " js 1b \n\t"
  1572. : : "g" ((x86_reg)-width), "r" (src1+width*2), "r" (src2+width*2), "r" (dstU+width), "r" (dstV+width)
  1573. : "%"REG_a
  1574. );
  1575. }
  1576. /* This is almost identical to the previous, end exists only because
  1577. * yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses. */
  1578. static void RENAME(uyvyToY)(uint8_t *dst, const uint8_t *src,
  1579. int width, uint32_t *unused)
  1580. {
  1581. __asm__ volatile(
  1582. "mov %0, %%"REG_a" \n\t"
  1583. "1: \n\t"
  1584. "movq (%1, %%"REG_a",2), %%mm0 \n\t"
  1585. "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
  1586. "psrlw $8, %%mm0 \n\t"
  1587. "psrlw $8, %%mm1 \n\t"
  1588. "packuswb %%mm1, %%mm0 \n\t"
  1589. "movq %%mm0, (%2, %%"REG_a") \n\t"
  1590. "add $8, %%"REG_a" \n\t"
  1591. " js 1b \n\t"
  1592. : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst+width)
  1593. : "%"REG_a
  1594. );
  1595. }
  1596. static void RENAME(uyvyToUV)(uint8_t *dstU, uint8_t *dstV,
  1597. const uint8_t *src1, const uint8_t *src2,
  1598. int width, uint32_t *unused)
  1599. {
  1600. __asm__ volatile(
  1601. "movq "MANGLE(bm01010101)", %%mm4 \n\t"
  1602. "mov %0, %%"REG_a" \n\t"
  1603. "1: \n\t"
  1604. "movq (%1, %%"REG_a",4), %%mm0 \n\t"
  1605. "movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
  1606. "pand %%mm4, %%mm0 \n\t"
  1607. "pand %%mm4, %%mm1 \n\t"
  1608. "packuswb %%mm1, %%mm0 \n\t"
  1609. "movq %%mm0, %%mm1 \n\t"
  1610. "psrlw $8, %%mm0 \n\t"
  1611. "pand %%mm4, %%mm1 \n\t"
  1612. "packuswb %%mm0, %%mm0 \n\t"
  1613. "packuswb %%mm1, %%mm1 \n\t"
  1614. "movd %%mm0, (%3, %%"REG_a") \n\t"
  1615. "movd %%mm1, (%2, %%"REG_a") \n\t"
  1616. "add $4, %%"REG_a" \n\t"
  1617. " js 1b \n\t"
  1618. : : "g" ((x86_reg)-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
  1619. : "%"REG_a
  1620. );
  1621. assert(src1 == src2);
  1622. }
  1623. static void RENAME(BEToUV)(uint8_t *dstU, uint8_t *dstV,
  1624. const uint8_t *src1, const uint8_t *src2,
  1625. int width, uint32_t *unused)
  1626. {
  1627. __asm__ volatile(
  1628. "movq "MANGLE(bm01010101)", %%mm4 \n\t"
  1629. "mov %0, %%"REG_a" \n\t"
  1630. "1: \n\t"
  1631. "movq (%1, %%"REG_a",2), %%mm0 \n\t"
  1632. "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
  1633. "movq (%2, %%"REG_a",2), %%mm2 \n\t"
  1634. "movq 8(%2, %%"REG_a",2), %%mm3 \n\t"
  1635. "pand %%mm4, %%mm0 \n\t"
  1636. "pand %%mm4, %%mm1 \n\t"
  1637. "pand %%mm4, %%mm2 \n\t"
  1638. "pand %%mm4, %%mm3 \n\t"
  1639. "packuswb %%mm1, %%mm0 \n\t"
  1640. "packuswb %%mm3, %%mm2 \n\t"
  1641. "movq %%mm0, (%3, %%"REG_a") \n\t"
  1642. "movq %%mm2, (%4, %%"REG_a") \n\t"
  1643. "add $8, %%"REG_a" \n\t"
  1644. " js 1b \n\t"
  1645. : : "g" ((x86_reg)-width), "r" (src1+width*2), "r" (src2+width*2), "r" (dstU+width), "r" (dstV+width)
  1646. : "%"REG_a
  1647. );
  1648. }
  1649. static av_always_inline void RENAME(nvXXtoUV)(uint8_t *dst1, uint8_t *dst2,
  1650. const uint8_t *src, int width)
  1651. {
  1652. __asm__ volatile(
  1653. "movq "MANGLE(bm01010101)", %%mm4 \n\t"
  1654. "mov %0, %%"REG_a" \n\t"
  1655. "1: \n\t"
  1656. "movq (%1, %%"REG_a",2), %%mm0 \n\t"
  1657. "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
  1658. "movq %%mm0, %%mm2 \n\t"
  1659. "movq %%mm1, %%mm3 \n\t"
  1660. "pand %%mm4, %%mm0 \n\t"
  1661. "pand %%mm4, %%mm1 \n\t"
  1662. "psrlw $8, %%mm2 \n\t"
  1663. "psrlw $8, %%mm3 \n\t"
  1664. "packuswb %%mm1, %%mm0 \n\t"
  1665. "packuswb %%mm3, %%mm2 \n\t"
  1666. "movq %%mm0, (%2, %%"REG_a") \n\t"
  1667. "movq %%mm2, (%3, %%"REG_a") \n\t"
  1668. "add $8, %%"REG_a" \n\t"
  1669. " js 1b \n\t"
  1670. : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst1+width), "r" (dst2+width)
  1671. : "%"REG_a
  1672. );
  1673. }
  1674. static void RENAME(nv12ToUV)(uint8_t *dstU, uint8_t *dstV,
  1675. const uint8_t *src1, const uint8_t *src2,
  1676. int width, uint32_t *unused)
  1677. {
  1678. RENAME(nvXXtoUV)(dstU, dstV, src1, width);
  1679. }
  1680. static void RENAME(nv21ToUV)(uint8_t *dstU, uint8_t *dstV,
  1681. const uint8_t *src1, const uint8_t *src2,
  1682. int width, uint32_t *unused)
  1683. {
  1684. RENAME(nvXXtoUV)(dstV, dstU, src1, width);
  1685. }
  1686. #endif /* !COMPILE_TEMPLATE_MMX2 */
  1687. static av_always_inline void RENAME(bgr24ToY_mmx)(int16_t *dst, const uint8_t *src,
  1688. int width, enum PixelFormat srcFormat)
  1689. {
  1690. if(srcFormat == PIX_FMT_BGR24) {
  1691. __asm__ volatile(
  1692. "movq "MANGLE(ff_bgr24toY1Coeff)", %%mm5 \n\t"
  1693. "movq "MANGLE(ff_bgr24toY2Coeff)", %%mm6 \n\t"
  1694. :
  1695. );
  1696. } else {
  1697. __asm__ volatile(
  1698. "movq "MANGLE(ff_rgb24toY1Coeff)", %%mm5 \n\t"
  1699. "movq "MANGLE(ff_rgb24toY2Coeff)", %%mm6 \n\t"
  1700. :
  1701. );
  1702. }
  1703. __asm__ volatile(
  1704. "movq "MANGLE(ff_bgr24toYOffset)", %%mm4 \n\t"
  1705. "mov %2, %%"REG_a" \n\t"
  1706. "pxor %%mm7, %%mm7 \n\t"
  1707. "1: \n\t"
  1708. PREFETCH" 64(%0) \n\t"
  1709. "movd (%0), %%mm0 \n\t"
  1710. "movd 2(%0), %%mm1 \n\t"
  1711. "movd 6(%0), %%mm2 \n\t"
  1712. "movd 8(%0), %%mm3 \n\t"
  1713. "add $12, %0 \n\t"
  1714. "punpcklbw %%mm7, %%mm0 \n\t"
  1715. "punpcklbw %%mm7, %%mm1 \n\t"
  1716. "punpcklbw %%mm7, %%mm2 \n\t"
  1717. "punpcklbw %%mm7, %%mm3 \n\t"
  1718. "pmaddwd %%mm5, %%mm0 \n\t"
  1719. "pmaddwd %%mm6, %%mm1 \n\t"
  1720. "pmaddwd %%mm5, %%mm2 \n\t"
  1721. "pmaddwd %%mm6, %%mm3 \n\t"
  1722. "paddd %%mm1, %%mm0 \n\t"
  1723. "paddd %%mm3, %%mm2 \n\t"
  1724. "paddd %%mm4, %%mm0 \n\t"
  1725. "paddd %%mm4, %%mm2 \n\t"
  1726. "psrad $9, %%mm0 \n\t"
  1727. "psrad $9, %%mm2 \n\t"
  1728. "packssdw %%mm2, %%mm0 \n\t"
  1729. "movq %%mm0, (%1, %%"REG_a") \n\t"
  1730. "add $8, %%"REG_a" \n\t"
  1731. " js 1b \n\t"
  1732. : "+r" (src)
  1733. : "r" (dst+width), "g" ((x86_reg)-2*width)
  1734. : "%"REG_a
  1735. );
  1736. }
  1737. static void RENAME(bgr24ToY)(int16_t *dst, const uint8_t *src,
  1738. int width, uint32_t *unused)
  1739. {
  1740. RENAME(bgr24ToY_mmx)(dst, src, width, PIX_FMT_BGR24);
  1741. }
  1742. static void RENAME(rgb24ToY)(int16_t *dst, const uint8_t *src,
  1743. int width, uint32_t *unused)
  1744. {
  1745. RENAME(bgr24ToY_mmx)(dst, src, width, PIX_FMT_RGB24);
  1746. }
  1747. static av_always_inline void RENAME(bgr24ToUV_mmx)(int16_t *dstU, int16_t *dstV,
  1748. const uint8_t *src, int width,
  1749. enum PixelFormat srcFormat)
  1750. {
  1751. __asm__ volatile(
  1752. "movq 24(%4), %%mm6 \n\t"
  1753. "mov %3, %%"REG_a" \n\t"
  1754. "pxor %%mm7, %%mm7 \n\t"
  1755. "1: \n\t"
  1756. PREFETCH" 64(%0) \n\t"
  1757. "movd (%0), %%mm0 \n\t"
  1758. "movd 2(%0), %%mm1 \n\t"
  1759. "punpcklbw %%mm7, %%mm0 \n\t"
  1760. "punpcklbw %%mm7, %%mm1 \n\t"
  1761. "movq %%mm0, %%mm2 \n\t"
  1762. "movq %%mm1, %%mm3 \n\t"
  1763. "pmaddwd (%4), %%mm0 \n\t"
  1764. "pmaddwd 8(%4), %%mm1 \n\t"
  1765. "pmaddwd 16(%4), %%mm2 \n\t"
  1766. "pmaddwd %%mm6, %%mm3 \n\t"
  1767. "paddd %%mm1, %%mm0 \n\t"
  1768. "paddd %%mm3, %%mm2 \n\t"
  1769. "movd 6(%0), %%mm1 \n\t"
  1770. "movd 8(%0), %%mm3 \n\t"
  1771. "add $12, %0 \n\t"
  1772. "punpcklbw %%mm7, %%mm1 \n\t"
  1773. "punpcklbw %%mm7, %%mm3 \n\t"
  1774. "movq %%mm1, %%mm4 \n\t"
  1775. "movq %%mm3, %%mm5 \n\t"
  1776. "pmaddwd (%4), %%mm1 \n\t"
  1777. "pmaddwd 8(%4), %%mm3 \n\t"
  1778. "pmaddwd 16(%4), %%mm4 \n\t"
  1779. "pmaddwd %%mm6, %%mm5 \n\t"
  1780. "paddd %%mm3, %%mm1 \n\t"
  1781. "paddd %%mm5, %%mm4 \n\t"
  1782. "movq "MANGLE(ff_bgr24toUVOffset)", %%mm3 \n\t"
  1783. "paddd %%mm3, %%mm0 \n\t"
  1784. "paddd %%mm3, %%mm2 \n\t"
  1785. "paddd %%mm3, %%mm1 \n\t"
  1786. "paddd %%mm3, %%mm4 \n\t"
  1787. "psrad $9, %%mm0 \n\t"
  1788. "psrad $9, %%mm2 \n\t"
  1789. "psrad $9, %%mm1 \n\t"
  1790. "psrad $9, %%mm4 \n\t"
  1791. "packssdw %%mm1, %%mm0 \n\t"
  1792. "packssdw %%mm4, %%mm2 \n\t"
  1793. "movq %%mm0, (%1, %%"REG_a") \n\t"
  1794. "movq %%mm2, (%2, %%"REG_a") \n\t"
  1795. "add $8, %%"REG_a" \n\t"
  1796. " js 1b \n\t"
  1797. : "+r" (src)
  1798. : "r" (dstU+width), "r" (dstV+width), "g" ((x86_reg)-2*width), "r"(ff_bgr24toUV[srcFormat == PIX_FMT_RGB24])
  1799. : "%"REG_a
  1800. );
  1801. }
  1802. static void RENAME(bgr24ToUV)(int16_t *dstU, int16_t *dstV,
  1803. const uint8_t *src1, const uint8_t *src2,
  1804. int width, uint32_t *unused)
  1805. {
  1806. RENAME(bgr24ToUV_mmx)(dstU, dstV, src1, width, PIX_FMT_BGR24);
  1807. assert(src1 == src2);
  1808. }
  1809. static void RENAME(rgb24ToUV)(int16_t *dstU, int16_t *dstV,
  1810. const uint8_t *src1, const uint8_t *src2,
  1811. int width, uint32_t *unused)
  1812. {
  1813. assert(src1==src2);
  1814. RENAME(bgr24ToUV_mmx)(dstU, dstV, src1, width, PIX_FMT_RGB24);
  1815. }
  1816. #if !COMPILE_TEMPLATE_MMX2
  1817. // bilinear / bicubic scaling
  1818. static void RENAME(hScale)(int16_t *dst, int dstW,
  1819. const uint8_t *src, const int16_t *filter,
  1820. const int16_t *filterPos, int filterSize)
  1821. {
  1822. assert(filterSize % 4 == 0 && filterSize>0);
  1823. if (filterSize==4) { // Always true for upscaling, sometimes for down, too.
  1824. x86_reg counter= -2*dstW;
  1825. filter-= counter*2;
  1826. filterPos-= counter/2;
  1827. dst-= counter/2;
  1828. __asm__ volatile(
  1829. #if defined(PIC)
  1830. "push %%"REG_b" \n\t"
  1831. #endif
  1832. "pxor %%mm7, %%mm7 \n\t"
  1833. "push %%"REG_BP" \n\t" // we use 7 regs here ...
  1834. "mov %%"REG_a", %%"REG_BP" \n\t"
  1835. ".p2align 4 \n\t"
  1836. "1: \n\t"
  1837. "movzwl (%2, %%"REG_BP"), %%eax \n\t"
  1838. "movzwl 2(%2, %%"REG_BP"), %%ebx \n\t"
  1839. "movq (%1, %%"REG_BP", 4), %%mm1 \n\t"
  1840. "movq 8(%1, %%"REG_BP", 4), %%mm3 \n\t"
  1841. "movd (%3, %%"REG_a"), %%mm0 \n\t"
  1842. "movd (%3, %%"REG_b"), %%mm2 \n\t"
  1843. "punpcklbw %%mm7, %%mm0 \n\t"
  1844. "punpcklbw %%mm7, %%mm2 \n\t"
  1845. "pmaddwd %%mm1, %%mm0 \n\t"
  1846. "pmaddwd %%mm2, %%mm3 \n\t"
  1847. "movq %%mm0, %%mm4 \n\t"
  1848. "punpckldq %%mm3, %%mm0 \n\t"
  1849. "punpckhdq %%mm3, %%mm4 \n\t"
  1850. "paddd %%mm4, %%mm0 \n\t"
  1851. "psrad $7, %%mm0 \n\t"
  1852. "packssdw %%mm0, %%mm0 \n\t"
  1853. "movd %%mm0, (%4, %%"REG_BP") \n\t"
  1854. "add $4, %%"REG_BP" \n\t"
  1855. " jnc 1b \n\t"
  1856. "pop %%"REG_BP" \n\t"
  1857. #if defined(PIC)
  1858. "pop %%"REG_b" \n\t"
  1859. #endif
  1860. : "+a" (counter)
  1861. : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
  1862. #if !defined(PIC)
  1863. : "%"REG_b
  1864. #endif
  1865. );
  1866. } else if (filterSize==8) {
  1867. x86_reg counter= -2*dstW;
  1868. filter-= counter*4;
  1869. filterPos-= counter/2;
  1870. dst-= counter/2;
  1871. __asm__ volatile(
  1872. #if defined(PIC)
  1873. "push %%"REG_b" \n\t"
  1874. #endif
  1875. "pxor %%mm7, %%mm7 \n\t"
  1876. "push %%"REG_BP" \n\t" // we use 7 regs here ...
  1877. "mov %%"REG_a", %%"REG_BP" \n\t"
  1878. ".p2align 4 \n\t"
  1879. "1: \n\t"
  1880. "movzwl (%2, %%"REG_BP"), %%eax \n\t"
  1881. "movzwl 2(%2, %%"REG_BP"), %%ebx \n\t"
  1882. "movq (%1, %%"REG_BP", 8), %%mm1 \n\t"
  1883. "movq 16(%1, %%"REG_BP", 8), %%mm3 \n\t"
  1884. "movd (%3, %%"REG_a"), %%mm0 \n\t"
  1885. "movd (%3, %%"REG_b"), %%mm2 \n\t"
  1886. "punpcklbw %%mm7, %%mm0 \n\t"
  1887. "punpcklbw %%mm7, %%mm2 \n\t"
  1888. "pmaddwd %%mm1, %%mm0 \n\t"
  1889. "pmaddwd %%mm2, %%mm3 \n\t"
  1890. "movq 8(%1, %%"REG_BP", 8), %%mm1 \n\t"
  1891. "movq 24(%1, %%"REG_BP", 8), %%mm5 \n\t"
  1892. "movd 4(%3, %%"REG_a"), %%mm4 \n\t"
  1893. "movd 4(%3, %%"REG_b"), %%mm2 \n\t"
  1894. "punpcklbw %%mm7, %%mm4 \n\t"
  1895. "punpcklbw %%mm7, %%mm2 \n\t"
  1896. "pmaddwd %%mm1, %%mm4 \n\t"
  1897. "pmaddwd %%mm2, %%mm5 \n\t"
  1898. "paddd %%mm4, %%mm0 \n\t"
  1899. "paddd %%mm5, %%mm3 \n\t"
  1900. "movq %%mm0, %%mm4 \n\t"
  1901. "punpckldq %%mm3, %%mm0 \n\t"
  1902. "punpckhdq %%mm3, %%mm4 \n\t"
  1903. "paddd %%mm4, %%mm0 \n\t"
  1904. "psrad $7, %%mm0 \n\t"
  1905. "packssdw %%mm0, %%mm0 \n\t"
  1906. "movd %%mm0, (%4, %%"REG_BP") \n\t"
  1907. "add $4, %%"REG_BP" \n\t"
  1908. " jnc 1b \n\t"
  1909. "pop %%"REG_BP" \n\t"
  1910. #if defined(PIC)
  1911. "pop %%"REG_b" \n\t"
  1912. #endif
  1913. : "+a" (counter)
  1914. : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
  1915. #if !defined(PIC)
  1916. : "%"REG_b
  1917. #endif
  1918. );
  1919. } else {
  1920. const uint8_t *offset = src+filterSize;
  1921. x86_reg counter= -2*dstW;
  1922. //filter-= counter*filterSize/2;
  1923. filterPos-= counter/2;
  1924. dst-= counter/2;
  1925. __asm__ volatile(
  1926. "pxor %%mm7, %%mm7 \n\t"
  1927. ".p2align 4 \n\t"
  1928. "1: \n\t"
  1929. "mov %2, %%"REG_c" \n\t"
  1930. "movzwl (%%"REG_c", %0), %%eax \n\t"
  1931. "movzwl 2(%%"REG_c", %0), %%edx \n\t"
  1932. "mov %5, %%"REG_c" \n\t"
  1933. "pxor %%mm4, %%mm4 \n\t"
  1934. "pxor %%mm5, %%mm5 \n\t"
  1935. "2: \n\t"
  1936. "movq (%1), %%mm1 \n\t"
  1937. "movq (%1, %6), %%mm3 \n\t"
  1938. "movd (%%"REG_c", %%"REG_a"), %%mm0 \n\t"
  1939. "movd (%%"REG_c", %%"REG_d"), %%mm2 \n\t"
  1940. "punpcklbw %%mm7, %%mm0 \n\t"
  1941. "punpcklbw %%mm7, %%mm2 \n\t"
  1942. "pmaddwd %%mm1, %%mm0 \n\t"
  1943. "pmaddwd %%mm2, %%mm3 \n\t"
  1944. "paddd %%mm3, %%mm5 \n\t"
  1945. "paddd %%mm0, %%mm4 \n\t"
  1946. "add $8, %1 \n\t"
  1947. "add $4, %%"REG_c" \n\t"
  1948. "cmp %4, %%"REG_c" \n\t"
  1949. " jb 2b \n\t"
  1950. "add %6, %1 \n\t"
  1951. "movq %%mm4, %%mm0 \n\t"
  1952. "punpckldq %%mm5, %%mm4 \n\t"
  1953. "punpckhdq %%mm5, %%mm0 \n\t"
  1954. "paddd %%mm0, %%mm4 \n\t"
  1955. "psrad $7, %%mm4 \n\t"
  1956. "packssdw %%mm4, %%mm4 \n\t"
  1957. "mov %3, %%"REG_a" \n\t"
  1958. "movd %%mm4, (%%"REG_a", %0) \n\t"
  1959. "add $4, %0 \n\t"
  1960. " jnc 1b \n\t"
  1961. : "+r" (counter), "+r" (filter)
  1962. : "m" (filterPos), "m" (dst), "m"(offset),
  1963. "m" (src), "r" ((x86_reg)filterSize*2)
  1964. : "%"REG_a, "%"REG_c, "%"REG_d
  1965. );
  1966. }
  1967. }
  1968. #endif /* !COMPILE_TEMPLATE_MMX2 */
  1969. static inline void RENAME(hScale16)(int16_t *dst, int dstW, const uint16_t *src, int srcW, int xInc,
  1970. const int16_t *filter, const int16_t *filterPos, long filterSize, int shift)
  1971. {
  1972. int i, j;
  1973. assert(filterSize % 4 == 0 && filterSize>0);
  1974. if (filterSize==4 && shift<15) { // Always true for upscaling, sometimes for down, too.
  1975. x86_reg counter= -2*dstW;
  1976. filter-= counter*2;
  1977. filterPos-= counter/2;
  1978. dst-= counter/2;
  1979. __asm__ volatile(
  1980. "movd %5, %%mm7 \n\t"
  1981. #if defined(PIC)
  1982. "push %%"REG_b" \n\t"
  1983. #endif
  1984. "push %%"REG_BP" \n\t" // we use 7 regs here ...
  1985. "mov %%"REG_a", %%"REG_BP" \n\t"
  1986. ".p2align 4 \n\t"
  1987. "1: \n\t"
  1988. "movzwl (%2, %%"REG_BP"), %%eax \n\t"
  1989. "movzwl 2(%2, %%"REG_BP"), %%ebx \n\t"
  1990. "movq (%1, %%"REG_BP", 4), %%mm1 \n\t"
  1991. "movq 8(%1, %%"REG_BP", 4), %%mm3 \n\t"
  1992. "movq (%3, %%"REG_a", 2), %%mm0 \n\t"
  1993. "movq (%3, %%"REG_b", 2), %%mm2 \n\t"
  1994. "pmaddwd %%mm1, %%mm0 \n\t"
  1995. "pmaddwd %%mm2, %%mm3 \n\t"
  1996. "movq %%mm0, %%mm4 \n\t"
  1997. "punpckldq %%mm3, %%mm0 \n\t"
  1998. "punpckhdq %%mm3, %%mm4 \n\t"
  1999. "paddd %%mm4, %%mm0 \n\t"
  2000. "psrad %%mm7, %%mm0 \n\t"
  2001. "packssdw %%mm0, %%mm0 \n\t"
  2002. "movd %%mm0, (%4, %%"REG_BP") \n\t"
  2003. "add $4, %%"REG_BP" \n\t"
  2004. " jnc 1b \n\t"
  2005. "pop %%"REG_BP" \n\t"
  2006. #if defined(PIC)
  2007. "pop %%"REG_b" \n\t"
  2008. #endif
  2009. : "+a" (counter)
  2010. : "c" (filter), "d" (filterPos), "S" (src), "D" (dst), "m"(shift)
  2011. #if !defined(PIC)
  2012. : "%"REG_b
  2013. #endif
  2014. );
  2015. } else if (filterSize==8 && shift<15) {
  2016. x86_reg counter= -2*dstW;
  2017. filter-= counter*4;
  2018. filterPos-= counter/2;
  2019. dst-= counter/2;
  2020. __asm__ volatile(
  2021. "movd %5, %%mm7 \n\t"
  2022. #if defined(PIC)
  2023. "push %%"REG_b" \n\t"
  2024. #endif
  2025. "push %%"REG_BP" \n\t" // we use 7 regs here ...
  2026. "mov %%"REG_a", %%"REG_BP" \n\t"
  2027. ".p2align 4 \n\t"
  2028. "1: \n\t"
  2029. "movzwl (%2, %%"REG_BP"), %%eax \n\t"
  2030. "movzwl 2(%2, %%"REG_BP"), %%ebx \n\t"
  2031. "movq (%1, %%"REG_BP", 8), %%mm1 \n\t"
  2032. "movq 16(%1, %%"REG_BP", 8), %%mm3 \n\t"
  2033. "movq (%3, %%"REG_a", 2), %%mm0 \n\t"
  2034. "movq (%3, %%"REG_b", 2), %%mm2 \n\t"
  2035. "pmaddwd %%mm1, %%mm0 \n\t"
  2036. "pmaddwd %%mm2, %%mm3 \n\t"
  2037. "movq 8(%1, %%"REG_BP", 8), %%mm1 \n\t"
  2038. "movq 24(%1, %%"REG_BP", 8), %%mm5 \n\t"
  2039. "movq 8(%3, %%"REG_a", 2), %%mm4 \n\t"
  2040. "movq 8(%3, %%"REG_b", 2), %%mm2 \n\t"
  2041. "pmaddwd %%mm1, %%mm4 \n\t"
  2042. "pmaddwd %%mm2, %%mm5 \n\t"
  2043. "paddd %%mm4, %%mm0 \n\t"
  2044. "paddd %%mm5, %%mm3 \n\t"
  2045. "movq %%mm0, %%mm4 \n\t"
  2046. "punpckldq %%mm3, %%mm0 \n\t"
  2047. "punpckhdq %%mm3, %%mm4 \n\t"
  2048. "paddd %%mm4, %%mm0 \n\t"
  2049. "psrad %%mm7, %%mm0 \n\t"
  2050. "packssdw %%mm0, %%mm0 \n\t"
  2051. "movd %%mm0, (%4, %%"REG_BP") \n\t"
  2052. "add $4, %%"REG_BP" \n\t"
  2053. " jnc 1b \n\t"
  2054. "pop %%"REG_BP" \n\t"
  2055. #if defined(PIC)
  2056. "pop %%"REG_b" \n\t"
  2057. #endif
  2058. : "+a" (counter)
  2059. : "c" (filter), "d" (filterPos), "S" (src), "D" (dst), "m"(shift)
  2060. #if !defined(PIC)
  2061. : "%"REG_b
  2062. #endif
  2063. );
  2064. } else if (shift<15){
  2065. const uint16_t *offset = src+filterSize;
  2066. x86_reg counter= -2*dstW;
  2067. //filter-= counter*filterSize/2;
  2068. filterPos-= counter/2;
  2069. dst-= counter/2;
  2070. __asm__ volatile(
  2071. "movd %7, %%mm7 \n\t"
  2072. ".p2align 4 \n\t"
  2073. "1: \n\t"
  2074. "mov %2, %%"REG_c" \n\t"
  2075. "movzwl (%%"REG_c", %0), %%eax \n\t"
  2076. "movzwl 2(%%"REG_c", %0), %%edx \n\t"
  2077. "mov %5, %%"REG_c" \n\t"
  2078. "pxor %%mm4, %%mm4 \n\t"
  2079. "pxor %%mm5, %%mm5 \n\t"
  2080. "2: \n\t"
  2081. "movq (%1), %%mm1 \n\t"
  2082. "movq (%1, %6), %%mm3 \n\t"
  2083. "movq (%%"REG_c", %%"REG_a", 2), %%mm0 \n\t"
  2084. "movq (%%"REG_c", %%"REG_d", 2), %%mm2 \n\t"
  2085. "pmaddwd %%mm1, %%mm0 \n\t"
  2086. "pmaddwd %%mm2, %%mm3 \n\t"
  2087. "paddd %%mm3, %%mm5 \n\t"
  2088. "paddd %%mm0, %%mm4 \n\t"
  2089. "add $8, %1 \n\t"
  2090. "add $8, %%"REG_c" \n\t"
  2091. "cmp %4, %%"REG_c" \n\t"
  2092. " jb 2b \n\t"
  2093. "add %6, %1 \n\t"
  2094. "movq %%mm4, %%mm0 \n\t"
  2095. "punpckldq %%mm5, %%mm4 \n\t"
  2096. "punpckhdq %%mm5, %%mm0 \n\t"
  2097. "paddd %%mm0, %%mm4 \n\t"
  2098. "psrad %%mm7, %%mm4 \n\t"
  2099. "packssdw %%mm4, %%mm4 \n\t"
  2100. "mov %3, %%"REG_a" \n\t"
  2101. "movd %%mm4, (%%"REG_a", %0) \n\t"
  2102. "add $4, %0 \n\t"
  2103. " jnc 1b \n\t"
  2104. : "+r" (counter), "+r" (filter)
  2105. : "m" (filterPos), "m" (dst), "m"(offset),
  2106. "m" (src), "r" ((x86_reg)filterSize*2), "m"(shift)
  2107. : "%"REG_a, "%"REG_c, "%"REG_d
  2108. );
  2109. } else
  2110. for (i=0; i<dstW; i++) {
  2111. int srcPos= filterPos[i];
  2112. int val=0;
  2113. for (j=0; j<filterSize; j++) {
  2114. val += ((int)src[srcPos + j])*filter[filterSize*i + j];
  2115. }
  2116. dst[i] = FFMIN(val>>shift, (1<<15)-1); // the cubic equation does overflow ...
  2117. }
  2118. }
  2119. #if COMPILE_TEMPLATE_MMX2
  2120. static void RENAME(hyscale_fast)(SwsContext *c, int16_t *dst,
  2121. int dstWidth, const uint8_t *src,
  2122. int srcW, int xInc)
  2123. {
  2124. int16_t *filterPos = c->hLumFilterPos;
  2125. int16_t *filter = c->hLumFilter;
  2126. void *mmx2FilterCode= c->lumMmx2FilterCode;
  2127. int i;
  2128. #if defined(PIC)
  2129. DECLARE_ALIGNED(8, uint64_t, ebxsave);
  2130. #endif
  2131. __asm__ volatile(
  2132. #if defined(PIC)
  2133. "mov %%"REG_b", %5 \n\t"
  2134. #endif
  2135. "pxor %%mm7, %%mm7 \n\t"
  2136. "mov %0, %%"REG_c" \n\t"
  2137. "mov %1, %%"REG_D" \n\t"
  2138. "mov %2, %%"REG_d" \n\t"
  2139. "mov %3, %%"REG_b" \n\t"
  2140. "xor %%"REG_a", %%"REG_a" \n\t" // i
  2141. PREFETCH" (%%"REG_c") \n\t"
  2142. PREFETCH" 32(%%"REG_c") \n\t"
  2143. PREFETCH" 64(%%"REG_c") \n\t"
  2144. #if ARCH_X86_64
  2145. #define CALL_MMX2_FILTER_CODE \
  2146. "movl (%%"REG_b"), %%esi \n\t"\
  2147. "call *%4 \n\t"\
  2148. "movl (%%"REG_b", %%"REG_a"), %%esi \n\t"\
  2149. "add %%"REG_S", %%"REG_c" \n\t"\
  2150. "add %%"REG_a", %%"REG_D" \n\t"\
  2151. "xor %%"REG_a", %%"REG_a" \n\t"\
  2152. #else
  2153. #define CALL_MMX2_FILTER_CODE \
  2154. "movl (%%"REG_b"), %%esi \n\t"\
  2155. "call *%4 \n\t"\
  2156. "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\
  2157. "add %%"REG_a", %%"REG_D" \n\t"\
  2158. "xor %%"REG_a", %%"REG_a" \n\t"\
  2159. #endif /* ARCH_X86_64 */
  2160. CALL_MMX2_FILTER_CODE
  2161. CALL_MMX2_FILTER_CODE
  2162. CALL_MMX2_FILTER_CODE
  2163. CALL_MMX2_FILTER_CODE
  2164. CALL_MMX2_FILTER_CODE
  2165. CALL_MMX2_FILTER_CODE
  2166. CALL_MMX2_FILTER_CODE
  2167. CALL_MMX2_FILTER_CODE
  2168. #if defined(PIC)
  2169. "mov %5, %%"REG_b" \n\t"
  2170. #endif
  2171. :: "m" (src), "m" (dst), "m" (filter), "m" (filterPos),
  2172. "m" (mmx2FilterCode)
  2173. #if defined(PIC)
  2174. ,"m" (ebxsave)
  2175. #endif
  2176. : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
  2177. #if !defined(PIC)
  2178. ,"%"REG_b
  2179. #endif
  2180. );
  2181. for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
  2182. dst[i] = src[srcW-1]*128;
  2183. }
  2184. static void RENAME(hcscale_fast)(SwsContext *c, int16_t *dst1, int16_t *dst2,
  2185. int dstWidth, const uint8_t *src1,
  2186. const uint8_t *src2, int srcW, int xInc)
  2187. {
  2188. int16_t *filterPos = c->hChrFilterPos;
  2189. int16_t *filter = c->hChrFilter;
  2190. void *mmx2FilterCode= c->chrMmx2FilterCode;
  2191. int i;
  2192. #if defined(PIC)
  2193. DECLARE_ALIGNED(8, uint64_t, ebxsave);
  2194. #endif
  2195. __asm__ volatile(
  2196. #if defined(PIC)
  2197. "mov %%"REG_b", %7 \n\t"
  2198. #endif
  2199. "pxor %%mm7, %%mm7 \n\t"
  2200. "mov %0, %%"REG_c" \n\t"
  2201. "mov %1, %%"REG_D" \n\t"
  2202. "mov %2, %%"REG_d" \n\t"
  2203. "mov %3, %%"REG_b" \n\t"
  2204. "xor %%"REG_a", %%"REG_a" \n\t" // i
  2205. PREFETCH" (%%"REG_c") \n\t"
  2206. PREFETCH" 32(%%"REG_c") \n\t"
  2207. PREFETCH" 64(%%"REG_c") \n\t"
  2208. CALL_MMX2_FILTER_CODE
  2209. CALL_MMX2_FILTER_CODE
  2210. CALL_MMX2_FILTER_CODE
  2211. CALL_MMX2_FILTER_CODE
  2212. "xor %%"REG_a", %%"REG_a" \n\t" // i
  2213. "mov %5, %%"REG_c" \n\t" // src
  2214. "mov %6, %%"REG_D" \n\t" // buf2
  2215. PREFETCH" (%%"REG_c") \n\t"
  2216. PREFETCH" 32(%%"REG_c") \n\t"
  2217. PREFETCH" 64(%%"REG_c") \n\t"
  2218. CALL_MMX2_FILTER_CODE
  2219. CALL_MMX2_FILTER_CODE
  2220. CALL_MMX2_FILTER_CODE
  2221. CALL_MMX2_FILTER_CODE
  2222. #if defined(PIC)
  2223. "mov %7, %%"REG_b" \n\t"
  2224. #endif
  2225. :: "m" (src1), "m" (dst1), "m" (filter), "m" (filterPos),
  2226. "m" (mmx2FilterCode), "m" (src2), "m"(dst2)
  2227. #if defined(PIC)
  2228. ,"m" (ebxsave)
  2229. #endif
  2230. : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
  2231. #if !defined(PIC)
  2232. ,"%"REG_b
  2233. #endif
  2234. );
  2235. for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) {
  2236. dst1[i] = src1[srcW-1]*128;
  2237. dst2[i] = src2[srcW-1]*128;
  2238. }
  2239. }
  2240. #endif /* COMPILE_TEMPLATE_MMX2 */
  2241. static av_cold void RENAME(sws_init_swScale)(SwsContext *c)
  2242. {
  2243. enum PixelFormat srcFormat = c->srcFormat,
  2244. dstFormat = c->dstFormat;
  2245. if (!is16BPS(dstFormat) && !is9_OR_10BPS(dstFormat) && dstFormat != PIX_FMT_NV12
  2246. && dstFormat != PIX_FMT_NV21 && !(c->flags & SWS_BITEXACT)) {
  2247. if (c->flags & SWS_ACCURATE_RND) {
  2248. c->yuv2yuv1 = RENAME(yuv2yuv1_ar );
  2249. c->yuv2yuvX = RENAME(yuv2yuvX_ar );
  2250. if (!(c->flags & SWS_FULL_CHR_H_INT)) {
  2251. switch (c->dstFormat) {
  2252. case PIX_FMT_RGB32: c->yuv2packedX = RENAME(yuv2rgb32_X_ar); break;
  2253. case PIX_FMT_BGR24: c->yuv2packedX = RENAME(yuv2bgr24_X_ar); break;
  2254. case PIX_FMT_RGB555: c->yuv2packedX = RENAME(yuv2rgb555_X_ar); break;
  2255. case PIX_FMT_RGB565: c->yuv2packedX = RENAME(yuv2rgb565_X_ar); break;
  2256. case PIX_FMT_YUYV422: c->yuv2packedX = RENAME(yuv2yuyv422_X_ar); break;
  2257. default: break;
  2258. }
  2259. }
  2260. } else {
  2261. int should_dither= isNBPS(c->srcFormat) || is16BPS(c->srcFormat);
  2262. c->yuv2yuv1 = should_dither ? RENAME(yuv2yuv1_ar ) : RENAME(yuv2yuv1 );
  2263. c->yuv2yuvX = RENAME(yuv2yuvX );
  2264. if (!(c->flags & SWS_FULL_CHR_H_INT)) {
  2265. switch (c->dstFormat) {
  2266. case PIX_FMT_RGB32: c->yuv2packedX = RENAME(yuv2rgb32_X); break;
  2267. case PIX_FMT_BGR24: c->yuv2packedX = RENAME(yuv2bgr24_X); break;
  2268. case PIX_FMT_RGB555: c->yuv2packedX = RENAME(yuv2rgb555_X); break;
  2269. case PIX_FMT_RGB565: c->yuv2packedX = RENAME(yuv2rgb565_X); break;
  2270. case PIX_FMT_YUYV422: c->yuv2packedX = RENAME(yuv2yuyv422_X); break;
  2271. default: break;
  2272. }
  2273. }
  2274. }
  2275. if (!(c->flags & SWS_FULL_CHR_H_INT)) {
  2276. switch (c->dstFormat) {
  2277. case PIX_FMT_RGB32:
  2278. c->yuv2packed1 = RENAME(yuv2rgb32_1);
  2279. c->yuv2packed2 = RENAME(yuv2rgb32_2);
  2280. break;
  2281. case PIX_FMT_BGR24:
  2282. c->yuv2packed1 = RENAME(yuv2bgr24_1);
  2283. c->yuv2packed2 = RENAME(yuv2bgr24_2);
  2284. break;
  2285. case PIX_FMT_RGB555:
  2286. c->yuv2packed1 = RENAME(yuv2rgb555_1);
  2287. c->yuv2packed2 = RENAME(yuv2rgb555_2);
  2288. break;
  2289. case PIX_FMT_RGB565:
  2290. c->yuv2packed1 = RENAME(yuv2rgb565_1);
  2291. c->yuv2packed2 = RENAME(yuv2rgb565_2);
  2292. break;
  2293. case PIX_FMT_YUYV422:
  2294. c->yuv2packed1 = RENAME(yuv2yuyv422_1);
  2295. c->yuv2packed2 = RENAME(yuv2yuyv422_2);
  2296. break;
  2297. default:
  2298. break;
  2299. }
  2300. }
  2301. }
  2302. #if !COMPILE_TEMPLATE_MMX2
  2303. c->hScale = RENAME(hScale );
  2304. #endif /* !COMPILE_TEMPLATE_MMX2 */
  2305. // Use the new MMX scaler if the MMX2 one can't be used (it is faster than the x86 ASM one).
  2306. #if COMPILE_TEMPLATE_MMX2
  2307. if (c->flags & SWS_FAST_BILINEAR && c->canMMX2BeUsed)
  2308. {
  2309. c->hyscale_fast = RENAME(hyscale_fast);
  2310. c->hcscale_fast = RENAME(hcscale_fast);
  2311. } else {
  2312. #endif /* COMPILE_TEMPLATE_MMX2 */
  2313. c->hyscale_fast = NULL;
  2314. c->hcscale_fast = NULL;
  2315. #if COMPILE_TEMPLATE_MMX2
  2316. }
  2317. #endif /* COMPILE_TEMPLATE_MMX2 */
  2318. #if !COMPILE_TEMPLATE_MMX2
  2319. switch(srcFormat) {
  2320. case PIX_FMT_YUYV422 : c->chrToYV12 = RENAME(yuy2ToUV); break;
  2321. case PIX_FMT_UYVY422 : c->chrToYV12 = RENAME(uyvyToUV); break;
  2322. case PIX_FMT_NV12 : c->chrToYV12 = RENAME(nv12ToUV); break;
  2323. case PIX_FMT_NV21 : c->chrToYV12 = RENAME(nv21ToUV); break;
  2324. case PIX_FMT_GRAY16LE :
  2325. case PIX_FMT_YUV420P9LE:
  2326. case PIX_FMT_YUV422P10LE:
  2327. case PIX_FMT_YUV420P10LE:
  2328. case PIX_FMT_YUV420P16LE:
  2329. case PIX_FMT_YUV422P16LE:
  2330. case PIX_FMT_YUV444P16LE: c->hScale16= RENAME(hScale16); break;
  2331. }
  2332. #endif /* !COMPILE_TEMPLATE_MMX2 */
  2333. if (!c->chrSrcHSubSample) {
  2334. switch(srcFormat) {
  2335. case PIX_FMT_BGR24 : c->chrToYV12 = RENAME(bgr24ToUV); break;
  2336. case PIX_FMT_RGB24 : c->chrToYV12 = RENAME(rgb24ToUV); break;
  2337. default: break;
  2338. }
  2339. }
  2340. switch (srcFormat) {
  2341. #if !COMPILE_TEMPLATE_MMX2
  2342. case PIX_FMT_YUYV422 :
  2343. case PIX_FMT_Y400A :
  2344. c->lumToYV12 = RENAME(yuy2ToY); break;
  2345. case PIX_FMT_UYVY422 :
  2346. c->lumToYV12 = RENAME(uyvyToY); break;
  2347. #endif /* !COMPILE_TEMPLATE_MMX2 */
  2348. case PIX_FMT_BGR24 : c->lumToYV12 = RENAME(bgr24ToY); break;
  2349. case PIX_FMT_RGB24 : c->lumToYV12 = RENAME(rgb24ToY); break;
  2350. default: break;
  2351. }
  2352. #if !COMPILE_TEMPLATE_MMX2
  2353. if (c->alpPixBuf) {
  2354. switch (srcFormat) {
  2355. case PIX_FMT_Y400A : c->alpToYV12 = RENAME(yuy2ToY); break;
  2356. default: break;
  2357. }
  2358. }
  2359. #endif /* !COMPILE_TEMPLATE_MMX2 */
  2360. if(isAnyRGB(c->srcFormat))
  2361. c->hScale16= RENAME(hScale16);
  2362. }