You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

2470 lines
113KB

  1. /*
  2. * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
  3. *
  4. * This file is part of FFmpeg.
  5. *
  6. * FFmpeg is free software; you can redistribute it and/or
  7. * modify it under the terms of the GNU Lesser General Public
  8. * License as published by the Free Software Foundation; either
  9. * version 2.1 of the License, or (at your option) any later version.
  10. *
  11. * FFmpeg is distributed in the hope that it will be useful,
  12. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  14. * Lesser General Public License for more details.
  15. *
  16. * You should have received a copy of the GNU Lesser General Public
  17. * License along with FFmpeg; if not, write to the Free Software
  18. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  19. */
  20. #undef REAL_MOVNTQ
  21. #undef MOVNTQ
  22. #undef PREFETCH
  23. #if COMPILE_TEMPLATE_MMX2
  24. #define PREFETCH "prefetchnta"
  25. #else
  26. #define PREFETCH " # nop"
  27. #endif
  28. #if COMPILE_TEMPLATE_MMX2
  29. #define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
  30. #else
  31. #define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
  32. #endif
  33. #define MOVNTQ(a,b) REAL_MOVNTQ(a,b)
  34. #define YSCALEYUV2YV12X(offset, dest, end, pos) \
  35. __asm__ volatile(\
  36. "movq "DITHER16"+0(%0), %%mm3 \n\t"\
  37. "movq "DITHER16"+8(%0), %%mm4 \n\t"\
  38. "lea " offset "(%0), %%"REG_d" \n\t"\
  39. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  40. ".p2align 4 \n\t" /* FIXME Unroll? */\
  41. "1: \n\t"\
  42. "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
  43. "movq (%%"REG_S", %3, 2), %%mm2 \n\t" /* srcData */\
  44. "movq 8(%%"REG_S", %3, 2), %%mm5 \n\t" /* srcData */\
  45. "add $16, %%"REG_d" \n\t"\
  46. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  47. "test %%"REG_S", %%"REG_S" \n\t"\
  48. "pmulhw %%mm0, %%mm2 \n\t"\
  49. "pmulhw %%mm0, %%mm5 \n\t"\
  50. "paddw %%mm2, %%mm3 \n\t"\
  51. "paddw %%mm5, %%mm4 \n\t"\
  52. " jnz 1b \n\t"\
  53. "psraw $3, %%mm3 \n\t"\
  54. "psraw $3, %%mm4 \n\t"\
  55. "packuswb %%mm4, %%mm3 \n\t"\
  56. MOVNTQ(%%mm3, (%1, %3))\
  57. "add $8, %3 \n\t"\
  58. "cmp %2, %3 \n\t"\
  59. "movq "DITHER16"+0(%0), %%mm3 \n\t"\
  60. "movq "DITHER16"+8(%0), %%mm4 \n\t"\
  61. "lea " offset "(%0), %%"REG_d" \n\t"\
  62. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  63. "jb 1b \n\t"\
  64. :: "r" (&c->redDither),\
  65. "r" (dest), "g" ((x86_reg)(end)), "r"((x86_reg)(pos))\
  66. : "%"REG_d, "%"REG_S\
  67. );
  68. static inline void RENAME(yuv2yuvX)(SwsContext *c, const int16_t *lumFilter,
  69. const int16_t **lumSrc, int lumFilterSize,
  70. const int16_t *chrFilter, const int16_t **chrUSrc,
  71. const int16_t **chrVSrc,
  72. int chrFilterSize, const int16_t **alpSrc,
  73. uint8_t *dest, uint8_t *uDest, uint8_t *vDest,
  74. uint8_t *aDest, int dstW, int chrDstW,
  75. const uint8_t *lumDither, const uint8_t *chrDither)
  76. {
  77. int i;
  78. if (uDest) {
  79. x86_reg uv_off = c->uv_off;
  80. for(i=0; i<8; i++) c->dither16[i] = chrDither[i]>>4;
  81. YSCALEYUV2YV12X(CHR_MMX_FILTER_OFFSET, uDest, chrDstW, 0)
  82. for(i=0; i<8; i++) c->dither16[i] = chrDither[(i+3)&7]>>4;
  83. YSCALEYUV2YV12X(CHR_MMX_FILTER_OFFSET, vDest - uv_off, chrDstW + uv_off, uv_off)
  84. }
  85. for(i=0; i<8; i++) c->dither16[i] = lumDither[i]>>4;
  86. if (CONFIG_SWSCALE_ALPHA && aDest) {
  87. YSCALEYUV2YV12X(ALP_MMX_FILTER_OFFSET, aDest, dstW, 0)
  88. }
  89. YSCALEYUV2YV12X(LUM_MMX_FILTER_OFFSET, dest, dstW, 0)
  90. }
  91. #define YSCALEYUV2YV12X_ACCURATE(offset, dest, end, pos) \
  92. __asm__ volatile(\
  93. "lea " offset "(%0), %%"REG_d" \n\t"\
  94. "movq "DITHER32"+0(%0), %%mm4 \n\t"\
  95. "movq "DITHER32"+8(%0), %%mm5 \n\t"\
  96. "movq "DITHER32"+16(%0), %%mm6 \n\t"\
  97. "movq "DITHER32"+24(%0), %%mm7 \n\t"\
  98. "pxor %%mm4, %%mm4 \n\t"\
  99. "pxor %%mm5, %%mm5 \n\t"\
  100. "pxor %%mm6, %%mm6 \n\t"\
  101. "pxor %%mm7, %%mm7 \n\t"\
  102. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  103. ".p2align 4 \n\t"\
  104. "1: \n\t"\
  105. "movq (%%"REG_S", %3, 2), %%mm0 \n\t" /* srcData */\
  106. "movq 8(%%"REG_S", %3, 2), %%mm2 \n\t" /* srcData */\
  107. "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
  108. "movq (%%"REG_S", %3, 2), %%mm1 \n\t" /* srcData */\
  109. "movq %%mm0, %%mm3 \n\t"\
  110. "punpcklwd %%mm1, %%mm0 \n\t"\
  111. "punpckhwd %%mm1, %%mm3 \n\t"\
  112. "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm1 \n\t" /* filterCoeff */\
  113. "pmaddwd %%mm1, %%mm0 \n\t"\
  114. "pmaddwd %%mm1, %%mm3 \n\t"\
  115. "paddd %%mm0, %%mm4 \n\t"\
  116. "paddd %%mm3, %%mm5 \n\t"\
  117. "movq 8(%%"REG_S", %3, 2), %%mm3 \n\t" /* srcData */\
  118. "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
  119. "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
  120. "test %%"REG_S", %%"REG_S" \n\t"\
  121. "movq %%mm2, %%mm0 \n\t"\
  122. "punpcklwd %%mm3, %%mm2 \n\t"\
  123. "punpckhwd %%mm3, %%mm0 \n\t"\
  124. "pmaddwd %%mm1, %%mm2 \n\t"\
  125. "pmaddwd %%mm1, %%mm0 \n\t"\
  126. "paddd %%mm2, %%mm6 \n\t"\
  127. "paddd %%mm0, %%mm7 \n\t"\
  128. " jnz 1b \n\t"\
  129. "psrad $19, %%mm4 \n\t"\
  130. "psrad $19, %%mm5 \n\t"\
  131. "psrad $19, %%mm6 \n\t"\
  132. "psrad $19, %%mm7 \n\t"\
  133. "packssdw %%mm5, %%mm4 \n\t"\
  134. "packssdw %%mm7, %%mm6 \n\t"\
  135. "packuswb %%mm6, %%mm4 \n\t"\
  136. MOVNTQ(%%mm4, (%1, %3))\
  137. "add $8, %3 \n\t"\
  138. "cmp %2, %3 \n\t"\
  139. "lea " offset "(%0), %%"REG_d" \n\t"\
  140. "movq "DITHER32"+0(%0), %%mm4 \n\t"\
  141. "movq "DITHER32"+8(%0), %%mm5 \n\t"\
  142. "movq "DITHER32"+16(%0), %%mm6 \n\t"\
  143. "movq "DITHER32"+24(%0), %%mm7 \n\t"\
  144. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  145. "jb 1b \n\t"\
  146. :: "r" (&c->redDither),\
  147. "r" (dest), "g" ((x86_reg)(end)), "r"((x86_reg)(pos))\
  148. : "%"REG_a, "%"REG_d, "%"REG_S\
  149. );
  150. static inline void RENAME(yuv2yuvX_ar)(SwsContext *c, const int16_t *lumFilter,
  151. const int16_t **lumSrc, int lumFilterSize,
  152. const int16_t *chrFilter, const int16_t **chrUSrc,
  153. const int16_t **chrVSrc,
  154. int chrFilterSize, const int16_t **alpSrc,
  155. uint8_t *dest, uint8_t *uDest, uint8_t *vDest,
  156. uint8_t *aDest, int dstW, int chrDstW,
  157. const uint8_t *lumDither, const uint8_t *chrDither)
  158. {
  159. int i;
  160. if (uDest) {
  161. x86_reg uv_off = c->uv_off;
  162. for(i=0; i<8; i++) c->dither32[i] = chrDither[i]<<12;
  163. YSCALEYUV2YV12X_ACCURATE(CHR_MMX_FILTER_OFFSET, uDest, chrDstW, 0)
  164. for(i=0; i<8; i++) c->dither32[i] = chrDither[(i+3)&7]<<12;
  165. YSCALEYUV2YV12X_ACCURATE(CHR_MMX_FILTER_OFFSET, vDest - uv_off, chrDstW + uv_off, uv_off)
  166. }
  167. for(i=0; i<8; i++) c->dither32[i] = lumDither[i]<<12;
  168. if (CONFIG_SWSCALE_ALPHA && aDest) {
  169. YSCALEYUV2YV12X_ACCURATE(ALP_MMX_FILTER_OFFSET, aDest, dstW, 0)
  170. }
  171. YSCALEYUV2YV12X_ACCURATE(LUM_MMX_FILTER_OFFSET, dest, dstW, 0)
  172. }
  173. #define YSCALEYUV2YV121 \
  174. "mov %2, %%"REG_a" \n\t"\
  175. ".p2align 4 \n\t" /* FIXME Unroll? */\
  176. "1: \n\t"\
  177. "movq (%0, %%"REG_a", 2), %%mm0 \n\t"\
  178. "movq 8(%0, %%"REG_a", 2), %%mm1 \n\t"\
  179. "psraw $7, %%mm0 \n\t"\
  180. "psraw $7, %%mm1 \n\t"\
  181. "packuswb %%mm1, %%mm0 \n\t"\
  182. MOVNTQ(%%mm0, (%1, %%REGa))\
  183. "add $8, %%"REG_a" \n\t"\
  184. "jnc 1b \n\t"
  185. static inline void RENAME(yuv2yuv1)(SwsContext *c, const int16_t *lumSrc,
  186. const int16_t *chrUSrc, const int16_t *chrVSrc,
  187. const int16_t *alpSrc,
  188. uint8_t *dest, uint8_t *uDest, uint8_t *vDest,
  189. uint8_t *aDest, int dstW, int chrDstW,
  190. const uint8_t *lumDither, const uint8_t *chrDither)
  191. {
  192. int p= 4;
  193. const int16_t *src[4]= { alpSrc + dstW, lumSrc + dstW, chrUSrc + chrDstW, chrVSrc + chrDstW };
  194. uint8_t *dst[4]= { aDest, dest, uDest, vDest };
  195. x86_reg counter[4]= { dstW, dstW, chrDstW, chrDstW };
  196. while (p--) {
  197. if (dst[p]) {
  198. __asm__ volatile(
  199. YSCALEYUV2YV121
  200. :: "r" (src[p]), "r" (dst[p] + counter[p]),
  201. "g" (-counter[p])
  202. : "%"REG_a
  203. );
  204. }
  205. }
  206. }
  207. #define YSCALEYUV2YV121_ACCURATE \
  208. "mov %2, %%"REG_a" \n\t"\
  209. "movq 0(%3), %%mm6 \n\t"\
  210. "movq 8(%3), %%mm7 \n\t"\
  211. ".p2align 4 \n\t" /* FIXME Unroll? */\
  212. "1: \n\t"\
  213. "movq (%0, %%"REG_a", 2), %%mm0 \n\t"\
  214. "movq 8(%0, %%"REG_a", 2), %%mm1 \n\t"\
  215. "paddsw %%mm6, %%mm0 \n\t"\
  216. "paddsw %%mm7, %%mm1 \n\t"\
  217. "psraw $7, %%mm0 \n\t"\
  218. "psraw $7, %%mm1 \n\t"\
  219. "packuswb %%mm1, %%mm0 \n\t"\
  220. MOVNTQ(%%mm0, (%1, %%REGa))\
  221. "add $8, %%"REG_a" \n\t"\
  222. "jnc 1b \n\t"
  223. static inline void RENAME(yuv2yuv1_ar)(SwsContext *c, const int16_t *lumSrc,
  224. const int16_t *chrUSrc, const int16_t *chrVSrc,
  225. const int16_t *alpSrc,
  226. uint8_t *dest, uint8_t *uDest, uint8_t *vDest,
  227. uint8_t *aDest, int dstW, int chrDstW,
  228. const uint8_t *lumDither, const uint8_t *chrDither)
  229. {
  230. int p= 4;
  231. const int16_t *src[4]= { alpSrc + dstW, lumSrc + dstW, chrUSrc + chrDstW, chrVSrc + chrDstW };
  232. uint8_t *dst[4]= { aDest, dest, uDest, vDest };
  233. x86_reg counter[4]= { dstW, dstW, chrDstW, chrDstW };
  234. while (p--) {
  235. if (dst[p]) {
  236. int i;
  237. for(i=0; i<8; i++) c->dither16[i] = i<2 ? lumDither[i] : chrDither[i];
  238. __asm__ volatile(
  239. YSCALEYUV2YV121_ACCURATE
  240. :: "r" (src[p]), "r" (dst[p] + counter[p]),
  241. "g" (-counter[p]), "r"(c->dither16)
  242. : "%"REG_a
  243. );
  244. }
  245. }
  246. }
  247. #define YSCALEYUV2PACKEDX_UV \
  248. __asm__ volatile(\
  249. "xor %%"REG_a", %%"REG_a" \n\t"\
  250. ".p2align 4 \n\t"\
  251. "nop \n\t"\
  252. "1: \n\t"\
  253. "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
  254. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  255. "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
  256. "movq %%mm3, %%mm4 \n\t"\
  257. ".p2align 4 \n\t"\
  258. "2: \n\t"\
  259. "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
  260. "movq (%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* UsrcData */\
  261. "add %6, %%"REG_S" \n\t" \
  262. "movq (%%"REG_S", %%"REG_a"), %%mm5 \n\t" /* VsrcData */\
  263. "add $16, %%"REG_d" \n\t"\
  264. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  265. "pmulhw %%mm0, %%mm2 \n\t"\
  266. "pmulhw %%mm0, %%mm5 \n\t"\
  267. "paddw %%mm2, %%mm3 \n\t"\
  268. "paddw %%mm5, %%mm4 \n\t"\
  269. "test %%"REG_S", %%"REG_S" \n\t"\
  270. " jnz 2b \n\t"\
  271. #define YSCALEYUV2PACKEDX_YA(offset,coeff,src1,src2,dst1,dst2) \
  272. "lea "offset"(%0), %%"REG_d" \n\t"\
  273. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  274. "movq "VROUNDER_OFFSET"(%0), "#dst1" \n\t"\
  275. "movq "#dst1", "#dst2" \n\t"\
  276. ".p2align 4 \n\t"\
  277. "2: \n\t"\
  278. "movq 8(%%"REG_d"), "#coeff" \n\t" /* filterCoeff */\
  279. "movq (%%"REG_S", %%"REG_a", 2), "#src1" \n\t" /* Y1srcData */\
  280. "movq 8(%%"REG_S", %%"REG_a", 2), "#src2" \n\t" /* Y2srcData */\
  281. "add $16, %%"REG_d" \n\t"\
  282. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  283. "pmulhw "#coeff", "#src1" \n\t"\
  284. "pmulhw "#coeff", "#src2" \n\t"\
  285. "paddw "#src1", "#dst1" \n\t"\
  286. "paddw "#src2", "#dst2" \n\t"\
  287. "test %%"REG_S", %%"REG_S" \n\t"\
  288. " jnz 2b \n\t"\
  289. #define YSCALEYUV2PACKEDX \
  290. YSCALEYUV2PACKEDX_UV \
  291. YSCALEYUV2PACKEDX_YA(LUM_MMX_FILTER_OFFSET,%%mm0,%%mm2,%%mm5,%%mm1,%%mm7) \
  292. #define YSCALEYUV2PACKEDX_END \
  293. :: "r" (&c->redDither), \
  294. "m" (dummy), "m" (dummy), "m" (dummy),\
  295. "r" (dest), "m" (dstW_reg), "m"(uv_off) \
  296. : "%"REG_a, "%"REG_d, "%"REG_S \
  297. );
  298. #define YSCALEYUV2PACKEDX_ACCURATE_UV \
  299. __asm__ volatile(\
  300. "xor %%"REG_a", %%"REG_a" \n\t"\
  301. ".p2align 4 \n\t"\
  302. "nop \n\t"\
  303. "1: \n\t"\
  304. "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
  305. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  306. "pxor %%mm4, %%mm4 \n\t"\
  307. "pxor %%mm5, %%mm5 \n\t"\
  308. "pxor %%mm6, %%mm6 \n\t"\
  309. "pxor %%mm7, %%mm7 \n\t"\
  310. ".p2align 4 \n\t"\
  311. "2: \n\t"\
  312. "movq (%%"REG_S", %%"REG_a"), %%mm0 \n\t" /* UsrcData */\
  313. "add %6, %%"REG_S" \n\t" \
  314. "movq (%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* VsrcData */\
  315. "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
  316. "movq (%%"REG_S", %%"REG_a"), %%mm1 \n\t" /* UsrcData */\
  317. "movq %%mm0, %%mm3 \n\t"\
  318. "punpcklwd %%mm1, %%mm0 \n\t"\
  319. "punpckhwd %%mm1, %%mm3 \n\t"\
  320. "movq "STR(APCK_COEF)"(%%"REG_d"),%%mm1 \n\t" /* filterCoeff */\
  321. "pmaddwd %%mm1, %%mm0 \n\t"\
  322. "pmaddwd %%mm1, %%mm3 \n\t"\
  323. "paddd %%mm0, %%mm4 \n\t"\
  324. "paddd %%mm3, %%mm5 \n\t"\
  325. "add %6, %%"REG_S" \n\t" \
  326. "movq (%%"REG_S", %%"REG_a"), %%mm3 \n\t" /* VsrcData */\
  327. "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
  328. "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
  329. "test %%"REG_S", %%"REG_S" \n\t"\
  330. "movq %%mm2, %%mm0 \n\t"\
  331. "punpcklwd %%mm3, %%mm2 \n\t"\
  332. "punpckhwd %%mm3, %%mm0 \n\t"\
  333. "pmaddwd %%mm1, %%mm2 \n\t"\
  334. "pmaddwd %%mm1, %%mm0 \n\t"\
  335. "paddd %%mm2, %%mm6 \n\t"\
  336. "paddd %%mm0, %%mm7 \n\t"\
  337. " jnz 2b \n\t"\
  338. "psrad $16, %%mm4 \n\t"\
  339. "psrad $16, %%mm5 \n\t"\
  340. "psrad $16, %%mm6 \n\t"\
  341. "psrad $16, %%mm7 \n\t"\
  342. "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
  343. "packssdw %%mm5, %%mm4 \n\t"\
  344. "packssdw %%mm7, %%mm6 \n\t"\
  345. "paddw %%mm0, %%mm4 \n\t"\
  346. "paddw %%mm0, %%mm6 \n\t"\
  347. "movq %%mm4, "U_TEMP"(%0) \n\t"\
  348. "movq %%mm6, "V_TEMP"(%0) \n\t"\
  349. #define YSCALEYUV2PACKEDX_ACCURATE_YA(offset) \
  350. "lea "offset"(%0), %%"REG_d" \n\t"\
  351. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  352. "pxor %%mm1, %%mm1 \n\t"\
  353. "pxor %%mm5, %%mm5 \n\t"\
  354. "pxor %%mm7, %%mm7 \n\t"\
  355. "pxor %%mm6, %%mm6 \n\t"\
  356. ".p2align 4 \n\t"\
  357. "2: \n\t"\
  358. "movq (%%"REG_S", %%"REG_a", 2), %%mm0 \n\t" /* Y1srcData */\
  359. "movq 8(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* Y2srcData */\
  360. "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
  361. "movq (%%"REG_S", %%"REG_a", 2), %%mm4 \n\t" /* Y1srcData */\
  362. "movq %%mm0, %%mm3 \n\t"\
  363. "punpcklwd %%mm4, %%mm0 \n\t"\
  364. "punpckhwd %%mm4, %%mm3 \n\t"\
  365. "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm4 \n\t" /* filterCoeff */\
  366. "pmaddwd %%mm4, %%mm0 \n\t"\
  367. "pmaddwd %%mm4, %%mm3 \n\t"\
  368. "paddd %%mm0, %%mm1 \n\t"\
  369. "paddd %%mm3, %%mm5 \n\t"\
  370. "movq 8(%%"REG_S", %%"REG_a", 2), %%mm3 \n\t" /* Y2srcData */\
  371. "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
  372. "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
  373. "test %%"REG_S", %%"REG_S" \n\t"\
  374. "movq %%mm2, %%mm0 \n\t"\
  375. "punpcklwd %%mm3, %%mm2 \n\t"\
  376. "punpckhwd %%mm3, %%mm0 \n\t"\
  377. "pmaddwd %%mm4, %%mm2 \n\t"\
  378. "pmaddwd %%mm4, %%mm0 \n\t"\
  379. "paddd %%mm2, %%mm7 \n\t"\
  380. "paddd %%mm0, %%mm6 \n\t"\
  381. " jnz 2b \n\t"\
  382. "psrad $16, %%mm1 \n\t"\
  383. "psrad $16, %%mm5 \n\t"\
  384. "psrad $16, %%mm7 \n\t"\
  385. "psrad $16, %%mm6 \n\t"\
  386. "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
  387. "packssdw %%mm5, %%mm1 \n\t"\
  388. "packssdw %%mm6, %%mm7 \n\t"\
  389. "paddw %%mm0, %%mm1 \n\t"\
  390. "paddw %%mm0, %%mm7 \n\t"\
  391. "movq "U_TEMP"(%0), %%mm3 \n\t"\
  392. "movq "V_TEMP"(%0), %%mm4 \n\t"\
  393. #define YSCALEYUV2PACKEDX_ACCURATE \
  394. YSCALEYUV2PACKEDX_ACCURATE_UV \
  395. YSCALEYUV2PACKEDX_ACCURATE_YA(LUM_MMX_FILTER_OFFSET)
  396. #define YSCALEYUV2RGBX \
  397. "psubw "U_OFFSET"(%0), %%mm3 \n\t" /* (U-128)8*/\
  398. "psubw "V_OFFSET"(%0), %%mm4 \n\t" /* (V-128)8*/\
  399. "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
  400. "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
  401. "pmulhw "UG_COEFF"(%0), %%mm3 \n\t"\
  402. "pmulhw "VG_COEFF"(%0), %%mm4 \n\t"\
  403. /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
  404. "pmulhw "UB_COEFF"(%0), %%mm2 \n\t"\
  405. "pmulhw "VR_COEFF"(%0), %%mm5 \n\t"\
  406. "psubw "Y_OFFSET"(%0), %%mm1 \n\t" /* 8(Y-16)*/\
  407. "psubw "Y_OFFSET"(%0), %%mm7 \n\t" /* 8(Y-16)*/\
  408. "pmulhw "Y_COEFF"(%0), %%mm1 \n\t"\
  409. "pmulhw "Y_COEFF"(%0), %%mm7 \n\t"\
  410. /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
  411. "paddw %%mm3, %%mm4 \n\t"\
  412. "movq %%mm2, %%mm0 \n\t"\
  413. "movq %%mm5, %%mm6 \n\t"\
  414. "movq %%mm4, %%mm3 \n\t"\
  415. "punpcklwd %%mm2, %%mm2 \n\t"\
  416. "punpcklwd %%mm5, %%mm5 \n\t"\
  417. "punpcklwd %%mm4, %%mm4 \n\t"\
  418. "paddw %%mm1, %%mm2 \n\t"\
  419. "paddw %%mm1, %%mm5 \n\t"\
  420. "paddw %%mm1, %%mm4 \n\t"\
  421. "punpckhwd %%mm0, %%mm0 \n\t"\
  422. "punpckhwd %%mm6, %%mm6 \n\t"\
  423. "punpckhwd %%mm3, %%mm3 \n\t"\
  424. "paddw %%mm7, %%mm0 \n\t"\
  425. "paddw %%mm7, %%mm6 \n\t"\
  426. "paddw %%mm7, %%mm3 \n\t"\
  427. /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
  428. "packuswb %%mm0, %%mm2 \n\t"\
  429. "packuswb %%mm6, %%mm5 \n\t"\
  430. "packuswb %%mm3, %%mm4 \n\t"\
  431. #define REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) \
  432. "movq "#b", "#q2" \n\t" /* B */\
  433. "movq "#r", "#t" \n\t" /* R */\
  434. "punpcklbw "#g", "#b" \n\t" /* GBGBGBGB 0 */\
  435. "punpcklbw "#a", "#r" \n\t" /* ARARARAR 0 */\
  436. "punpckhbw "#g", "#q2" \n\t" /* GBGBGBGB 2 */\
  437. "punpckhbw "#a", "#t" \n\t" /* ARARARAR 2 */\
  438. "movq "#b", "#q0" \n\t" /* GBGBGBGB 0 */\
  439. "movq "#q2", "#q3" \n\t" /* GBGBGBGB 2 */\
  440. "punpcklwd "#r", "#q0" \n\t" /* ARGBARGB 0 */\
  441. "punpckhwd "#r", "#b" \n\t" /* ARGBARGB 1 */\
  442. "punpcklwd "#t", "#q2" \n\t" /* ARGBARGB 2 */\
  443. "punpckhwd "#t", "#q3" \n\t" /* ARGBARGB 3 */\
  444. \
  445. MOVNTQ( q0, (dst, index, 4))\
  446. MOVNTQ( b, 8(dst, index, 4))\
  447. MOVNTQ( q2, 16(dst, index, 4))\
  448. MOVNTQ( q3, 24(dst, index, 4))\
  449. \
  450. "add $8, "#index" \n\t"\
  451. "cmp "#dstw", "#index" \n\t"\
  452. " jb 1b \n\t"
  453. #define WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t)
  454. static inline void RENAME(yuv2rgb32_X_ar)(SwsContext *c, const int16_t *lumFilter,
  455. const int16_t **lumSrc, int lumFilterSize,
  456. const int16_t *chrFilter, const int16_t **chrUSrc,
  457. const int16_t **chrVSrc,
  458. int chrFilterSize, const int16_t **alpSrc,
  459. uint8_t *dest, int dstW, int dstY)
  460. {
  461. x86_reg dummy=0;
  462. x86_reg dstW_reg = dstW;
  463. x86_reg uv_off = c->uv_off << 1;
  464. if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
  465. YSCALEYUV2PACKEDX_ACCURATE
  466. YSCALEYUV2RGBX
  467. "movq %%mm2, "U_TEMP"(%0) \n\t"
  468. "movq %%mm4, "V_TEMP"(%0) \n\t"
  469. "movq %%mm5, "Y_TEMP"(%0) \n\t"
  470. YSCALEYUV2PACKEDX_ACCURATE_YA(ALP_MMX_FILTER_OFFSET)
  471. "movq "Y_TEMP"(%0), %%mm5 \n\t"
  472. "psraw $3, %%mm1 \n\t"
  473. "psraw $3, %%mm7 \n\t"
  474. "packuswb %%mm7, %%mm1 \n\t"
  475. WRITEBGR32(%4, %5, %%REGa, %%mm3, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm2, %%mm6)
  476. YSCALEYUV2PACKEDX_END
  477. } else {
  478. YSCALEYUV2PACKEDX_ACCURATE
  479. YSCALEYUV2RGBX
  480. "pcmpeqd %%mm7, %%mm7 \n\t"
  481. WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
  482. YSCALEYUV2PACKEDX_END
  483. }
  484. }
  485. static inline void RENAME(yuv2rgb32_X)(SwsContext *c, const int16_t *lumFilter,
  486. const int16_t **lumSrc, int lumFilterSize,
  487. const int16_t *chrFilter, const int16_t **chrUSrc,
  488. const int16_t **chrVSrc,
  489. int chrFilterSize, const int16_t **alpSrc,
  490. uint8_t *dest, int dstW, int dstY)
  491. {
  492. x86_reg dummy=0;
  493. x86_reg dstW_reg = dstW;
  494. x86_reg uv_off = c->uv_off << 1;
  495. if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
  496. YSCALEYUV2PACKEDX
  497. YSCALEYUV2RGBX
  498. YSCALEYUV2PACKEDX_YA(ALP_MMX_FILTER_OFFSET, %%mm0, %%mm3, %%mm6, %%mm1, %%mm7)
  499. "psraw $3, %%mm1 \n\t"
  500. "psraw $3, %%mm7 \n\t"
  501. "packuswb %%mm7, %%mm1 \n\t"
  502. WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
  503. YSCALEYUV2PACKEDX_END
  504. } else {
  505. YSCALEYUV2PACKEDX
  506. YSCALEYUV2RGBX
  507. "pcmpeqd %%mm7, %%mm7 \n\t"
  508. WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
  509. YSCALEYUV2PACKEDX_END
  510. }
  511. }
  512. #define REAL_WRITERGB16(dst, dstw, index) \
  513. "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
  514. "pand "MANGLE(bFC)", %%mm4 \n\t" /* G */\
  515. "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
  516. "psrlq $3, %%mm2 \n\t"\
  517. \
  518. "movq %%mm2, %%mm1 \n\t"\
  519. "movq %%mm4, %%mm3 \n\t"\
  520. \
  521. "punpcklbw %%mm7, %%mm3 \n\t"\
  522. "punpcklbw %%mm5, %%mm2 \n\t"\
  523. "punpckhbw %%mm7, %%mm4 \n\t"\
  524. "punpckhbw %%mm5, %%mm1 \n\t"\
  525. \
  526. "psllq $3, %%mm3 \n\t"\
  527. "psllq $3, %%mm4 \n\t"\
  528. \
  529. "por %%mm3, %%mm2 \n\t"\
  530. "por %%mm4, %%mm1 \n\t"\
  531. \
  532. MOVNTQ(%%mm2, (dst, index, 2))\
  533. MOVNTQ(%%mm1, 8(dst, index, 2))\
  534. \
  535. "add $8, "#index" \n\t"\
  536. "cmp "#dstw", "#index" \n\t"\
  537. " jb 1b \n\t"
  538. #define WRITERGB16(dst, dstw, index) REAL_WRITERGB16(dst, dstw, index)
  539. static inline void RENAME(yuv2rgb565_X_ar)(SwsContext *c, const int16_t *lumFilter,
  540. const int16_t **lumSrc, int lumFilterSize,
  541. const int16_t *chrFilter, const int16_t **chrUSrc,
  542. const int16_t **chrVSrc,
  543. int chrFilterSize, const int16_t **alpSrc,
  544. uint8_t *dest, int dstW, int dstY)
  545. {
  546. x86_reg dummy=0;
  547. x86_reg dstW_reg = dstW;
  548. x86_reg uv_off = c->uv_off << 1;
  549. YSCALEYUV2PACKEDX_ACCURATE
  550. YSCALEYUV2RGBX
  551. "pxor %%mm7, %%mm7 \n\t"
  552. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  553. #ifdef DITHER1XBPP
  554. "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
  555. "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
  556. "paddusb "RED_DITHER"(%0), %%mm5\n\t"
  557. #endif
  558. WRITERGB16(%4, %5, %%REGa)
  559. YSCALEYUV2PACKEDX_END
  560. }
  561. static inline void RENAME(yuv2rgb565_X)(SwsContext *c, const int16_t *lumFilter,
  562. const int16_t **lumSrc, int lumFilterSize,
  563. const int16_t *chrFilter, const int16_t **chrUSrc,
  564. const int16_t **chrVSrc,
  565. int chrFilterSize, const int16_t **alpSrc,
  566. uint8_t *dest, int dstW, int dstY)
  567. {
  568. x86_reg dummy=0;
  569. x86_reg dstW_reg = dstW;
  570. x86_reg uv_off = c->uv_off << 1;
  571. YSCALEYUV2PACKEDX
  572. YSCALEYUV2RGBX
  573. "pxor %%mm7, %%mm7 \n\t"
  574. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  575. #ifdef DITHER1XBPP
  576. "paddusb "BLUE_DITHER"(%0), %%mm2 \n\t"
  577. "paddusb "GREEN_DITHER"(%0), %%mm4 \n\t"
  578. "paddusb "RED_DITHER"(%0), %%mm5 \n\t"
  579. #endif
  580. WRITERGB16(%4, %5, %%REGa)
  581. YSCALEYUV2PACKEDX_END
  582. }
  583. #define REAL_WRITERGB15(dst, dstw, index) \
  584. "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
  585. "pand "MANGLE(bF8)", %%mm4 \n\t" /* G */\
  586. "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
  587. "psrlq $3, %%mm2 \n\t"\
  588. "psrlq $1, %%mm5 \n\t"\
  589. \
  590. "movq %%mm2, %%mm1 \n\t"\
  591. "movq %%mm4, %%mm3 \n\t"\
  592. \
  593. "punpcklbw %%mm7, %%mm3 \n\t"\
  594. "punpcklbw %%mm5, %%mm2 \n\t"\
  595. "punpckhbw %%mm7, %%mm4 \n\t"\
  596. "punpckhbw %%mm5, %%mm1 \n\t"\
  597. \
  598. "psllq $2, %%mm3 \n\t"\
  599. "psllq $2, %%mm4 \n\t"\
  600. \
  601. "por %%mm3, %%mm2 \n\t"\
  602. "por %%mm4, %%mm1 \n\t"\
  603. \
  604. MOVNTQ(%%mm2, (dst, index, 2))\
  605. MOVNTQ(%%mm1, 8(dst, index, 2))\
  606. \
  607. "add $8, "#index" \n\t"\
  608. "cmp "#dstw", "#index" \n\t"\
  609. " jb 1b \n\t"
  610. #define WRITERGB15(dst, dstw, index) REAL_WRITERGB15(dst, dstw, index)
  611. static inline void RENAME(yuv2rgb555_X_ar)(SwsContext *c, const int16_t *lumFilter,
  612. const int16_t **lumSrc, int lumFilterSize,
  613. const int16_t *chrFilter, const int16_t **chrUSrc,
  614. const int16_t **chrVSrc,
  615. int chrFilterSize, const int16_t **alpSrc,
  616. uint8_t *dest, int dstW, int dstY)
  617. {
  618. x86_reg dummy=0;
  619. x86_reg dstW_reg = dstW;
  620. x86_reg uv_off = c->uv_off << 1;
  621. YSCALEYUV2PACKEDX_ACCURATE
  622. YSCALEYUV2RGBX
  623. "pxor %%mm7, %%mm7 \n\t"
  624. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  625. #ifdef DITHER1XBPP
  626. "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
  627. "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
  628. "paddusb "RED_DITHER"(%0), %%mm5\n\t"
  629. #endif
  630. WRITERGB15(%4, %5, %%REGa)
  631. YSCALEYUV2PACKEDX_END
  632. }
  633. static inline void RENAME(yuv2rgb555_X)(SwsContext *c, const int16_t *lumFilter,
  634. const int16_t **lumSrc, int lumFilterSize,
  635. const int16_t *chrFilter, const int16_t **chrUSrc,
  636. const int16_t **chrVSrc,
  637. int chrFilterSize, const int16_t **alpSrc,
  638. uint8_t *dest, int dstW, int dstY)
  639. {
  640. x86_reg dummy=0;
  641. x86_reg dstW_reg = dstW;
  642. x86_reg uv_off = c->uv_off << 1;
  643. YSCALEYUV2PACKEDX
  644. YSCALEYUV2RGBX
  645. "pxor %%mm7, %%mm7 \n\t"
  646. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  647. #ifdef DITHER1XBPP
  648. "paddusb "BLUE_DITHER"(%0), %%mm2 \n\t"
  649. "paddusb "GREEN_DITHER"(%0), %%mm4 \n\t"
  650. "paddusb "RED_DITHER"(%0), %%mm5 \n\t"
  651. #endif
  652. WRITERGB15(%4, %5, %%REGa)
  653. YSCALEYUV2PACKEDX_END
  654. }
  655. #define WRITEBGR24MMX(dst, dstw, index) \
  656. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
  657. "movq %%mm2, %%mm1 \n\t" /* B */\
  658. "movq %%mm5, %%mm6 \n\t" /* R */\
  659. "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
  660. "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
  661. "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
  662. "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
  663. "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
  664. "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
  665. "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
  666. "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
  667. "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
  668. "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
  669. \
  670. "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
  671. "movq %%mm2, %%mm6 \n\t" /* 0RGB0RGB 1 */\
  672. "movq %%mm1, %%mm5 \n\t" /* 0RGB0RGB 2 */\
  673. "movq %%mm3, %%mm7 \n\t" /* 0RGB0RGB 3 */\
  674. \
  675. "psllq $40, %%mm0 \n\t" /* RGB00000 0 */\
  676. "psllq $40, %%mm2 \n\t" /* RGB00000 1 */\
  677. "psllq $40, %%mm1 \n\t" /* RGB00000 2 */\
  678. "psllq $40, %%mm3 \n\t" /* RGB00000 3 */\
  679. \
  680. "punpckhdq %%mm4, %%mm0 \n\t" /* 0RGBRGB0 0 */\
  681. "punpckhdq %%mm6, %%mm2 \n\t" /* 0RGBRGB0 1 */\
  682. "punpckhdq %%mm5, %%mm1 \n\t" /* 0RGBRGB0 2 */\
  683. "punpckhdq %%mm7, %%mm3 \n\t" /* 0RGBRGB0 3 */\
  684. \
  685. "psrlq $8, %%mm0 \n\t" /* 00RGBRGB 0 */\
  686. "movq %%mm2, %%mm6 \n\t" /* 0RGBRGB0 1 */\
  687. "psllq $40, %%mm2 \n\t" /* GB000000 1 */\
  688. "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
  689. MOVNTQ(%%mm0, (dst))\
  690. \
  691. "psrlq $24, %%mm6 \n\t" /* 0000RGBR 1 */\
  692. "movq %%mm1, %%mm5 \n\t" /* 0RGBRGB0 2 */\
  693. "psllq $24, %%mm1 \n\t" /* BRGB0000 2 */\
  694. "por %%mm1, %%mm6 \n\t" /* BRGBRGBR 1 */\
  695. MOVNTQ(%%mm6, 8(dst))\
  696. \
  697. "psrlq $40, %%mm5 \n\t" /* 000000RG 2 */\
  698. "psllq $8, %%mm3 \n\t" /* RGBRGB00 3 */\
  699. "por %%mm3, %%mm5 \n\t" /* RGBRGBRG 2 */\
  700. MOVNTQ(%%mm5, 16(dst))\
  701. \
  702. "add $24, "#dst" \n\t"\
  703. \
  704. "add $8, "#index" \n\t"\
  705. "cmp "#dstw", "#index" \n\t"\
  706. " jb 1b \n\t"
  707. #define WRITEBGR24MMX2(dst, dstw, index) \
  708. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
  709. "movq "MANGLE(ff_M24A)", %%mm0 \n\t"\
  710. "movq "MANGLE(ff_M24C)", %%mm7 \n\t"\
  711. "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2 B1 B0 B1 B0 */\
  712. "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2 G1 G0 G1 G0 */\
  713. "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0 R1 R0 R1 R0 */\
  714. \
  715. "pand %%mm0, %%mm1 \n\t" /* B2 B1 B0 */\
  716. "pand %%mm0, %%mm3 \n\t" /* G2 G1 G0 */\
  717. "pand %%mm7, %%mm6 \n\t" /* R1 R0 */\
  718. \
  719. "psllq $8, %%mm3 \n\t" /* G2 G1 G0 */\
  720. "por %%mm1, %%mm6 \n\t"\
  721. "por %%mm3, %%mm6 \n\t"\
  722. MOVNTQ(%%mm6, (dst))\
  723. \
  724. "psrlq $8, %%mm4 \n\t" /* 00 G7 G6 G5 G4 G3 G2 G1 */\
  725. "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4 B3 B2 B3 B2 */\
  726. "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3 G4 G3 G4 G3 */\
  727. "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4 R3 R2 R3 R2 */\
  728. \
  729. "pand "MANGLE(ff_M24B)", %%mm1 \n\t" /* B5 B4 B3 */\
  730. "pand %%mm7, %%mm3 \n\t" /* G4 G3 */\
  731. "pand %%mm0, %%mm6 \n\t" /* R4 R3 R2 */\
  732. \
  733. "por %%mm1, %%mm3 \n\t" /* B5 G4 B4 G3 B3 */\
  734. "por %%mm3, %%mm6 \n\t"\
  735. MOVNTQ(%%mm6, 8(dst))\
  736. \
  737. "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6 B7 B6 B6 B7 */\
  738. "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7 G6 G5 G6 G5 */\
  739. "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6 R5 R4 R5 R4 */\
  740. \
  741. "pand %%mm7, %%mm1 \n\t" /* B7 B6 */\
  742. "pand %%mm0, %%mm3 \n\t" /* G7 G6 G5 */\
  743. "pand "MANGLE(ff_M24B)", %%mm6 \n\t" /* R7 R6 R5 */\
  744. \
  745. "por %%mm1, %%mm3 \n\t"\
  746. "por %%mm3, %%mm6 \n\t"\
  747. MOVNTQ(%%mm6, 16(dst))\
  748. \
  749. "add $24, "#dst" \n\t"\
  750. \
  751. "add $8, "#index" \n\t"\
  752. "cmp "#dstw", "#index" \n\t"\
  753. " jb 1b \n\t"
  754. #if COMPILE_TEMPLATE_MMX2
  755. #undef WRITEBGR24
  756. #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX2(dst, dstw, index)
  757. #else
  758. #undef WRITEBGR24
  759. #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX(dst, dstw, index)
  760. #endif
  761. static inline void RENAME(yuv2bgr24_X_ar)(SwsContext *c, const int16_t *lumFilter,
  762. const int16_t **lumSrc, int lumFilterSize,
  763. const int16_t *chrFilter, const int16_t **chrUSrc,
  764. const int16_t **chrVSrc,
  765. int chrFilterSize, const int16_t **alpSrc,
  766. uint8_t *dest, int dstW, int dstY)
  767. {
  768. x86_reg dummy=0;
  769. x86_reg dstW_reg = dstW;
  770. x86_reg uv_off = c->uv_off << 1;
  771. YSCALEYUV2PACKEDX_ACCURATE
  772. YSCALEYUV2RGBX
  773. "pxor %%mm7, %%mm7 \n\t"
  774. "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"\n\t" //FIXME optimize
  775. "add %4, %%"REG_c" \n\t"
  776. WRITEBGR24(%%REGc, %5, %%REGa)
  777. :: "r" (&c->redDither),
  778. "m" (dummy), "m" (dummy), "m" (dummy),
  779. "r" (dest), "m" (dstW_reg), "m"(uv_off)
  780. : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
  781. );
  782. }
  783. static inline void RENAME(yuv2bgr24_X)(SwsContext *c, const int16_t *lumFilter,
  784. const int16_t **lumSrc, int lumFilterSize,
  785. const int16_t *chrFilter, const int16_t **chrUSrc,
  786. const int16_t **chrVSrc,
  787. int chrFilterSize, const int16_t **alpSrc,
  788. uint8_t *dest, int dstW, int dstY)
  789. {
  790. x86_reg dummy=0;
  791. x86_reg dstW_reg = dstW;
  792. x86_reg uv_off = c->uv_off << 1;
  793. YSCALEYUV2PACKEDX
  794. YSCALEYUV2RGBX
  795. "pxor %%mm7, %%mm7 \n\t"
  796. "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c" \n\t" //FIXME optimize
  797. "add %4, %%"REG_c" \n\t"
  798. WRITEBGR24(%%REGc, %5, %%REGa)
  799. :: "r" (&c->redDither),
  800. "m" (dummy), "m" (dummy), "m" (dummy),
  801. "r" (dest), "m" (dstW_reg), "m"(uv_off)
  802. : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
  803. );
  804. }
  805. #define REAL_WRITEYUY2(dst, dstw, index) \
  806. "packuswb %%mm3, %%mm3 \n\t"\
  807. "packuswb %%mm4, %%mm4 \n\t"\
  808. "packuswb %%mm7, %%mm1 \n\t"\
  809. "punpcklbw %%mm4, %%mm3 \n\t"\
  810. "movq %%mm1, %%mm7 \n\t"\
  811. "punpcklbw %%mm3, %%mm1 \n\t"\
  812. "punpckhbw %%mm3, %%mm7 \n\t"\
  813. \
  814. MOVNTQ(%%mm1, (dst, index, 2))\
  815. MOVNTQ(%%mm7, 8(dst, index, 2))\
  816. \
  817. "add $8, "#index" \n\t"\
  818. "cmp "#dstw", "#index" \n\t"\
  819. " jb 1b \n\t"
  820. #define WRITEYUY2(dst, dstw, index) REAL_WRITEYUY2(dst, dstw, index)
  821. static inline void RENAME(yuv2yuyv422_X_ar)(SwsContext *c, const int16_t *lumFilter,
  822. const int16_t **lumSrc, int lumFilterSize,
  823. const int16_t *chrFilter, const int16_t **chrUSrc,
  824. const int16_t **chrVSrc,
  825. int chrFilterSize, const int16_t **alpSrc,
  826. uint8_t *dest, int dstW, int dstY)
  827. {
  828. x86_reg dummy=0;
  829. x86_reg dstW_reg = dstW;
  830. x86_reg uv_off = c->uv_off << 1;
  831. YSCALEYUV2PACKEDX_ACCURATE
  832. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  833. "psraw $3, %%mm3 \n\t"
  834. "psraw $3, %%mm4 \n\t"
  835. "psraw $3, %%mm1 \n\t"
  836. "psraw $3, %%mm7 \n\t"
  837. WRITEYUY2(%4, %5, %%REGa)
  838. YSCALEYUV2PACKEDX_END
  839. }
  840. static inline void RENAME(yuv2yuyv422_X)(SwsContext *c, const int16_t *lumFilter,
  841. const int16_t **lumSrc, int lumFilterSize,
  842. const int16_t *chrFilter, const int16_t **chrUSrc,
  843. const int16_t **chrVSrc,
  844. int chrFilterSize, const int16_t **alpSrc,
  845. uint8_t *dest, int dstW, int dstY)
  846. {
  847. x86_reg dummy=0;
  848. x86_reg dstW_reg = dstW;
  849. x86_reg uv_off = c->uv_off << 1;
  850. YSCALEYUV2PACKEDX
  851. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  852. "psraw $3, %%mm3 \n\t"
  853. "psraw $3, %%mm4 \n\t"
  854. "psraw $3, %%mm1 \n\t"
  855. "psraw $3, %%mm7 \n\t"
  856. WRITEYUY2(%4, %5, %%REGa)
  857. YSCALEYUV2PACKEDX_END
  858. }
  859. #define REAL_YSCALEYUV2RGB_UV(index, c) \
  860. "xor "#index", "#index" \n\t"\
  861. ".p2align 4 \n\t"\
  862. "1: \n\t"\
  863. "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
  864. "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
  865. "add "UV_OFFx2"("#c"), "#index" \n\t" \
  866. "movq (%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
  867. "movq (%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
  868. "sub "UV_OFFx2"("#c"), "#index" \n\t" \
  869. "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
  870. "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
  871. "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
  872. "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
  873. "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
  874. "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
  875. "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
  876. "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
  877. "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
  878. "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
  879. "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
  880. "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
  881. "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
  882. "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
  883. "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
  884. /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
  885. #define REAL_YSCALEYUV2RGB_YA(index, c, b1, b2) \
  886. "movq ("#b1", "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
  887. "movq ("#b2", "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
  888. "movq 8("#b1", "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
  889. "movq 8("#b2", "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
  890. "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
  891. "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
  892. "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
  893. "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
  894. "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  895. "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  896. "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
  897. "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
  898. #define REAL_YSCALEYUV2RGB_COEFF(c) \
  899. "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
  900. "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
  901. "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
  902. "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
  903. "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
  904. "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
  905. /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
  906. "paddw %%mm3, %%mm4 \n\t"\
  907. "movq %%mm2, %%mm0 \n\t"\
  908. "movq %%mm5, %%mm6 \n\t"\
  909. "movq %%mm4, %%mm3 \n\t"\
  910. "punpcklwd %%mm2, %%mm2 \n\t"\
  911. "punpcklwd %%mm5, %%mm5 \n\t"\
  912. "punpcklwd %%mm4, %%mm4 \n\t"\
  913. "paddw %%mm1, %%mm2 \n\t"\
  914. "paddw %%mm1, %%mm5 \n\t"\
  915. "paddw %%mm1, %%mm4 \n\t"\
  916. "punpckhwd %%mm0, %%mm0 \n\t"\
  917. "punpckhwd %%mm6, %%mm6 \n\t"\
  918. "punpckhwd %%mm3, %%mm3 \n\t"\
  919. "paddw %%mm7, %%mm0 \n\t"\
  920. "paddw %%mm7, %%mm6 \n\t"\
  921. "paddw %%mm7, %%mm3 \n\t"\
  922. /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
  923. "packuswb %%mm0, %%mm2 \n\t"\
  924. "packuswb %%mm6, %%mm5 \n\t"\
  925. "packuswb %%mm3, %%mm4 \n\t"\
  926. #define YSCALEYUV2RGB_YA(index, c, b1, b2) REAL_YSCALEYUV2RGB_YA(index, c, b1, b2)
  927. #define YSCALEYUV2RGB(index, c) \
  928. REAL_YSCALEYUV2RGB_UV(index, c) \
  929. REAL_YSCALEYUV2RGB_YA(index, c, %0, %1) \
  930. REAL_YSCALEYUV2RGB_COEFF(c)
  931. /**
  932. * vertical bilinear scale YV12 to RGB
  933. */
  934. static inline void RENAME(yuv2rgb32_2)(SwsContext *c, const uint16_t *buf0,
  935. const uint16_t *buf1, const uint16_t *ubuf0,
  936. const uint16_t *ubuf1, const uint16_t *vbuf0,
  937. const uint16_t *vbuf1, const uint16_t *abuf0,
  938. const uint16_t *abuf1, uint8_t *dest,
  939. int dstW, int yalpha, int uvalpha, int y)
  940. {
  941. if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
  942. #if ARCH_X86_64
  943. __asm__ volatile(
  944. YSCALEYUV2RGB(%%r8, %5)
  945. YSCALEYUV2RGB_YA(%%r8, %5, %6, %7)
  946. "psraw $3, %%mm1 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
  947. "psraw $3, %%mm7 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
  948. "packuswb %%mm7, %%mm1 \n\t"
  949. WRITEBGR32(%4, 8280(%5), %%r8, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
  950. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "r" (dest),
  951. "a" (&c->redDither),
  952. "r" (abuf0), "r" (abuf1)
  953. : "%r8"
  954. );
  955. #else
  956. c->u_temp=(intptr_t)abuf0;
  957. c->v_temp=(intptr_t)abuf1;
  958. __asm__ volatile(
  959. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  960. "mov %4, %%"REG_b" \n\t"
  961. "push %%"REG_BP" \n\t"
  962. YSCALEYUV2RGB(%%REGBP, %5)
  963. "push %0 \n\t"
  964. "push %1 \n\t"
  965. "mov "U_TEMP"(%5), %0 \n\t"
  966. "mov "V_TEMP"(%5), %1 \n\t"
  967. YSCALEYUV2RGB_YA(%%REGBP, %5, %0, %1)
  968. "psraw $3, %%mm1 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
  969. "psraw $3, %%mm7 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
  970. "packuswb %%mm7, %%mm1 \n\t"
  971. "pop %1 \n\t"
  972. "pop %0 \n\t"
  973. WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
  974. "pop %%"REG_BP" \n\t"
  975. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  976. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  977. "a" (&c->redDither)
  978. );
  979. #endif
  980. } else {
  981. __asm__ volatile(
  982. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  983. "mov %4, %%"REG_b" \n\t"
  984. "push %%"REG_BP" \n\t"
  985. YSCALEYUV2RGB(%%REGBP, %5)
  986. "pcmpeqd %%mm7, %%mm7 \n\t"
  987. WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
  988. "pop %%"REG_BP" \n\t"
  989. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  990. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  991. "a" (&c->redDither)
  992. );
  993. }
  994. }
  995. static inline void RENAME(yuv2bgr24_2)(SwsContext *c, const uint16_t *buf0,
  996. const uint16_t *buf1, const uint16_t *ubuf0,
  997. const uint16_t *ubuf1, const uint16_t *vbuf0,
  998. const uint16_t *vbuf1, const uint16_t *abuf0,
  999. const uint16_t *abuf1, uint8_t *dest,
  1000. int dstW, int yalpha, int uvalpha, int y)
  1001. {
  1002. //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
  1003. __asm__ volatile(
  1004. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1005. "mov %4, %%"REG_b" \n\t"
  1006. "push %%"REG_BP" \n\t"
  1007. YSCALEYUV2RGB(%%REGBP, %5)
  1008. "pxor %%mm7, %%mm7 \n\t"
  1009. WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
  1010. "pop %%"REG_BP" \n\t"
  1011. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1012. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1013. "a" (&c->redDither)
  1014. );
  1015. }
  1016. static inline void RENAME(yuv2rgb555_2)(SwsContext *c, const uint16_t *buf0,
  1017. const uint16_t *buf1, const uint16_t *ubuf0,
  1018. const uint16_t *ubuf1, const uint16_t *vbuf0,
  1019. const uint16_t *vbuf1, const uint16_t *abuf0,
  1020. const uint16_t *abuf1, uint8_t *dest,
  1021. int dstW, int yalpha, int uvalpha, int y)
  1022. {
  1023. //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
  1024. __asm__ volatile(
  1025. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1026. "mov %4, %%"REG_b" \n\t"
  1027. "push %%"REG_BP" \n\t"
  1028. YSCALEYUV2RGB(%%REGBP, %5)
  1029. "pxor %%mm7, %%mm7 \n\t"
  1030. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1031. #ifdef DITHER1XBPP
  1032. "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
  1033. "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
  1034. "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
  1035. #endif
  1036. WRITERGB15(%%REGb, 8280(%5), %%REGBP)
  1037. "pop %%"REG_BP" \n\t"
  1038. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1039. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1040. "a" (&c->redDither)
  1041. );
  1042. }
  1043. static inline void RENAME(yuv2rgb565_2)(SwsContext *c, const uint16_t *buf0,
  1044. const uint16_t *buf1, const uint16_t *ubuf0,
  1045. const uint16_t *ubuf1, const uint16_t *vbuf0,
  1046. const uint16_t *vbuf1, const uint16_t *abuf0,
  1047. const uint16_t *abuf1, uint8_t *dest,
  1048. int dstW, int yalpha, int uvalpha, int y)
  1049. {
  1050. //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
  1051. __asm__ volatile(
  1052. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1053. "mov %4, %%"REG_b" \n\t"
  1054. "push %%"REG_BP" \n\t"
  1055. YSCALEYUV2RGB(%%REGBP, %5)
  1056. "pxor %%mm7, %%mm7 \n\t"
  1057. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1058. #ifdef DITHER1XBPP
  1059. "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
  1060. "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
  1061. "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
  1062. #endif
  1063. WRITERGB16(%%REGb, 8280(%5), %%REGBP)
  1064. "pop %%"REG_BP" \n\t"
  1065. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1066. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1067. "a" (&c->redDither)
  1068. );
  1069. }
  1070. #define REAL_YSCALEYUV2PACKED(index, c) \
  1071. "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
  1072. "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1 \n\t"\
  1073. "psraw $3, %%mm0 \n\t"\
  1074. "psraw $3, %%mm1 \n\t"\
  1075. "movq %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c") \n\t"\
  1076. "movq %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c") \n\t"\
  1077. "xor "#index", "#index" \n\t"\
  1078. ".p2align 4 \n\t"\
  1079. "1: \n\t"\
  1080. "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
  1081. "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
  1082. "add "UV_OFFx2"("#c"), "#index" \n\t" \
  1083. "movq (%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
  1084. "movq (%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
  1085. "sub "UV_OFFx2"("#c"), "#index" \n\t" \
  1086. "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
  1087. "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
  1088. "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
  1089. "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
  1090. "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
  1091. "psraw $7, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
  1092. "psraw $7, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
  1093. "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
  1094. "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
  1095. "movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
  1096. "movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
  1097. "movq 8(%0, "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
  1098. "movq 8(%1, "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
  1099. "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
  1100. "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
  1101. "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
  1102. "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
  1103. "psraw $7, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  1104. "psraw $7, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  1105. "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
  1106. "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
  1107. #define YSCALEYUV2PACKED(index, c) REAL_YSCALEYUV2PACKED(index, c)
  1108. static inline void RENAME(yuv2yuyv422_2)(SwsContext *c, const uint16_t *buf0,
  1109. const uint16_t *buf1, const uint16_t *ubuf0,
  1110. const uint16_t *ubuf1, const uint16_t *vbuf0,
  1111. const uint16_t *vbuf1, const uint16_t *abuf0,
  1112. const uint16_t *abuf1, uint8_t *dest,
  1113. int dstW, int yalpha, int uvalpha, int y)
  1114. {
  1115. //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
  1116. __asm__ volatile(
  1117. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1118. "mov %4, %%"REG_b" \n\t"
  1119. "push %%"REG_BP" \n\t"
  1120. YSCALEYUV2PACKED(%%REGBP, %5)
  1121. WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
  1122. "pop %%"REG_BP" \n\t"
  1123. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1124. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1125. "a" (&c->redDither)
  1126. );
  1127. }
  1128. #define REAL_YSCALEYUV2RGB1(index, c) \
  1129. "xor "#index", "#index" \n\t"\
  1130. ".p2align 4 \n\t"\
  1131. "1: \n\t"\
  1132. "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
  1133. "add "UV_OFFx2"("#c"), "#index" \n\t" \
  1134. "movq (%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
  1135. "sub "UV_OFFx2"("#c"), "#index" \n\t" \
  1136. "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
  1137. "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
  1138. "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
  1139. "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
  1140. "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
  1141. "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
  1142. "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
  1143. "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
  1144. /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
  1145. "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
  1146. "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
  1147. "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  1148. "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  1149. "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
  1150. "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
  1151. "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
  1152. "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
  1153. "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
  1154. "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
  1155. /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
  1156. "paddw %%mm3, %%mm4 \n\t"\
  1157. "movq %%mm2, %%mm0 \n\t"\
  1158. "movq %%mm5, %%mm6 \n\t"\
  1159. "movq %%mm4, %%mm3 \n\t"\
  1160. "punpcklwd %%mm2, %%mm2 \n\t"\
  1161. "punpcklwd %%mm5, %%mm5 \n\t"\
  1162. "punpcklwd %%mm4, %%mm4 \n\t"\
  1163. "paddw %%mm1, %%mm2 \n\t"\
  1164. "paddw %%mm1, %%mm5 \n\t"\
  1165. "paddw %%mm1, %%mm4 \n\t"\
  1166. "punpckhwd %%mm0, %%mm0 \n\t"\
  1167. "punpckhwd %%mm6, %%mm6 \n\t"\
  1168. "punpckhwd %%mm3, %%mm3 \n\t"\
  1169. "paddw %%mm7, %%mm0 \n\t"\
  1170. "paddw %%mm7, %%mm6 \n\t"\
  1171. "paddw %%mm7, %%mm3 \n\t"\
  1172. /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
  1173. "packuswb %%mm0, %%mm2 \n\t"\
  1174. "packuswb %%mm6, %%mm5 \n\t"\
  1175. "packuswb %%mm3, %%mm4 \n\t"\
  1176. #define YSCALEYUV2RGB1(index, c) REAL_YSCALEYUV2RGB1(index, c)
  1177. // do vertical chrominance interpolation
  1178. #define REAL_YSCALEYUV2RGB1b(index, c) \
  1179. "xor "#index", "#index" \n\t"\
  1180. ".p2align 4 \n\t"\
  1181. "1: \n\t"\
  1182. "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
  1183. "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
  1184. "add "UV_OFFx2"("#c"), "#index" \n\t" \
  1185. "movq (%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
  1186. "movq (%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
  1187. "sub "UV_OFFx2"("#c"), "#index" \n\t" \
  1188. "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
  1189. "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
  1190. "psrlw $5, %%mm3 \n\t" /*FIXME might overflow*/\
  1191. "psrlw $5, %%mm4 \n\t" /*FIXME might overflow*/\
  1192. "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
  1193. "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
  1194. "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
  1195. "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
  1196. "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
  1197. "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
  1198. /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
  1199. "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
  1200. "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
  1201. "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  1202. "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  1203. "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
  1204. "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
  1205. "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
  1206. "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
  1207. "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
  1208. "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
  1209. /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
  1210. "paddw %%mm3, %%mm4 \n\t"\
  1211. "movq %%mm2, %%mm0 \n\t"\
  1212. "movq %%mm5, %%mm6 \n\t"\
  1213. "movq %%mm4, %%mm3 \n\t"\
  1214. "punpcklwd %%mm2, %%mm2 \n\t"\
  1215. "punpcklwd %%mm5, %%mm5 \n\t"\
  1216. "punpcklwd %%mm4, %%mm4 \n\t"\
  1217. "paddw %%mm1, %%mm2 \n\t"\
  1218. "paddw %%mm1, %%mm5 \n\t"\
  1219. "paddw %%mm1, %%mm4 \n\t"\
  1220. "punpckhwd %%mm0, %%mm0 \n\t"\
  1221. "punpckhwd %%mm6, %%mm6 \n\t"\
  1222. "punpckhwd %%mm3, %%mm3 \n\t"\
  1223. "paddw %%mm7, %%mm0 \n\t"\
  1224. "paddw %%mm7, %%mm6 \n\t"\
  1225. "paddw %%mm7, %%mm3 \n\t"\
  1226. /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
  1227. "packuswb %%mm0, %%mm2 \n\t"\
  1228. "packuswb %%mm6, %%mm5 \n\t"\
  1229. "packuswb %%mm3, %%mm4 \n\t"\
  1230. #define YSCALEYUV2RGB1b(index, c) REAL_YSCALEYUV2RGB1b(index, c)
  1231. #define REAL_YSCALEYUV2RGB1_ALPHA(index) \
  1232. "movq (%1, "#index", 2), %%mm7 \n\t" /* abuf0[index ] */\
  1233. "movq 8(%1, "#index", 2), %%mm1 \n\t" /* abuf0[index+4] */\
  1234. "psraw $7, %%mm7 \n\t" /* abuf0[index ] >>7 */\
  1235. "psraw $7, %%mm1 \n\t" /* abuf0[index+4] >>7 */\
  1236. "packuswb %%mm1, %%mm7 \n\t"
  1237. #define YSCALEYUV2RGB1_ALPHA(index) REAL_YSCALEYUV2RGB1_ALPHA(index)
  1238. /**
  1239. * YV12 to RGB without scaling or interpolating
  1240. */
  1241. static inline void RENAME(yuv2rgb32_1)(SwsContext *c, const uint16_t *buf0,
  1242. const uint16_t *ubuf0, const uint16_t *ubuf1,
  1243. const uint16_t *vbuf0, const uint16_t *vbuf1,
  1244. const uint16_t *abuf0, uint8_t *dest,
  1245. int dstW, int uvalpha, enum PixelFormat dstFormat,
  1246. int flags, int y)
  1247. {
  1248. const uint16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
  1249. if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
  1250. if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
  1251. __asm__ volatile(
  1252. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1253. "mov %4, %%"REG_b" \n\t"
  1254. "push %%"REG_BP" \n\t"
  1255. YSCALEYUV2RGB1(%%REGBP, %5)
  1256. YSCALEYUV2RGB1_ALPHA(%%REGBP)
  1257. WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
  1258. "pop %%"REG_BP" \n\t"
  1259. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1260. :: "c" (buf0), "d" (abuf0), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1261. "a" (&c->redDither)
  1262. );
  1263. } else {
  1264. __asm__ volatile(
  1265. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1266. "mov %4, %%"REG_b" \n\t"
  1267. "push %%"REG_BP" \n\t"
  1268. YSCALEYUV2RGB1(%%REGBP, %5)
  1269. "pcmpeqd %%mm7, %%mm7 \n\t"
  1270. WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
  1271. "pop %%"REG_BP" \n\t"
  1272. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1273. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1274. "a" (&c->redDither)
  1275. );
  1276. }
  1277. } else {
  1278. if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
  1279. __asm__ volatile(
  1280. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1281. "mov %4, %%"REG_b" \n\t"
  1282. "push %%"REG_BP" \n\t"
  1283. YSCALEYUV2RGB1b(%%REGBP, %5)
  1284. YSCALEYUV2RGB1_ALPHA(%%REGBP)
  1285. WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
  1286. "pop %%"REG_BP" \n\t"
  1287. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1288. :: "c" (buf0), "d" (abuf0), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1289. "a" (&c->redDither)
  1290. );
  1291. } else {
  1292. __asm__ volatile(
  1293. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1294. "mov %4, %%"REG_b" \n\t"
  1295. "push %%"REG_BP" \n\t"
  1296. YSCALEYUV2RGB1b(%%REGBP, %5)
  1297. "pcmpeqd %%mm7, %%mm7 \n\t"
  1298. WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
  1299. "pop %%"REG_BP" \n\t"
  1300. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1301. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1302. "a" (&c->redDither)
  1303. );
  1304. }
  1305. }
  1306. }
  1307. static inline void RENAME(yuv2bgr24_1)(SwsContext *c, const uint16_t *buf0,
  1308. const uint16_t *ubuf0, const uint16_t *ubuf1,
  1309. const uint16_t *vbuf0, const uint16_t *vbuf1,
  1310. const uint16_t *abuf0, uint8_t *dest,
  1311. int dstW, int uvalpha, enum PixelFormat dstFormat,
  1312. int flags, int y)
  1313. {
  1314. const uint16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
  1315. if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
  1316. __asm__ volatile(
  1317. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1318. "mov %4, %%"REG_b" \n\t"
  1319. "push %%"REG_BP" \n\t"
  1320. YSCALEYUV2RGB1(%%REGBP, %5)
  1321. "pxor %%mm7, %%mm7 \n\t"
  1322. WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
  1323. "pop %%"REG_BP" \n\t"
  1324. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1325. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1326. "a" (&c->redDither)
  1327. );
  1328. } else {
  1329. __asm__ volatile(
  1330. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1331. "mov %4, %%"REG_b" \n\t"
  1332. "push %%"REG_BP" \n\t"
  1333. YSCALEYUV2RGB1b(%%REGBP, %5)
  1334. "pxor %%mm7, %%mm7 \n\t"
  1335. WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
  1336. "pop %%"REG_BP" \n\t"
  1337. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1338. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1339. "a" (&c->redDither)
  1340. );
  1341. }
  1342. }
  1343. static inline void RENAME(yuv2rgb555_1)(SwsContext *c, const uint16_t *buf0,
  1344. const uint16_t *ubuf0, const uint16_t *ubuf1,
  1345. const uint16_t *vbuf0, const uint16_t *vbuf1,
  1346. const uint16_t *abuf0, uint8_t *dest,
  1347. int dstW, int uvalpha, enum PixelFormat dstFormat,
  1348. int flags, int y)
  1349. {
  1350. const uint16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
  1351. if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
  1352. __asm__ volatile(
  1353. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1354. "mov %4, %%"REG_b" \n\t"
  1355. "push %%"REG_BP" \n\t"
  1356. YSCALEYUV2RGB1(%%REGBP, %5)
  1357. "pxor %%mm7, %%mm7 \n\t"
  1358. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1359. #ifdef DITHER1XBPP
  1360. "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
  1361. "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
  1362. "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
  1363. #endif
  1364. WRITERGB15(%%REGb, 8280(%5), %%REGBP)
  1365. "pop %%"REG_BP" \n\t"
  1366. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1367. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1368. "a" (&c->redDither)
  1369. );
  1370. } else {
  1371. __asm__ volatile(
  1372. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1373. "mov %4, %%"REG_b" \n\t"
  1374. "push %%"REG_BP" \n\t"
  1375. YSCALEYUV2RGB1b(%%REGBP, %5)
  1376. "pxor %%mm7, %%mm7 \n\t"
  1377. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1378. #ifdef DITHER1XBPP
  1379. "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
  1380. "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
  1381. "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
  1382. #endif
  1383. WRITERGB15(%%REGb, 8280(%5), %%REGBP)
  1384. "pop %%"REG_BP" \n\t"
  1385. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1386. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1387. "a" (&c->redDither)
  1388. );
  1389. }
  1390. }
  1391. static inline void RENAME(yuv2rgb565_1)(SwsContext *c, const uint16_t *buf0,
  1392. const uint16_t *ubuf0, const uint16_t *ubuf1,
  1393. const uint16_t *vbuf0, const uint16_t *vbuf1,
  1394. const uint16_t *abuf0, uint8_t *dest,
  1395. int dstW, int uvalpha, enum PixelFormat dstFormat,
  1396. int flags, int y)
  1397. {
  1398. const uint16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
  1399. if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
  1400. __asm__ volatile(
  1401. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1402. "mov %4, %%"REG_b" \n\t"
  1403. "push %%"REG_BP" \n\t"
  1404. YSCALEYUV2RGB1(%%REGBP, %5)
  1405. "pxor %%mm7, %%mm7 \n\t"
  1406. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1407. #ifdef DITHER1XBPP
  1408. "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
  1409. "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
  1410. "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
  1411. #endif
  1412. WRITERGB16(%%REGb, 8280(%5), %%REGBP)
  1413. "pop %%"REG_BP" \n\t"
  1414. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1415. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1416. "a" (&c->redDither)
  1417. );
  1418. } else {
  1419. __asm__ volatile(
  1420. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1421. "mov %4, %%"REG_b" \n\t"
  1422. "push %%"REG_BP" \n\t"
  1423. YSCALEYUV2RGB1b(%%REGBP, %5)
  1424. "pxor %%mm7, %%mm7 \n\t"
  1425. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1426. #ifdef DITHER1XBPP
  1427. "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
  1428. "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
  1429. "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
  1430. #endif
  1431. WRITERGB16(%%REGb, 8280(%5), %%REGBP)
  1432. "pop %%"REG_BP" \n\t"
  1433. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1434. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1435. "a" (&c->redDither)
  1436. );
  1437. }
  1438. }
  1439. #define REAL_YSCALEYUV2PACKED1(index, c) \
  1440. "xor "#index", "#index" \n\t"\
  1441. ".p2align 4 \n\t"\
  1442. "1: \n\t"\
  1443. "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
  1444. "add "UV_OFFx2"("#c"), "#index" \n\t" \
  1445. "movq (%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
  1446. "sub "UV_OFFx2"("#c"), "#index" \n\t" \
  1447. "psraw $7, %%mm3 \n\t" \
  1448. "psraw $7, %%mm4 \n\t" \
  1449. "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
  1450. "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
  1451. "psraw $7, %%mm1 \n\t" \
  1452. "psraw $7, %%mm7 \n\t" \
  1453. #define YSCALEYUV2PACKED1(index, c) REAL_YSCALEYUV2PACKED1(index, c)
  1454. #define REAL_YSCALEYUV2PACKED1b(index, c) \
  1455. "xor "#index", "#index" \n\t"\
  1456. ".p2align 4 \n\t"\
  1457. "1: \n\t"\
  1458. "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
  1459. "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
  1460. "add "UV_OFFx2"("#c"), "#index" \n\t" \
  1461. "movq (%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
  1462. "movq (%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
  1463. "sub "UV_OFFx2"("#c"), "#index" \n\t" \
  1464. "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
  1465. "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
  1466. "psrlw $8, %%mm3 \n\t" \
  1467. "psrlw $8, %%mm4 \n\t" \
  1468. "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
  1469. "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
  1470. "psraw $7, %%mm1 \n\t" \
  1471. "psraw $7, %%mm7 \n\t"
  1472. #define YSCALEYUV2PACKED1b(index, c) REAL_YSCALEYUV2PACKED1b(index, c)
  1473. static inline void RENAME(yuv2yuyv422_1)(SwsContext *c, const uint16_t *buf0,
  1474. const uint16_t *ubuf0, const uint16_t *ubuf1,
  1475. const uint16_t *vbuf0, const uint16_t *vbuf1,
  1476. const uint16_t *abuf0, uint8_t *dest,
  1477. int dstW, int uvalpha, enum PixelFormat dstFormat,
  1478. int flags, int y)
  1479. {
  1480. const uint16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
  1481. if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
  1482. __asm__ volatile(
  1483. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1484. "mov %4, %%"REG_b" \n\t"
  1485. "push %%"REG_BP" \n\t"
  1486. YSCALEYUV2PACKED1(%%REGBP, %5)
  1487. WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
  1488. "pop %%"REG_BP" \n\t"
  1489. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1490. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1491. "a" (&c->redDither)
  1492. );
  1493. } else {
  1494. __asm__ volatile(
  1495. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1496. "mov %4, %%"REG_b" \n\t"
  1497. "push %%"REG_BP" \n\t"
  1498. YSCALEYUV2PACKED1b(%%REGBP, %5)
  1499. WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
  1500. "pop %%"REG_BP" \n\t"
  1501. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1502. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1503. "a" (&c->redDither)
  1504. );
  1505. }
  1506. }
  1507. #if !COMPILE_TEMPLATE_MMX2
  1508. //FIXME yuy2* can read up to 7 samples too much
  1509. static inline void RENAME(yuy2ToY)(uint8_t *dst, const uint8_t *src, int width, uint32_t *unused)
  1510. {
  1511. __asm__ volatile(
  1512. "movq "MANGLE(bm01010101)", %%mm2 \n\t"
  1513. "mov %0, %%"REG_a" \n\t"
  1514. "1: \n\t"
  1515. "movq (%1, %%"REG_a",2), %%mm0 \n\t"
  1516. "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
  1517. "pand %%mm2, %%mm0 \n\t"
  1518. "pand %%mm2, %%mm1 \n\t"
  1519. "packuswb %%mm1, %%mm0 \n\t"
  1520. "movq %%mm0, (%2, %%"REG_a") \n\t"
  1521. "add $8, %%"REG_a" \n\t"
  1522. " js 1b \n\t"
  1523. : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst+width)
  1524. : "%"REG_a
  1525. );
  1526. }
  1527. static inline void RENAME(yuy2ToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, int width, uint32_t *unused)
  1528. {
  1529. __asm__ volatile(
  1530. "movq "MANGLE(bm01010101)", %%mm4 \n\t"
  1531. "mov %0, %%"REG_a" \n\t"
  1532. "1: \n\t"
  1533. "movq (%1, %%"REG_a",4), %%mm0 \n\t"
  1534. "movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
  1535. "psrlw $8, %%mm0 \n\t"
  1536. "psrlw $8, %%mm1 \n\t"
  1537. "packuswb %%mm1, %%mm0 \n\t"
  1538. "movq %%mm0, %%mm1 \n\t"
  1539. "psrlw $8, %%mm0 \n\t"
  1540. "pand %%mm4, %%mm1 \n\t"
  1541. "packuswb %%mm0, %%mm0 \n\t"
  1542. "packuswb %%mm1, %%mm1 \n\t"
  1543. "movd %%mm0, (%3, %%"REG_a") \n\t"
  1544. "movd %%mm1, (%2, %%"REG_a") \n\t"
  1545. "add $4, %%"REG_a" \n\t"
  1546. " js 1b \n\t"
  1547. : : "g" ((x86_reg)-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
  1548. : "%"REG_a
  1549. );
  1550. assert(src1 == src2);
  1551. }
  1552. static inline void RENAME(LEToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, int width, uint32_t *unused)
  1553. {
  1554. __asm__ volatile(
  1555. "mov %0, %%"REG_a" \n\t"
  1556. "1: \n\t"
  1557. "movq (%1, %%"REG_a",2), %%mm0 \n\t"
  1558. "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
  1559. "movq (%2, %%"REG_a",2), %%mm2 \n\t"
  1560. "movq 8(%2, %%"REG_a",2), %%mm3 \n\t"
  1561. "psrlw $8, %%mm0 \n\t"
  1562. "psrlw $8, %%mm1 \n\t"
  1563. "psrlw $8, %%mm2 \n\t"
  1564. "psrlw $8, %%mm3 \n\t"
  1565. "packuswb %%mm1, %%mm0 \n\t"
  1566. "packuswb %%mm3, %%mm2 \n\t"
  1567. "movq %%mm0, (%3, %%"REG_a") \n\t"
  1568. "movq %%mm2, (%4, %%"REG_a") \n\t"
  1569. "add $8, %%"REG_a" \n\t"
  1570. " js 1b \n\t"
  1571. : : "g" ((x86_reg)-width), "r" (src1+width*2), "r" (src2+width*2), "r" (dstU+width), "r" (dstV+width)
  1572. : "%"REG_a
  1573. );
  1574. }
  1575. /* This is almost identical to the previous, end exists only because
  1576. * yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses. */
  1577. static inline void RENAME(uyvyToY)(uint8_t *dst, const uint8_t *src, int width, uint32_t *unused)
  1578. {
  1579. __asm__ volatile(
  1580. "mov %0, %%"REG_a" \n\t"
  1581. "1: \n\t"
  1582. "movq (%1, %%"REG_a",2), %%mm0 \n\t"
  1583. "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
  1584. "psrlw $8, %%mm0 \n\t"
  1585. "psrlw $8, %%mm1 \n\t"
  1586. "packuswb %%mm1, %%mm0 \n\t"
  1587. "movq %%mm0, (%2, %%"REG_a") \n\t"
  1588. "add $8, %%"REG_a" \n\t"
  1589. " js 1b \n\t"
  1590. : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst+width)
  1591. : "%"REG_a
  1592. );
  1593. }
  1594. static inline void RENAME(uyvyToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, int width, uint32_t *unused)
  1595. {
  1596. __asm__ volatile(
  1597. "movq "MANGLE(bm01010101)", %%mm4 \n\t"
  1598. "mov %0, %%"REG_a" \n\t"
  1599. "1: \n\t"
  1600. "movq (%1, %%"REG_a",4), %%mm0 \n\t"
  1601. "movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
  1602. "pand %%mm4, %%mm0 \n\t"
  1603. "pand %%mm4, %%mm1 \n\t"
  1604. "packuswb %%mm1, %%mm0 \n\t"
  1605. "movq %%mm0, %%mm1 \n\t"
  1606. "psrlw $8, %%mm0 \n\t"
  1607. "pand %%mm4, %%mm1 \n\t"
  1608. "packuswb %%mm0, %%mm0 \n\t"
  1609. "packuswb %%mm1, %%mm1 \n\t"
  1610. "movd %%mm0, (%3, %%"REG_a") \n\t"
  1611. "movd %%mm1, (%2, %%"REG_a") \n\t"
  1612. "add $4, %%"REG_a" \n\t"
  1613. " js 1b \n\t"
  1614. : : "g" ((x86_reg)-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
  1615. : "%"REG_a
  1616. );
  1617. assert(src1 == src2);
  1618. }
  1619. static inline void RENAME(BEToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, int width, uint32_t *unused)
  1620. {
  1621. __asm__ volatile(
  1622. "movq "MANGLE(bm01010101)", %%mm4 \n\t"
  1623. "mov %0, %%"REG_a" \n\t"
  1624. "1: \n\t"
  1625. "movq (%1, %%"REG_a",2), %%mm0 \n\t"
  1626. "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
  1627. "movq (%2, %%"REG_a",2), %%mm2 \n\t"
  1628. "movq 8(%2, %%"REG_a",2), %%mm3 \n\t"
  1629. "pand %%mm4, %%mm0 \n\t"
  1630. "pand %%mm4, %%mm1 \n\t"
  1631. "pand %%mm4, %%mm2 \n\t"
  1632. "pand %%mm4, %%mm3 \n\t"
  1633. "packuswb %%mm1, %%mm0 \n\t"
  1634. "packuswb %%mm3, %%mm2 \n\t"
  1635. "movq %%mm0, (%3, %%"REG_a") \n\t"
  1636. "movq %%mm2, (%4, %%"REG_a") \n\t"
  1637. "add $8, %%"REG_a" \n\t"
  1638. " js 1b \n\t"
  1639. : : "g" ((x86_reg)-width), "r" (src1+width*2), "r" (src2+width*2), "r" (dstU+width), "r" (dstV+width)
  1640. : "%"REG_a
  1641. );
  1642. }
  1643. static inline void RENAME(nvXXtoUV)(uint8_t *dst1, uint8_t *dst2,
  1644. const uint8_t *src, int width)
  1645. {
  1646. __asm__ volatile(
  1647. "movq "MANGLE(bm01010101)", %%mm4 \n\t"
  1648. "mov %0, %%"REG_a" \n\t"
  1649. "1: \n\t"
  1650. "movq (%1, %%"REG_a",2), %%mm0 \n\t"
  1651. "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
  1652. "movq %%mm0, %%mm2 \n\t"
  1653. "movq %%mm1, %%mm3 \n\t"
  1654. "pand %%mm4, %%mm0 \n\t"
  1655. "pand %%mm4, %%mm1 \n\t"
  1656. "psrlw $8, %%mm2 \n\t"
  1657. "psrlw $8, %%mm3 \n\t"
  1658. "packuswb %%mm1, %%mm0 \n\t"
  1659. "packuswb %%mm3, %%mm2 \n\t"
  1660. "movq %%mm0, (%2, %%"REG_a") \n\t"
  1661. "movq %%mm2, (%3, %%"REG_a") \n\t"
  1662. "add $8, %%"REG_a" \n\t"
  1663. " js 1b \n\t"
  1664. : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst1+width), "r" (dst2+width)
  1665. : "%"REG_a
  1666. );
  1667. }
  1668. static inline void RENAME(nv12ToUV)(uint8_t *dstU, uint8_t *dstV,
  1669. const uint8_t *src1, const uint8_t *src2,
  1670. int width, uint32_t *unused)
  1671. {
  1672. RENAME(nvXXtoUV)(dstU, dstV, src1, width);
  1673. }
  1674. static inline void RENAME(nv21ToUV)(uint8_t *dstU, uint8_t *dstV,
  1675. const uint8_t *src1, const uint8_t *src2,
  1676. int width, uint32_t *unused)
  1677. {
  1678. RENAME(nvXXtoUV)(dstV, dstU, src1, width);
  1679. }
  1680. #endif /* !COMPILE_TEMPLATE_MMX2 */
  1681. static inline void RENAME(bgr24ToY_mmx)(int16_t *dst, const uint8_t *src, int width, enum PixelFormat srcFormat)
  1682. {
  1683. if(srcFormat == PIX_FMT_BGR24) {
  1684. __asm__ volatile(
  1685. "movq "MANGLE(ff_bgr24toY1Coeff)", %%mm5 \n\t"
  1686. "movq "MANGLE(ff_bgr24toY2Coeff)", %%mm6 \n\t"
  1687. :
  1688. );
  1689. } else {
  1690. __asm__ volatile(
  1691. "movq "MANGLE(ff_rgb24toY1Coeff)", %%mm5 \n\t"
  1692. "movq "MANGLE(ff_rgb24toY2Coeff)", %%mm6 \n\t"
  1693. :
  1694. );
  1695. }
  1696. __asm__ volatile(
  1697. "movq "MANGLE(ff_bgr24toYOffset)", %%mm4 \n\t"
  1698. "mov %2, %%"REG_a" \n\t"
  1699. "pxor %%mm7, %%mm7 \n\t"
  1700. "1: \n\t"
  1701. PREFETCH" 64(%0) \n\t"
  1702. "movd (%0), %%mm0 \n\t"
  1703. "movd 2(%0), %%mm1 \n\t"
  1704. "movd 6(%0), %%mm2 \n\t"
  1705. "movd 8(%0), %%mm3 \n\t"
  1706. "add $12, %0 \n\t"
  1707. "punpcklbw %%mm7, %%mm0 \n\t"
  1708. "punpcklbw %%mm7, %%mm1 \n\t"
  1709. "punpcklbw %%mm7, %%mm2 \n\t"
  1710. "punpcklbw %%mm7, %%mm3 \n\t"
  1711. "pmaddwd %%mm5, %%mm0 \n\t"
  1712. "pmaddwd %%mm6, %%mm1 \n\t"
  1713. "pmaddwd %%mm5, %%mm2 \n\t"
  1714. "pmaddwd %%mm6, %%mm3 \n\t"
  1715. "paddd %%mm1, %%mm0 \n\t"
  1716. "paddd %%mm3, %%mm2 \n\t"
  1717. "paddd %%mm4, %%mm0 \n\t"
  1718. "paddd %%mm4, %%mm2 \n\t"
  1719. "psrad $9, %%mm0 \n\t"
  1720. "psrad $9, %%mm2 \n\t"
  1721. "packssdw %%mm2, %%mm0 \n\t"
  1722. "movq %%mm0, (%1, %%"REG_a") \n\t"
  1723. "add $8, %%"REG_a" \n\t"
  1724. " js 1b \n\t"
  1725. : "+r" (src)
  1726. : "r" (dst+width), "g" ((x86_reg)-2*width)
  1727. : "%"REG_a
  1728. );
  1729. }
  1730. static inline void RENAME(bgr24ToUV_mmx)(int16_t *dstU, int16_t *dstV, const uint8_t *src, int width, enum PixelFormat srcFormat)
  1731. {
  1732. __asm__ volatile(
  1733. "movq 24(%4), %%mm6 \n\t"
  1734. "mov %3, %%"REG_a" \n\t"
  1735. "pxor %%mm7, %%mm7 \n\t"
  1736. "1: \n\t"
  1737. PREFETCH" 64(%0) \n\t"
  1738. "movd (%0), %%mm0 \n\t"
  1739. "movd 2(%0), %%mm1 \n\t"
  1740. "punpcklbw %%mm7, %%mm0 \n\t"
  1741. "punpcklbw %%mm7, %%mm1 \n\t"
  1742. "movq %%mm0, %%mm2 \n\t"
  1743. "movq %%mm1, %%mm3 \n\t"
  1744. "pmaddwd (%4), %%mm0 \n\t"
  1745. "pmaddwd 8(%4), %%mm1 \n\t"
  1746. "pmaddwd 16(%4), %%mm2 \n\t"
  1747. "pmaddwd %%mm6, %%mm3 \n\t"
  1748. "paddd %%mm1, %%mm0 \n\t"
  1749. "paddd %%mm3, %%mm2 \n\t"
  1750. "movd 6(%0), %%mm1 \n\t"
  1751. "movd 8(%0), %%mm3 \n\t"
  1752. "add $12, %0 \n\t"
  1753. "punpcklbw %%mm7, %%mm1 \n\t"
  1754. "punpcklbw %%mm7, %%mm3 \n\t"
  1755. "movq %%mm1, %%mm4 \n\t"
  1756. "movq %%mm3, %%mm5 \n\t"
  1757. "pmaddwd (%4), %%mm1 \n\t"
  1758. "pmaddwd 8(%4), %%mm3 \n\t"
  1759. "pmaddwd 16(%4), %%mm4 \n\t"
  1760. "pmaddwd %%mm6, %%mm5 \n\t"
  1761. "paddd %%mm3, %%mm1 \n\t"
  1762. "paddd %%mm5, %%mm4 \n\t"
  1763. "movq "MANGLE(ff_bgr24toUVOffset)", %%mm3 \n\t"
  1764. "paddd %%mm3, %%mm0 \n\t"
  1765. "paddd %%mm3, %%mm2 \n\t"
  1766. "paddd %%mm3, %%mm1 \n\t"
  1767. "paddd %%mm3, %%mm4 \n\t"
  1768. "psrad $9, %%mm0 \n\t"
  1769. "psrad $9, %%mm2 \n\t"
  1770. "psrad $9, %%mm1 \n\t"
  1771. "psrad $9, %%mm4 \n\t"
  1772. "packssdw %%mm1, %%mm0 \n\t"
  1773. "packssdw %%mm4, %%mm2 \n\t"
  1774. "movq %%mm0, (%1, %%"REG_a") \n\t"
  1775. "movq %%mm2, (%2, %%"REG_a") \n\t"
  1776. "add $8, %%"REG_a" \n\t"
  1777. " js 1b \n\t"
  1778. : "+r" (src)
  1779. : "r" (dstU+width), "r" (dstV+width), "g" ((x86_reg)-2*width), "r"(ff_bgr24toUV[srcFormat == PIX_FMT_RGB24])
  1780. : "%"REG_a
  1781. );
  1782. }
  1783. static inline void RENAME(bgr24ToY)(int16_t *dst, const uint8_t *src, int width, uint32_t *unused)
  1784. {
  1785. RENAME(bgr24ToY_mmx)(dst, src, width, PIX_FMT_BGR24);
  1786. }
  1787. static inline void RENAME(bgr24ToUV)(int16_t *dstU, int16_t *dstV, const uint8_t *src1, const uint8_t *src2, int width, uint32_t *unused)
  1788. {
  1789. RENAME(bgr24ToUV_mmx)(dstU, dstV, src1, width, PIX_FMT_BGR24);
  1790. assert(src1 == src2);
  1791. }
  1792. static inline void RENAME(rgb24ToY)(int16_t *dst, const uint8_t *src, int width, uint32_t *unused)
  1793. {
  1794. RENAME(bgr24ToY_mmx)(dst, src, width, PIX_FMT_RGB24);
  1795. }
  1796. static inline void RENAME(rgb24ToUV)(int16_t *dstU, int16_t *dstV, const uint8_t *src1, const uint8_t *src2, int width, uint32_t *unused)
  1797. {
  1798. assert(src1==src2);
  1799. RENAME(bgr24ToUV_mmx)(dstU, dstV, src1, width, PIX_FMT_RGB24);
  1800. }
  1801. #if !COMPILE_TEMPLATE_MMX2
  1802. // bilinear / bicubic scaling
  1803. static inline void RENAME(hScale)(int16_t *dst, int dstW, const uint8_t *src, int srcW, int xInc,
  1804. const int16_t *filter, const int16_t *filterPos, int filterSize)
  1805. {
  1806. assert(filterSize % 4 == 0 && filterSize>0);
  1807. if (filterSize==4) { // Always true for upscaling, sometimes for down, too.
  1808. x86_reg counter= -2*dstW;
  1809. filter-= counter*2;
  1810. filterPos-= counter/2;
  1811. dst-= counter/2;
  1812. __asm__ volatile(
  1813. #if defined(PIC)
  1814. "push %%"REG_b" \n\t"
  1815. #endif
  1816. "pxor %%mm7, %%mm7 \n\t"
  1817. "push %%"REG_BP" \n\t" // we use 7 regs here ...
  1818. "mov %%"REG_a", %%"REG_BP" \n\t"
  1819. ".p2align 4 \n\t"
  1820. "1: \n\t"
  1821. "movzwl (%2, %%"REG_BP"), %%eax \n\t"
  1822. "movzwl 2(%2, %%"REG_BP"), %%ebx \n\t"
  1823. "movq (%1, %%"REG_BP", 4), %%mm1 \n\t"
  1824. "movq 8(%1, %%"REG_BP", 4), %%mm3 \n\t"
  1825. "movd (%3, %%"REG_a"), %%mm0 \n\t"
  1826. "movd (%3, %%"REG_b"), %%mm2 \n\t"
  1827. "punpcklbw %%mm7, %%mm0 \n\t"
  1828. "punpcklbw %%mm7, %%mm2 \n\t"
  1829. "pmaddwd %%mm1, %%mm0 \n\t"
  1830. "pmaddwd %%mm2, %%mm3 \n\t"
  1831. "movq %%mm0, %%mm4 \n\t"
  1832. "punpckldq %%mm3, %%mm0 \n\t"
  1833. "punpckhdq %%mm3, %%mm4 \n\t"
  1834. "paddd %%mm4, %%mm0 \n\t"
  1835. "psrad $7, %%mm0 \n\t"
  1836. "packssdw %%mm0, %%mm0 \n\t"
  1837. "movd %%mm0, (%4, %%"REG_BP") \n\t"
  1838. "add $4, %%"REG_BP" \n\t"
  1839. " jnc 1b \n\t"
  1840. "pop %%"REG_BP" \n\t"
  1841. #if defined(PIC)
  1842. "pop %%"REG_b" \n\t"
  1843. #endif
  1844. : "+a" (counter)
  1845. : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
  1846. #if !defined(PIC)
  1847. : "%"REG_b
  1848. #endif
  1849. );
  1850. } else if (filterSize==8) {
  1851. x86_reg counter= -2*dstW;
  1852. filter-= counter*4;
  1853. filterPos-= counter/2;
  1854. dst-= counter/2;
  1855. __asm__ volatile(
  1856. #if defined(PIC)
  1857. "push %%"REG_b" \n\t"
  1858. #endif
  1859. "pxor %%mm7, %%mm7 \n\t"
  1860. "push %%"REG_BP" \n\t" // we use 7 regs here ...
  1861. "mov %%"REG_a", %%"REG_BP" \n\t"
  1862. ".p2align 4 \n\t"
  1863. "1: \n\t"
  1864. "movzwl (%2, %%"REG_BP"), %%eax \n\t"
  1865. "movzwl 2(%2, %%"REG_BP"), %%ebx \n\t"
  1866. "movq (%1, %%"REG_BP", 8), %%mm1 \n\t"
  1867. "movq 16(%1, %%"REG_BP", 8), %%mm3 \n\t"
  1868. "movd (%3, %%"REG_a"), %%mm0 \n\t"
  1869. "movd (%3, %%"REG_b"), %%mm2 \n\t"
  1870. "punpcklbw %%mm7, %%mm0 \n\t"
  1871. "punpcklbw %%mm7, %%mm2 \n\t"
  1872. "pmaddwd %%mm1, %%mm0 \n\t"
  1873. "pmaddwd %%mm2, %%mm3 \n\t"
  1874. "movq 8(%1, %%"REG_BP", 8), %%mm1 \n\t"
  1875. "movq 24(%1, %%"REG_BP", 8), %%mm5 \n\t"
  1876. "movd 4(%3, %%"REG_a"), %%mm4 \n\t"
  1877. "movd 4(%3, %%"REG_b"), %%mm2 \n\t"
  1878. "punpcklbw %%mm7, %%mm4 \n\t"
  1879. "punpcklbw %%mm7, %%mm2 \n\t"
  1880. "pmaddwd %%mm1, %%mm4 \n\t"
  1881. "pmaddwd %%mm2, %%mm5 \n\t"
  1882. "paddd %%mm4, %%mm0 \n\t"
  1883. "paddd %%mm5, %%mm3 \n\t"
  1884. "movq %%mm0, %%mm4 \n\t"
  1885. "punpckldq %%mm3, %%mm0 \n\t"
  1886. "punpckhdq %%mm3, %%mm4 \n\t"
  1887. "paddd %%mm4, %%mm0 \n\t"
  1888. "psrad $7, %%mm0 \n\t"
  1889. "packssdw %%mm0, %%mm0 \n\t"
  1890. "movd %%mm0, (%4, %%"REG_BP") \n\t"
  1891. "add $4, %%"REG_BP" \n\t"
  1892. " jnc 1b \n\t"
  1893. "pop %%"REG_BP" \n\t"
  1894. #if defined(PIC)
  1895. "pop %%"REG_b" \n\t"
  1896. #endif
  1897. : "+a" (counter)
  1898. : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
  1899. #if !defined(PIC)
  1900. : "%"REG_b
  1901. #endif
  1902. );
  1903. } else {
  1904. const uint8_t *offset = src+filterSize;
  1905. x86_reg counter= -2*dstW;
  1906. //filter-= counter*filterSize/2;
  1907. filterPos-= counter/2;
  1908. dst-= counter/2;
  1909. __asm__ volatile(
  1910. "pxor %%mm7, %%mm7 \n\t"
  1911. ".p2align 4 \n\t"
  1912. "1: \n\t"
  1913. "mov %2, %%"REG_c" \n\t"
  1914. "movzwl (%%"REG_c", %0), %%eax \n\t"
  1915. "movzwl 2(%%"REG_c", %0), %%edx \n\t"
  1916. "mov %5, %%"REG_c" \n\t"
  1917. "pxor %%mm4, %%mm4 \n\t"
  1918. "pxor %%mm5, %%mm5 \n\t"
  1919. "2: \n\t"
  1920. "movq (%1), %%mm1 \n\t"
  1921. "movq (%1, %6), %%mm3 \n\t"
  1922. "movd (%%"REG_c", %%"REG_a"), %%mm0 \n\t"
  1923. "movd (%%"REG_c", %%"REG_d"), %%mm2 \n\t"
  1924. "punpcklbw %%mm7, %%mm0 \n\t"
  1925. "punpcklbw %%mm7, %%mm2 \n\t"
  1926. "pmaddwd %%mm1, %%mm0 \n\t"
  1927. "pmaddwd %%mm2, %%mm3 \n\t"
  1928. "paddd %%mm3, %%mm5 \n\t"
  1929. "paddd %%mm0, %%mm4 \n\t"
  1930. "add $8, %1 \n\t"
  1931. "add $4, %%"REG_c" \n\t"
  1932. "cmp %4, %%"REG_c" \n\t"
  1933. " jb 2b \n\t"
  1934. "add %6, %1 \n\t"
  1935. "movq %%mm4, %%mm0 \n\t"
  1936. "punpckldq %%mm5, %%mm4 \n\t"
  1937. "punpckhdq %%mm5, %%mm0 \n\t"
  1938. "paddd %%mm0, %%mm4 \n\t"
  1939. "psrad $7, %%mm4 \n\t"
  1940. "packssdw %%mm4, %%mm4 \n\t"
  1941. "mov %3, %%"REG_a" \n\t"
  1942. "movd %%mm4, (%%"REG_a", %0) \n\t"
  1943. "add $4, %0 \n\t"
  1944. " jnc 1b \n\t"
  1945. : "+r" (counter), "+r" (filter)
  1946. : "m" (filterPos), "m" (dst), "m"(offset),
  1947. "m" (src), "r" ((x86_reg)filterSize*2)
  1948. : "%"REG_a, "%"REG_c, "%"REG_d
  1949. );
  1950. }
  1951. }
  1952. #endif /* !COMPILE_TEMPLATE_MMX2 */
  1953. static inline void RENAME(hScale16)(int16_t *dst, int dstW, const uint16_t *src, int srcW, int xInc,
  1954. const int16_t *filter, const int16_t *filterPos, long filterSize, int shift)
  1955. {
  1956. int i, j;
  1957. assert(filterSize % 4 == 0 && filterSize>0);
  1958. if (filterSize==4 && shift<15) { // Always true for upscaling, sometimes for down, too.
  1959. x86_reg counter= -2*dstW;
  1960. filter-= counter*2;
  1961. filterPos-= counter/2;
  1962. dst-= counter/2;
  1963. __asm__ volatile(
  1964. "movd %5, %%mm7 \n\t"
  1965. #if defined(PIC)
  1966. "push %%"REG_b" \n\t"
  1967. #endif
  1968. "push %%"REG_BP" \n\t" // we use 7 regs here ...
  1969. "mov %%"REG_a", %%"REG_BP" \n\t"
  1970. ".p2align 4 \n\t"
  1971. "1: \n\t"
  1972. "movzwl (%2, %%"REG_BP"), %%eax \n\t"
  1973. "movzwl 2(%2, %%"REG_BP"), %%ebx \n\t"
  1974. "movq (%1, %%"REG_BP", 4), %%mm1 \n\t"
  1975. "movq 8(%1, %%"REG_BP", 4), %%mm3 \n\t"
  1976. "movq (%3, %%"REG_a", 2), %%mm0 \n\t"
  1977. "movq (%3, %%"REG_b", 2), %%mm2 \n\t"
  1978. "pmaddwd %%mm1, %%mm0 \n\t"
  1979. "pmaddwd %%mm2, %%mm3 \n\t"
  1980. "movq %%mm0, %%mm4 \n\t"
  1981. "punpckldq %%mm3, %%mm0 \n\t"
  1982. "punpckhdq %%mm3, %%mm4 \n\t"
  1983. "paddd %%mm4, %%mm0 \n\t"
  1984. "psrad %%mm7, %%mm0 \n\t"
  1985. "packssdw %%mm0, %%mm0 \n\t"
  1986. "movd %%mm0, (%4, %%"REG_BP") \n\t"
  1987. "add $4, %%"REG_BP" \n\t"
  1988. " jnc 1b \n\t"
  1989. "pop %%"REG_BP" \n\t"
  1990. #if defined(PIC)
  1991. "pop %%"REG_b" \n\t"
  1992. #endif
  1993. : "+a" (counter)
  1994. : "c" (filter), "d" (filterPos), "S" (src), "D" (dst), "m"(shift)
  1995. #if !defined(PIC)
  1996. : "%"REG_b
  1997. #endif
  1998. );
  1999. } else if (filterSize==8 && shift<15) {
  2000. x86_reg counter= -2*dstW;
  2001. filter-= counter*4;
  2002. filterPos-= counter/2;
  2003. dst-= counter/2;
  2004. __asm__ volatile(
  2005. "movd %5, %%mm7 \n\t"
  2006. #if defined(PIC)
  2007. "push %%"REG_b" \n\t"
  2008. #endif
  2009. "push %%"REG_BP" \n\t" // we use 7 regs here ...
  2010. "mov %%"REG_a", %%"REG_BP" \n\t"
  2011. ".p2align 4 \n\t"
  2012. "1: \n\t"
  2013. "movzwl (%2, %%"REG_BP"), %%eax \n\t"
  2014. "movzwl 2(%2, %%"REG_BP"), %%ebx \n\t"
  2015. "movq (%1, %%"REG_BP", 8), %%mm1 \n\t"
  2016. "movq 16(%1, %%"REG_BP", 8), %%mm3 \n\t"
  2017. "movq (%3, %%"REG_a", 2), %%mm0 \n\t"
  2018. "movq (%3, %%"REG_b", 2), %%mm2 \n\t"
  2019. "pmaddwd %%mm1, %%mm0 \n\t"
  2020. "pmaddwd %%mm2, %%mm3 \n\t"
  2021. "movq 8(%1, %%"REG_BP", 8), %%mm1 \n\t"
  2022. "movq 24(%1, %%"REG_BP", 8), %%mm5 \n\t"
  2023. "movq 8(%3, %%"REG_a", 2), %%mm4 \n\t"
  2024. "movq 8(%3, %%"REG_b", 2), %%mm2 \n\t"
  2025. "pmaddwd %%mm1, %%mm4 \n\t"
  2026. "pmaddwd %%mm2, %%mm5 \n\t"
  2027. "paddd %%mm4, %%mm0 \n\t"
  2028. "paddd %%mm5, %%mm3 \n\t"
  2029. "movq %%mm0, %%mm4 \n\t"
  2030. "punpckldq %%mm3, %%mm0 \n\t"
  2031. "punpckhdq %%mm3, %%mm4 \n\t"
  2032. "paddd %%mm4, %%mm0 \n\t"
  2033. "psrad %%mm7, %%mm0 \n\t"
  2034. "packssdw %%mm0, %%mm0 \n\t"
  2035. "movd %%mm0, (%4, %%"REG_BP") \n\t"
  2036. "add $4, %%"REG_BP" \n\t"
  2037. " jnc 1b \n\t"
  2038. "pop %%"REG_BP" \n\t"
  2039. #if defined(PIC)
  2040. "pop %%"REG_b" \n\t"
  2041. #endif
  2042. : "+a" (counter)
  2043. : "c" (filter), "d" (filterPos), "S" (src), "D" (dst), "m"(shift)
  2044. #if !defined(PIC)
  2045. : "%"REG_b
  2046. #endif
  2047. );
  2048. } else if (shift<15){
  2049. const uint16_t *offset = src+filterSize;
  2050. x86_reg counter= -2*dstW;
  2051. //filter-= counter*filterSize/2;
  2052. filterPos-= counter/2;
  2053. dst-= counter/2;
  2054. __asm__ volatile(
  2055. "movd %7, %%mm7 \n\t"
  2056. ".p2align 4 \n\t"
  2057. "1: \n\t"
  2058. "mov %2, %%"REG_c" \n\t"
  2059. "movzwl (%%"REG_c", %0), %%eax \n\t"
  2060. "movzwl 2(%%"REG_c", %0), %%edx \n\t"
  2061. "mov %5, %%"REG_c" \n\t"
  2062. "pxor %%mm4, %%mm4 \n\t"
  2063. "pxor %%mm5, %%mm5 \n\t"
  2064. "2: \n\t"
  2065. "movq (%1), %%mm1 \n\t"
  2066. "movq (%1, %6), %%mm3 \n\t"
  2067. "movq (%%"REG_c", %%"REG_a", 2), %%mm0 \n\t"
  2068. "movq (%%"REG_c", %%"REG_d", 2), %%mm2 \n\t"
  2069. "pmaddwd %%mm1, %%mm0 \n\t"
  2070. "pmaddwd %%mm2, %%mm3 \n\t"
  2071. "paddd %%mm3, %%mm5 \n\t"
  2072. "paddd %%mm0, %%mm4 \n\t"
  2073. "add $8, %1 \n\t"
  2074. "add $8, %%"REG_c" \n\t"
  2075. "cmp %4, %%"REG_c" \n\t"
  2076. " jb 2b \n\t"
  2077. "add %6, %1 \n\t"
  2078. "movq %%mm4, %%mm0 \n\t"
  2079. "punpckldq %%mm5, %%mm4 \n\t"
  2080. "punpckhdq %%mm5, %%mm0 \n\t"
  2081. "paddd %%mm0, %%mm4 \n\t"
  2082. "psrad %%mm7, %%mm4 \n\t"
  2083. "packssdw %%mm4, %%mm4 \n\t"
  2084. "mov %3, %%"REG_a" \n\t"
  2085. "movd %%mm4, (%%"REG_a", %0) \n\t"
  2086. "add $4, %0 \n\t"
  2087. " jnc 1b \n\t"
  2088. : "+r" (counter), "+r" (filter)
  2089. : "m" (filterPos), "m" (dst), "m"(offset),
  2090. "m" (src), "r" ((x86_reg)filterSize*2), "m"(shift)
  2091. : "%"REG_a, "%"REG_c, "%"REG_d
  2092. );
  2093. } else
  2094. for (i=0; i<dstW; i++) {
  2095. int srcPos= filterPos[i];
  2096. int val=0;
  2097. for (j=0; j<filterSize; j++) {
  2098. val += ((int)src[srcPos + j])*filter[filterSize*i + j];
  2099. }
  2100. dst[i] = FFMIN(val>>shift, (1<<15)-1); // the cubic equation does overflow ...
  2101. }
  2102. }
  2103. #if COMPILE_TEMPLATE_MMX2
  2104. static inline void RENAME(hyscale_fast)(SwsContext *c, int16_t *dst,
  2105. int dstWidth, const uint8_t *src, int srcW,
  2106. int xInc)
  2107. {
  2108. int32_t *filterPos = c->hLumFilterPos;
  2109. int16_t *filter = c->hLumFilter;
  2110. void *mmx2FilterCode= c->lumMmx2FilterCode;
  2111. int i;
  2112. #if defined(PIC)
  2113. DECLARE_ALIGNED(8, uint64_t, ebxsave);
  2114. #endif
  2115. __asm__ volatile(
  2116. #if defined(PIC)
  2117. "mov %%"REG_b", %5 \n\t"
  2118. #endif
  2119. "pxor %%mm7, %%mm7 \n\t"
  2120. "mov %0, %%"REG_c" \n\t"
  2121. "mov %1, %%"REG_D" \n\t"
  2122. "mov %2, %%"REG_d" \n\t"
  2123. "mov %3, %%"REG_b" \n\t"
  2124. "xor %%"REG_a", %%"REG_a" \n\t" // i
  2125. PREFETCH" (%%"REG_c") \n\t"
  2126. PREFETCH" 32(%%"REG_c") \n\t"
  2127. PREFETCH" 64(%%"REG_c") \n\t"
  2128. #if ARCH_X86_64
  2129. #define CALL_MMX2_FILTER_CODE \
  2130. "movl (%%"REG_b"), %%esi \n\t"\
  2131. "call *%4 \n\t"\
  2132. "movl (%%"REG_b", %%"REG_a"), %%esi \n\t"\
  2133. "add %%"REG_S", %%"REG_c" \n\t"\
  2134. "add %%"REG_a", %%"REG_D" \n\t"\
  2135. "xor %%"REG_a", %%"REG_a" \n\t"\
  2136. #else
  2137. #define CALL_MMX2_FILTER_CODE \
  2138. "movl (%%"REG_b"), %%esi \n\t"\
  2139. "call *%4 \n\t"\
  2140. "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\
  2141. "add %%"REG_a", %%"REG_D" \n\t"\
  2142. "xor %%"REG_a", %%"REG_a" \n\t"\
  2143. #endif /* ARCH_X86_64 */
  2144. CALL_MMX2_FILTER_CODE
  2145. CALL_MMX2_FILTER_CODE
  2146. CALL_MMX2_FILTER_CODE
  2147. CALL_MMX2_FILTER_CODE
  2148. CALL_MMX2_FILTER_CODE
  2149. CALL_MMX2_FILTER_CODE
  2150. CALL_MMX2_FILTER_CODE
  2151. CALL_MMX2_FILTER_CODE
  2152. #if defined(PIC)
  2153. "mov %5, %%"REG_b" \n\t"
  2154. #endif
  2155. :: "m" (src), "m" (dst), "m" (filter), "m" (filterPos),
  2156. "m" (mmx2FilterCode)
  2157. #if defined(PIC)
  2158. ,"m" (ebxsave)
  2159. #endif
  2160. : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
  2161. #if !defined(PIC)
  2162. ,"%"REG_b
  2163. #endif
  2164. );
  2165. for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
  2166. dst[i] = src[srcW-1]*128;
  2167. }
  2168. static inline void RENAME(hcscale_fast)(SwsContext *c, int16_t *dst1, int16_t *dst2,
  2169. int dstWidth, const uint8_t *src1,
  2170. const uint8_t *src2, int srcW, int xInc)
  2171. {
  2172. int32_t *filterPos = c->hChrFilterPos;
  2173. int16_t *filter = c->hChrFilter;
  2174. void *mmx2FilterCode= c->chrMmx2FilterCode;
  2175. int i;
  2176. #if defined(PIC)
  2177. DECLARE_ALIGNED(8, uint64_t, ebxsave);
  2178. #endif
  2179. __asm__ volatile(
  2180. #if defined(PIC)
  2181. "mov %%"REG_b", %7 \n\t"
  2182. #endif
  2183. "pxor %%mm7, %%mm7 \n\t"
  2184. "mov %0, %%"REG_c" \n\t"
  2185. "mov %1, %%"REG_D" \n\t"
  2186. "mov %2, %%"REG_d" \n\t"
  2187. "mov %3, %%"REG_b" \n\t"
  2188. "xor %%"REG_a", %%"REG_a" \n\t" // i
  2189. PREFETCH" (%%"REG_c") \n\t"
  2190. PREFETCH" 32(%%"REG_c") \n\t"
  2191. PREFETCH" 64(%%"REG_c") \n\t"
  2192. CALL_MMX2_FILTER_CODE
  2193. CALL_MMX2_FILTER_CODE
  2194. CALL_MMX2_FILTER_CODE
  2195. CALL_MMX2_FILTER_CODE
  2196. "xor %%"REG_a", %%"REG_a" \n\t" // i
  2197. "mov %5, %%"REG_c" \n\t" // src
  2198. "mov %6, %%"REG_D" \n\t" // buf2
  2199. PREFETCH" (%%"REG_c") \n\t"
  2200. PREFETCH" 32(%%"REG_c") \n\t"
  2201. PREFETCH" 64(%%"REG_c") \n\t"
  2202. CALL_MMX2_FILTER_CODE
  2203. CALL_MMX2_FILTER_CODE
  2204. CALL_MMX2_FILTER_CODE
  2205. CALL_MMX2_FILTER_CODE
  2206. #if defined(PIC)
  2207. "mov %7, %%"REG_b" \n\t"
  2208. #endif
  2209. :: "m" (src1), "m" (dst1), "m" (filter), "m" (filterPos),
  2210. "m" (mmx2FilterCode), "m" (src2), "m"(dst2)
  2211. #if defined(PIC)
  2212. ,"m" (ebxsave)
  2213. #endif
  2214. : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
  2215. #if !defined(PIC)
  2216. ,"%"REG_b
  2217. #endif
  2218. );
  2219. for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) {
  2220. dst1[i] = src1[srcW-1]*128;
  2221. dst2[i] = src2[srcW-1]*128;
  2222. }
  2223. }
  2224. #endif /* COMPILE_TEMPLATE_MMX2 */
  2225. static void RENAME(sws_init_swScale)(SwsContext *c)
  2226. {
  2227. enum PixelFormat srcFormat = c->srcFormat;
  2228. if (!(c->flags & SWS_BITEXACT)) {
  2229. if (c->flags & SWS_ACCURATE_RND) {
  2230. c->yuv2yuv1 = RENAME(yuv2yuv1_ar );
  2231. c->yuv2yuvX = RENAME(yuv2yuvX_ar );
  2232. switch (c->dstFormat) {
  2233. case PIX_FMT_RGB32: c->yuv2packedX = RENAME(yuv2rgb32_X_ar); break;
  2234. case PIX_FMT_BGR24: c->yuv2packedX = RENAME(yuv2bgr24_X_ar); break;
  2235. case PIX_FMT_RGB555: c->yuv2packedX = RENAME(yuv2rgb555_X_ar); break;
  2236. case PIX_FMT_RGB565: c->yuv2packedX = RENAME(yuv2rgb565_X_ar); break;
  2237. case PIX_FMT_YUYV422: c->yuv2packedX = RENAME(yuv2yuyv422_X_ar); break;
  2238. default: break;
  2239. }
  2240. } else {
  2241. c->yuv2yuv1 = RENAME(yuv2yuv1 );
  2242. c->yuv2yuvX = RENAME(yuv2yuvX );
  2243. switch (c->dstFormat) {
  2244. case PIX_FMT_RGB32: c->yuv2packedX = RENAME(yuv2rgb32_X); break;
  2245. case PIX_FMT_BGR24: c->yuv2packedX = RENAME(yuv2bgr24_X); break;
  2246. case PIX_FMT_RGB555: c->yuv2packedX = RENAME(yuv2rgb555_X); break;
  2247. case PIX_FMT_RGB565: c->yuv2packedX = RENAME(yuv2rgb565_X); break;
  2248. case PIX_FMT_YUYV422: c->yuv2packedX = RENAME(yuv2yuyv422_X); break;
  2249. default: break;
  2250. }
  2251. }
  2252. switch (c->dstFormat) {
  2253. case PIX_FMT_RGB32:
  2254. c->yuv2packed1 = RENAME(yuv2rgb32_1);
  2255. c->yuv2packed2 = RENAME(yuv2rgb32_2);
  2256. break;
  2257. case PIX_FMT_BGR24:
  2258. c->yuv2packed1 = RENAME(yuv2bgr24_1);
  2259. c->yuv2packed2 = RENAME(yuv2bgr24_2);
  2260. break;
  2261. case PIX_FMT_RGB555:
  2262. c->yuv2packed1 = RENAME(yuv2rgb555_1);
  2263. c->yuv2packed2 = RENAME(yuv2rgb555_2);
  2264. break;
  2265. case PIX_FMT_RGB565:
  2266. c->yuv2packed1 = RENAME(yuv2rgb565_1);
  2267. c->yuv2packed2 = RENAME(yuv2rgb565_2);
  2268. break;
  2269. case PIX_FMT_YUYV422:
  2270. c->yuv2packed1 = RENAME(yuv2yuyv422_1);
  2271. c->yuv2packed2 = RENAME(yuv2yuyv422_2);
  2272. break;
  2273. default:
  2274. break;
  2275. }
  2276. }
  2277. #if !COMPILE_TEMPLATE_MMX2
  2278. c->hScale = RENAME(hScale );
  2279. #endif /* !COMPILE_TEMPLATE_MMX2 */
  2280. // Use the new MMX scaler if the MMX2 one can't be used (it is faster than the x86 ASM one).
  2281. #if COMPILE_TEMPLATE_MMX2
  2282. if (c->flags & SWS_FAST_BILINEAR && c->canMMX2BeUsed)
  2283. {
  2284. c->hyscale_fast = RENAME(hyscale_fast);
  2285. c->hcscale_fast = RENAME(hcscale_fast);
  2286. } else {
  2287. #endif /* COMPILE_TEMPLATE_MMX2 */
  2288. c->hyscale_fast = NULL;
  2289. c->hcscale_fast = NULL;
  2290. #if COMPILE_TEMPLATE_MMX2
  2291. }
  2292. #endif /* COMPILE_TEMPLATE_MMX2 */
  2293. #if !COMPILE_TEMPLATE_MMX2
  2294. switch(srcFormat) {
  2295. case PIX_FMT_YUYV422 : c->chrToYV12 = RENAME(yuy2ToUV); break;
  2296. case PIX_FMT_UYVY422 : c->chrToYV12 = RENAME(uyvyToUV); break;
  2297. case PIX_FMT_NV12 : c->chrToYV12 = RENAME(nv12ToUV); break;
  2298. case PIX_FMT_NV21 : c->chrToYV12 = RENAME(nv21ToUV); break;
  2299. case PIX_FMT_GRAY16LE :
  2300. case PIX_FMT_YUV420P9LE:
  2301. case PIX_FMT_YUV422P10LE:
  2302. case PIX_FMT_YUV420P10LE:
  2303. case PIX_FMT_YUV420P16LE:
  2304. case PIX_FMT_YUV422P16LE:
  2305. case PIX_FMT_YUV444P16LE: c->hScale16= RENAME(hScale16); break;
  2306. }
  2307. #endif /* !COMPILE_TEMPLATE_MMX2 */
  2308. if (!c->chrSrcHSubSample) {
  2309. switch(srcFormat) {
  2310. case PIX_FMT_BGR24 : c->chrToYV12 = RENAME(bgr24ToUV); break;
  2311. case PIX_FMT_RGB24 : c->chrToYV12 = RENAME(rgb24ToUV); break;
  2312. default: break;
  2313. }
  2314. }
  2315. switch (srcFormat) {
  2316. #if !COMPILE_TEMPLATE_MMX2
  2317. case PIX_FMT_YUYV422 :
  2318. case PIX_FMT_Y400A :
  2319. c->lumToYV12 = RENAME(yuy2ToY); break;
  2320. case PIX_FMT_UYVY422 :
  2321. c->lumToYV12 = RENAME(uyvyToY); break;
  2322. #endif /* !COMPILE_TEMPLATE_MMX2 */
  2323. case PIX_FMT_BGR24 : c->lumToYV12 = RENAME(bgr24ToY); break;
  2324. case PIX_FMT_RGB24 : c->lumToYV12 = RENAME(rgb24ToY); break;
  2325. default: break;
  2326. }
  2327. #if !COMPILE_TEMPLATE_MMX2
  2328. if (c->alpPixBuf) {
  2329. switch (srcFormat) {
  2330. case PIX_FMT_Y400A : c->alpToYV12 = RENAME(yuy2ToY); break;
  2331. default: break;
  2332. }
  2333. }
  2334. #endif /* !COMPILE_TEMPLATE_MMX2 */
  2335. if(isAnyRGB(c->srcFormat))
  2336. c->hScale16= RENAME(hScale16);
  2337. }