You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

2552 lines
117KB

  1. /*
  2. * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
  3. *
  4. * This file is part of FFmpeg.
  5. *
  6. * FFmpeg is free software; you can redistribute it and/or
  7. * modify it under the terms of the GNU Lesser General Public
  8. * License as published by the Free Software Foundation; either
  9. * version 2.1 of the License, or (at your option) any later version.
  10. *
  11. * FFmpeg is distributed in the hope that it will be useful,
  12. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  14. * Lesser General Public License for more details.
  15. *
  16. * You should have received a copy of the GNU Lesser General Public
  17. * License along with FFmpeg; if not, write to the Free Software
  18. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  19. */
  20. #include "swscale_template.h"
  21. #undef REAL_MOVNTQ
  22. #undef MOVNTQ
  23. #undef PREFETCH
  24. #if COMPILE_TEMPLATE_MMX2
  25. #define PREFETCH "prefetchnta"
  26. #else
  27. #define PREFETCH " # nop"
  28. #endif
  29. #if COMPILE_TEMPLATE_MMX2
  30. #define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
  31. #else
  32. #define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
  33. #endif
  34. #define MOVNTQ(a,b) REAL_MOVNTQ(a,b)
  35. #define YSCALEYUV2YV12X(offset, dest, end, pos) \
  36. __asm__ volatile(\
  37. "movq "DITHER16"+0(%0), %%mm3 \n\t"\
  38. "movq "DITHER16"+8(%0), %%mm4 \n\t"\
  39. "lea " offset "(%0), %%"REG_d" \n\t"\
  40. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  41. ".p2align 4 \n\t" /* FIXME Unroll? */\
  42. "1: \n\t"\
  43. "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
  44. "movq (%%"REG_S", %3, 2), %%mm2 \n\t" /* srcData */\
  45. "movq 8(%%"REG_S", %3, 2), %%mm5 \n\t" /* srcData */\
  46. "add $16, %%"REG_d" \n\t"\
  47. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  48. "test %%"REG_S", %%"REG_S" \n\t"\
  49. "pmulhw %%mm0, %%mm2 \n\t"\
  50. "pmulhw %%mm0, %%mm5 \n\t"\
  51. "paddw %%mm2, %%mm3 \n\t"\
  52. "paddw %%mm5, %%mm4 \n\t"\
  53. " jnz 1b \n\t"\
  54. "psraw $3, %%mm3 \n\t"\
  55. "psraw $3, %%mm4 \n\t"\
  56. "packuswb %%mm4, %%mm3 \n\t"\
  57. MOVNTQ(%%mm3, (%1, %3))\
  58. "add $8, %3 \n\t"\
  59. "cmp %2, %3 \n\t"\
  60. "movq "DITHER16"+0(%0), %%mm3 \n\t"\
  61. "movq "DITHER16"+8(%0), %%mm4 \n\t"\
  62. "lea " offset "(%0), %%"REG_d" \n\t"\
  63. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  64. "jb 1b \n\t"\
  65. :: "r" (&c->redDither),\
  66. "r" (dest), "g" ((x86_reg)(end)), "r"((x86_reg)(pos))\
  67. : "%"REG_d, "%"REG_S\
  68. );
  69. static inline void RENAME(yuv2yuvX)(SwsContext *c, const int16_t *lumFilter,
  70. const int16_t **lumSrc, int lumFilterSize,
  71. const int16_t *chrFilter, const int16_t **chrUSrc,
  72. const int16_t **chrVSrc,
  73. int chrFilterSize, const int16_t **alpSrc,
  74. uint8_t *dest, uint8_t *uDest, uint8_t *vDest,
  75. uint8_t *aDest, int dstW, int chrDstW,
  76. const uint8_t *lumDither, const uint8_t *chrDither)
  77. {
  78. int i;
  79. if (uDest) {
  80. x86_reg uv_off = c->uv_off;
  81. for(i=0; i<8; i++) c->dither16[i] = chrDither[i]>>4;
  82. YSCALEYUV2YV12X(CHR_MMX_FILTER_OFFSET, uDest, chrDstW, 0)
  83. for(i=0; i<8; i++) c->dither16[i] = chrDither[(i+3)&7]>>4;
  84. YSCALEYUV2YV12X(CHR_MMX_FILTER_OFFSET, vDest - uv_off, chrDstW + uv_off, uv_off)
  85. }
  86. for(i=0; i<8; i++) c->dither16[i] = lumDither[i]>>4;
  87. if (CONFIG_SWSCALE_ALPHA && aDest) {
  88. YSCALEYUV2YV12X(ALP_MMX_FILTER_OFFSET, aDest, dstW, 0)
  89. }
  90. YSCALEYUV2YV12X(LUM_MMX_FILTER_OFFSET, dest, dstW, 0)
  91. }
  92. #define YSCALEYUV2YV12X_ACCURATE(offset, dest, end, pos) \
  93. __asm__ volatile(\
  94. "lea " offset "(%0), %%"REG_d" \n\t"\
  95. "movq "DITHER32"+0(%0), %%mm4 \n\t"\
  96. "movq "DITHER32"+8(%0), %%mm5 \n\t"\
  97. "movq "DITHER32"+16(%0), %%mm6 \n\t"\
  98. "movq "DITHER32"+24(%0), %%mm7 \n\t"\
  99. "pxor %%mm4, %%mm4 \n\t"\
  100. "pxor %%mm5, %%mm5 \n\t"\
  101. "pxor %%mm6, %%mm6 \n\t"\
  102. "pxor %%mm7, %%mm7 \n\t"\
  103. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  104. ".p2align 4 \n\t"\
  105. "1: \n\t"\
  106. "movq (%%"REG_S", %3, 2), %%mm0 \n\t" /* srcData */\
  107. "movq 8(%%"REG_S", %3, 2), %%mm2 \n\t" /* srcData */\
  108. "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
  109. "movq (%%"REG_S", %3, 2), %%mm1 \n\t" /* srcData */\
  110. "movq %%mm0, %%mm3 \n\t"\
  111. "punpcklwd %%mm1, %%mm0 \n\t"\
  112. "punpckhwd %%mm1, %%mm3 \n\t"\
  113. "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm1 \n\t" /* filterCoeff */\
  114. "pmaddwd %%mm1, %%mm0 \n\t"\
  115. "pmaddwd %%mm1, %%mm3 \n\t"\
  116. "paddd %%mm0, %%mm4 \n\t"\
  117. "paddd %%mm3, %%mm5 \n\t"\
  118. "movq 8(%%"REG_S", %3, 2), %%mm3 \n\t" /* srcData */\
  119. "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
  120. "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
  121. "test %%"REG_S", %%"REG_S" \n\t"\
  122. "movq %%mm2, %%mm0 \n\t"\
  123. "punpcklwd %%mm3, %%mm2 \n\t"\
  124. "punpckhwd %%mm3, %%mm0 \n\t"\
  125. "pmaddwd %%mm1, %%mm2 \n\t"\
  126. "pmaddwd %%mm1, %%mm0 \n\t"\
  127. "paddd %%mm2, %%mm6 \n\t"\
  128. "paddd %%mm0, %%mm7 \n\t"\
  129. " jnz 1b \n\t"\
  130. "psrad $19, %%mm4 \n\t"\
  131. "psrad $19, %%mm5 \n\t"\
  132. "psrad $19, %%mm6 \n\t"\
  133. "psrad $19, %%mm7 \n\t"\
  134. "packssdw %%mm5, %%mm4 \n\t"\
  135. "packssdw %%mm7, %%mm6 \n\t"\
  136. "packuswb %%mm6, %%mm4 \n\t"\
  137. MOVNTQ(%%mm4, (%1, %3))\
  138. "add $8, %3 \n\t"\
  139. "cmp %2, %3 \n\t"\
  140. "lea " offset "(%0), %%"REG_d" \n\t"\
  141. "movq "DITHER32"+0(%0), %%mm4 \n\t"\
  142. "movq "DITHER32"+8(%0), %%mm5 \n\t"\
  143. "movq "DITHER32"+16(%0), %%mm6 \n\t"\
  144. "movq "DITHER32"+24(%0), %%mm7 \n\t"\
  145. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  146. "jb 1b \n\t"\
  147. :: "r" (&c->redDither),\
  148. "r" (dest), "g" ((x86_reg)(end)), "r"((x86_reg)(pos))\
  149. : "%"REG_a, "%"REG_d, "%"REG_S\
  150. );
  151. static inline void RENAME(yuv2yuvX_ar)(SwsContext *c, const int16_t *lumFilter,
  152. const int16_t **lumSrc, int lumFilterSize,
  153. const int16_t *chrFilter, const int16_t **chrUSrc,
  154. const int16_t **chrVSrc,
  155. int chrFilterSize, const int16_t **alpSrc,
  156. uint8_t *dest, uint8_t *uDest, uint8_t *vDest,
  157. uint8_t *aDest, int dstW, int chrDstW,
  158. const uint8_t *lumDither, const uint8_t *chrDither)
  159. {
  160. int i;
  161. if (uDest) {
  162. x86_reg uv_off = c->uv_off;
  163. for(i=0; i<8; i++) c->dither32[i] = chrDither[i]<<12;
  164. YSCALEYUV2YV12X_ACCURATE(CHR_MMX_FILTER_OFFSET, uDest, chrDstW, 0)
  165. for(i=0; i<8; i++) c->dither32[i] = chrDither[(i+3)&7]<<12;
  166. YSCALEYUV2YV12X_ACCURATE(CHR_MMX_FILTER_OFFSET, vDest - uv_off, chrDstW + uv_off, uv_off)
  167. }
  168. for(i=0; i<8; i++) c->dither32[i] = lumDither[i]<<12;
  169. if (CONFIG_SWSCALE_ALPHA && aDest) {
  170. YSCALEYUV2YV12X_ACCURATE(ALP_MMX_FILTER_OFFSET, aDest, dstW, 0)
  171. }
  172. YSCALEYUV2YV12X_ACCURATE(LUM_MMX_FILTER_OFFSET, dest, dstW, 0)
  173. }
  174. #define YSCALEYUV2YV121 \
  175. "mov %2, %%"REG_a" \n\t"\
  176. ".p2align 4 \n\t" /* FIXME Unroll? */\
  177. "1: \n\t"\
  178. "movq (%0, %%"REG_a", 2), %%mm0 \n\t"\
  179. "movq 8(%0, %%"REG_a", 2), %%mm1 \n\t"\
  180. "psraw $7, %%mm0 \n\t"\
  181. "psraw $7, %%mm1 \n\t"\
  182. "packuswb %%mm1, %%mm0 \n\t"\
  183. MOVNTQ(%%mm0, (%1, %%REGa))\
  184. "add $8, %%"REG_a" \n\t"\
  185. "jnc 1b \n\t"
  186. static inline void RENAME(yuv2yuv1)(SwsContext *c, const int16_t *lumSrc,
  187. const int16_t *chrUSrc, const int16_t *chrVSrc,
  188. const int16_t *alpSrc,
  189. uint8_t *dest, uint8_t *uDest, uint8_t *vDest,
  190. uint8_t *aDest, int dstW, int chrDstW,
  191. const uint8_t *lumDither, const uint8_t *chrDither)
  192. {
  193. int p= 4;
  194. const int16_t *src[4]= { alpSrc + dstW, lumSrc + dstW, chrUSrc + chrDstW, chrVSrc + chrDstW };
  195. uint8_t *dst[4]= { aDest, dest, uDest, vDest };
  196. x86_reg counter[4]= { dstW, dstW, chrDstW, chrDstW };
  197. while (p--) {
  198. if (dst[p]) {
  199. __asm__ volatile(
  200. YSCALEYUV2YV121
  201. :: "r" (src[p]), "r" (dst[p] + counter[p]),
  202. "g" (-counter[p])
  203. : "%"REG_a
  204. );
  205. }
  206. }
  207. }
  208. #define YSCALEYUV2YV121_ACCURATE \
  209. "mov %2, %%"REG_a" \n\t"\
  210. "movq 0(%3), %%mm6 \n\t"\
  211. "movq 8(%3), %%mm7 \n\t"\
  212. ".p2align 4 \n\t" /* FIXME Unroll? */\
  213. "1: \n\t"\
  214. "movq (%0, %%"REG_a", 2), %%mm0 \n\t"\
  215. "movq 8(%0, %%"REG_a", 2), %%mm1 \n\t"\
  216. "paddsw %%mm6, %%mm0 \n\t"\
  217. "paddsw %%mm7, %%mm1 \n\t"\
  218. "psraw $7, %%mm0 \n\t"\
  219. "psraw $7, %%mm1 \n\t"\
  220. "packuswb %%mm1, %%mm0 \n\t"\
  221. MOVNTQ(%%mm0, (%1, %%REGa))\
  222. "add $8, %%"REG_a" \n\t"\
  223. "jnc 1b \n\t"
  224. static inline void RENAME(yuv2yuv1_ar)(SwsContext *c, const int16_t *lumSrc,
  225. const int16_t *chrUSrc, const int16_t *chrVSrc,
  226. const int16_t *alpSrc,
  227. uint8_t *dest, uint8_t *uDest, uint8_t *vDest,
  228. uint8_t *aDest, int dstW, int chrDstW,
  229. const uint8_t *lumDither, const uint8_t *chrDither)
  230. {
  231. int p= 4;
  232. const int16_t *src[4]= { alpSrc + dstW, lumSrc + dstW, chrUSrc + chrDstW, chrVSrc + chrDstW };
  233. uint8_t *dst[4]= { aDest, dest, uDest, vDest };
  234. x86_reg counter[4]= { dstW, dstW, chrDstW, chrDstW };
  235. while (p--) {
  236. if (dst[p]) {
  237. int i;
  238. for(i=0; i<8; i++) c->dither16[i] = i<2 ? lumDither[i] : chrDither[i];
  239. __asm__ volatile(
  240. YSCALEYUV2YV121_ACCURATE
  241. :: "r" (src[p]), "r" (dst[p] + counter[p]),
  242. "g" (-counter[p]), "r"(c->dither16)
  243. : "%"REG_a
  244. );
  245. }
  246. }
  247. }
  248. #define YSCALEYUV2PACKEDX_UV \
  249. __asm__ volatile(\
  250. "xor %%"REG_a", %%"REG_a" \n\t"\
  251. ".p2align 4 \n\t"\
  252. "nop \n\t"\
  253. "1: \n\t"\
  254. "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
  255. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  256. "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
  257. "movq %%mm3, %%mm4 \n\t"\
  258. ".p2align 4 \n\t"\
  259. "2: \n\t"\
  260. "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
  261. "movq (%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* UsrcData */\
  262. "add %6, %%"REG_S" \n\t" \
  263. "movq (%%"REG_S", %%"REG_a"), %%mm5 \n\t" /* VsrcData */\
  264. "add $16, %%"REG_d" \n\t"\
  265. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  266. "pmulhw %%mm0, %%mm2 \n\t"\
  267. "pmulhw %%mm0, %%mm5 \n\t"\
  268. "paddw %%mm2, %%mm3 \n\t"\
  269. "paddw %%mm5, %%mm4 \n\t"\
  270. "test %%"REG_S", %%"REG_S" \n\t"\
  271. " jnz 2b \n\t"\
  272. #define YSCALEYUV2PACKEDX_YA(offset,coeff,src1,src2,dst1,dst2) \
  273. "lea "offset"(%0), %%"REG_d" \n\t"\
  274. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  275. "movq "VROUNDER_OFFSET"(%0), "#dst1" \n\t"\
  276. "movq "#dst1", "#dst2" \n\t"\
  277. ".p2align 4 \n\t"\
  278. "2: \n\t"\
  279. "movq 8(%%"REG_d"), "#coeff" \n\t" /* filterCoeff */\
  280. "movq (%%"REG_S", %%"REG_a", 2), "#src1" \n\t" /* Y1srcData */\
  281. "movq 8(%%"REG_S", %%"REG_a", 2), "#src2" \n\t" /* Y2srcData */\
  282. "add $16, %%"REG_d" \n\t"\
  283. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  284. "pmulhw "#coeff", "#src1" \n\t"\
  285. "pmulhw "#coeff", "#src2" \n\t"\
  286. "paddw "#src1", "#dst1" \n\t"\
  287. "paddw "#src2", "#dst2" \n\t"\
  288. "test %%"REG_S", %%"REG_S" \n\t"\
  289. " jnz 2b \n\t"\
  290. #define YSCALEYUV2PACKEDX \
  291. YSCALEYUV2PACKEDX_UV \
  292. YSCALEYUV2PACKEDX_YA(LUM_MMX_FILTER_OFFSET,%%mm0,%%mm2,%%mm5,%%mm1,%%mm7) \
  293. #define YSCALEYUV2PACKEDX_END \
  294. :: "r" (&c->redDither), \
  295. "m" (dummy), "m" (dummy), "m" (dummy),\
  296. "r" (dest), "m" (dstW_reg), "m"(uv_off) \
  297. : "%"REG_a, "%"REG_d, "%"REG_S \
  298. );
  299. #define YSCALEYUV2PACKEDX_ACCURATE_UV \
  300. __asm__ volatile(\
  301. "xor %%"REG_a", %%"REG_a" \n\t"\
  302. ".p2align 4 \n\t"\
  303. "nop \n\t"\
  304. "1: \n\t"\
  305. "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
  306. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  307. "pxor %%mm4, %%mm4 \n\t"\
  308. "pxor %%mm5, %%mm5 \n\t"\
  309. "pxor %%mm6, %%mm6 \n\t"\
  310. "pxor %%mm7, %%mm7 \n\t"\
  311. ".p2align 4 \n\t"\
  312. "2: \n\t"\
  313. "movq (%%"REG_S", %%"REG_a"), %%mm0 \n\t" /* UsrcData */\
  314. "add %6, %%"REG_S" \n\t" \
  315. "movq (%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* VsrcData */\
  316. "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
  317. "movq (%%"REG_S", %%"REG_a"), %%mm1 \n\t" /* UsrcData */\
  318. "movq %%mm0, %%mm3 \n\t"\
  319. "punpcklwd %%mm1, %%mm0 \n\t"\
  320. "punpckhwd %%mm1, %%mm3 \n\t"\
  321. "movq "STR(APCK_COEF)"(%%"REG_d"),%%mm1 \n\t" /* filterCoeff */\
  322. "pmaddwd %%mm1, %%mm0 \n\t"\
  323. "pmaddwd %%mm1, %%mm3 \n\t"\
  324. "paddd %%mm0, %%mm4 \n\t"\
  325. "paddd %%mm3, %%mm5 \n\t"\
  326. "add %6, %%"REG_S" \n\t" \
  327. "movq (%%"REG_S", %%"REG_a"), %%mm3 \n\t" /* VsrcData */\
  328. "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
  329. "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
  330. "test %%"REG_S", %%"REG_S" \n\t"\
  331. "movq %%mm2, %%mm0 \n\t"\
  332. "punpcklwd %%mm3, %%mm2 \n\t"\
  333. "punpckhwd %%mm3, %%mm0 \n\t"\
  334. "pmaddwd %%mm1, %%mm2 \n\t"\
  335. "pmaddwd %%mm1, %%mm0 \n\t"\
  336. "paddd %%mm2, %%mm6 \n\t"\
  337. "paddd %%mm0, %%mm7 \n\t"\
  338. " jnz 2b \n\t"\
  339. "psrad $16, %%mm4 \n\t"\
  340. "psrad $16, %%mm5 \n\t"\
  341. "psrad $16, %%mm6 \n\t"\
  342. "psrad $16, %%mm7 \n\t"\
  343. "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
  344. "packssdw %%mm5, %%mm4 \n\t"\
  345. "packssdw %%mm7, %%mm6 \n\t"\
  346. "paddw %%mm0, %%mm4 \n\t"\
  347. "paddw %%mm0, %%mm6 \n\t"\
  348. "movq %%mm4, "U_TEMP"(%0) \n\t"\
  349. "movq %%mm6, "V_TEMP"(%0) \n\t"\
  350. #define YSCALEYUV2PACKEDX_ACCURATE_YA(offset) \
  351. "lea "offset"(%0), %%"REG_d" \n\t"\
  352. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  353. "pxor %%mm1, %%mm1 \n\t"\
  354. "pxor %%mm5, %%mm5 \n\t"\
  355. "pxor %%mm7, %%mm7 \n\t"\
  356. "pxor %%mm6, %%mm6 \n\t"\
  357. ".p2align 4 \n\t"\
  358. "2: \n\t"\
  359. "movq (%%"REG_S", %%"REG_a", 2), %%mm0 \n\t" /* Y1srcData */\
  360. "movq 8(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* Y2srcData */\
  361. "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
  362. "movq (%%"REG_S", %%"REG_a", 2), %%mm4 \n\t" /* Y1srcData */\
  363. "movq %%mm0, %%mm3 \n\t"\
  364. "punpcklwd %%mm4, %%mm0 \n\t"\
  365. "punpckhwd %%mm4, %%mm3 \n\t"\
  366. "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm4 \n\t" /* filterCoeff */\
  367. "pmaddwd %%mm4, %%mm0 \n\t"\
  368. "pmaddwd %%mm4, %%mm3 \n\t"\
  369. "paddd %%mm0, %%mm1 \n\t"\
  370. "paddd %%mm3, %%mm5 \n\t"\
  371. "movq 8(%%"REG_S", %%"REG_a", 2), %%mm3 \n\t" /* Y2srcData */\
  372. "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
  373. "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
  374. "test %%"REG_S", %%"REG_S" \n\t"\
  375. "movq %%mm2, %%mm0 \n\t"\
  376. "punpcklwd %%mm3, %%mm2 \n\t"\
  377. "punpckhwd %%mm3, %%mm0 \n\t"\
  378. "pmaddwd %%mm4, %%mm2 \n\t"\
  379. "pmaddwd %%mm4, %%mm0 \n\t"\
  380. "paddd %%mm2, %%mm7 \n\t"\
  381. "paddd %%mm0, %%mm6 \n\t"\
  382. " jnz 2b \n\t"\
  383. "psrad $16, %%mm1 \n\t"\
  384. "psrad $16, %%mm5 \n\t"\
  385. "psrad $16, %%mm7 \n\t"\
  386. "psrad $16, %%mm6 \n\t"\
  387. "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
  388. "packssdw %%mm5, %%mm1 \n\t"\
  389. "packssdw %%mm6, %%mm7 \n\t"\
  390. "paddw %%mm0, %%mm1 \n\t"\
  391. "paddw %%mm0, %%mm7 \n\t"\
  392. "movq "U_TEMP"(%0), %%mm3 \n\t"\
  393. "movq "V_TEMP"(%0), %%mm4 \n\t"\
  394. #define YSCALEYUV2PACKEDX_ACCURATE \
  395. YSCALEYUV2PACKEDX_ACCURATE_UV \
  396. YSCALEYUV2PACKEDX_ACCURATE_YA(LUM_MMX_FILTER_OFFSET)
  397. #define YSCALEYUV2RGBX \
  398. "psubw "U_OFFSET"(%0), %%mm3 \n\t" /* (U-128)8*/\
  399. "psubw "V_OFFSET"(%0), %%mm4 \n\t" /* (V-128)8*/\
  400. "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
  401. "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
  402. "pmulhw "UG_COEFF"(%0), %%mm3 \n\t"\
  403. "pmulhw "VG_COEFF"(%0), %%mm4 \n\t"\
  404. /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
  405. "pmulhw "UB_COEFF"(%0), %%mm2 \n\t"\
  406. "pmulhw "VR_COEFF"(%0), %%mm5 \n\t"\
  407. "psubw "Y_OFFSET"(%0), %%mm1 \n\t" /* 8(Y-16)*/\
  408. "psubw "Y_OFFSET"(%0), %%mm7 \n\t" /* 8(Y-16)*/\
  409. "pmulhw "Y_COEFF"(%0), %%mm1 \n\t"\
  410. "pmulhw "Y_COEFF"(%0), %%mm7 \n\t"\
  411. /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
  412. "paddw %%mm3, %%mm4 \n\t"\
  413. "movq %%mm2, %%mm0 \n\t"\
  414. "movq %%mm5, %%mm6 \n\t"\
  415. "movq %%mm4, %%mm3 \n\t"\
  416. "punpcklwd %%mm2, %%mm2 \n\t"\
  417. "punpcklwd %%mm5, %%mm5 \n\t"\
  418. "punpcklwd %%mm4, %%mm4 \n\t"\
  419. "paddw %%mm1, %%mm2 \n\t"\
  420. "paddw %%mm1, %%mm5 \n\t"\
  421. "paddw %%mm1, %%mm4 \n\t"\
  422. "punpckhwd %%mm0, %%mm0 \n\t"\
  423. "punpckhwd %%mm6, %%mm6 \n\t"\
  424. "punpckhwd %%mm3, %%mm3 \n\t"\
  425. "paddw %%mm7, %%mm0 \n\t"\
  426. "paddw %%mm7, %%mm6 \n\t"\
  427. "paddw %%mm7, %%mm3 \n\t"\
  428. /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
  429. "packuswb %%mm0, %%mm2 \n\t"\
  430. "packuswb %%mm6, %%mm5 \n\t"\
  431. "packuswb %%mm3, %%mm4 \n\t"\
  432. #define REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) \
  433. "movq "#b", "#q2" \n\t" /* B */\
  434. "movq "#r", "#t" \n\t" /* R */\
  435. "punpcklbw "#g", "#b" \n\t" /* GBGBGBGB 0 */\
  436. "punpcklbw "#a", "#r" \n\t" /* ARARARAR 0 */\
  437. "punpckhbw "#g", "#q2" \n\t" /* GBGBGBGB 2 */\
  438. "punpckhbw "#a", "#t" \n\t" /* ARARARAR 2 */\
  439. "movq "#b", "#q0" \n\t" /* GBGBGBGB 0 */\
  440. "movq "#q2", "#q3" \n\t" /* GBGBGBGB 2 */\
  441. "punpcklwd "#r", "#q0" \n\t" /* ARGBARGB 0 */\
  442. "punpckhwd "#r", "#b" \n\t" /* ARGBARGB 1 */\
  443. "punpcklwd "#t", "#q2" \n\t" /* ARGBARGB 2 */\
  444. "punpckhwd "#t", "#q3" \n\t" /* ARGBARGB 3 */\
  445. \
  446. MOVNTQ( q0, (dst, index, 4))\
  447. MOVNTQ( b, 8(dst, index, 4))\
  448. MOVNTQ( q2, 16(dst, index, 4))\
  449. MOVNTQ( q3, 24(dst, index, 4))\
  450. \
  451. "add $8, "#index" \n\t"\
  452. "cmp "#dstw", "#index" \n\t"\
  453. " jb 1b \n\t"
  454. #define WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t)
  455. static inline void RENAME(yuv2rgb32_X_ar)(SwsContext *c, const int16_t *lumFilter,
  456. const int16_t **lumSrc, int lumFilterSize,
  457. const int16_t *chrFilter, const int16_t **chrUSrc,
  458. const int16_t **chrVSrc,
  459. int chrFilterSize, const int16_t **alpSrc,
  460. uint8_t *dest, int dstW, int dstY)
  461. {
  462. x86_reg dummy=0;
  463. x86_reg dstW_reg = dstW;
  464. x86_reg uv_off = c->uv_off << 1;
  465. if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
  466. YSCALEYUV2PACKEDX_ACCURATE
  467. YSCALEYUV2RGBX
  468. "movq %%mm2, "U_TEMP"(%0) \n\t"
  469. "movq %%mm4, "V_TEMP"(%0) \n\t"
  470. "movq %%mm5, "Y_TEMP"(%0) \n\t"
  471. YSCALEYUV2PACKEDX_ACCURATE_YA(ALP_MMX_FILTER_OFFSET)
  472. "movq "Y_TEMP"(%0), %%mm5 \n\t"
  473. "psraw $3, %%mm1 \n\t"
  474. "psraw $3, %%mm7 \n\t"
  475. "packuswb %%mm7, %%mm1 \n\t"
  476. WRITEBGR32(%4, %5, %%REGa, %%mm3, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm2, %%mm6)
  477. YSCALEYUV2PACKEDX_END
  478. } else {
  479. YSCALEYUV2PACKEDX_ACCURATE
  480. YSCALEYUV2RGBX
  481. "pcmpeqd %%mm7, %%mm7 \n\t"
  482. WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
  483. YSCALEYUV2PACKEDX_END
  484. }
  485. }
  486. static inline void RENAME(yuv2rgb32_X)(SwsContext *c, const int16_t *lumFilter,
  487. const int16_t **lumSrc, int lumFilterSize,
  488. const int16_t *chrFilter, const int16_t **chrUSrc,
  489. const int16_t **chrVSrc,
  490. int chrFilterSize, const int16_t **alpSrc,
  491. uint8_t *dest, int dstW, int dstY)
  492. {
  493. x86_reg dummy=0;
  494. x86_reg dstW_reg = dstW;
  495. x86_reg uv_off = c->uv_off << 1;
  496. if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
  497. YSCALEYUV2PACKEDX
  498. YSCALEYUV2RGBX
  499. YSCALEYUV2PACKEDX_YA(ALP_MMX_FILTER_OFFSET, %%mm0, %%mm3, %%mm6, %%mm1, %%mm7)
  500. "psraw $3, %%mm1 \n\t"
  501. "psraw $3, %%mm7 \n\t"
  502. "packuswb %%mm7, %%mm1 \n\t"
  503. WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
  504. YSCALEYUV2PACKEDX_END
  505. } else {
  506. YSCALEYUV2PACKEDX
  507. YSCALEYUV2RGBX
  508. "pcmpeqd %%mm7, %%mm7 \n\t"
  509. WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
  510. YSCALEYUV2PACKEDX_END
  511. }
  512. }
  513. #define REAL_WRITERGB16(dst, dstw, index) \
  514. "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
  515. "pand "MANGLE(bFC)", %%mm4 \n\t" /* G */\
  516. "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
  517. "psrlq $3, %%mm2 \n\t"\
  518. \
  519. "movq %%mm2, %%mm1 \n\t"\
  520. "movq %%mm4, %%mm3 \n\t"\
  521. \
  522. "punpcklbw %%mm7, %%mm3 \n\t"\
  523. "punpcklbw %%mm5, %%mm2 \n\t"\
  524. "punpckhbw %%mm7, %%mm4 \n\t"\
  525. "punpckhbw %%mm5, %%mm1 \n\t"\
  526. \
  527. "psllq $3, %%mm3 \n\t"\
  528. "psllq $3, %%mm4 \n\t"\
  529. \
  530. "por %%mm3, %%mm2 \n\t"\
  531. "por %%mm4, %%mm1 \n\t"\
  532. \
  533. MOVNTQ(%%mm2, (dst, index, 2))\
  534. MOVNTQ(%%mm1, 8(dst, index, 2))\
  535. \
  536. "add $8, "#index" \n\t"\
  537. "cmp "#dstw", "#index" \n\t"\
  538. " jb 1b \n\t"
  539. #define WRITERGB16(dst, dstw, index) REAL_WRITERGB16(dst, dstw, index)
  540. static inline void RENAME(yuv2rgb565_X_ar)(SwsContext *c, const int16_t *lumFilter,
  541. const int16_t **lumSrc, int lumFilterSize,
  542. const int16_t *chrFilter, const int16_t **chrUSrc,
  543. const int16_t **chrVSrc,
  544. int chrFilterSize, const int16_t **alpSrc,
  545. uint8_t *dest, int dstW, int dstY)
  546. {
  547. x86_reg dummy=0;
  548. x86_reg dstW_reg = dstW;
  549. x86_reg uv_off = c->uv_off << 1;
  550. YSCALEYUV2PACKEDX_ACCURATE
  551. YSCALEYUV2RGBX
  552. "pxor %%mm7, %%mm7 \n\t"
  553. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  554. #ifdef DITHER1XBPP
  555. "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
  556. "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
  557. "paddusb "RED_DITHER"(%0), %%mm5\n\t"
  558. #endif
  559. WRITERGB16(%4, %5, %%REGa)
  560. YSCALEYUV2PACKEDX_END
  561. }
  562. static inline void RENAME(yuv2rgb565_X)(SwsContext *c, const int16_t *lumFilter,
  563. const int16_t **lumSrc, int lumFilterSize,
  564. const int16_t *chrFilter, const int16_t **chrUSrc,
  565. const int16_t **chrVSrc,
  566. int chrFilterSize, const int16_t **alpSrc,
  567. uint8_t *dest, int dstW, int dstY)
  568. {
  569. x86_reg dummy=0;
  570. x86_reg dstW_reg = dstW;
  571. x86_reg uv_off = c->uv_off << 1;
  572. YSCALEYUV2PACKEDX
  573. YSCALEYUV2RGBX
  574. "pxor %%mm7, %%mm7 \n\t"
  575. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  576. #ifdef DITHER1XBPP
  577. "paddusb "BLUE_DITHER"(%0), %%mm2 \n\t"
  578. "paddusb "GREEN_DITHER"(%0), %%mm4 \n\t"
  579. "paddusb "RED_DITHER"(%0), %%mm5 \n\t"
  580. #endif
  581. WRITERGB16(%4, %5, %%REGa)
  582. YSCALEYUV2PACKEDX_END
  583. }
  584. #define REAL_WRITERGB15(dst, dstw, index) \
  585. "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
  586. "pand "MANGLE(bF8)", %%mm4 \n\t" /* G */\
  587. "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
  588. "psrlq $3, %%mm2 \n\t"\
  589. "psrlq $1, %%mm5 \n\t"\
  590. \
  591. "movq %%mm2, %%mm1 \n\t"\
  592. "movq %%mm4, %%mm3 \n\t"\
  593. \
  594. "punpcklbw %%mm7, %%mm3 \n\t"\
  595. "punpcklbw %%mm5, %%mm2 \n\t"\
  596. "punpckhbw %%mm7, %%mm4 \n\t"\
  597. "punpckhbw %%mm5, %%mm1 \n\t"\
  598. \
  599. "psllq $2, %%mm3 \n\t"\
  600. "psllq $2, %%mm4 \n\t"\
  601. \
  602. "por %%mm3, %%mm2 \n\t"\
  603. "por %%mm4, %%mm1 \n\t"\
  604. \
  605. MOVNTQ(%%mm2, (dst, index, 2))\
  606. MOVNTQ(%%mm1, 8(dst, index, 2))\
  607. \
  608. "add $8, "#index" \n\t"\
  609. "cmp "#dstw", "#index" \n\t"\
  610. " jb 1b \n\t"
  611. #define WRITERGB15(dst, dstw, index) REAL_WRITERGB15(dst, dstw, index)
  612. static inline void RENAME(yuv2rgb555_X_ar)(SwsContext *c, const int16_t *lumFilter,
  613. const int16_t **lumSrc, int lumFilterSize,
  614. const int16_t *chrFilter, const int16_t **chrUSrc,
  615. const int16_t **chrVSrc,
  616. int chrFilterSize, const int16_t **alpSrc,
  617. uint8_t *dest, int dstW, int dstY)
  618. {
  619. x86_reg dummy=0;
  620. x86_reg dstW_reg = dstW;
  621. x86_reg uv_off = c->uv_off << 1;
  622. YSCALEYUV2PACKEDX_ACCURATE
  623. YSCALEYUV2RGBX
  624. "pxor %%mm7, %%mm7 \n\t"
  625. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  626. #ifdef DITHER1XBPP
  627. "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
  628. "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
  629. "paddusb "RED_DITHER"(%0), %%mm5\n\t"
  630. #endif
  631. WRITERGB15(%4, %5, %%REGa)
  632. YSCALEYUV2PACKEDX_END
  633. }
  634. static inline void RENAME(yuv2rgb555_X)(SwsContext *c, const int16_t *lumFilter,
  635. const int16_t **lumSrc, int lumFilterSize,
  636. const int16_t *chrFilter, const int16_t **chrUSrc,
  637. const int16_t **chrVSrc,
  638. int chrFilterSize, const int16_t **alpSrc,
  639. uint8_t *dest, int dstW, int dstY)
  640. {
  641. x86_reg dummy=0;
  642. x86_reg dstW_reg = dstW;
  643. x86_reg uv_off = c->uv_off << 1;
  644. YSCALEYUV2PACKEDX
  645. YSCALEYUV2RGBX
  646. "pxor %%mm7, %%mm7 \n\t"
  647. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  648. #ifdef DITHER1XBPP
  649. "paddusb "BLUE_DITHER"(%0), %%mm2 \n\t"
  650. "paddusb "GREEN_DITHER"(%0), %%mm4 \n\t"
  651. "paddusb "RED_DITHER"(%0), %%mm5 \n\t"
  652. #endif
  653. WRITERGB15(%4, %5, %%REGa)
  654. YSCALEYUV2PACKEDX_END
  655. }
  656. #define WRITEBGR24MMX(dst, dstw, index) \
  657. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
  658. "movq %%mm2, %%mm1 \n\t" /* B */\
  659. "movq %%mm5, %%mm6 \n\t" /* R */\
  660. "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
  661. "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
  662. "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
  663. "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
  664. "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
  665. "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
  666. "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
  667. "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
  668. "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
  669. "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
  670. \
  671. "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
  672. "movq %%mm2, %%mm6 \n\t" /* 0RGB0RGB 1 */\
  673. "movq %%mm1, %%mm5 \n\t" /* 0RGB0RGB 2 */\
  674. "movq %%mm3, %%mm7 \n\t" /* 0RGB0RGB 3 */\
  675. \
  676. "psllq $40, %%mm0 \n\t" /* RGB00000 0 */\
  677. "psllq $40, %%mm2 \n\t" /* RGB00000 1 */\
  678. "psllq $40, %%mm1 \n\t" /* RGB00000 2 */\
  679. "psllq $40, %%mm3 \n\t" /* RGB00000 3 */\
  680. \
  681. "punpckhdq %%mm4, %%mm0 \n\t" /* 0RGBRGB0 0 */\
  682. "punpckhdq %%mm6, %%mm2 \n\t" /* 0RGBRGB0 1 */\
  683. "punpckhdq %%mm5, %%mm1 \n\t" /* 0RGBRGB0 2 */\
  684. "punpckhdq %%mm7, %%mm3 \n\t" /* 0RGBRGB0 3 */\
  685. \
  686. "psrlq $8, %%mm0 \n\t" /* 00RGBRGB 0 */\
  687. "movq %%mm2, %%mm6 \n\t" /* 0RGBRGB0 1 */\
  688. "psllq $40, %%mm2 \n\t" /* GB000000 1 */\
  689. "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
  690. MOVNTQ(%%mm0, (dst))\
  691. \
  692. "psrlq $24, %%mm6 \n\t" /* 0000RGBR 1 */\
  693. "movq %%mm1, %%mm5 \n\t" /* 0RGBRGB0 2 */\
  694. "psllq $24, %%mm1 \n\t" /* BRGB0000 2 */\
  695. "por %%mm1, %%mm6 \n\t" /* BRGBRGBR 1 */\
  696. MOVNTQ(%%mm6, 8(dst))\
  697. \
  698. "psrlq $40, %%mm5 \n\t" /* 000000RG 2 */\
  699. "psllq $8, %%mm3 \n\t" /* RGBRGB00 3 */\
  700. "por %%mm3, %%mm5 \n\t" /* RGBRGBRG 2 */\
  701. MOVNTQ(%%mm5, 16(dst))\
  702. \
  703. "add $24, "#dst" \n\t"\
  704. \
  705. "add $8, "#index" \n\t"\
  706. "cmp "#dstw", "#index" \n\t"\
  707. " jb 1b \n\t"
  708. #define WRITEBGR24MMX2(dst, dstw, index) \
  709. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
  710. "movq "MANGLE(ff_M24A)", %%mm0 \n\t"\
  711. "movq "MANGLE(ff_M24C)", %%mm7 \n\t"\
  712. "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2 B1 B0 B1 B0 */\
  713. "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2 G1 G0 G1 G0 */\
  714. "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0 R1 R0 R1 R0 */\
  715. \
  716. "pand %%mm0, %%mm1 \n\t" /* B2 B1 B0 */\
  717. "pand %%mm0, %%mm3 \n\t" /* G2 G1 G0 */\
  718. "pand %%mm7, %%mm6 \n\t" /* R1 R0 */\
  719. \
  720. "psllq $8, %%mm3 \n\t" /* G2 G1 G0 */\
  721. "por %%mm1, %%mm6 \n\t"\
  722. "por %%mm3, %%mm6 \n\t"\
  723. MOVNTQ(%%mm6, (dst))\
  724. \
  725. "psrlq $8, %%mm4 \n\t" /* 00 G7 G6 G5 G4 G3 G2 G1 */\
  726. "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4 B3 B2 B3 B2 */\
  727. "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3 G4 G3 G4 G3 */\
  728. "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4 R3 R2 R3 R2 */\
  729. \
  730. "pand "MANGLE(ff_M24B)", %%mm1 \n\t" /* B5 B4 B3 */\
  731. "pand %%mm7, %%mm3 \n\t" /* G4 G3 */\
  732. "pand %%mm0, %%mm6 \n\t" /* R4 R3 R2 */\
  733. \
  734. "por %%mm1, %%mm3 \n\t" /* B5 G4 B4 G3 B3 */\
  735. "por %%mm3, %%mm6 \n\t"\
  736. MOVNTQ(%%mm6, 8(dst))\
  737. \
  738. "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6 B7 B6 B6 B7 */\
  739. "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7 G6 G5 G6 G5 */\
  740. "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6 R5 R4 R5 R4 */\
  741. \
  742. "pand %%mm7, %%mm1 \n\t" /* B7 B6 */\
  743. "pand %%mm0, %%mm3 \n\t" /* G7 G6 G5 */\
  744. "pand "MANGLE(ff_M24B)", %%mm6 \n\t" /* R7 R6 R5 */\
  745. \
  746. "por %%mm1, %%mm3 \n\t"\
  747. "por %%mm3, %%mm6 \n\t"\
  748. MOVNTQ(%%mm6, 16(dst))\
  749. \
  750. "add $24, "#dst" \n\t"\
  751. \
  752. "add $8, "#index" \n\t"\
  753. "cmp "#dstw", "#index" \n\t"\
  754. " jb 1b \n\t"
  755. #if COMPILE_TEMPLATE_MMX2
  756. #undef WRITEBGR24
  757. #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX2(dst, dstw, index)
  758. #else
  759. #undef WRITEBGR24
  760. #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX(dst, dstw, index)
  761. #endif
  762. static inline void RENAME(yuv2bgr24_X_ar)(SwsContext *c, const int16_t *lumFilter,
  763. const int16_t **lumSrc, int lumFilterSize,
  764. const int16_t *chrFilter, const int16_t **chrUSrc,
  765. const int16_t **chrVSrc,
  766. int chrFilterSize, const int16_t **alpSrc,
  767. uint8_t *dest, int dstW, int dstY)
  768. {
  769. x86_reg dummy=0;
  770. x86_reg dstW_reg = dstW;
  771. x86_reg uv_off = c->uv_off << 1;
  772. YSCALEYUV2PACKEDX_ACCURATE
  773. YSCALEYUV2RGBX
  774. "pxor %%mm7, %%mm7 \n\t"
  775. "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"\n\t" //FIXME optimize
  776. "add %4, %%"REG_c" \n\t"
  777. WRITEBGR24(%%REGc, %5, %%REGa)
  778. :: "r" (&c->redDither),
  779. "m" (dummy), "m" (dummy), "m" (dummy),
  780. "r" (dest), "m" (dstW_reg), "m"(uv_off)
  781. : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
  782. );
  783. }
  784. static inline void RENAME(yuv2bgr24_X)(SwsContext *c, const int16_t *lumFilter,
  785. const int16_t **lumSrc, int lumFilterSize,
  786. const int16_t *chrFilter, const int16_t **chrUSrc,
  787. const int16_t **chrVSrc,
  788. int chrFilterSize, const int16_t **alpSrc,
  789. uint8_t *dest, int dstW, int dstY)
  790. {
  791. x86_reg dummy=0;
  792. x86_reg dstW_reg = dstW;
  793. x86_reg uv_off = c->uv_off << 1;
  794. YSCALEYUV2PACKEDX
  795. YSCALEYUV2RGBX
  796. "pxor %%mm7, %%mm7 \n\t"
  797. "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c" \n\t" //FIXME optimize
  798. "add %4, %%"REG_c" \n\t"
  799. WRITEBGR24(%%REGc, %5, %%REGa)
  800. :: "r" (&c->redDither),
  801. "m" (dummy), "m" (dummy), "m" (dummy),
  802. "r" (dest), "m" (dstW_reg), "m"(uv_off)
  803. : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
  804. );
  805. }
  806. #define REAL_WRITEYUY2(dst, dstw, index) \
  807. "packuswb %%mm3, %%mm3 \n\t"\
  808. "packuswb %%mm4, %%mm4 \n\t"\
  809. "packuswb %%mm7, %%mm1 \n\t"\
  810. "punpcklbw %%mm4, %%mm3 \n\t"\
  811. "movq %%mm1, %%mm7 \n\t"\
  812. "punpcklbw %%mm3, %%mm1 \n\t"\
  813. "punpckhbw %%mm3, %%mm7 \n\t"\
  814. \
  815. MOVNTQ(%%mm1, (dst, index, 2))\
  816. MOVNTQ(%%mm7, 8(dst, index, 2))\
  817. \
  818. "add $8, "#index" \n\t"\
  819. "cmp "#dstw", "#index" \n\t"\
  820. " jb 1b \n\t"
  821. #define WRITEYUY2(dst, dstw, index) REAL_WRITEYUY2(dst, dstw, index)
  822. static inline void RENAME(yuv2yuyv422_X_ar)(SwsContext *c, const int16_t *lumFilter,
  823. const int16_t **lumSrc, int lumFilterSize,
  824. const int16_t *chrFilter, const int16_t **chrUSrc,
  825. const int16_t **chrVSrc,
  826. int chrFilterSize, const int16_t **alpSrc,
  827. uint8_t *dest, int dstW, int dstY)
  828. {
  829. x86_reg dummy=0;
  830. x86_reg dstW_reg = dstW;
  831. x86_reg uv_off = c->uv_off << 1;
  832. YSCALEYUV2PACKEDX_ACCURATE
  833. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  834. "psraw $3, %%mm3 \n\t"
  835. "psraw $3, %%mm4 \n\t"
  836. "psraw $3, %%mm1 \n\t"
  837. "psraw $3, %%mm7 \n\t"
  838. WRITEYUY2(%4, %5, %%REGa)
  839. YSCALEYUV2PACKEDX_END
  840. }
  841. static inline void RENAME(yuv2yuyv422_X)(SwsContext *c, const int16_t *lumFilter,
  842. const int16_t **lumSrc, int lumFilterSize,
  843. const int16_t *chrFilter, const int16_t **chrUSrc,
  844. const int16_t **chrVSrc,
  845. int chrFilterSize, const int16_t **alpSrc,
  846. uint8_t *dest, int dstW, int dstY)
  847. {
  848. x86_reg dummy=0;
  849. x86_reg dstW_reg = dstW;
  850. x86_reg uv_off = c->uv_off << 1;
  851. YSCALEYUV2PACKEDX
  852. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  853. "psraw $3, %%mm3 \n\t"
  854. "psraw $3, %%mm4 \n\t"
  855. "psraw $3, %%mm1 \n\t"
  856. "psraw $3, %%mm7 \n\t"
  857. WRITEYUY2(%4, %5, %%REGa)
  858. YSCALEYUV2PACKEDX_END
  859. }
  860. #define REAL_YSCALEYUV2RGB_UV(index, c) \
  861. "xor "#index", "#index" \n\t"\
  862. ".p2align 4 \n\t"\
  863. "1: \n\t"\
  864. "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
  865. "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
  866. "add "UV_OFFx2"("#c"), "#index" \n\t" \
  867. "movq (%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
  868. "movq (%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
  869. "sub "UV_OFFx2"("#c"), "#index" \n\t" \
  870. "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
  871. "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
  872. "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
  873. "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
  874. "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
  875. "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
  876. "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
  877. "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
  878. "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
  879. "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
  880. "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
  881. "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
  882. "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
  883. "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
  884. "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
  885. /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
  886. #define REAL_YSCALEYUV2RGB_YA(index, c, b1, b2) \
  887. "movq ("#b1", "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
  888. "movq ("#b2", "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
  889. "movq 8("#b1", "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
  890. "movq 8("#b2", "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
  891. "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
  892. "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
  893. "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
  894. "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
  895. "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  896. "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  897. "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
  898. "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
  899. #define REAL_YSCALEYUV2RGB_COEFF(c) \
  900. "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
  901. "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
  902. "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
  903. "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
  904. "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
  905. "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
  906. /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
  907. "paddw %%mm3, %%mm4 \n\t"\
  908. "movq %%mm2, %%mm0 \n\t"\
  909. "movq %%mm5, %%mm6 \n\t"\
  910. "movq %%mm4, %%mm3 \n\t"\
  911. "punpcklwd %%mm2, %%mm2 \n\t"\
  912. "punpcklwd %%mm5, %%mm5 \n\t"\
  913. "punpcklwd %%mm4, %%mm4 \n\t"\
  914. "paddw %%mm1, %%mm2 \n\t"\
  915. "paddw %%mm1, %%mm5 \n\t"\
  916. "paddw %%mm1, %%mm4 \n\t"\
  917. "punpckhwd %%mm0, %%mm0 \n\t"\
  918. "punpckhwd %%mm6, %%mm6 \n\t"\
  919. "punpckhwd %%mm3, %%mm3 \n\t"\
  920. "paddw %%mm7, %%mm0 \n\t"\
  921. "paddw %%mm7, %%mm6 \n\t"\
  922. "paddw %%mm7, %%mm3 \n\t"\
  923. /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
  924. "packuswb %%mm0, %%mm2 \n\t"\
  925. "packuswb %%mm6, %%mm5 \n\t"\
  926. "packuswb %%mm3, %%mm4 \n\t"\
  927. #define YSCALEYUV2RGB_YA(index, c, b1, b2) REAL_YSCALEYUV2RGB_YA(index, c, b1, b2)
  928. #define YSCALEYUV2RGB(index, c) \
  929. REAL_YSCALEYUV2RGB_UV(index, c) \
  930. REAL_YSCALEYUV2RGB_YA(index, c, %0, %1) \
  931. REAL_YSCALEYUV2RGB_COEFF(c)
  932. /**
  933. * vertical bilinear scale YV12 to RGB
  934. */
  935. static inline void RENAME(yuv2rgb32_2)(SwsContext *c, const uint16_t *buf0,
  936. const uint16_t *buf1, const uint16_t *ubuf0,
  937. const uint16_t *ubuf1, const uint16_t *vbuf0,
  938. const uint16_t *vbuf1, const uint16_t *abuf0,
  939. const uint16_t *abuf1, uint8_t *dest,
  940. int dstW, int yalpha, int uvalpha, int y)
  941. {
  942. if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
  943. #if ARCH_X86_64
  944. __asm__ volatile(
  945. YSCALEYUV2RGB(%%r8, %5)
  946. YSCALEYUV2RGB_YA(%%r8, %5, %6, %7)
  947. "psraw $3, %%mm1 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
  948. "psraw $3, %%mm7 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
  949. "packuswb %%mm7, %%mm1 \n\t"
  950. WRITEBGR32(%4, 8280(%5), %%r8, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
  951. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "r" (dest),
  952. "a" (&c->redDither),
  953. "r" (abuf0), "r" (abuf1)
  954. : "%r8"
  955. );
  956. #else
  957. c->u_temp=(intptr_t)abuf0;
  958. c->v_temp=(intptr_t)abuf1;
  959. __asm__ volatile(
  960. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  961. "mov %4, %%"REG_b" \n\t"
  962. "push %%"REG_BP" \n\t"
  963. YSCALEYUV2RGB(%%REGBP, %5)
  964. "push %0 \n\t"
  965. "push %1 \n\t"
  966. "mov "U_TEMP"(%5), %0 \n\t"
  967. "mov "V_TEMP"(%5), %1 \n\t"
  968. YSCALEYUV2RGB_YA(%%REGBP, %5, %0, %1)
  969. "psraw $3, %%mm1 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
  970. "psraw $3, %%mm7 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
  971. "packuswb %%mm7, %%mm1 \n\t"
  972. "pop %1 \n\t"
  973. "pop %0 \n\t"
  974. WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
  975. "pop %%"REG_BP" \n\t"
  976. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  977. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  978. "a" (&c->redDither)
  979. );
  980. #endif
  981. } else {
  982. __asm__ volatile(
  983. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  984. "mov %4, %%"REG_b" \n\t"
  985. "push %%"REG_BP" \n\t"
  986. YSCALEYUV2RGB(%%REGBP, %5)
  987. "pcmpeqd %%mm7, %%mm7 \n\t"
  988. WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
  989. "pop %%"REG_BP" \n\t"
  990. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  991. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  992. "a" (&c->redDither)
  993. );
  994. }
  995. }
  996. static inline void RENAME(yuv2bgr24_2)(SwsContext *c, const uint16_t *buf0,
  997. const uint16_t *buf1, const uint16_t *ubuf0,
  998. const uint16_t *ubuf1, const uint16_t *vbuf0,
  999. const uint16_t *vbuf1, const uint16_t *abuf0,
  1000. const uint16_t *abuf1, uint8_t *dest,
  1001. int dstW, int yalpha, int uvalpha, int y)
  1002. {
  1003. //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
  1004. __asm__ volatile(
  1005. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1006. "mov %4, %%"REG_b" \n\t"
  1007. "push %%"REG_BP" \n\t"
  1008. YSCALEYUV2RGB(%%REGBP, %5)
  1009. "pxor %%mm7, %%mm7 \n\t"
  1010. WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
  1011. "pop %%"REG_BP" \n\t"
  1012. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1013. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1014. "a" (&c->redDither)
  1015. );
  1016. }
  1017. static inline void RENAME(yuv2rgb555_2)(SwsContext *c, const uint16_t *buf0,
  1018. const uint16_t *buf1, const uint16_t *ubuf0,
  1019. const uint16_t *ubuf1, const uint16_t *vbuf0,
  1020. const uint16_t *vbuf1, const uint16_t *abuf0,
  1021. const uint16_t *abuf1, uint8_t *dest,
  1022. int dstW, int yalpha, int uvalpha, int y)
  1023. {
  1024. //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
  1025. __asm__ volatile(
  1026. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1027. "mov %4, %%"REG_b" \n\t"
  1028. "push %%"REG_BP" \n\t"
  1029. YSCALEYUV2RGB(%%REGBP, %5)
  1030. "pxor %%mm7, %%mm7 \n\t"
  1031. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1032. #ifdef DITHER1XBPP
  1033. "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
  1034. "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
  1035. "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
  1036. #endif
  1037. WRITERGB15(%%REGb, 8280(%5), %%REGBP)
  1038. "pop %%"REG_BP" \n\t"
  1039. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1040. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1041. "a" (&c->redDither)
  1042. );
  1043. }
  1044. static inline void RENAME(yuv2rgb565_2)(SwsContext *c, const uint16_t *buf0,
  1045. const uint16_t *buf1, const uint16_t *ubuf0,
  1046. const uint16_t *ubuf1, const uint16_t *vbuf0,
  1047. const uint16_t *vbuf1, const uint16_t *abuf0,
  1048. const uint16_t *abuf1, uint8_t *dest,
  1049. int dstW, int yalpha, int uvalpha, int y)
  1050. {
  1051. //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
  1052. __asm__ volatile(
  1053. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1054. "mov %4, %%"REG_b" \n\t"
  1055. "push %%"REG_BP" \n\t"
  1056. YSCALEYUV2RGB(%%REGBP, %5)
  1057. "pxor %%mm7, %%mm7 \n\t"
  1058. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1059. #ifdef DITHER1XBPP
  1060. "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
  1061. "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
  1062. "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
  1063. #endif
  1064. WRITERGB16(%%REGb, 8280(%5), %%REGBP)
  1065. "pop %%"REG_BP" \n\t"
  1066. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1067. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1068. "a" (&c->redDither)
  1069. );
  1070. }
  1071. #define REAL_YSCALEYUV2PACKED(index, c) \
  1072. "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
  1073. "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1 \n\t"\
  1074. "psraw $3, %%mm0 \n\t"\
  1075. "psraw $3, %%mm1 \n\t"\
  1076. "movq %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c") \n\t"\
  1077. "movq %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c") \n\t"\
  1078. "xor "#index", "#index" \n\t"\
  1079. ".p2align 4 \n\t"\
  1080. "1: \n\t"\
  1081. "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
  1082. "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
  1083. "add "UV_OFFx2"("#c"), "#index" \n\t" \
  1084. "movq (%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
  1085. "movq (%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
  1086. "sub "UV_OFFx2"("#c"), "#index" \n\t" \
  1087. "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
  1088. "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
  1089. "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
  1090. "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
  1091. "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
  1092. "psraw $7, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
  1093. "psraw $7, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
  1094. "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
  1095. "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
  1096. "movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
  1097. "movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
  1098. "movq 8(%0, "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
  1099. "movq 8(%1, "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
  1100. "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
  1101. "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
  1102. "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
  1103. "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
  1104. "psraw $7, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  1105. "psraw $7, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  1106. "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
  1107. "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
  1108. #define YSCALEYUV2PACKED(index, c) REAL_YSCALEYUV2PACKED(index, c)
  1109. static inline void RENAME(yuv2yuyv422_2)(SwsContext *c, const uint16_t *buf0,
  1110. const uint16_t *buf1, const uint16_t *ubuf0,
  1111. const uint16_t *ubuf1, const uint16_t *vbuf0,
  1112. const uint16_t *vbuf1, const uint16_t *abuf0,
  1113. const uint16_t *abuf1, uint8_t *dest,
  1114. int dstW, int yalpha, int uvalpha, int y)
  1115. {
  1116. //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
  1117. __asm__ volatile(
  1118. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1119. "mov %4, %%"REG_b" \n\t"
  1120. "push %%"REG_BP" \n\t"
  1121. YSCALEYUV2PACKED(%%REGBP, %5)
  1122. WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
  1123. "pop %%"REG_BP" \n\t"
  1124. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1125. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1126. "a" (&c->redDither)
  1127. );
  1128. }
  1129. #define REAL_YSCALEYUV2RGB1(index, c) \
  1130. "xor "#index", "#index" \n\t"\
  1131. ".p2align 4 \n\t"\
  1132. "1: \n\t"\
  1133. "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
  1134. "add "UV_OFFx2"("#c"), "#index" \n\t" \
  1135. "movq (%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
  1136. "sub "UV_OFFx2"("#c"), "#index" \n\t" \
  1137. "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
  1138. "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
  1139. "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
  1140. "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
  1141. "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
  1142. "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
  1143. "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
  1144. "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
  1145. /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
  1146. "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
  1147. "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
  1148. "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  1149. "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  1150. "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
  1151. "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
  1152. "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
  1153. "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
  1154. "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
  1155. "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
  1156. /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
  1157. "paddw %%mm3, %%mm4 \n\t"\
  1158. "movq %%mm2, %%mm0 \n\t"\
  1159. "movq %%mm5, %%mm6 \n\t"\
  1160. "movq %%mm4, %%mm3 \n\t"\
  1161. "punpcklwd %%mm2, %%mm2 \n\t"\
  1162. "punpcklwd %%mm5, %%mm5 \n\t"\
  1163. "punpcklwd %%mm4, %%mm4 \n\t"\
  1164. "paddw %%mm1, %%mm2 \n\t"\
  1165. "paddw %%mm1, %%mm5 \n\t"\
  1166. "paddw %%mm1, %%mm4 \n\t"\
  1167. "punpckhwd %%mm0, %%mm0 \n\t"\
  1168. "punpckhwd %%mm6, %%mm6 \n\t"\
  1169. "punpckhwd %%mm3, %%mm3 \n\t"\
  1170. "paddw %%mm7, %%mm0 \n\t"\
  1171. "paddw %%mm7, %%mm6 \n\t"\
  1172. "paddw %%mm7, %%mm3 \n\t"\
  1173. /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
  1174. "packuswb %%mm0, %%mm2 \n\t"\
  1175. "packuswb %%mm6, %%mm5 \n\t"\
  1176. "packuswb %%mm3, %%mm4 \n\t"\
  1177. #define YSCALEYUV2RGB1(index, c) REAL_YSCALEYUV2RGB1(index, c)
  1178. // do vertical chrominance interpolation
  1179. #define REAL_YSCALEYUV2RGB1b(index, c) \
  1180. "xor "#index", "#index" \n\t"\
  1181. ".p2align 4 \n\t"\
  1182. "1: \n\t"\
  1183. "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
  1184. "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
  1185. "add "UV_OFFx2"("#c"), "#index" \n\t" \
  1186. "movq (%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
  1187. "movq (%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
  1188. "sub "UV_OFFx2"("#c"), "#index" \n\t" \
  1189. "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
  1190. "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
  1191. "psrlw $5, %%mm3 \n\t" /*FIXME might overflow*/\
  1192. "psrlw $5, %%mm4 \n\t" /*FIXME might overflow*/\
  1193. "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
  1194. "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
  1195. "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
  1196. "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
  1197. "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
  1198. "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
  1199. /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
  1200. "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
  1201. "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
  1202. "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  1203. "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  1204. "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
  1205. "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
  1206. "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
  1207. "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
  1208. "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
  1209. "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
  1210. /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
  1211. "paddw %%mm3, %%mm4 \n\t"\
  1212. "movq %%mm2, %%mm0 \n\t"\
  1213. "movq %%mm5, %%mm6 \n\t"\
  1214. "movq %%mm4, %%mm3 \n\t"\
  1215. "punpcklwd %%mm2, %%mm2 \n\t"\
  1216. "punpcklwd %%mm5, %%mm5 \n\t"\
  1217. "punpcklwd %%mm4, %%mm4 \n\t"\
  1218. "paddw %%mm1, %%mm2 \n\t"\
  1219. "paddw %%mm1, %%mm5 \n\t"\
  1220. "paddw %%mm1, %%mm4 \n\t"\
  1221. "punpckhwd %%mm0, %%mm0 \n\t"\
  1222. "punpckhwd %%mm6, %%mm6 \n\t"\
  1223. "punpckhwd %%mm3, %%mm3 \n\t"\
  1224. "paddw %%mm7, %%mm0 \n\t"\
  1225. "paddw %%mm7, %%mm6 \n\t"\
  1226. "paddw %%mm7, %%mm3 \n\t"\
  1227. /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
  1228. "packuswb %%mm0, %%mm2 \n\t"\
  1229. "packuswb %%mm6, %%mm5 \n\t"\
  1230. "packuswb %%mm3, %%mm4 \n\t"\
  1231. #define YSCALEYUV2RGB1b(index, c) REAL_YSCALEYUV2RGB1b(index, c)
  1232. #define REAL_YSCALEYUV2RGB1_ALPHA(index) \
  1233. "movq (%1, "#index", 2), %%mm7 \n\t" /* abuf0[index ] */\
  1234. "movq 8(%1, "#index", 2), %%mm1 \n\t" /* abuf0[index+4] */\
  1235. "psraw $7, %%mm7 \n\t" /* abuf0[index ] >>7 */\
  1236. "psraw $7, %%mm1 \n\t" /* abuf0[index+4] >>7 */\
  1237. "packuswb %%mm1, %%mm7 \n\t"
  1238. #define YSCALEYUV2RGB1_ALPHA(index) REAL_YSCALEYUV2RGB1_ALPHA(index)
  1239. /**
  1240. * YV12 to RGB without scaling or interpolating
  1241. */
  1242. static inline void RENAME(yuv2rgb32_1)(SwsContext *c, const uint16_t *buf0,
  1243. const uint16_t *ubuf0, const uint16_t *ubuf1,
  1244. const uint16_t *vbuf0, const uint16_t *vbuf1,
  1245. const uint16_t *abuf0, uint8_t *dest,
  1246. int dstW, int uvalpha, enum PixelFormat dstFormat,
  1247. int flags, int y)
  1248. {
  1249. const uint16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
  1250. if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
  1251. if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
  1252. __asm__ volatile(
  1253. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1254. "mov %4, %%"REG_b" \n\t"
  1255. "push %%"REG_BP" \n\t"
  1256. YSCALEYUV2RGB1(%%REGBP, %5)
  1257. YSCALEYUV2RGB1_ALPHA(%%REGBP)
  1258. WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
  1259. "pop %%"REG_BP" \n\t"
  1260. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1261. :: "c" (buf0), "d" (abuf0), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1262. "a" (&c->redDither)
  1263. );
  1264. } else {
  1265. __asm__ volatile(
  1266. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1267. "mov %4, %%"REG_b" \n\t"
  1268. "push %%"REG_BP" \n\t"
  1269. YSCALEYUV2RGB1(%%REGBP, %5)
  1270. "pcmpeqd %%mm7, %%mm7 \n\t"
  1271. WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
  1272. "pop %%"REG_BP" \n\t"
  1273. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1274. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1275. "a" (&c->redDither)
  1276. );
  1277. }
  1278. } else {
  1279. if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
  1280. __asm__ volatile(
  1281. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1282. "mov %4, %%"REG_b" \n\t"
  1283. "push %%"REG_BP" \n\t"
  1284. YSCALEYUV2RGB1b(%%REGBP, %5)
  1285. YSCALEYUV2RGB1_ALPHA(%%REGBP)
  1286. WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
  1287. "pop %%"REG_BP" \n\t"
  1288. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1289. :: "c" (buf0), "d" (abuf0), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1290. "a" (&c->redDither)
  1291. );
  1292. } else {
  1293. __asm__ volatile(
  1294. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1295. "mov %4, %%"REG_b" \n\t"
  1296. "push %%"REG_BP" \n\t"
  1297. YSCALEYUV2RGB1b(%%REGBP, %5)
  1298. "pcmpeqd %%mm7, %%mm7 \n\t"
  1299. WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
  1300. "pop %%"REG_BP" \n\t"
  1301. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1302. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1303. "a" (&c->redDither)
  1304. );
  1305. }
  1306. }
  1307. }
  1308. static inline void RENAME(yuv2bgr24_1)(SwsContext *c, const uint16_t *buf0,
  1309. const uint16_t *ubuf0, const uint16_t *ubuf1,
  1310. const uint16_t *vbuf0, const uint16_t *vbuf1,
  1311. const uint16_t *abuf0, uint8_t *dest,
  1312. int dstW, int uvalpha, enum PixelFormat dstFormat,
  1313. int flags, int y)
  1314. {
  1315. const uint16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
  1316. if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
  1317. __asm__ volatile(
  1318. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1319. "mov %4, %%"REG_b" \n\t"
  1320. "push %%"REG_BP" \n\t"
  1321. YSCALEYUV2RGB1(%%REGBP, %5)
  1322. "pxor %%mm7, %%mm7 \n\t"
  1323. WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
  1324. "pop %%"REG_BP" \n\t"
  1325. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1326. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1327. "a" (&c->redDither)
  1328. );
  1329. } else {
  1330. __asm__ volatile(
  1331. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1332. "mov %4, %%"REG_b" \n\t"
  1333. "push %%"REG_BP" \n\t"
  1334. YSCALEYUV2RGB1b(%%REGBP, %5)
  1335. "pxor %%mm7, %%mm7 \n\t"
  1336. WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
  1337. "pop %%"REG_BP" \n\t"
  1338. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1339. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1340. "a" (&c->redDither)
  1341. );
  1342. }
  1343. }
  1344. static inline void RENAME(yuv2rgb555_1)(SwsContext *c, const uint16_t *buf0,
  1345. const uint16_t *ubuf0, const uint16_t *ubuf1,
  1346. const uint16_t *vbuf0, const uint16_t *vbuf1,
  1347. const uint16_t *abuf0, uint8_t *dest,
  1348. int dstW, int uvalpha, enum PixelFormat dstFormat,
  1349. int flags, int y)
  1350. {
  1351. const uint16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
  1352. if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
  1353. __asm__ volatile(
  1354. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1355. "mov %4, %%"REG_b" \n\t"
  1356. "push %%"REG_BP" \n\t"
  1357. YSCALEYUV2RGB1(%%REGBP, %5)
  1358. "pxor %%mm7, %%mm7 \n\t"
  1359. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1360. #ifdef DITHER1XBPP
  1361. "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
  1362. "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
  1363. "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
  1364. #endif
  1365. WRITERGB15(%%REGb, 8280(%5), %%REGBP)
  1366. "pop %%"REG_BP" \n\t"
  1367. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1368. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1369. "a" (&c->redDither)
  1370. );
  1371. } else {
  1372. __asm__ volatile(
  1373. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1374. "mov %4, %%"REG_b" \n\t"
  1375. "push %%"REG_BP" \n\t"
  1376. YSCALEYUV2RGB1b(%%REGBP, %5)
  1377. "pxor %%mm7, %%mm7 \n\t"
  1378. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1379. #ifdef DITHER1XBPP
  1380. "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
  1381. "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
  1382. "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
  1383. #endif
  1384. WRITERGB15(%%REGb, 8280(%5), %%REGBP)
  1385. "pop %%"REG_BP" \n\t"
  1386. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1387. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1388. "a" (&c->redDither)
  1389. );
  1390. }
  1391. }
  1392. static inline void RENAME(yuv2rgb565_1)(SwsContext *c, const uint16_t *buf0,
  1393. const uint16_t *ubuf0, const uint16_t *ubuf1,
  1394. const uint16_t *vbuf0, const uint16_t *vbuf1,
  1395. const uint16_t *abuf0, uint8_t *dest,
  1396. int dstW, int uvalpha, enum PixelFormat dstFormat,
  1397. int flags, int y)
  1398. {
  1399. const uint16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
  1400. if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
  1401. __asm__ volatile(
  1402. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1403. "mov %4, %%"REG_b" \n\t"
  1404. "push %%"REG_BP" \n\t"
  1405. YSCALEYUV2RGB1(%%REGBP, %5)
  1406. "pxor %%mm7, %%mm7 \n\t"
  1407. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1408. #ifdef DITHER1XBPP
  1409. "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
  1410. "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
  1411. "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
  1412. #endif
  1413. WRITERGB16(%%REGb, 8280(%5), %%REGBP)
  1414. "pop %%"REG_BP" \n\t"
  1415. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1416. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1417. "a" (&c->redDither)
  1418. );
  1419. } else {
  1420. __asm__ volatile(
  1421. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1422. "mov %4, %%"REG_b" \n\t"
  1423. "push %%"REG_BP" \n\t"
  1424. YSCALEYUV2RGB1b(%%REGBP, %5)
  1425. "pxor %%mm7, %%mm7 \n\t"
  1426. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1427. #ifdef DITHER1XBPP
  1428. "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
  1429. "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
  1430. "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
  1431. #endif
  1432. WRITERGB16(%%REGb, 8280(%5), %%REGBP)
  1433. "pop %%"REG_BP" \n\t"
  1434. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1435. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1436. "a" (&c->redDither)
  1437. );
  1438. }
  1439. }
  1440. #define REAL_YSCALEYUV2PACKED1(index, c) \
  1441. "xor "#index", "#index" \n\t"\
  1442. ".p2align 4 \n\t"\
  1443. "1: \n\t"\
  1444. "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
  1445. "add "UV_OFFx2"("#c"), "#index" \n\t" \
  1446. "movq (%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
  1447. "sub "UV_OFFx2"("#c"), "#index" \n\t" \
  1448. "psraw $7, %%mm3 \n\t" \
  1449. "psraw $7, %%mm4 \n\t" \
  1450. "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
  1451. "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
  1452. "psraw $7, %%mm1 \n\t" \
  1453. "psraw $7, %%mm7 \n\t" \
  1454. #define YSCALEYUV2PACKED1(index, c) REAL_YSCALEYUV2PACKED1(index, c)
  1455. #define REAL_YSCALEYUV2PACKED1b(index, c) \
  1456. "xor "#index", "#index" \n\t"\
  1457. ".p2align 4 \n\t"\
  1458. "1: \n\t"\
  1459. "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
  1460. "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
  1461. "add "UV_OFFx2"("#c"), "#index" \n\t" \
  1462. "movq (%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
  1463. "movq (%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
  1464. "sub "UV_OFFx2"("#c"), "#index" \n\t" \
  1465. "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
  1466. "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
  1467. "psrlw $8, %%mm3 \n\t" \
  1468. "psrlw $8, %%mm4 \n\t" \
  1469. "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
  1470. "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
  1471. "psraw $7, %%mm1 \n\t" \
  1472. "psraw $7, %%mm7 \n\t"
  1473. #define YSCALEYUV2PACKED1b(index, c) REAL_YSCALEYUV2PACKED1b(index, c)
  1474. static inline void RENAME(yuv2yuyv422_1)(SwsContext *c, const uint16_t *buf0,
  1475. const uint16_t *ubuf0, const uint16_t *ubuf1,
  1476. const uint16_t *vbuf0, const uint16_t *vbuf1,
  1477. const uint16_t *abuf0, uint8_t *dest,
  1478. int dstW, int uvalpha, enum PixelFormat dstFormat,
  1479. int flags, int y)
  1480. {
  1481. const uint16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
  1482. if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
  1483. __asm__ volatile(
  1484. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1485. "mov %4, %%"REG_b" \n\t"
  1486. "push %%"REG_BP" \n\t"
  1487. YSCALEYUV2PACKED1(%%REGBP, %5)
  1488. WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
  1489. "pop %%"REG_BP" \n\t"
  1490. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1491. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1492. "a" (&c->redDither)
  1493. );
  1494. } else {
  1495. __asm__ volatile(
  1496. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1497. "mov %4, %%"REG_b" \n\t"
  1498. "push %%"REG_BP" \n\t"
  1499. YSCALEYUV2PACKED1b(%%REGBP, %5)
  1500. WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
  1501. "pop %%"REG_BP" \n\t"
  1502. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1503. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1504. "a" (&c->redDither)
  1505. );
  1506. }
  1507. }
  1508. #if !COMPILE_TEMPLATE_MMX2
  1509. //FIXME yuy2* can read up to 7 samples too much
  1510. static inline void RENAME(yuy2ToY)(uint8_t *dst, const uint8_t *src, int width, uint32_t *unused)
  1511. {
  1512. __asm__ volatile(
  1513. "movq "MANGLE(bm01010101)", %%mm2 \n\t"
  1514. "mov %0, %%"REG_a" \n\t"
  1515. "1: \n\t"
  1516. "movq (%1, %%"REG_a",2), %%mm0 \n\t"
  1517. "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
  1518. "pand %%mm2, %%mm0 \n\t"
  1519. "pand %%mm2, %%mm1 \n\t"
  1520. "packuswb %%mm1, %%mm0 \n\t"
  1521. "movq %%mm0, (%2, %%"REG_a") \n\t"
  1522. "add $8, %%"REG_a" \n\t"
  1523. " js 1b \n\t"
  1524. : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst+width)
  1525. : "%"REG_a
  1526. );
  1527. }
  1528. static inline void RENAME(yuy2ToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, int width, uint32_t *unused)
  1529. {
  1530. __asm__ volatile(
  1531. "movq "MANGLE(bm01010101)", %%mm4 \n\t"
  1532. "mov %0, %%"REG_a" \n\t"
  1533. "1: \n\t"
  1534. "movq (%1, %%"REG_a",4), %%mm0 \n\t"
  1535. "movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
  1536. "psrlw $8, %%mm0 \n\t"
  1537. "psrlw $8, %%mm1 \n\t"
  1538. "packuswb %%mm1, %%mm0 \n\t"
  1539. "movq %%mm0, %%mm1 \n\t"
  1540. "psrlw $8, %%mm0 \n\t"
  1541. "pand %%mm4, %%mm1 \n\t"
  1542. "packuswb %%mm0, %%mm0 \n\t"
  1543. "packuswb %%mm1, %%mm1 \n\t"
  1544. "movd %%mm0, (%3, %%"REG_a") \n\t"
  1545. "movd %%mm1, (%2, %%"REG_a") \n\t"
  1546. "add $4, %%"REG_a" \n\t"
  1547. " js 1b \n\t"
  1548. : : "g" ((x86_reg)-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
  1549. : "%"REG_a
  1550. );
  1551. assert(src1 == src2);
  1552. }
  1553. static inline void RENAME(LEToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, int width, uint32_t *unused)
  1554. {
  1555. __asm__ volatile(
  1556. "mov %0, %%"REG_a" \n\t"
  1557. "1: \n\t"
  1558. "movq (%1, %%"REG_a",2), %%mm0 \n\t"
  1559. "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
  1560. "movq (%2, %%"REG_a",2), %%mm2 \n\t"
  1561. "movq 8(%2, %%"REG_a",2), %%mm3 \n\t"
  1562. "psrlw $8, %%mm0 \n\t"
  1563. "psrlw $8, %%mm1 \n\t"
  1564. "psrlw $8, %%mm2 \n\t"
  1565. "psrlw $8, %%mm3 \n\t"
  1566. "packuswb %%mm1, %%mm0 \n\t"
  1567. "packuswb %%mm3, %%mm2 \n\t"
  1568. "movq %%mm0, (%3, %%"REG_a") \n\t"
  1569. "movq %%mm2, (%4, %%"REG_a") \n\t"
  1570. "add $8, %%"REG_a" \n\t"
  1571. " js 1b \n\t"
  1572. : : "g" ((x86_reg)-width), "r" (src1+width*2), "r" (src2+width*2), "r" (dstU+width), "r" (dstV+width)
  1573. : "%"REG_a
  1574. );
  1575. }
  1576. /* This is almost identical to the previous, end exists only because
  1577. * yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses. */
  1578. static inline void RENAME(uyvyToY)(uint8_t *dst, const uint8_t *src, int width, uint32_t *unused)
  1579. {
  1580. __asm__ volatile(
  1581. "mov %0, %%"REG_a" \n\t"
  1582. "1: \n\t"
  1583. "movq (%1, %%"REG_a",2), %%mm0 \n\t"
  1584. "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
  1585. "psrlw $8, %%mm0 \n\t"
  1586. "psrlw $8, %%mm1 \n\t"
  1587. "packuswb %%mm1, %%mm0 \n\t"
  1588. "movq %%mm0, (%2, %%"REG_a") \n\t"
  1589. "add $8, %%"REG_a" \n\t"
  1590. " js 1b \n\t"
  1591. : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst+width)
  1592. : "%"REG_a
  1593. );
  1594. }
  1595. static inline void RENAME(uyvyToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, int width, uint32_t *unused)
  1596. {
  1597. __asm__ volatile(
  1598. "movq "MANGLE(bm01010101)", %%mm4 \n\t"
  1599. "mov %0, %%"REG_a" \n\t"
  1600. "1: \n\t"
  1601. "movq (%1, %%"REG_a",4), %%mm0 \n\t"
  1602. "movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
  1603. "pand %%mm4, %%mm0 \n\t"
  1604. "pand %%mm4, %%mm1 \n\t"
  1605. "packuswb %%mm1, %%mm0 \n\t"
  1606. "movq %%mm0, %%mm1 \n\t"
  1607. "psrlw $8, %%mm0 \n\t"
  1608. "pand %%mm4, %%mm1 \n\t"
  1609. "packuswb %%mm0, %%mm0 \n\t"
  1610. "packuswb %%mm1, %%mm1 \n\t"
  1611. "movd %%mm0, (%3, %%"REG_a") \n\t"
  1612. "movd %%mm1, (%2, %%"REG_a") \n\t"
  1613. "add $4, %%"REG_a" \n\t"
  1614. " js 1b \n\t"
  1615. : : "g" ((x86_reg)-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
  1616. : "%"REG_a
  1617. );
  1618. assert(src1 == src2);
  1619. }
  1620. static inline void RENAME(BEToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, int width, uint32_t *unused)
  1621. {
  1622. __asm__ volatile(
  1623. "movq "MANGLE(bm01010101)", %%mm4 \n\t"
  1624. "mov %0, %%"REG_a" \n\t"
  1625. "1: \n\t"
  1626. "movq (%1, %%"REG_a",2), %%mm0 \n\t"
  1627. "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
  1628. "movq (%2, %%"REG_a",2), %%mm2 \n\t"
  1629. "movq 8(%2, %%"REG_a",2), %%mm3 \n\t"
  1630. "pand %%mm4, %%mm0 \n\t"
  1631. "pand %%mm4, %%mm1 \n\t"
  1632. "pand %%mm4, %%mm2 \n\t"
  1633. "pand %%mm4, %%mm3 \n\t"
  1634. "packuswb %%mm1, %%mm0 \n\t"
  1635. "packuswb %%mm3, %%mm2 \n\t"
  1636. "movq %%mm0, (%3, %%"REG_a") \n\t"
  1637. "movq %%mm2, (%4, %%"REG_a") \n\t"
  1638. "add $8, %%"REG_a" \n\t"
  1639. " js 1b \n\t"
  1640. : : "g" ((x86_reg)-width), "r" (src1+width*2), "r" (src2+width*2), "r" (dstU+width), "r" (dstV+width)
  1641. : "%"REG_a
  1642. );
  1643. }
  1644. static inline void RENAME(nvXXtoUV)(uint8_t *dst1, uint8_t *dst2,
  1645. const uint8_t *src, int width)
  1646. {
  1647. __asm__ volatile(
  1648. "movq "MANGLE(bm01010101)", %%mm4 \n\t"
  1649. "mov %0, %%"REG_a" \n\t"
  1650. "1: \n\t"
  1651. "movq (%1, %%"REG_a",2), %%mm0 \n\t"
  1652. "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
  1653. "movq %%mm0, %%mm2 \n\t"
  1654. "movq %%mm1, %%mm3 \n\t"
  1655. "pand %%mm4, %%mm0 \n\t"
  1656. "pand %%mm4, %%mm1 \n\t"
  1657. "psrlw $8, %%mm2 \n\t"
  1658. "psrlw $8, %%mm3 \n\t"
  1659. "packuswb %%mm1, %%mm0 \n\t"
  1660. "packuswb %%mm3, %%mm2 \n\t"
  1661. "movq %%mm0, (%2, %%"REG_a") \n\t"
  1662. "movq %%mm2, (%3, %%"REG_a") \n\t"
  1663. "add $8, %%"REG_a" \n\t"
  1664. " js 1b \n\t"
  1665. : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst1+width), "r" (dst2+width)
  1666. : "%"REG_a
  1667. );
  1668. }
  1669. static inline void RENAME(nv12ToUV)(uint8_t *dstU, uint8_t *dstV,
  1670. const uint8_t *src1, const uint8_t *src2,
  1671. int width, uint32_t *unused)
  1672. {
  1673. RENAME(nvXXtoUV)(dstU, dstV, src1, width);
  1674. }
  1675. static inline void RENAME(nv21ToUV)(uint8_t *dstU, uint8_t *dstV,
  1676. const uint8_t *src1, const uint8_t *src2,
  1677. int width, uint32_t *unused)
  1678. {
  1679. RENAME(nvXXtoUV)(dstV, dstU, src1, width);
  1680. }
  1681. #endif /* !COMPILE_TEMPLATE_MMX2 */
  1682. static inline void RENAME(bgr24ToY_mmx)(int16_t *dst, const uint8_t *src, int width, enum PixelFormat srcFormat)
  1683. {
  1684. if(srcFormat == PIX_FMT_BGR24) {
  1685. __asm__ volatile(
  1686. "movq "MANGLE(ff_bgr24toY1Coeff)", %%mm5 \n\t"
  1687. "movq "MANGLE(ff_bgr24toY2Coeff)", %%mm6 \n\t"
  1688. :
  1689. );
  1690. } else {
  1691. __asm__ volatile(
  1692. "movq "MANGLE(ff_rgb24toY1Coeff)", %%mm5 \n\t"
  1693. "movq "MANGLE(ff_rgb24toY2Coeff)", %%mm6 \n\t"
  1694. :
  1695. );
  1696. }
  1697. __asm__ volatile(
  1698. "movq "MANGLE(ff_bgr24toYOffset)", %%mm4 \n\t"
  1699. "mov %2, %%"REG_a" \n\t"
  1700. "pxor %%mm7, %%mm7 \n\t"
  1701. "1: \n\t"
  1702. PREFETCH" 64(%0) \n\t"
  1703. "movd (%0), %%mm0 \n\t"
  1704. "movd 2(%0), %%mm1 \n\t"
  1705. "movd 6(%0), %%mm2 \n\t"
  1706. "movd 8(%0), %%mm3 \n\t"
  1707. "add $12, %0 \n\t"
  1708. "punpcklbw %%mm7, %%mm0 \n\t"
  1709. "punpcklbw %%mm7, %%mm1 \n\t"
  1710. "punpcklbw %%mm7, %%mm2 \n\t"
  1711. "punpcklbw %%mm7, %%mm3 \n\t"
  1712. "pmaddwd %%mm5, %%mm0 \n\t"
  1713. "pmaddwd %%mm6, %%mm1 \n\t"
  1714. "pmaddwd %%mm5, %%mm2 \n\t"
  1715. "pmaddwd %%mm6, %%mm3 \n\t"
  1716. "paddd %%mm1, %%mm0 \n\t"
  1717. "paddd %%mm3, %%mm2 \n\t"
  1718. "paddd %%mm4, %%mm0 \n\t"
  1719. "paddd %%mm4, %%mm2 \n\t"
  1720. "psrad $9, %%mm0 \n\t"
  1721. "psrad $9, %%mm2 \n\t"
  1722. "packssdw %%mm2, %%mm0 \n\t"
  1723. "movq %%mm0, (%1, %%"REG_a") \n\t"
  1724. "add $8, %%"REG_a" \n\t"
  1725. " js 1b \n\t"
  1726. : "+r" (src)
  1727. : "r" (dst+width), "g" ((x86_reg)-2*width)
  1728. : "%"REG_a
  1729. );
  1730. }
  1731. static inline void RENAME(bgr24ToUV_mmx)(int16_t *dstU, int16_t *dstV, const uint8_t *src, int width, enum PixelFormat srcFormat)
  1732. {
  1733. __asm__ volatile(
  1734. "movq 24(%4), %%mm6 \n\t"
  1735. "mov %3, %%"REG_a" \n\t"
  1736. "pxor %%mm7, %%mm7 \n\t"
  1737. "1: \n\t"
  1738. PREFETCH" 64(%0) \n\t"
  1739. "movd (%0), %%mm0 \n\t"
  1740. "movd 2(%0), %%mm1 \n\t"
  1741. "punpcklbw %%mm7, %%mm0 \n\t"
  1742. "punpcklbw %%mm7, %%mm1 \n\t"
  1743. "movq %%mm0, %%mm2 \n\t"
  1744. "movq %%mm1, %%mm3 \n\t"
  1745. "pmaddwd (%4), %%mm0 \n\t"
  1746. "pmaddwd 8(%4), %%mm1 \n\t"
  1747. "pmaddwd 16(%4), %%mm2 \n\t"
  1748. "pmaddwd %%mm6, %%mm3 \n\t"
  1749. "paddd %%mm1, %%mm0 \n\t"
  1750. "paddd %%mm3, %%mm2 \n\t"
  1751. "movd 6(%0), %%mm1 \n\t"
  1752. "movd 8(%0), %%mm3 \n\t"
  1753. "add $12, %0 \n\t"
  1754. "punpcklbw %%mm7, %%mm1 \n\t"
  1755. "punpcklbw %%mm7, %%mm3 \n\t"
  1756. "movq %%mm1, %%mm4 \n\t"
  1757. "movq %%mm3, %%mm5 \n\t"
  1758. "pmaddwd (%4), %%mm1 \n\t"
  1759. "pmaddwd 8(%4), %%mm3 \n\t"
  1760. "pmaddwd 16(%4), %%mm4 \n\t"
  1761. "pmaddwd %%mm6, %%mm5 \n\t"
  1762. "paddd %%mm3, %%mm1 \n\t"
  1763. "paddd %%mm5, %%mm4 \n\t"
  1764. "movq "MANGLE(ff_bgr24toUVOffset)", %%mm3 \n\t"
  1765. "paddd %%mm3, %%mm0 \n\t"
  1766. "paddd %%mm3, %%mm2 \n\t"
  1767. "paddd %%mm3, %%mm1 \n\t"
  1768. "paddd %%mm3, %%mm4 \n\t"
  1769. "psrad $9, %%mm0 \n\t"
  1770. "psrad $9, %%mm2 \n\t"
  1771. "psrad $9, %%mm1 \n\t"
  1772. "psrad $9, %%mm4 \n\t"
  1773. "packssdw %%mm1, %%mm0 \n\t"
  1774. "packssdw %%mm4, %%mm2 \n\t"
  1775. "movq %%mm0, (%1, %%"REG_a") \n\t"
  1776. "movq %%mm2, (%2, %%"REG_a") \n\t"
  1777. "add $8, %%"REG_a" \n\t"
  1778. " js 1b \n\t"
  1779. : "+r" (src)
  1780. : "r" (dstU+width), "r" (dstV+width), "g" ((x86_reg)-2*width), "r"(ff_bgr24toUV[srcFormat == PIX_FMT_RGB24])
  1781. : "%"REG_a
  1782. );
  1783. }
  1784. static inline void RENAME(bgr24ToY)(int16_t *dst, const uint8_t *src, int width, uint32_t *unused)
  1785. {
  1786. RENAME(bgr24ToY_mmx)(dst, src, width, PIX_FMT_BGR24);
  1787. }
  1788. static inline void RENAME(bgr24ToUV)(int16_t *dstU, int16_t *dstV, const uint8_t *src1, const uint8_t *src2, int width, uint32_t *unused)
  1789. {
  1790. RENAME(bgr24ToUV_mmx)(dstU, dstV, src1, width, PIX_FMT_BGR24);
  1791. assert(src1 == src2);
  1792. }
  1793. static inline void RENAME(rgb24ToY)(int16_t *dst, const uint8_t *src, int width, uint32_t *unused)
  1794. {
  1795. RENAME(bgr24ToY_mmx)(dst, src, width, PIX_FMT_RGB24);
  1796. }
  1797. static inline void RENAME(rgb24ToUV)(int16_t *dstU, int16_t *dstV, const uint8_t *src1, const uint8_t *src2, int width, uint32_t *unused)
  1798. {
  1799. assert(src1==src2);
  1800. RENAME(bgr24ToUV_mmx)(dstU, dstV, src1, width, PIX_FMT_RGB24);
  1801. }
  1802. #if !COMPILE_TEMPLATE_MMX2
  1803. // bilinear / bicubic scaling
  1804. static inline void RENAME(hScale)(int16_t *dst, int dstW, const uint8_t *src, int srcW, int xInc,
  1805. const int16_t *filter, const int16_t *filterPos, int filterSize)
  1806. {
  1807. assert(filterSize % 4 == 0 && filterSize>0);
  1808. if (filterSize==4) { // Always true for upscaling, sometimes for down, too.
  1809. x86_reg counter= -2*dstW;
  1810. filter-= counter*2;
  1811. filterPos-= counter/2;
  1812. dst-= counter/2;
  1813. __asm__ volatile(
  1814. #if defined(PIC)
  1815. "push %%"REG_b" \n\t"
  1816. #endif
  1817. "pxor %%mm7, %%mm7 \n\t"
  1818. "push %%"REG_BP" \n\t" // we use 7 regs here ...
  1819. "mov %%"REG_a", %%"REG_BP" \n\t"
  1820. ".p2align 4 \n\t"
  1821. "1: \n\t"
  1822. "movzwl (%2, %%"REG_BP"), %%eax \n\t"
  1823. "movzwl 2(%2, %%"REG_BP"), %%ebx \n\t"
  1824. "movq (%1, %%"REG_BP", 4), %%mm1 \n\t"
  1825. "movq 8(%1, %%"REG_BP", 4), %%mm3 \n\t"
  1826. "movd (%3, %%"REG_a"), %%mm0 \n\t"
  1827. "movd (%3, %%"REG_b"), %%mm2 \n\t"
  1828. "punpcklbw %%mm7, %%mm0 \n\t"
  1829. "punpcklbw %%mm7, %%mm2 \n\t"
  1830. "pmaddwd %%mm1, %%mm0 \n\t"
  1831. "pmaddwd %%mm2, %%mm3 \n\t"
  1832. "movq %%mm0, %%mm4 \n\t"
  1833. "punpckldq %%mm3, %%mm0 \n\t"
  1834. "punpckhdq %%mm3, %%mm4 \n\t"
  1835. "paddd %%mm4, %%mm0 \n\t"
  1836. "psrad $7, %%mm0 \n\t"
  1837. "packssdw %%mm0, %%mm0 \n\t"
  1838. "movd %%mm0, (%4, %%"REG_BP") \n\t"
  1839. "add $4, %%"REG_BP" \n\t"
  1840. " jnc 1b \n\t"
  1841. "pop %%"REG_BP" \n\t"
  1842. #if defined(PIC)
  1843. "pop %%"REG_b" \n\t"
  1844. #endif
  1845. : "+a" (counter)
  1846. : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
  1847. #if !defined(PIC)
  1848. : "%"REG_b
  1849. #endif
  1850. );
  1851. } else if (filterSize==8) {
  1852. x86_reg counter= -2*dstW;
  1853. filter-= counter*4;
  1854. filterPos-= counter/2;
  1855. dst-= counter/2;
  1856. __asm__ volatile(
  1857. #if defined(PIC)
  1858. "push %%"REG_b" \n\t"
  1859. #endif
  1860. "pxor %%mm7, %%mm7 \n\t"
  1861. "push %%"REG_BP" \n\t" // we use 7 regs here ...
  1862. "mov %%"REG_a", %%"REG_BP" \n\t"
  1863. ".p2align 4 \n\t"
  1864. "1: \n\t"
  1865. "movzwl (%2, %%"REG_BP"), %%eax \n\t"
  1866. "movzwl 2(%2, %%"REG_BP"), %%ebx \n\t"
  1867. "movq (%1, %%"REG_BP", 8), %%mm1 \n\t"
  1868. "movq 16(%1, %%"REG_BP", 8), %%mm3 \n\t"
  1869. "movd (%3, %%"REG_a"), %%mm0 \n\t"
  1870. "movd (%3, %%"REG_b"), %%mm2 \n\t"
  1871. "punpcklbw %%mm7, %%mm0 \n\t"
  1872. "punpcklbw %%mm7, %%mm2 \n\t"
  1873. "pmaddwd %%mm1, %%mm0 \n\t"
  1874. "pmaddwd %%mm2, %%mm3 \n\t"
  1875. "movq 8(%1, %%"REG_BP", 8), %%mm1 \n\t"
  1876. "movq 24(%1, %%"REG_BP", 8), %%mm5 \n\t"
  1877. "movd 4(%3, %%"REG_a"), %%mm4 \n\t"
  1878. "movd 4(%3, %%"REG_b"), %%mm2 \n\t"
  1879. "punpcklbw %%mm7, %%mm4 \n\t"
  1880. "punpcklbw %%mm7, %%mm2 \n\t"
  1881. "pmaddwd %%mm1, %%mm4 \n\t"
  1882. "pmaddwd %%mm2, %%mm5 \n\t"
  1883. "paddd %%mm4, %%mm0 \n\t"
  1884. "paddd %%mm5, %%mm3 \n\t"
  1885. "movq %%mm0, %%mm4 \n\t"
  1886. "punpckldq %%mm3, %%mm0 \n\t"
  1887. "punpckhdq %%mm3, %%mm4 \n\t"
  1888. "paddd %%mm4, %%mm0 \n\t"
  1889. "psrad $7, %%mm0 \n\t"
  1890. "packssdw %%mm0, %%mm0 \n\t"
  1891. "movd %%mm0, (%4, %%"REG_BP") \n\t"
  1892. "add $4, %%"REG_BP" \n\t"
  1893. " jnc 1b \n\t"
  1894. "pop %%"REG_BP" \n\t"
  1895. #if defined(PIC)
  1896. "pop %%"REG_b" \n\t"
  1897. #endif
  1898. : "+a" (counter)
  1899. : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
  1900. #if !defined(PIC)
  1901. : "%"REG_b
  1902. #endif
  1903. );
  1904. } else {
  1905. const uint8_t *offset = src+filterSize;
  1906. x86_reg counter= -2*dstW;
  1907. //filter-= counter*filterSize/2;
  1908. filterPos-= counter/2;
  1909. dst-= counter/2;
  1910. __asm__ volatile(
  1911. "pxor %%mm7, %%mm7 \n\t"
  1912. ".p2align 4 \n\t"
  1913. "1: \n\t"
  1914. "mov %2, %%"REG_c" \n\t"
  1915. "movzwl (%%"REG_c", %0), %%eax \n\t"
  1916. "movzwl 2(%%"REG_c", %0), %%edx \n\t"
  1917. "mov %5, %%"REG_c" \n\t"
  1918. "pxor %%mm4, %%mm4 \n\t"
  1919. "pxor %%mm5, %%mm5 \n\t"
  1920. "2: \n\t"
  1921. "movq (%1), %%mm1 \n\t"
  1922. "movq (%1, %6), %%mm3 \n\t"
  1923. "movd (%%"REG_c", %%"REG_a"), %%mm0 \n\t"
  1924. "movd (%%"REG_c", %%"REG_d"), %%mm2 \n\t"
  1925. "punpcklbw %%mm7, %%mm0 \n\t"
  1926. "punpcklbw %%mm7, %%mm2 \n\t"
  1927. "pmaddwd %%mm1, %%mm0 \n\t"
  1928. "pmaddwd %%mm2, %%mm3 \n\t"
  1929. "paddd %%mm3, %%mm5 \n\t"
  1930. "paddd %%mm0, %%mm4 \n\t"
  1931. "add $8, %1 \n\t"
  1932. "add $4, %%"REG_c" \n\t"
  1933. "cmp %4, %%"REG_c" \n\t"
  1934. " jb 2b \n\t"
  1935. "add %6, %1 \n\t"
  1936. "movq %%mm4, %%mm0 \n\t"
  1937. "punpckldq %%mm5, %%mm4 \n\t"
  1938. "punpckhdq %%mm5, %%mm0 \n\t"
  1939. "paddd %%mm0, %%mm4 \n\t"
  1940. "psrad $7, %%mm4 \n\t"
  1941. "packssdw %%mm4, %%mm4 \n\t"
  1942. "mov %3, %%"REG_a" \n\t"
  1943. "movd %%mm4, (%%"REG_a", %0) \n\t"
  1944. "add $4, %0 \n\t"
  1945. " jnc 1b \n\t"
  1946. : "+r" (counter), "+r" (filter)
  1947. : "m" (filterPos), "m" (dst), "m"(offset),
  1948. "m" (src), "r" ((x86_reg)filterSize*2)
  1949. : "%"REG_a, "%"REG_c, "%"REG_d
  1950. );
  1951. }
  1952. }
  1953. #endif /* !COMPILE_TEMPLATE_MMX2 */
  1954. static inline void RENAME(hScale16)(int16_t *dst, int dstW, const uint16_t *src, int srcW, int xInc,
  1955. const int16_t *filter, const int16_t *filterPos, long filterSize, int shift)
  1956. {
  1957. int i, j;
  1958. assert(filterSize % 4 == 0 && filterSize>0);
  1959. if (filterSize==4 && shift<15) { // Always true for upscaling, sometimes for down, too.
  1960. x86_reg counter= -2*dstW;
  1961. filter-= counter*2;
  1962. filterPos-= counter/2;
  1963. dst-= counter/2;
  1964. __asm__ volatile(
  1965. "movd %5, %%mm7 \n\t"
  1966. #if defined(PIC)
  1967. "push %%"REG_b" \n\t"
  1968. #endif
  1969. "push %%"REG_BP" \n\t" // we use 7 regs here ...
  1970. "mov %%"REG_a", %%"REG_BP" \n\t"
  1971. ".p2align 4 \n\t"
  1972. "1: \n\t"
  1973. "movzwl (%2, %%"REG_BP"), %%eax \n\t"
  1974. "movzwl 2(%2, %%"REG_BP"), %%ebx \n\t"
  1975. "movq (%1, %%"REG_BP", 4), %%mm1 \n\t"
  1976. "movq 8(%1, %%"REG_BP", 4), %%mm3 \n\t"
  1977. "movq (%3, %%"REG_a", 2), %%mm0 \n\t"
  1978. "movq (%3, %%"REG_b", 2), %%mm2 \n\t"
  1979. "pmaddwd %%mm1, %%mm0 \n\t"
  1980. "pmaddwd %%mm2, %%mm3 \n\t"
  1981. "movq %%mm0, %%mm4 \n\t"
  1982. "punpckldq %%mm3, %%mm0 \n\t"
  1983. "punpckhdq %%mm3, %%mm4 \n\t"
  1984. "paddd %%mm4, %%mm0 \n\t"
  1985. "psrad %%mm7, %%mm0 \n\t"
  1986. "packssdw %%mm0, %%mm0 \n\t"
  1987. "movd %%mm0, (%4, %%"REG_BP") \n\t"
  1988. "add $4, %%"REG_BP" \n\t"
  1989. " jnc 1b \n\t"
  1990. "pop %%"REG_BP" \n\t"
  1991. #if defined(PIC)
  1992. "pop %%"REG_b" \n\t"
  1993. #endif
  1994. : "+a" (counter)
  1995. : "c" (filter), "d" (filterPos), "S" (src), "D" (dst), "m"(shift)
  1996. #if !defined(PIC)
  1997. : "%"REG_b
  1998. #endif
  1999. );
  2000. } else if (filterSize==8 && shift<15) {
  2001. x86_reg counter= -2*dstW;
  2002. filter-= counter*4;
  2003. filterPos-= counter/2;
  2004. dst-= counter/2;
  2005. __asm__ volatile(
  2006. "movd %5, %%mm7 \n\t"
  2007. #if defined(PIC)
  2008. "push %%"REG_b" \n\t"
  2009. #endif
  2010. "push %%"REG_BP" \n\t" // we use 7 regs here ...
  2011. "mov %%"REG_a", %%"REG_BP" \n\t"
  2012. ".p2align 4 \n\t"
  2013. "1: \n\t"
  2014. "movzwl (%2, %%"REG_BP"), %%eax \n\t"
  2015. "movzwl 2(%2, %%"REG_BP"), %%ebx \n\t"
  2016. "movq (%1, %%"REG_BP", 8), %%mm1 \n\t"
  2017. "movq 16(%1, %%"REG_BP", 8), %%mm3 \n\t"
  2018. "movq (%3, %%"REG_a", 2), %%mm0 \n\t"
  2019. "movq (%3, %%"REG_b", 2), %%mm2 \n\t"
  2020. "pmaddwd %%mm1, %%mm0 \n\t"
  2021. "pmaddwd %%mm2, %%mm3 \n\t"
  2022. "movq 8(%1, %%"REG_BP", 8), %%mm1 \n\t"
  2023. "movq 24(%1, %%"REG_BP", 8), %%mm5 \n\t"
  2024. "movq 8(%3, %%"REG_a", 2), %%mm4 \n\t"
  2025. "movq 8(%3, %%"REG_b", 2), %%mm2 \n\t"
  2026. "pmaddwd %%mm1, %%mm4 \n\t"
  2027. "pmaddwd %%mm2, %%mm5 \n\t"
  2028. "paddd %%mm4, %%mm0 \n\t"
  2029. "paddd %%mm5, %%mm3 \n\t"
  2030. "movq %%mm0, %%mm4 \n\t"
  2031. "punpckldq %%mm3, %%mm0 \n\t"
  2032. "punpckhdq %%mm3, %%mm4 \n\t"
  2033. "paddd %%mm4, %%mm0 \n\t"
  2034. "psrad %%mm7, %%mm0 \n\t"
  2035. "packssdw %%mm0, %%mm0 \n\t"
  2036. "movd %%mm0, (%4, %%"REG_BP") \n\t"
  2037. "add $4, %%"REG_BP" \n\t"
  2038. " jnc 1b \n\t"
  2039. "pop %%"REG_BP" \n\t"
  2040. #if defined(PIC)
  2041. "pop %%"REG_b" \n\t"
  2042. #endif
  2043. : "+a" (counter)
  2044. : "c" (filter), "d" (filterPos), "S" (src), "D" (dst), "m"(shift)
  2045. #if !defined(PIC)
  2046. : "%"REG_b
  2047. #endif
  2048. );
  2049. } else if (shift<15){
  2050. const uint16_t *offset = src+filterSize;
  2051. x86_reg counter= -2*dstW;
  2052. //filter-= counter*filterSize/2;
  2053. filterPos-= counter/2;
  2054. dst-= counter/2;
  2055. __asm__ volatile(
  2056. "movd %7, %%mm7 \n\t"
  2057. ".p2align 4 \n\t"
  2058. "1: \n\t"
  2059. "mov %2, %%"REG_c" \n\t"
  2060. "movzwl (%%"REG_c", %0), %%eax \n\t"
  2061. "movzwl 2(%%"REG_c", %0), %%edx \n\t"
  2062. "mov %5, %%"REG_c" \n\t"
  2063. "pxor %%mm4, %%mm4 \n\t"
  2064. "pxor %%mm5, %%mm5 \n\t"
  2065. "2: \n\t"
  2066. "movq (%1), %%mm1 \n\t"
  2067. "movq (%1, %6), %%mm3 \n\t"
  2068. "movq (%%"REG_c", %%"REG_a", 2), %%mm0 \n\t"
  2069. "movq (%%"REG_c", %%"REG_d", 2), %%mm2 \n\t"
  2070. "pmaddwd %%mm1, %%mm0 \n\t"
  2071. "pmaddwd %%mm2, %%mm3 \n\t"
  2072. "paddd %%mm3, %%mm5 \n\t"
  2073. "paddd %%mm0, %%mm4 \n\t"
  2074. "add $8, %1 \n\t"
  2075. "add $8, %%"REG_c" \n\t"
  2076. "cmp %4, %%"REG_c" \n\t"
  2077. " jb 2b \n\t"
  2078. "add %6, %1 \n\t"
  2079. "movq %%mm4, %%mm0 \n\t"
  2080. "punpckldq %%mm5, %%mm4 \n\t"
  2081. "punpckhdq %%mm5, %%mm0 \n\t"
  2082. "paddd %%mm0, %%mm4 \n\t"
  2083. "psrad %%mm7, %%mm4 \n\t"
  2084. "packssdw %%mm4, %%mm4 \n\t"
  2085. "mov %3, %%"REG_a" \n\t"
  2086. "movd %%mm4, (%%"REG_a", %0) \n\t"
  2087. "add $4, %0 \n\t"
  2088. " jnc 1b \n\t"
  2089. : "+r" (counter), "+r" (filter)
  2090. : "m" (filterPos), "m" (dst), "m"(offset),
  2091. "m" (src), "r" ((x86_reg)filterSize*2), "m"(shift)
  2092. : "%"REG_a, "%"REG_c, "%"REG_d
  2093. );
  2094. } else
  2095. for (i=0; i<dstW; i++) {
  2096. int srcPos= filterPos[i];
  2097. int val=0;
  2098. for (j=0; j<filterSize; j++) {
  2099. val += ((int)src[srcPos + j])*filter[filterSize*i + j];
  2100. }
  2101. dst[i] = FFMIN(val>>shift, (1<<15)-1); // the cubic equation does overflow ...
  2102. }
  2103. }
  2104. #if COMPILE_TEMPLATE_MMX2
  2105. static inline void RENAME(hyscale_fast)(SwsContext *c, int16_t *dst,
  2106. int dstWidth, const uint8_t *src, int srcW,
  2107. int xInc)
  2108. {
  2109. int32_t *filterPos = c->hLumFilterPos;
  2110. int16_t *filter = c->hLumFilter;
  2111. void *mmx2FilterCode= c->lumMmx2FilterCode;
  2112. int i;
  2113. #if defined(PIC)
  2114. DECLARE_ALIGNED(8, uint64_t, ebxsave);
  2115. #endif
  2116. __asm__ volatile(
  2117. #if defined(PIC)
  2118. "mov %%"REG_b", %5 \n\t"
  2119. #endif
  2120. "pxor %%mm7, %%mm7 \n\t"
  2121. "mov %0, %%"REG_c" \n\t"
  2122. "mov %1, %%"REG_D" \n\t"
  2123. "mov %2, %%"REG_d" \n\t"
  2124. "mov %3, %%"REG_b" \n\t"
  2125. "xor %%"REG_a", %%"REG_a" \n\t" // i
  2126. PREFETCH" (%%"REG_c") \n\t"
  2127. PREFETCH" 32(%%"REG_c") \n\t"
  2128. PREFETCH" 64(%%"REG_c") \n\t"
  2129. #if ARCH_X86_64
  2130. #define CALL_MMX2_FILTER_CODE \
  2131. "movl (%%"REG_b"), %%esi \n\t"\
  2132. "call *%4 \n\t"\
  2133. "movl (%%"REG_b", %%"REG_a"), %%esi \n\t"\
  2134. "add %%"REG_S", %%"REG_c" \n\t"\
  2135. "add %%"REG_a", %%"REG_D" \n\t"\
  2136. "xor %%"REG_a", %%"REG_a" \n\t"\
  2137. #else
  2138. #define CALL_MMX2_FILTER_CODE \
  2139. "movl (%%"REG_b"), %%esi \n\t"\
  2140. "call *%4 \n\t"\
  2141. "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\
  2142. "add %%"REG_a", %%"REG_D" \n\t"\
  2143. "xor %%"REG_a", %%"REG_a" \n\t"\
  2144. #endif /* ARCH_X86_64 */
  2145. CALL_MMX2_FILTER_CODE
  2146. CALL_MMX2_FILTER_CODE
  2147. CALL_MMX2_FILTER_CODE
  2148. CALL_MMX2_FILTER_CODE
  2149. CALL_MMX2_FILTER_CODE
  2150. CALL_MMX2_FILTER_CODE
  2151. CALL_MMX2_FILTER_CODE
  2152. CALL_MMX2_FILTER_CODE
  2153. #if defined(PIC)
  2154. "mov %5, %%"REG_b" \n\t"
  2155. #endif
  2156. :: "m" (src), "m" (dst), "m" (filter), "m" (filterPos),
  2157. "m" (mmx2FilterCode)
  2158. #if defined(PIC)
  2159. ,"m" (ebxsave)
  2160. #endif
  2161. : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
  2162. #if !defined(PIC)
  2163. ,"%"REG_b
  2164. #endif
  2165. );
  2166. for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
  2167. dst[i] = src[srcW-1]*128;
  2168. }
  2169. static inline void RENAME(hcscale_fast)(SwsContext *c, int16_t *dst1, int16_t *dst2,
  2170. int dstWidth, const uint8_t *src1,
  2171. const uint8_t *src2, int srcW, int xInc)
  2172. {
  2173. int32_t *filterPos = c->hChrFilterPos;
  2174. int16_t *filter = c->hChrFilter;
  2175. void *mmx2FilterCode= c->chrMmx2FilterCode;
  2176. int i;
  2177. #if defined(PIC)
  2178. DECLARE_ALIGNED(8, uint64_t, ebxsave);
  2179. #endif
  2180. __asm__ volatile(
  2181. #if defined(PIC)
  2182. "mov %%"REG_b", %7 \n\t"
  2183. #endif
  2184. "pxor %%mm7, %%mm7 \n\t"
  2185. "mov %0, %%"REG_c" \n\t"
  2186. "mov %1, %%"REG_D" \n\t"
  2187. "mov %2, %%"REG_d" \n\t"
  2188. "mov %3, %%"REG_b" \n\t"
  2189. "xor %%"REG_a", %%"REG_a" \n\t" // i
  2190. PREFETCH" (%%"REG_c") \n\t"
  2191. PREFETCH" 32(%%"REG_c") \n\t"
  2192. PREFETCH" 64(%%"REG_c") \n\t"
  2193. CALL_MMX2_FILTER_CODE
  2194. CALL_MMX2_FILTER_CODE
  2195. CALL_MMX2_FILTER_CODE
  2196. CALL_MMX2_FILTER_CODE
  2197. "xor %%"REG_a", %%"REG_a" \n\t" // i
  2198. "mov %5, %%"REG_c" \n\t" // src
  2199. "mov %6, %%"REG_D" \n\t" // buf2
  2200. PREFETCH" (%%"REG_c") \n\t"
  2201. PREFETCH" 32(%%"REG_c") \n\t"
  2202. PREFETCH" 64(%%"REG_c") \n\t"
  2203. CALL_MMX2_FILTER_CODE
  2204. CALL_MMX2_FILTER_CODE
  2205. CALL_MMX2_FILTER_CODE
  2206. CALL_MMX2_FILTER_CODE
  2207. #if defined(PIC)
  2208. "mov %7, %%"REG_b" \n\t"
  2209. #endif
  2210. :: "m" (src1), "m" (dst1), "m" (filter), "m" (filterPos),
  2211. "m" (mmx2FilterCode), "m" (src2), "m"(dst2)
  2212. #if defined(PIC)
  2213. ,"m" (ebxsave)
  2214. #endif
  2215. : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
  2216. #if !defined(PIC)
  2217. ,"%"REG_b
  2218. #endif
  2219. );
  2220. for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) {
  2221. dst1[i] = src1[srcW-1]*128;
  2222. dst2[i] = src2[srcW-1]*128;
  2223. }
  2224. }
  2225. #endif /* COMPILE_TEMPLATE_MMX2 */
  2226. #if !COMPILE_TEMPLATE_MMX2
  2227. static void updateMMXDitherTables(SwsContext *c, int dstY, int lumBufIndex, int chrBufIndex,
  2228. int lastInLumBuf, int lastInChrBuf)
  2229. {
  2230. const int dstH= c->dstH;
  2231. const int flags= c->flags;
  2232. int16_t **lumPixBuf= c->lumPixBuf;
  2233. int16_t **chrUPixBuf= c->chrUPixBuf;
  2234. int16_t **alpPixBuf= c->alpPixBuf;
  2235. const int vLumBufSize= c->vLumBufSize;
  2236. const int vChrBufSize= c->vChrBufSize;
  2237. int16_t *vLumFilterPos= c->vLumFilterPos;
  2238. int16_t *vChrFilterPos= c->vChrFilterPos;
  2239. int16_t *vLumFilter= c->vLumFilter;
  2240. int16_t *vChrFilter= c->vChrFilter;
  2241. int32_t *lumMmxFilter= c->lumMmxFilter;
  2242. int32_t *chrMmxFilter= c->chrMmxFilter;
  2243. int32_t av_unused *alpMmxFilter= c->alpMmxFilter;
  2244. const int vLumFilterSize= c->vLumFilterSize;
  2245. const int vChrFilterSize= c->vChrFilterSize;
  2246. const int chrDstY= dstY>>c->chrDstVSubSample;
  2247. const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input
  2248. const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input
  2249. c->blueDither= ff_dither8[dstY&1];
  2250. if (c->dstFormat == PIX_FMT_RGB555 || c->dstFormat == PIX_FMT_BGR555)
  2251. c->greenDither= ff_dither8[dstY&1];
  2252. else
  2253. c->greenDither= ff_dither4[dstY&1];
  2254. c->redDither= ff_dither8[(dstY+1)&1];
  2255. if (dstY < dstH - 2) {
  2256. const int16_t **lumSrcPtr= (const int16_t **) lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
  2257. const int16_t **chrUSrcPtr= (const int16_t **) chrUPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
  2258. const int16_t **alpSrcPtr= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? (const int16_t **) alpPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize : NULL;
  2259. int i;
  2260. if (flags & SWS_ACCURATE_RND) {
  2261. int s= APCK_SIZE / 8;
  2262. for (i=0; i<vLumFilterSize; i+=2) {
  2263. *(const void**)&lumMmxFilter[s*i ]= lumSrcPtr[i ];
  2264. *(const void**)&lumMmxFilter[s*i+APCK_PTR2/4 ]= lumSrcPtr[i+(vLumFilterSize>1)];
  2265. lumMmxFilter[s*i+APCK_COEF/4 ]=
  2266. lumMmxFilter[s*i+APCK_COEF/4+1]= vLumFilter[dstY*vLumFilterSize + i ]
  2267. + (vLumFilterSize>1 ? vLumFilter[dstY*vLumFilterSize + i + 1]<<16 : 0);
  2268. if (CONFIG_SWSCALE_ALPHA && alpPixBuf) {
  2269. *(const void**)&alpMmxFilter[s*i ]= alpSrcPtr[i ];
  2270. *(const void**)&alpMmxFilter[s*i+APCK_PTR2/4 ]= alpSrcPtr[i+(vLumFilterSize>1)];
  2271. alpMmxFilter[s*i+APCK_COEF/4 ]=
  2272. alpMmxFilter[s*i+APCK_COEF/4+1]= lumMmxFilter[s*i+APCK_COEF/4 ];
  2273. }
  2274. }
  2275. for (i=0; i<vChrFilterSize; i+=2) {
  2276. *(const void**)&chrMmxFilter[s*i ]= chrUSrcPtr[i ];
  2277. *(const void**)&chrMmxFilter[s*i+APCK_PTR2/4 ]= chrUSrcPtr[i+(vChrFilterSize>1)];
  2278. chrMmxFilter[s*i+APCK_COEF/4 ]=
  2279. chrMmxFilter[s*i+APCK_COEF/4+1]= vChrFilter[chrDstY*vChrFilterSize + i ]
  2280. + (vChrFilterSize>1 ? vChrFilter[chrDstY*vChrFilterSize + i + 1]<<16 : 0);
  2281. }
  2282. } else {
  2283. for (i=0; i<vLumFilterSize; i++) {
  2284. *(const void**)&lumMmxFilter[4*i+0]= lumSrcPtr[i];
  2285. lumMmxFilter[4*i+2]=
  2286. lumMmxFilter[4*i+3]=
  2287. ((uint16_t)vLumFilter[dstY*vLumFilterSize + i])*0x10001;
  2288. if (CONFIG_SWSCALE_ALPHA && alpPixBuf) {
  2289. *(const void**)&alpMmxFilter[4*i+0]= alpSrcPtr[i];
  2290. alpMmxFilter[4*i+2]=
  2291. alpMmxFilter[4*i+3]= lumMmxFilter[4*i+2];
  2292. }
  2293. }
  2294. for (i=0; i<vChrFilterSize; i++) {
  2295. *(const void**)&chrMmxFilter[4*i+0]= chrUSrcPtr[i];
  2296. chrMmxFilter[4*i+2]=
  2297. chrMmxFilter[4*i+3]=
  2298. ((uint16_t)vChrFilter[chrDstY*vChrFilterSize + i])*0x10001;
  2299. }
  2300. }
  2301. }
  2302. }
  2303. #endif /* !COMPILE_TEMPLATE_MMX2 */
  2304. static void RENAME(sws_init_swScale)(SwsContext *c)
  2305. {
  2306. enum PixelFormat srcFormat = c->srcFormat;
  2307. if (!(c->flags & SWS_BITEXACT)) {
  2308. if (c->flags & SWS_ACCURATE_RND) {
  2309. c->yuv2yuv1 = RENAME(yuv2yuv1_ar );
  2310. c->yuv2yuvX = RENAME(yuv2yuvX_ar );
  2311. switch (c->dstFormat) {
  2312. case PIX_FMT_RGB32: c->yuv2packedX = RENAME(yuv2rgb32_X_ar); break;
  2313. case PIX_FMT_BGR24: c->yuv2packedX = RENAME(yuv2bgr24_X_ar); break;
  2314. case PIX_FMT_RGB555: c->yuv2packedX = RENAME(yuv2rgb555_X_ar); break;
  2315. case PIX_FMT_RGB565: c->yuv2packedX = RENAME(yuv2rgb565_X_ar); break;
  2316. case PIX_FMT_YUYV422: c->yuv2packedX = RENAME(yuv2yuyv422_X_ar); break;
  2317. default: break;
  2318. }
  2319. } else {
  2320. c->yuv2yuv1 = RENAME(yuv2yuv1 );
  2321. c->yuv2yuvX = RENAME(yuv2yuvX );
  2322. switch (c->dstFormat) {
  2323. case PIX_FMT_RGB32: c->yuv2packedX = RENAME(yuv2rgb32_X); break;
  2324. case PIX_FMT_BGR24: c->yuv2packedX = RENAME(yuv2bgr24_X); break;
  2325. case PIX_FMT_RGB555: c->yuv2packedX = RENAME(yuv2rgb555_X); break;
  2326. case PIX_FMT_RGB565: c->yuv2packedX = RENAME(yuv2rgb565_X); break;
  2327. case PIX_FMT_YUYV422: c->yuv2packedX = RENAME(yuv2yuyv422_X); break;
  2328. default: break;
  2329. }
  2330. }
  2331. switch (c->dstFormat) {
  2332. case PIX_FMT_RGB32:
  2333. c->yuv2packed1 = RENAME(yuv2rgb32_1);
  2334. c->yuv2packed2 = RENAME(yuv2rgb32_2);
  2335. break;
  2336. case PIX_FMT_BGR24:
  2337. c->yuv2packed1 = RENAME(yuv2bgr24_1);
  2338. c->yuv2packed2 = RENAME(yuv2bgr24_2);
  2339. break;
  2340. case PIX_FMT_RGB555:
  2341. c->yuv2packed1 = RENAME(yuv2rgb555_1);
  2342. c->yuv2packed2 = RENAME(yuv2rgb555_2);
  2343. break;
  2344. case PIX_FMT_RGB565:
  2345. c->yuv2packed1 = RENAME(yuv2rgb565_1);
  2346. c->yuv2packed2 = RENAME(yuv2rgb565_2);
  2347. break;
  2348. case PIX_FMT_YUYV422:
  2349. c->yuv2packed1 = RENAME(yuv2yuyv422_1);
  2350. c->yuv2packed2 = RENAME(yuv2yuyv422_2);
  2351. break;
  2352. default:
  2353. break;
  2354. }
  2355. }
  2356. #if !COMPILE_TEMPLATE_MMX2
  2357. c->hScale = RENAME(hScale );
  2358. #endif /* !COMPILE_TEMPLATE_MMX2 */
  2359. // Use the new MMX scaler if the MMX2 one can't be used (it is faster than the x86 ASM one).
  2360. #if COMPILE_TEMPLATE_MMX2
  2361. if (c->flags & SWS_FAST_BILINEAR && c->canMMX2BeUsed)
  2362. {
  2363. c->hyscale_fast = RENAME(hyscale_fast);
  2364. c->hcscale_fast = RENAME(hcscale_fast);
  2365. } else {
  2366. #endif /* COMPILE_TEMPLATE_MMX2 */
  2367. c->hyscale_fast = NULL;
  2368. c->hcscale_fast = NULL;
  2369. #if COMPILE_TEMPLATE_MMX2
  2370. }
  2371. #endif /* COMPILE_TEMPLATE_MMX2 */
  2372. #if !COMPILE_TEMPLATE_MMX2
  2373. switch(srcFormat) {
  2374. case PIX_FMT_YUYV422 : c->chrToYV12 = RENAME(yuy2ToUV); break;
  2375. case PIX_FMT_UYVY422 : c->chrToYV12 = RENAME(uyvyToUV); break;
  2376. case PIX_FMT_NV12 : c->chrToYV12 = RENAME(nv12ToUV); break;
  2377. case PIX_FMT_NV21 : c->chrToYV12 = RENAME(nv21ToUV); break;
  2378. case PIX_FMT_GRAY16LE :
  2379. case PIX_FMT_YUV420P9LE:
  2380. case PIX_FMT_YUV422P10LE:
  2381. case PIX_FMT_YUV420P10LE:
  2382. case PIX_FMT_YUV420P16LE:
  2383. case PIX_FMT_YUV422P16LE:
  2384. case PIX_FMT_YUV444P16LE: c->hScale16= RENAME(hScale16); break;
  2385. }
  2386. #endif /* !COMPILE_TEMPLATE_MMX2 */
  2387. if (!c->chrSrcHSubSample) {
  2388. switch(srcFormat) {
  2389. case PIX_FMT_BGR24 : c->chrToYV12 = RENAME(bgr24ToUV); break;
  2390. case PIX_FMT_RGB24 : c->chrToYV12 = RENAME(rgb24ToUV); break;
  2391. default: break;
  2392. }
  2393. }
  2394. switch (srcFormat) {
  2395. #if !COMPILE_TEMPLATE_MMX2
  2396. case PIX_FMT_YUYV422 :
  2397. case PIX_FMT_Y400A :
  2398. c->lumToYV12 = RENAME(yuy2ToY); break;
  2399. case PIX_FMT_UYVY422 :
  2400. c->lumToYV12 = RENAME(uyvyToY); break;
  2401. #endif /* !COMPILE_TEMPLATE_MMX2 */
  2402. case PIX_FMT_BGR24 : c->lumToYV12 = RENAME(bgr24ToY); break;
  2403. case PIX_FMT_RGB24 : c->lumToYV12 = RENAME(rgb24ToY); break;
  2404. default: break;
  2405. }
  2406. #if !COMPILE_TEMPLATE_MMX2
  2407. if (c->alpPixBuf) {
  2408. switch (srcFormat) {
  2409. case PIX_FMT_Y400A : c->alpToYV12 = RENAME(yuy2ToY); break;
  2410. default: break;
  2411. }
  2412. }
  2413. #endif /* !COMPILE_TEMPLATE_MMX2 */
  2414. if(isAnyRGB(c->srcFormat))
  2415. c->hScale16= RENAME(hScale16);
  2416. }