You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1547 lines
70KB

  1. /*
  2. * Copyright (C) 2001-2011 Michael Niedermayer <michaelni@gmx.at>
  3. *
  4. * This file is part of FFmpeg.
  5. *
  6. * FFmpeg is free software; you can redistribute it and/or
  7. * modify it under the terms of the GNU Lesser General Public
  8. * License as published by the Free Software Foundation; either
  9. * version 2.1 of the License, or (at your option) any later version.
  10. *
  11. * FFmpeg is distributed in the hope that it will be useful,
  12. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  14. * Lesser General Public License for more details.
  15. *
  16. * You should have received a copy of the GNU Lesser General Public
  17. * License along with FFmpeg; if not, write to the Free Software
  18. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  19. */
  20. #include <stdint.h>
  21. #include "libavutil/x86/asm.h"
  22. #include "libswscale/swscale_internal.h"
  23. #undef REAL_MOVNTQ
  24. #undef MOVNTQ
  25. #undef MOVNTQ2
  26. #undef PREFETCH
  27. #if COMPILE_TEMPLATE_MMXEXT
  28. #define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
  29. #define MOVNTQ2 "movntq "
  30. #else
  31. #define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
  32. #define MOVNTQ2 "movq "
  33. #endif
  34. #define MOVNTQ(a,b) REAL_MOVNTQ(a,b)
  35. #if !COMPILE_TEMPLATE_MMXEXT
  36. static av_always_inline void
  37. dither_8to16(const uint8_t *srcDither, int rot)
  38. {
  39. if (rot) {
  40. __asm__ volatile("pxor %%mm0, %%mm0\n\t"
  41. "movq (%0), %%mm3\n\t"
  42. "movq %%mm3, %%mm4\n\t"
  43. "psrlq $24, %%mm3\n\t"
  44. "psllq $40, %%mm4\n\t"
  45. "por %%mm4, %%mm3\n\t"
  46. "movq %%mm3, %%mm4\n\t"
  47. "punpcklbw %%mm0, %%mm3\n\t"
  48. "punpckhbw %%mm0, %%mm4\n\t"
  49. :: "r"(srcDither)
  50. );
  51. } else {
  52. __asm__ volatile("pxor %%mm0, %%mm0\n\t"
  53. "movq (%0), %%mm3\n\t"
  54. "movq %%mm3, %%mm4\n\t"
  55. "punpcklbw %%mm0, %%mm3\n\t"
  56. "punpckhbw %%mm0, %%mm4\n\t"
  57. :: "r"(srcDither)
  58. );
  59. }
  60. }
  61. #endif
  62. static void RENAME(yuv2yuvX)(const int16_t *filter, int filterSize,
  63. const int16_t **src, uint8_t *dest, int dstW,
  64. const uint8_t *dither, int offset)
  65. {
  66. dither_8to16(dither, offset);
  67. filterSize--;
  68. __asm__ volatile(
  69. "movd %0, %%mm1\n\t"
  70. "punpcklwd %%mm1, %%mm1\n\t"
  71. "punpckldq %%mm1, %%mm1\n\t"
  72. "psllw $3, %%mm1\n\t"
  73. "paddw %%mm1, %%mm3\n\t"
  74. "paddw %%mm1, %%mm4\n\t"
  75. "psraw $4, %%mm3\n\t"
  76. "psraw $4, %%mm4\n\t"
  77. ::"m"(filterSize)
  78. );
  79. __asm__ volatile(\
  80. "movq %%mm3, %%mm6\n\t"
  81. "movq %%mm4, %%mm7\n\t"
  82. "movl %3, %%ecx\n\t"
  83. "mov %0, %%"REG_d" \n\t"\
  84. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  85. ".p2align 4 \n\t" /* FIXME Unroll? */\
  86. "1: \n\t"\
  87. "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
  88. "movq (%%"REG_S", %%"REG_c", 2), %%mm2 \n\t" /* srcData */\
  89. "movq 8(%%"REG_S", %%"REG_c", 2), %%mm5 \n\t" /* srcData */\
  90. "add $16, %%"REG_d" \n\t"\
  91. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  92. "test %%"REG_S", %%"REG_S" \n\t"\
  93. "pmulhw %%mm0, %%mm2 \n\t"\
  94. "pmulhw %%mm0, %%mm5 \n\t"\
  95. "paddw %%mm2, %%mm3 \n\t"\
  96. "paddw %%mm5, %%mm4 \n\t"\
  97. " jnz 1b \n\t"\
  98. "psraw $3, %%mm3 \n\t"\
  99. "psraw $3, %%mm4 \n\t"\
  100. "packuswb %%mm4, %%mm3 \n\t"
  101. MOVNTQ2 " %%mm3, (%1, %%"REG_c")\n\t"
  102. "add $8, %%"REG_c" \n\t"\
  103. "cmp %2, %%"REG_c" \n\t"\
  104. "movq %%mm6, %%mm3\n\t"
  105. "movq %%mm7, %%mm4\n\t"
  106. "mov %0, %%"REG_d" \n\t"\
  107. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  108. "jb 1b \n\t"\
  109. :: "g" (filter),
  110. "r" (dest-offset), "g" ((x86_reg)(dstW+offset)), "m" (offset)
  111. : "%"REG_d, "%"REG_S, "%"REG_c
  112. );
  113. }
  114. #define YSCALEYUV2PACKEDX_UV \
  115. __asm__ volatile(\
  116. "xor %%"REG_a", %%"REG_a" \n\t"\
  117. ".p2align 4 \n\t"\
  118. "nop \n\t"\
  119. "1: \n\t"\
  120. "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
  121. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  122. "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
  123. "movq %%mm3, %%mm4 \n\t"\
  124. ".p2align 4 \n\t"\
  125. "2: \n\t"\
  126. "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
  127. "movq (%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* UsrcData */\
  128. "add %6, %%"REG_S" \n\t" \
  129. "movq (%%"REG_S", %%"REG_a"), %%mm5 \n\t" /* VsrcData */\
  130. "add $16, %%"REG_d" \n\t"\
  131. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  132. "pmulhw %%mm0, %%mm2 \n\t"\
  133. "pmulhw %%mm0, %%mm5 \n\t"\
  134. "paddw %%mm2, %%mm3 \n\t"\
  135. "paddw %%mm5, %%mm4 \n\t"\
  136. "test %%"REG_S", %%"REG_S" \n\t"\
  137. " jnz 2b \n\t"\
  138. #define YSCALEYUV2PACKEDX_YA(offset,coeff,src1,src2,dst1,dst2) \
  139. "lea "offset"(%0), %%"REG_d" \n\t"\
  140. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  141. "movq "VROUNDER_OFFSET"(%0), "#dst1" \n\t"\
  142. "movq "#dst1", "#dst2" \n\t"\
  143. ".p2align 4 \n\t"\
  144. "2: \n\t"\
  145. "movq 8(%%"REG_d"), "#coeff" \n\t" /* filterCoeff */\
  146. "movq (%%"REG_S", %%"REG_a", 2), "#src1" \n\t" /* Y1srcData */\
  147. "movq 8(%%"REG_S", %%"REG_a", 2), "#src2" \n\t" /* Y2srcData */\
  148. "add $16, %%"REG_d" \n\t"\
  149. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  150. "pmulhw "#coeff", "#src1" \n\t"\
  151. "pmulhw "#coeff", "#src2" \n\t"\
  152. "paddw "#src1", "#dst1" \n\t"\
  153. "paddw "#src2", "#dst2" \n\t"\
  154. "test %%"REG_S", %%"REG_S" \n\t"\
  155. " jnz 2b \n\t"\
  156. #define YSCALEYUV2PACKEDX \
  157. YSCALEYUV2PACKEDX_UV \
  158. YSCALEYUV2PACKEDX_YA(LUM_MMX_FILTER_OFFSET,%%mm0,%%mm2,%%mm5,%%mm1,%%mm7) \
  159. #define YSCALEYUV2PACKEDX_END \
  160. :: "r" (&c->redDither), \
  161. "m" (dummy), "m" (dummy), "m" (dummy),\
  162. "r" (dest), "m" (dstW_reg), "m"(uv_off) \
  163. NAMED_CONSTRAINTS_ADD(bF8,bFC) \
  164. : "%"REG_a, "%"REG_d, "%"REG_S \
  165. );
  166. #define YSCALEYUV2PACKEDX_ACCURATE_UV \
  167. __asm__ volatile(\
  168. "xor %%"REG_a", %%"REG_a" \n\t"\
  169. ".p2align 4 \n\t"\
  170. "nop \n\t"\
  171. "1: \n\t"\
  172. "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
  173. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  174. "pxor %%mm4, %%mm4 \n\t"\
  175. "pxor %%mm5, %%mm5 \n\t"\
  176. "pxor %%mm6, %%mm6 \n\t"\
  177. "pxor %%mm7, %%mm7 \n\t"\
  178. ".p2align 4 \n\t"\
  179. "2: \n\t"\
  180. "movq (%%"REG_S", %%"REG_a"), %%mm0 \n\t" /* UsrcData */\
  181. "add %6, %%"REG_S" \n\t" \
  182. "movq (%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* VsrcData */\
  183. "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
  184. "movq (%%"REG_S", %%"REG_a"), %%mm1 \n\t" /* UsrcData */\
  185. "movq %%mm0, %%mm3 \n\t"\
  186. "punpcklwd %%mm1, %%mm0 \n\t"\
  187. "punpckhwd %%mm1, %%mm3 \n\t"\
  188. "movq "STR(APCK_COEF)"(%%"REG_d"),%%mm1 \n\t" /* filterCoeff */\
  189. "pmaddwd %%mm1, %%mm0 \n\t"\
  190. "pmaddwd %%mm1, %%mm3 \n\t"\
  191. "paddd %%mm0, %%mm4 \n\t"\
  192. "paddd %%mm3, %%mm5 \n\t"\
  193. "add %6, %%"REG_S" \n\t" \
  194. "movq (%%"REG_S", %%"REG_a"), %%mm3 \n\t" /* VsrcData */\
  195. "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
  196. "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
  197. "test %%"REG_S", %%"REG_S" \n\t"\
  198. "movq %%mm2, %%mm0 \n\t"\
  199. "punpcklwd %%mm3, %%mm2 \n\t"\
  200. "punpckhwd %%mm3, %%mm0 \n\t"\
  201. "pmaddwd %%mm1, %%mm2 \n\t"\
  202. "pmaddwd %%mm1, %%mm0 \n\t"\
  203. "paddd %%mm2, %%mm6 \n\t"\
  204. "paddd %%mm0, %%mm7 \n\t"\
  205. " jnz 2b \n\t"\
  206. "psrad $16, %%mm4 \n\t"\
  207. "psrad $16, %%mm5 \n\t"\
  208. "psrad $16, %%mm6 \n\t"\
  209. "psrad $16, %%mm7 \n\t"\
  210. "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
  211. "packssdw %%mm5, %%mm4 \n\t"\
  212. "packssdw %%mm7, %%mm6 \n\t"\
  213. "paddw %%mm0, %%mm4 \n\t"\
  214. "paddw %%mm0, %%mm6 \n\t"\
  215. "movq %%mm4, "U_TEMP"(%0) \n\t"\
  216. "movq %%mm6, "V_TEMP"(%0) \n\t"\
  217. #define YSCALEYUV2PACKEDX_ACCURATE_YA(offset) \
  218. "lea "offset"(%0), %%"REG_d" \n\t"\
  219. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  220. "pxor %%mm1, %%mm1 \n\t"\
  221. "pxor %%mm5, %%mm5 \n\t"\
  222. "pxor %%mm7, %%mm7 \n\t"\
  223. "pxor %%mm6, %%mm6 \n\t"\
  224. ".p2align 4 \n\t"\
  225. "2: \n\t"\
  226. "movq (%%"REG_S", %%"REG_a", 2), %%mm0 \n\t" /* Y1srcData */\
  227. "movq 8(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* Y2srcData */\
  228. "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
  229. "movq (%%"REG_S", %%"REG_a", 2), %%mm4 \n\t" /* Y1srcData */\
  230. "movq %%mm0, %%mm3 \n\t"\
  231. "punpcklwd %%mm4, %%mm0 \n\t"\
  232. "punpckhwd %%mm4, %%mm3 \n\t"\
  233. "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm4 \n\t" /* filterCoeff */\
  234. "pmaddwd %%mm4, %%mm0 \n\t"\
  235. "pmaddwd %%mm4, %%mm3 \n\t"\
  236. "paddd %%mm0, %%mm1 \n\t"\
  237. "paddd %%mm3, %%mm5 \n\t"\
  238. "movq 8(%%"REG_S", %%"REG_a", 2), %%mm3 \n\t" /* Y2srcData */\
  239. "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
  240. "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
  241. "test %%"REG_S", %%"REG_S" \n\t"\
  242. "movq %%mm2, %%mm0 \n\t"\
  243. "punpcklwd %%mm3, %%mm2 \n\t"\
  244. "punpckhwd %%mm3, %%mm0 \n\t"\
  245. "pmaddwd %%mm4, %%mm2 \n\t"\
  246. "pmaddwd %%mm4, %%mm0 \n\t"\
  247. "paddd %%mm2, %%mm7 \n\t"\
  248. "paddd %%mm0, %%mm6 \n\t"\
  249. " jnz 2b \n\t"\
  250. "psrad $16, %%mm1 \n\t"\
  251. "psrad $16, %%mm5 \n\t"\
  252. "psrad $16, %%mm7 \n\t"\
  253. "psrad $16, %%mm6 \n\t"\
  254. "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
  255. "packssdw %%mm5, %%mm1 \n\t"\
  256. "packssdw %%mm6, %%mm7 \n\t"\
  257. "paddw %%mm0, %%mm1 \n\t"\
  258. "paddw %%mm0, %%mm7 \n\t"\
  259. "movq "U_TEMP"(%0), %%mm3 \n\t"\
  260. "movq "V_TEMP"(%0), %%mm4 \n\t"\
  261. #define YSCALEYUV2PACKEDX_ACCURATE \
  262. YSCALEYUV2PACKEDX_ACCURATE_UV \
  263. YSCALEYUV2PACKEDX_ACCURATE_YA(LUM_MMX_FILTER_OFFSET)
  264. #define YSCALEYUV2RGBX \
  265. "psubw "U_OFFSET"(%0), %%mm3 \n\t" /* (U-128)8*/\
  266. "psubw "V_OFFSET"(%0), %%mm4 \n\t" /* (V-128)8*/\
  267. "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
  268. "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
  269. "pmulhw "UG_COEFF"(%0), %%mm3 \n\t"\
  270. "pmulhw "VG_COEFF"(%0), %%mm4 \n\t"\
  271. /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
  272. "pmulhw "UB_COEFF"(%0), %%mm2 \n\t"\
  273. "pmulhw "VR_COEFF"(%0), %%mm5 \n\t"\
  274. "psubw "Y_OFFSET"(%0), %%mm1 \n\t" /* 8(Y-16)*/\
  275. "psubw "Y_OFFSET"(%0), %%mm7 \n\t" /* 8(Y-16)*/\
  276. "pmulhw "Y_COEFF"(%0), %%mm1 \n\t"\
  277. "pmulhw "Y_COEFF"(%0), %%mm7 \n\t"\
  278. /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
  279. "paddw %%mm3, %%mm4 \n\t"\
  280. "movq %%mm2, %%mm0 \n\t"\
  281. "movq %%mm5, %%mm6 \n\t"\
  282. "movq %%mm4, %%mm3 \n\t"\
  283. "punpcklwd %%mm2, %%mm2 \n\t"\
  284. "punpcklwd %%mm5, %%mm5 \n\t"\
  285. "punpcklwd %%mm4, %%mm4 \n\t"\
  286. "paddw %%mm1, %%mm2 \n\t"\
  287. "paddw %%mm1, %%mm5 \n\t"\
  288. "paddw %%mm1, %%mm4 \n\t"\
  289. "punpckhwd %%mm0, %%mm0 \n\t"\
  290. "punpckhwd %%mm6, %%mm6 \n\t"\
  291. "punpckhwd %%mm3, %%mm3 \n\t"\
  292. "paddw %%mm7, %%mm0 \n\t"\
  293. "paddw %%mm7, %%mm6 \n\t"\
  294. "paddw %%mm7, %%mm3 \n\t"\
  295. /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
  296. "packuswb %%mm0, %%mm2 \n\t"\
  297. "packuswb %%mm6, %%mm5 \n\t"\
  298. "packuswb %%mm3, %%mm4 \n\t"\
  299. #define REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) \
  300. "movq "#b", "#q2" \n\t" /* B */\
  301. "movq "#r", "#t" \n\t" /* R */\
  302. "punpcklbw "#g", "#b" \n\t" /* GBGBGBGB 0 */\
  303. "punpcklbw "#a", "#r" \n\t" /* ARARARAR 0 */\
  304. "punpckhbw "#g", "#q2" \n\t" /* GBGBGBGB 2 */\
  305. "punpckhbw "#a", "#t" \n\t" /* ARARARAR 2 */\
  306. "movq "#b", "#q0" \n\t" /* GBGBGBGB 0 */\
  307. "movq "#q2", "#q3" \n\t" /* GBGBGBGB 2 */\
  308. "punpcklwd "#r", "#q0" \n\t" /* ARGBARGB 0 */\
  309. "punpckhwd "#r", "#b" \n\t" /* ARGBARGB 1 */\
  310. "punpcklwd "#t", "#q2" \n\t" /* ARGBARGB 2 */\
  311. "punpckhwd "#t", "#q3" \n\t" /* ARGBARGB 3 */\
  312. \
  313. MOVNTQ( q0, (dst, index, 4))\
  314. MOVNTQ( b, 8(dst, index, 4))\
  315. MOVNTQ( q2, 16(dst, index, 4))\
  316. MOVNTQ( q3, 24(dst, index, 4))\
  317. \
  318. "add $8, "#index" \n\t"\
  319. "cmp "dstw", "#index" \n\t"\
  320. " jb 1b \n\t"
  321. #define WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t)
  322. static void RENAME(yuv2rgb32_X_ar)(SwsContext *c, const int16_t *lumFilter,
  323. const int16_t **lumSrc, int lumFilterSize,
  324. const int16_t *chrFilter, const int16_t **chrUSrc,
  325. const int16_t **chrVSrc,
  326. int chrFilterSize, const int16_t **alpSrc,
  327. uint8_t *dest, int dstW, int dstY)
  328. {
  329. x86_reg dummy=0;
  330. x86_reg dstW_reg = dstW;
  331. x86_reg uv_off = c->uv_offx2;
  332. if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
  333. YSCALEYUV2PACKEDX_ACCURATE
  334. YSCALEYUV2RGBX
  335. "movq %%mm2, "U_TEMP"(%0) \n\t"
  336. "movq %%mm4, "V_TEMP"(%0) \n\t"
  337. "movq %%mm5, "Y_TEMP"(%0) \n\t"
  338. YSCALEYUV2PACKEDX_ACCURATE_YA(ALP_MMX_FILTER_OFFSET)
  339. "movq "Y_TEMP"(%0), %%mm5 \n\t"
  340. "psraw $3, %%mm1 \n\t"
  341. "psraw $3, %%mm7 \n\t"
  342. "packuswb %%mm7, %%mm1 \n\t"
  343. WRITEBGR32(%4, "%5", %%REGa, %%mm3, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm2, %%mm6)
  344. YSCALEYUV2PACKEDX_END
  345. } else {
  346. YSCALEYUV2PACKEDX_ACCURATE
  347. YSCALEYUV2RGBX
  348. "pcmpeqd %%mm7, %%mm7 \n\t"
  349. WRITEBGR32(%4, "%5", %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
  350. YSCALEYUV2PACKEDX_END
  351. }
  352. }
  353. static void RENAME(yuv2rgb32_X)(SwsContext *c, const int16_t *lumFilter,
  354. const int16_t **lumSrc, int lumFilterSize,
  355. const int16_t *chrFilter, const int16_t **chrUSrc,
  356. const int16_t **chrVSrc,
  357. int chrFilterSize, const int16_t **alpSrc,
  358. uint8_t *dest, int dstW, int dstY)
  359. {
  360. x86_reg dummy=0;
  361. x86_reg dstW_reg = dstW;
  362. x86_reg uv_off = c->uv_offx2;
  363. if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
  364. YSCALEYUV2PACKEDX
  365. YSCALEYUV2RGBX
  366. YSCALEYUV2PACKEDX_YA(ALP_MMX_FILTER_OFFSET, %%mm0, %%mm3, %%mm6, %%mm1, %%mm7)
  367. "psraw $3, %%mm1 \n\t"
  368. "psraw $3, %%mm7 \n\t"
  369. "packuswb %%mm7, %%mm1 \n\t"
  370. WRITEBGR32(%4, "%5", %%REGa, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
  371. YSCALEYUV2PACKEDX_END
  372. } else {
  373. YSCALEYUV2PACKEDX
  374. YSCALEYUV2RGBX
  375. "pcmpeqd %%mm7, %%mm7 \n\t"
  376. WRITEBGR32(%4, "%5", %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
  377. YSCALEYUV2PACKEDX_END
  378. }
  379. }
  380. #define REAL_WRITERGB16(dst, dstw, index) \
  381. "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
  382. "pand "MANGLE(bFC)", %%mm4 \n\t" /* G */\
  383. "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
  384. "psrlq $3, %%mm2 \n\t"\
  385. \
  386. "movq %%mm2, %%mm1 \n\t"\
  387. "movq %%mm4, %%mm3 \n\t"\
  388. \
  389. "punpcklbw %%mm7, %%mm3 \n\t"\
  390. "punpcklbw %%mm5, %%mm2 \n\t"\
  391. "punpckhbw %%mm7, %%mm4 \n\t"\
  392. "punpckhbw %%mm5, %%mm1 \n\t"\
  393. \
  394. "psllq $3, %%mm3 \n\t"\
  395. "psllq $3, %%mm4 \n\t"\
  396. \
  397. "por %%mm3, %%mm2 \n\t"\
  398. "por %%mm4, %%mm1 \n\t"\
  399. \
  400. MOVNTQ(%%mm2, (dst, index, 2))\
  401. MOVNTQ(%%mm1, 8(dst, index, 2))\
  402. \
  403. "add $8, "#index" \n\t"\
  404. "cmp "dstw", "#index" \n\t"\
  405. " jb 1b \n\t"
  406. #define WRITERGB16(dst, dstw, index) REAL_WRITERGB16(dst, dstw, index)
  407. static void RENAME(yuv2rgb565_X_ar)(SwsContext *c, const int16_t *lumFilter,
  408. const int16_t **lumSrc, int lumFilterSize,
  409. const int16_t *chrFilter, const int16_t **chrUSrc,
  410. const int16_t **chrVSrc,
  411. int chrFilterSize, const int16_t **alpSrc,
  412. uint8_t *dest, int dstW, int dstY)
  413. {
  414. x86_reg dummy=0;
  415. x86_reg dstW_reg = dstW;
  416. x86_reg uv_off = c->uv_offx2;
  417. YSCALEYUV2PACKEDX_ACCURATE
  418. YSCALEYUV2RGBX
  419. "pxor %%mm7, %%mm7 \n\t"
  420. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  421. #ifdef DITHER1XBPP
  422. "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
  423. "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
  424. "paddusb "RED_DITHER"(%0), %%mm5\n\t"
  425. #endif
  426. WRITERGB16(%4, "%5", %%REGa)
  427. YSCALEYUV2PACKEDX_END
  428. }
  429. static void RENAME(yuv2rgb565_X)(SwsContext *c, const int16_t *lumFilter,
  430. const int16_t **lumSrc, int lumFilterSize,
  431. const int16_t *chrFilter, const int16_t **chrUSrc,
  432. const int16_t **chrVSrc,
  433. int chrFilterSize, const int16_t **alpSrc,
  434. uint8_t *dest, int dstW, int dstY)
  435. {
  436. x86_reg dummy=0;
  437. x86_reg dstW_reg = dstW;
  438. x86_reg uv_off = c->uv_offx2;
  439. YSCALEYUV2PACKEDX
  440. YSCALEYUV2RGBX
  441. "pxor %%mm7, %%mm7 \n\t"
  442. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  443. #ifdef DITHER1XBPP
  444. "paddusb "BLUE_DITHER"(%0), %%mm2 \n\t"
  445. "paddusb "GREEN_DITHER"(%0), %%mm4 \n\t"
  446. "paddusb "RED_DITHER"(%0), %%mm5 \n\t"
  447. #endif
  448. WRITERGB16(%4, "%5", %%REGa)
  449. YSCALEYUV2PACKEDX_END
  450. }
  451. #define REAL_WRITERGB15(dst, dstw, index) \
  452. "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
  453. "pand "MANGLE(bF8)", %%mm4 \n\t" /* G */\
  454. "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
  455. "psrlq $3, %%mm2 \n\t"\
  456. "psrlq $1, %%mm5 \n\t"\
  457. \
  458. "movq %%mm2, %%mm1 \n\t"\
  459. "movq %%mm4, %%mm3 \n\t"\
  460. \
  461. "punpcklbw %%mm7, %%mm3 \n\t"\
  462. "punpcklbw %%mm5, %%mm2 \n\t"\
  463. "punpckhbw %%mm7, %%mm4 \n\t"\
  464. "punpckhbw %%mm5, %%mm1 \n\t"\
  465. \
  466. "psllq $2, %%mm3 \n\t"\
  467. "psllq $2, %%mm4 \n\t"\
  468. \
  469. "por %%mm3, %%mm2 \n\t"\
  470. "por %%mm4, %%mm1 \n\t"\
  471. \
  472. MOVNTQ(%%mm2, (dst, index, 2))\
  473. MOVNTQ(%%mm1, 8(dst, index, 2))\
  474. \
  475. "add $8, "#index" \n\t"\
  476. "cmp "dstw", "#index" \n\t"\
  477. " jb 1b \n\t"
  478. #define WRITERGB15(dst, dstw, index) REAL_WRITERGB15(dst, dstw, index)
  479. static void RENAME(yuv2rgb555_X_ar)(SwsContext *c, const int16_t *lumFilter,
  480. const int16_t **lumSrc, int lumFilterSize,
  481. const int16_t *chrFilter, const int16_t **chrUSrc,
  482. const int16_t **chrVSrc,
  483. int chrFilterSize, const int16_t **alpSrc,
  484. uint8_t *dest, int dstW, int dstY)
  485. {
  486. x86_reg dummy=0;
  487. x86_reg dstW_reg = dstW;
  488. x86_reg uv_off = c->uv_offx2;
  489. YSCALEYUV2PACKEDX_ACCURATE
  490. YSCALEYUV2RGBX
  491. "pxor %%mm7, %%mm7 \n\t"
  492. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  493. #ifdef DITHER1XBPP
  494. "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
  495. "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
  496. "paddusb "RED_DITHER"(%0), %%mm5\n\t"
  497. #endif
  498. WRITERGB15(%4, "%5", %%REGa)
  499. YSCALEYUV2PACKEDX_END
  500. }
  501. static void RENAME(yuv2rgb555_X)(SwsContext *c, const int16_t *lumFilter,
  502. const int16_t **lumSrc, int lumFilterSize,
  503. const int16_t *chrFilter, const int16_t **chrUSrc,
  504. const int16_t **chrVSrc,
  505. int chrFilterSize, const int16_t **alpSrc,
  506. uint8_t *dest, int dstW, int dstY)
  507. {
  508. x86_reg dummy=0;
  509. x86_reg dstW_reg = dstW;
  510. x86_reg uv_off = c->uv_offx2;
  511. YSCALEYUV2PACKEDX
  512. YSCALEYUV2RGBX
  513. "pxor %%mm7, %%mm7 \n\t"
  514. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  515. #ifdef DITHER1XBPP
  516. "paddusb "BLUE_DITHER"(%0), %%mm2 \n\t"
  517. "paddusb "GREEN_DITHER"(%0), %%mm4 \n\t"
  518. "paddusb "RED_DITHER"(%0), %%mm5 \n\t"
  519. #endif
  520. WRITERGB15(%4, "%5", %%REGa)
  521. YSCALEYUV2PACKEDX_END
  522. }
  523. #define WRITEBGR24MMX(dst, dstw, index) \
  524. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
  525. "movq %%mm2, %%mm1 \n\t" /* B */\
  526. "movq %%mm5, %%mm6 \n\t" /* R */\
  527. "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
  528. "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
  529. "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
  530. "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
  531. "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
  532. "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
  533. "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
  534. "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
  535. "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
  536. "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
  537. \
  538. "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
  539. "movq %%mm2, %%mm6 \n\t" /* 0RGB0RGB 1 */\
  540. "movq %%mm1, %%mm5 \n\t" /* 0RGB0RGB 2 */\
  541. "movq %%mm3, %%mm7 \n\t" /* 0RGB0RGB 3 */\
  542. \
  543. "psllq $40, %%mm0 \n\t" /* RGB00000 0 */\
  544. "psllq $40, %%mm2 \n\t" /* RGB00000 1 */\
  545. "psllq $40, %%mm1 \n\t" /* RGB00000 2 */\
  546. "psllq $40, %%mm3 \n\t" /* RGB00000 3 */\
  547. \
  548. "punpckhdq %%mm4, %%mm0 \n\t" /* 0RGBRGB0 0 */\
  549. "punpckhdq %%mm6, %%mm2 \n\t" /* 0RGBRGB0 1 */\
  550. "punpckhdq %%mm5, %%mm1 \n\t" /* 0RGBRGB0 2 */\
  551. "punpckhdq %%mm7, %%mm3 \n\t" /* 0RGBRGB0 3 */\
  552. \
  553. "psrlq $8, %%mm0 \n\t" /* 00RGBRGB 0 */\
  554. "movq %%mm2, %%mm6 \n\t" /* 0RGBRGB0 1 */\
  555. "psllq $40, %%mm2 \n\t" /* GB000000 1 */\
  556. "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
  557. MOVNTQ(%%mm0, (dst))\
  558. \
  559. "psrlq $24, %%mm6 \n\t" /* 0000RGBR 1 */\
  560. "movq %%mm1, %%mm5 \n\t" /* 0RGBRGB0 2 */\
  561. "psllq $24, %%mm1 \n\t" /* BRGB0000 2 */\
  562. "por %%mm1, %%mm6 \n\t" /* BRGBRGBR 1 */\
  563. MOVNTQ(%%mm6, 8(dst))\
  564. \
  565. "psrlq $40, %%mm5 \n\t" /* 000000RG 2 */\
  566. "psllq $8, %%mm3 \n\t" /* RGBRGB00 3 */\
  567. "por %%mm3, %%mm5 \n\t" /* RGBRGBRG 2 */\
  568. MOVNTQ(%%mm5, 16(dst))\
  569. \
  570. "add $24, "#dst" \n\t"\
  571. \
  572. "add $8, "#index" \n\t"\
  573. "cmp "dstw", "#index" \n\t"\
  574. " jb 1b \n\t"
  575. #define WRITEBGR24MMXEXT(dst, dstw, index) \
  576. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
  577. "movq "MANGLE(ff_M24A)", %%mm0 \n\t"\
  578. "movq "MANGLE(ff_M24C)", %%mm7 \n\t"\
  579. "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2 B1 B0 B1 B0 */\
  580. "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2 G1 G0 G1 G0 */\
  581. "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0 R1 R0 R1 R0 */\
  582. \
  583. "pand %%mm0, %%mm1 \n\t" /* B2 B1 B0 */\
  584. "pand %%mm0, %%mm3 \n\t" /* G2 G1 G0 */\
  585. "pand %%mm7, %%mm6 \n\t" /* R1 R0 */\
  586. \
  587. "psllq $8, %%mm3 \n\t" /* G2 G1 G0 */\
  588. "por %%mm1, %%mm6 \n\t"\
  589. "por %%mm3, %%mm6 \n\t"\
  590. MOVNTQ(%%mm6, (dst))\
  591. \
  592. "psrlq $8, %%mm4 \n\t" /* 00 G7 G6 G5 G4 G3 G2 G1 */\
  593. "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4 B3 B2 B3 B2 */\
  594. "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3 G4 G3 G4 G3 */\
  595. "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4 R3 R2 R3 R2 */\
  596. \
  597. "pand "MANGLE(ff_M24B)", %%mm1 \n\t" /* B5 B4 B3 */\
  598. "pand %%mm7, %%mm3 \n\t" /* G4 G3 */\
  599. "pand %%mm0, %%mm6 \n\t" /* R4 R3 R2 */\
  600. \
  601. "por %%mm1, %%mm3 \n\t" /* B5 G4 B4 G3 B3 */\
  602. "por %%mm3, %%mm6 \n\t"\
  603. MOVNTQ(%%mm6, 8(dst))\
  604. \
  605. "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6 B7 B6 B6 B7 */\
  606. "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7 G6 G5 G6 G5 */\
  607. "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6 R5 R4 R5 R4 */\
  608. \
  609. "pand %%mm7, %%mm1 \n\t" /* B7 B6 */\
  610. "pand %%mm0, %%mm3 \n\t" /* G7 G6 G5 */\
  611. "pand "MANGLE(ff_M24B)", %%mm6 \n\t" /* R7 R6 R5 */\
  612. \
  613. "por %%mm1, %%mm3 \n\t"\
  614. "por %%mm3, %%mm6 \n\t"\
  615. MOVNTQ(%%mm6, 16(dst))\
  616. \
  617. "add $24, "#dst" \n\t"\
  618. \
  619. "add $8, "#index" \n\t"\
  620. "cmp "dstw", "#index" \n\t"\
  621. " jb 1b \n\t"
  622. #if COMPILE_TEMPLATE_MMXEXT
  623. #undef WRITEBGR24
  624. #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMXEXT(dst, dstw, index)
  625. #else
  626. #undef WRITEBGR24
  627. #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX(dst, dstw, index)
  628. #endif
  629. #if HAVE_6REGS
  630. static void RENAME(yuv2bgr24_X_ar)(SwsContext *c, const int16_t *lumFilter,
  631. const int16_t **lumSrc, int lumFilterSize,
  632. const int16_t *chrFilter, const int16_t **chrUSrc,
  633. const int16_t **chrVSrc,
  634. int chrFilterSize, const int16_t **alpSrc,
  635. uint8_t *dest, int dstW, int dstY)
  636. {
  637. x86_reg dummy=0;
  638. x86_reg dstW_reg = dstW;
  639. x86_reg uv_off = c->uv_offx2;
  640. YSCALEYUV2PACKEDX_ACCURATE
  641. YSCALEYUV2RGBX
  642. "pxor %%mm7, %%mm7 \n\t"
  643. "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"\n\t" //FIXME optimize
  644. "add %4, %%"REG_c" \n\t"
  645. WRITEBGR24(%%REGc, "%5", %%REGa)
  646. :: "r" (&c->redDither),
  647. "m" (dummy), "m" (dummy), "m" (dummy),
  648. "r" (dest), "m" (dstW_reg), "m"(uv_off)
  649. NAMED_CONSTRAINTS_ADD(ff_M24A,ff_M24C,ff_M24B)
  650. : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
  651. );
  652. }
  653. static void RENAME(yuv2bgr24_X)(SwsContext *c, const int16_t *lumFilter,
  654. const int16_t **lumSrc, int lumFilterSize,
  655. const int16_t *chrFilter, const int16_t **chrUSrc,
  656. const int16_t **chrVSrc,
  657. int chrFilterSize, const int16_t **alpSrc,
  658. uint8_t *dest, int dstW, int dstY)
  659. {
  660. x86_reg dummy=0;
  661. x86_reg dstW_reg = dstW;
  662. x86_reg uv_off = c->uv_offx2;
  663. YSCALEYUV2PACKEDX
  664. YSCALEYUV2RGBX
  665. "pxor %%mm7, %%mm7 \n\t"
  666. "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c" \n\t" //FIXME optimize
  667. "add %4, %%"REG_c" \n\t"
  668. WRITEBGR24(%%REGc, "%5", %%REGa)
  669. :: "r" (&c->redDither),
  670. "m" (dummy), "m" (dummy), "m" (dummy),
  671. "r" (dest), "m" (dstW_reg), "m"(uv_off)
  672. NAMED_CONSTRAINTS_ADD(ff_M24A,ff_M24C,ff_M24B)
  673. : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
  674. );
  675. }
  676. #endif /* HAVE_6REGS */
  677. #define REAL_WRITEYUY2(dst, dstw, index) \
  678. "packuswb %%mm3, %%mm3 \n\t"\
  679. "packuswb %%mm4, %%mm4 \n\t"\
  680. "packuswb %%mm7, %%mm1 \n\t"\
  681. "punpcklbw %%mm4, %%mm3 \n\t"\
  682. "movq %%mm1, %%mm7 \n\t"\
  683. "punpcklbw %%mm3, %%mm1 \n\t"\
  684. "punpckhbw %%mm3, %%mm7 \n\t"\
  685. \
  686. MOVNTQ(%%mm1, (dst, index, 2))\
  687. MOVNTQ(%%mm7, 8(dst, index, 2))\
  688. \
  689. "add $8, "#index" \n\t"\
  690. "cmp "dstw", "#index" \n\t"\
  691. " jb 1b \n\t"
  692. #define WRITEYUY2(dst, dstw, index) REAL_WRITEYUY2(dst, dstw, index)
  693. static void RENAME(yuv2yuyv422_X_ar)(SwsContext *c, const int16_t *lumFilter,
  694. const int16_t **lumSrc, int lumFilterSize,
  695. const int16_t *chrFilter, const int16_t **chrUSrc,
  696. const int16_t **chrVSrc,
  697. int chrFilterSize, const int16_t **alpSrc,
  698. uint8_t *dest, int dstW, int dstY)
  699. {
  700. x86_reg dummy=0;
  701. x86_reg dstW_reg = dstW;
  702. x86_reg uv_off = c->uv_offx2;
  703. YSCALEYUV2PACKEDX_ACCURATE
  704. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  705. "psraw $3, %%mm3 \n\t"
  706. "psraw $3, %%mm4 \n\t"
  707. "psraw $3, %%mm1 \n\t"
  708. "psraw $3, %%mm7 \n\t"
  709. WRITEYUY2(%4, "%5", %%REGa)
  710. YSCALEYUV2PACKEDX_END
  711. }
  712. static void RENAME(yuv2yuyv422_X)(SwsContext *c, const int16_t *lumFilter,
  713. const int16_t **lumSrc, int lumFilterSize,
  714. const int16_t *chrFilter, const int16_t **chrUSrc,
  715. const int16_t **chrVSrc,
  716. int chrFilterSize, const int16_t **alpSrc,
  717. uint8_t *dest, int dstW, int dstY)
  718. {
  719. x86_reg dummy=0;
  720. x86_reg dstW_reg = dstW;
  721. x86_reg uv_off = c->uv_offx2;
  722. YSCALEYUV2PACKEDX
  723. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  724. "psraw $3, %%mm3 \n\t"
  725. "psraw $3, %%mm4 \n\t"
  726. "psraw $3, %%mm1 \n\t"
  727. "psraw $3, %%mm7 \n\t"
  728. WRITEYUY2(%4, "%5", %%REGa)
  729. YSCALEYUV2PACKEDX_END
  730. }
  731. #define REAL_YSCALEYUV2RGB_UV(index, c) \
  732. "xor "#index", "#index" \n\t"\
  733. ".p2align 4 \n\t"\
  734. "1: \n\t"\
  735. "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
  736. "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
  737. "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \
  738. "movq (%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
  739. "movq (%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
  740. "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \
  741. "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
  742. "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
  743. "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
  744. "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
  745. "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
  746. "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
  747. "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
  748. "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
  749. "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
  750. "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
  751. "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
  752. "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
  753. "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
  754. "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
  755. "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
  756. /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
  757. #define REAL_YSCALEYUV2RGB_YA(index, c, b1, b2) \
  758. "movq ("#b1", "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
  759. "movq ("#b2", "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
  760. "movq 8("#b1", "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
  761. "movq 8("#b2", "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
  762. "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
  763. "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
  764. "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
  765. "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
  766. "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  767. "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  768. "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
  769. "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
  770. #define REAL_YSCALEYUV2RGB_COEFF(c) \
  771. "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
  772. "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
  773. "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
  774. "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
  775. "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
  776. "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
  777. /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
  778. "paddw %%mm3, %%mm4 \n\t"\
  779. "movq %%mm2, %%mm0 \n\t"\
  780. "movq %%mm5, %%mm6 \n\t"\
  781. "movq %%mm4, %%mm3 \n\t"\
  782. "punpcklwd %%mm2, %%mm2 \n\t"\
  783. "punpcklwd %%mm5, %%mm5 \n\t"\
  784. "punpcklwd %%mm4, %%mm4 \n\t"\
  785. "paddw %%mm1, %%mm2 \n\t"\
  786. "paddw %%mm1, %%mm5 \n\t"\
  787. "paddw %%mm1, %%mm4 \n\t"\
  788. "punpckhwd %%mm0, %%mm0 \n\t"\
  789. "punpckhwd %%mm6, %%mm6 \n\t"\
  790. "punpckhwd %%mm3, %%mm3 \n\t"\
  791. "paddw %%mm7, %%mm0 \n\t"\
  792. "paddw %%mm7, %%mm6 \n\t"\
  793. "paddw %%mm7, %%mm3 \n\t"\
  794. /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
  795. "packuswb %%mm0, %%mm2 \n\t"\
  796. "packuswb %%mm6, %%mm5 \n\t"\
  797. "packuswb %%mm3, %%mm4 \n\t"\
  798. #define YSCALEYUV2RGB_YA(index, c, b1, b2) REAL_YSCALEYUV2RGB_YA(index, c, b1, b2)
  799. #define YSCALEYUV2RGB(index, c) \
  800. REAL_YSCALEYUV2RGB_UV(index, c) \
  801. REAL_YSCALEYUV2RGB_YA(index, c, %0, %1) \
  802. REAL_YSCALEYUV2RGB_COEFF(c)
  803. /**
  804. * vertical bilinear scale YV12 to RGB
  805. */
  806. static void RENAME(yuv2rgb32_2)(SwsContext *c, const int16_t *buf[2],
  807. const int16_t *ubuf[2], const int16_t *vbuf[2],
  808. const int16_t *abuf[2], uint8_t *dest,
  809. int dstW, int yalpha, int uvalpha, int y)
  810. {
  811. const int16_t *buf0 = buf[0], *buf1 = buf[1],
  812. *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
  813. if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
  814. const int16_t *abuf0 = abuf[0], *abuf1 = abuf[1];
  815. #if ARCH_X86_64
  816. __asm__ volatile(
  817. YSCALEYUV2RGB(%%r8, %5)
  818. YSCALEYUV2RGB_YA(%%r8, %5, %6, %7)
  819. "psraw $3, %%mm1 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
  820. "psraw $3, %%mm7 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
  821. "packuswb %%mm7, %%mm1 \n\t"
  822. WRITEBGR32(%4, DSTW_OFFSET"(%5)", %%r8, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
  823. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "r" (dest),
  824. "a" (&c->redDither),
  825. "r" (abuf0), "r" (abuf1)
  826. : "%r8"
  827. );
  828. #else
  829. c->u_temp=(intptr_t)abuf0;
  830. c->v_temp=(intptr_t)abuf1;
  831. __asm__ volatile(
  832. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  833. "mov %4, %%"REG_b" \n\t"
  834. "push %%"REG_BP" \n\t"
  835. YSCALEYUV2RGB(%%REGBP, %5)
  836. "push %0 \n\t"
  837. "push %1 \n\t"
  838. "mov "U_TEMP"(%5), %0 \n\t"
  839. "mov "V_TEMP"(%5), %1 \n\t"
  840. YSCALEYUV2RGB_YA(%%REGBP, %5, %0, %1)
  841. "psraw $3, %%mm1 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
  842. "psraw $3, %%mm7 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
  843. "packuswb %%mm7, %%mm1 \n\t"
  844. "pop %1 \n\t"
  845. "pop %0 \n\t"
  846. WRITEBGR32(%%REGb, DSTW_OFFSET"(%5)", %%REGBP, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
  847. "pop %%"REG_BP" \n\t"
  848. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  849. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  850. "a" (&c->redDither)
  851. );
  852. #endif
  853. } else {
  854. __asm__ volatile(
  855. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  856. "mov %4, %%"REG_b" \n\t"
  857. "push %%"REG_BP" \n\t"
  858. YSCALEYUV2RGB(%%REGBP, %5)
  859. "pcmpeqd %%mm7, %%mm7 \n\t"
  860. WRITEBGR32(%%REGb, DSTW_OFFSET"(%5)", %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
  861. "pop %%"REG_BP" \n\t"
  862. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  863. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  864. "a" (&c->redDither)
  865. );
  866. }
  867. }
  868. static void RENAME(yuv2bgr24_2)(SwsContext *c, const int16_t *buf[2],
  869. const int16_t *ubuf[2], const int16_t *vbuf[2],
  870. const int16_t *abuf[2], uint8_t *dest,
  871. int dstW, int yalpha, int uvalpha, int y)
  872. {
  873. const int16_t *buf0 = buf[0], *buf1 = buf[1],
  874. *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
  875. __asm__ volatile(
  876. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  877. "mov %4, %%"REG_b" \n\t"
  878. "push %%"REG_BP" \n\t"
  879. YSCALEYUV2RGB(%%REGBP, %5)
  880. "pxor %%mm7, %%mm7 \n\t"
  881. WRITEBGR24(%%REGb, DSTW_OFFSET"(%5)", %%REGBP)
  882. "pop %%"REG_BP" \n\t"
  883. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  884. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  885. "a" (&c->redDither)
  886. NAMED_CONSTRAINTS_ADD(ff_M24A,ff_M24C,ff_M24B)
  887. );
  888. }
  889. static void RENAME(yuv2rgb555_2)(SwsContext *c, const int16_t *buf[2],
  890. const int16_t *ubuf[2], const int16_t *vbuf[2],
  891. const int16_t *abuf[2], uint8_t *dest,
  892. int dstW, int yalpha, int uvalpha, int y)
  893. {
  894. const int16_t *buf0 = buf[0], *buf1 = buf[1],
  895. *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
  896. __asm__ volatile(
  897. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  898. "mov %4, %%"REG_b" \n\t"
  899. "push %%"REG_BP" \n\t"
  900. YSCALEYUV2RGB(%%REGBP, %5)
  901. "pxor %%mm7, %%mm7 \n\t"
  902. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  903. #ifdef DITHER1XBPP
  904. "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
  905. "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
  906. "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
  907. #endif
  908. WRITERGB15(%%REGb, DSTW_OFFSET"(%5)", %%REGBP)
  909. "pop %%"REG_BP" \n\t"
  910. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  911. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  912. "a" (&c->redDither)
  913. NAMED_CONSTRAINTS_ADD(bF8)
  914. );
  915. }
  916. static void RENAME(yuv2rgb565_2)(SwsContext *c, const int16_t *buf[2],
  917. const int16_t *ubuf[2], const int16_t *vbuf[2],
  918. const int16_t *abuf[2], uint8_t *dest,
  919. int dstW, int yalpha, int uvalpha, int y)
  920. {
  921. const int16_t *buf0 = buf[0], *buf1 = buf[1],
  922. *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
  923. __asm__ volatile(
  924. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  925. "mov %4, %%"REG_b" \n\t"
  926. "push %%"REG_BP" \n\t"
  927. YSCALEYUV2RGB(%%REGBP, %5)
  928. "pxor %%mm7, %%mm7 \n\t"
  929. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  930. #ifdef DITHER1XBPP
  931. "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
  932. "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
  933. "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
  934. #endif
  935. WRITERGB16(%%REGb, DSTW_OFFSET"(%5)", %%REGBP)
  936. "pop %%"REG_BP" \n\t"
  937. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  938. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  939. "a" (&c->redDither)
  940. NAMED_CONSTRAINTS_ADD(bF8,bFC)
  941. );
  942. }
  943. #define REAL_YSCALEYUV2PACKED(index, c) \
  944. "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
  945. "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1 \n\t"\
  946. "psraw $3, %%mm0 \n\t"\
  947. "psraw $3, %%mm1 \n\t"\
  948. "movq %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c") \n\t"\
  949. "movq %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c") \n\t"\
  950. "xor "#index", "#index" \n\t"\
  951. ".p2align 4 \n\t"\
  952. "1: \n\t"\
  953. "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
  954. "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
  955. "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \
  956. "movq (%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
  957. "movq (%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
  958. "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \
  959. "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
  960. "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
  961. "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
  962. "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
  963. "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
  964. "psraw $7, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
  965. "psraw $7, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
  966. "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
  967. "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
  968. "movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
  969. "movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
  970. "movq 8(%0, "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
  971. "movq 8(%1, "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
  972. "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
  973. "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
  974. "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
  975. "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
  976. "psraw $7, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  977. "psraw $7, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  978. "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
  979. "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
  980. #define YSCALEYUV2PACKED(index, c) REAL_YSCALEYUV2PACKED(index, c)
  981. static void RENAME(yuv2yuyv422_2)(SwsContext *c, const int16_t *buf[2],
  982. const int16_t *ubuf[2], const int16_t *vbuf[2],
  983. const int16_t *abuf[2], uint8_t *dest,
  984. int dstW, int yalpha, int uvalpha, int y)
  985. {
  986. const int16_t *buf0 = buf[0], *buf1 = buf[1],
  987. *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
  988. __asm__ volatile(
  989. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  990. "mov %4, %%"REG_b" \n\t"
  991. "push %%"REG_BP" \n\t"
  992. YSCALEYUV2PACKED(%%REGBP, %5)
  993. WRITEYUY2(%%REGb, DSTW_OFFSET"(%5)", %%REGBP)
  994. "pop %%"REG_BP" \n\t"
  995. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  996. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  997. "a" (&c->redDither)
  998. );
  999. }
  1000. #define REAL_YSCALEYUV2RGB1(index, c) \
  1001. "xor "#index", "#index" \n\t"\
  1002. ".p2align 4 \n\t"\
  1003. "1: \n\t"\
  1004. "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
  1005. "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \
  1006. "movq (%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
  1007. "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \
  1008. "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
  1009. "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
  1010. "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
  1011. "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
  1012. "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
  1013. "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
  1014. "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
  1015. "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
  1016. /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
  1017. "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
  1018. "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
  1019. "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  1020. "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  1021. "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
  1022. "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
  1023. "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
  1024. "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
  1025. "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
  1026. "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
  1027. /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
  1028. "paddw %%mm3, %%mm4 \n\t"\
  1029. "movq %%mm2, %%mm0 \n\t"\
  1030. "movq %%mm5, %%mm6 \n\t"\
  1031. "movq %%mm4, %%mm3 \n\t"\
  1032. "punpcklwd %%mm2, %%mm2 \n\t"\
  1033. "punpcklwd %%mm5, %%mm5 \n\t"\
  1034. "punpcklwd %%mm4, %%mm4 \n\t"\
  1035. "paddw %%mm1, %%mm2 \n\t"\
  1036. "paddw %%mm1, %%mm5 \n\t"\
  1037. "paddw %%mm1, %%mm4 \n\t"\
  1038. "punpckhwd %%mm0, %%mm0 \n\t"\
  1039. "punpckhwd %%mm6, %%mm6 \n\t"\
  1040. "punpckhwd %%mm3, %%mm3 \n\t"\
  1041. "paddw %%mm7, %%mm0 \n\t"\
  1042. "paddw %%mm7, %%mm6 \n\t"\
  1043. "paddw %%mm7, %%mm3 \n\t"\
  1044. /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
  1045. "packuswb %%mm0, %%mm2 \n\t"\
  1046. "packuswb %%mm6, %%mm5 \n\t"\
  1047. "packuswb %%mm3, %%mm4 \n\t"\
  1048. #define YSCALEYUV2RGB1(index, c) REAL_YSCALEYUV2RGB1(index, c)
  1049. // do vertical chrominance interpolation
  1050. #define REAL_YSCALEYUV2RGB1b(index, c) \
  1051. "xor "#index", "#index" \n\t"\
  1052. ".p2align 4 \n\t"\
  1053. "1: \n\t"\
  1054. "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
  1055. "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
  1056. "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \
  1057. "movq (%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
  1058. "movq (%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
  1059. "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \
  1060. "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
  1061. "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
  1062. "psrlw $5, %%mm3 \n\t" /*FIXME might overflow*/\
  1063. "psrlw $5, %%mm4 \n\t" /*FIXME might overflow*/\
  1064. "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
  1065. "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
  1066. "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
  1067. "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
  1068. "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
  1069. "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
  1070. /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
  1071. "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
  1072. "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
  1073. "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  1074. "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  1075. "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
  1076. "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
  1077. "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
  1078. "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
  1079. "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
  1080. "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
  1081. /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
  1082. "paddw %%mm3, %%mm4 \n\t"\
  1083. "movq %%mm2, %%mm0 \n\t"\
  1084. "movq %%mm5, %%mm6 \n\t"\
  1085. "movq %%mm4, %%mm3 \n\t"\
  1086. "punpcklwd %%mm2, %%mm2 \n\t"\
  1087. "punpcklwd %%mm5, %%mm5 \n\t"\
  1088. "punpcklwd %%mm4, %%mm4 \n\t"\
  1089. "paddw %%mm1, %%mm2 \n\t"\
  1090. "paddw %%mm1, %%mm5 \n\t"\
  1091. "paddw %%mm1, %%mm4 \n\t"\
  1092. "punpckhwd %%mm0, %%mm0 \n\t"\
  1093. "punpckhwd %%mm6, %%mm6 \n\t"\
  1094. "punpckhwd %%mm3, %%mm3 \n\t"\
  1095. "paddw %%mm7, %%mm0 \n\t"\
  1096. "paddw %%mm7, %%mm6 \n\t"\
  1097. "paddw %%mm7, %%mm3 \n\t"\
  1098. /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
  1099. "packuswb %%mm0, %%mm2 \n\t"\
  1100. "packuswb %%mm6, %%mm5 \n\t"\
  1101. "packuswb %%mm3, %%mm4 \n\t"\
  1102. #define YSCALEYUV2RGB1b(index, c) REAL_YSCALEYUV2RGB1b(index, c)
  1103. #define REAL_YSCALEYUV2RGB1_ALPHA(index) \
  1104. "movq (%1, "#index", 2), %%mm7 \n\t" /* abuf0[index ] */\
  1105. "movq 8(%1, "#index", 2), %%mm1 \n\t" /* abuf0[index+4] */\
  1106. "psraw $7, %%mm7 \n\t" /* abuf0[index ] >>7 */\
  1107. "psraw $7, %%mm1 \n\t" /* abuf0[index+4] >>7 */\
  1108. "packuswb %%mm1, %%mm7 \n\t"
  1109. #define YSCALEYUV2RGB1_ALPHA(index) REAL_YSCALEYUV2RGB1_ALPHA(index)
  1110. /**
  1111. * YV12 to RGB without scaling or interpolating
  1112. */
  1113. static void RENAME(yuv2rgb32_1)(SwsContext *c, const int16_t *buf0,
  1114. const int16_t *ubuf[2], const int16_t *vbuf[2],
  1115. const int16_t *abuf0, uint8_t *dest,
  1116. int dstW, int uvalpha, int y)
  1117. {
  1118. const int16_t *ubuf0 = ubuf[0];
  1119. const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
  1120. if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
  1121. const int16_t *ubuf1 = ubuf[0];
  1122. if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
  1123. __asm__ volatile(
  1124. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1125. "mov %4, %%"REG_b" \n\t"
  1126. "push %%"REG_BP" \n\t"
  1127. YSCALEYUV2RGB1(%%REGBP, %5)
  1128. YSCALEYUV2RGB1_ALPHA(%%REGBP)
  1129. WRITEBGR32(%%REGb, DSTW_OFFSET"(%5)", %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
  1130. "pop %%"REG_BP" \n\t"
  1131. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1132. :: "c" (buf0), "d" (abuf0), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1133. "a" (&c->redDither)
  1134. );
  1135. } else {
  1136. __asm__ volatile(
  1137. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1138. "mov %4, %%"REG_b" \n\t"
  1139. "push %%"REG_BP" \n\t"
  1140. YSCALEYUV2RGB1(%%REGBP, %5)
  1141. "pcmpeqd %%mm7, %%mm7 \n\t"
  1142. WRITEBGR32(%%REGb, DSTW_OFFSET"(%5)", %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
  1143. "pop %%"REG_BP" \n\t"
  1144. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1145. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1146. "a" (&c->redDither)
  1147. );
  1148. }
  1149. } else {
  1150. const int16_t *ubuf1 = ubuf[1];
  1151. if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
  1152. __asm__ volatile(
  1153. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1154. "mov %4, %%"REG_b" \n\t"
  1155. "push %%"REG_BP" \n\t"
  1156. YSCALEYUV2RGB1b(%%REGBP, %5)
  1157. YSCALEYUV2RGB1_ALPHA(%%REGBP)
  1158. WRITEBGR32(%%REGb, DSTW_OFFSET"(%5)", %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
  1159. "pop %%"REG_BP" \n\t"
  1160. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1161. :: "c" (buf0), "d" (abuf0), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1162. "a" (&c->redDither)
  1163. );
  1164. } else {
  1165. __asm__ volatile(
  1166. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1167. "mov %4, %%"REG_b" \n\t"
  1168. "push %%"REG_BP" \n\t"
  1169. YSCALEYUV2RGB1b(%%REGBP, %5)
  1170. "pcmpeqd %%mm7, %%mm7 \n\t"
  1171. WRITEBGR32(%%REGb, DSTW_OFFSET"(%5)", %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
  1172. "pop %%"REG_BP" \n\t"
  1173. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1174. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1175. "a" (&c->redDither)
  1176. );
  1177. }
  1178. }
  1179. }
  1180. static void RENAME(yuv2bgr24_1)(SwsContext *c, const int16_t *buf0,
  1181. const int16_t *ubuf[2], const int16_t *vbuf[2],
  1182. const int16_t *abuf0, uint8_t *dest,
  1183. int dstW, int uvalpha, int y)
  1184. {
  1185. const int16_t *ubuf0 = ubuf[0];
  1186. const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
  1187. if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
  1188. const int16_t *ubuf1 = ubuf[0];
  1189. __asm__ volatile(
  1190. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1191. "mov %4, %%"REG_b" \n\t"
  1192. "push %%"REG_BP" \n\t"
  1193. YSCALEYUV2RGB1(%%REGBP, %5)
  1194. "pxor %%mm7, %%mm7 \n\t"
  1195. WRITEBGR24(%%REGb, DSTW_OFFSET"(%5)", %%REGBP)
  1196. "pop %%"REG_BP" \n\t"
  1197. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1198. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1199. "a" (&c->redDither)
  1200. NAMED_CONSTRAINTS_ADD(ff_M24A,ff_M24C,ff_M24B)
  1201. );
  1202. } else {
  1203. const int16_t *ubuf1 = ubuf[1];
  1204. __asm__ volatile(
  1205. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1206. "mov %4, %%"REG_b" \n\t"
  1207. "push %%"REG_BP" \n\t"
  1208. YSCALEYUV2RGB1b(%%REGBP, %5)
  1209. "pxor %%mm7, %%mm7 \n\t"
  1210. WRITEBGR24(%%REGb, DSTW_OFFSET"(%5)", %%REGBP)
  1211. "pop %%"REG_BP" \n\t"
  1212. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1213. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1214. "a" (&c->redDither)
  1215. NAMED_CONSTRAINTS_ADD(ff_M24A,ff_M24C,ff_M24B)
  1216. );
  1217. }
  1218. }
  1219. static void RENAME(yuv2rgb555_1)(SwsContext *c, const int16_t *buf0,
  1220. const int16_t *ubuf[2], const int16_t *vbuf[2],
  1221. const int16_t *abuf0, uint8_t *dest,
  1222. int dstW, int uvalpha, int y)
  1223. {
  1224. const int16_t *ubuf0 = ubuf[0];
  1225. const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
  1226. if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
  1227. const int16_t *ubuf1 = ubuf[0];
  1228. __asm__ volatile(
  1229. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1230. "mov %4, %%"REG_b" \n\t"
  1231. "push %%"REG_BP" \n\t"
  1232. YSCALEYUV2RGB1(%%REGBP, %5)
  1233. "pxor %%mm7, %%mm7 \n\t"
  1234. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1235. #ifdef DITHER1XBPP
  1236. "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
  1237. "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
  1238. "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
  1239. #endif
  1240. WRITERGB15(%%REGb, DSTW_OFFSET"(%5)", %%REGBP)
  1241. "pop %%"REG_BP" \n\t"
  1242. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1243. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1244. "a" (&c->redDither)
  1245. NAMED_CONSTRAINTS_ADD(bF8)
  1246. );
  1247. } else {
  1248. const int16_t *ubuf1 = ubuf[1];
  1249. __asm__ volatile(
  1250. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1251. "mov %4, %%"REG_b" \n\t"
  1252. "push %%"REG_BP" \n\t"
  1253. YSCALEYUV2RGB1b(%%REGBP, %5)
  1254. "pxor %%mm7, %%mm7 \n\t"
  1255. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1256. #ifdef DITHER1XBPP
  1257. "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
  1258. "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
  1259. "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
  1260. #endif
  1261. WRITERGB15(%%REGb, DSTW_OFFSET"(%5)", %%REGBP)
  1262. "pop %%"REG_BP" \n\t"
  1263. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1264. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1265. "a" (&c->redDither)
  1266. NAMED_CONSTRAINTS_ADD(bF8)
  1267. );
  1268. }
  1269. }
  1270. static void RENAME(yuv2rgb565_1)(SwsContext *c, const int16_t *buf0,
  1271. const int16_t *ubuf[2], const int16_t *vbuf[2],
  1272. const int16_t *abuf0, uint8_t *dest,
  1273. int dstW, int uvalpha, int y)
  1274. {
  1275. const int16_t *ubuf0 = ubuf[0];
  1276. const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
  1277. if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
  1278. const int16_t *ubuf1 = ubuf[0];
  1279. __asm__ volatile(
  1280. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1281. "mov %4, %%"REG_b" \n\t"
  1282. "push %%"REG_BP" \n\t"
  1283. YSCALEYUV2RGB1(%%REGBP, %5)
  1284. "pxor %%mm7, %%mm7 \n\t"
  1285. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1286. #ifdef DITHER1XBPP
  1287. "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
  1288. "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
  1289. "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
  1290. #endif
  1291. WRITERGB16(%%REGb, DSTW_OFFSET"(%5)", %%REGBP)
  1292. "pop %%"REG_BP" \n\t"
  1293. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1294. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1295. "a" (&c->redDither)
  1296. NAMED_CONSTRAINTS_ADD(bF8,bFC)
  1297. );
  1298. } else {
  1299. const int16_t *ubuf1 = ubuf[1];
  1300. __asm__ volatile(
  1301. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1302. "mov %4, %%"REG_b" \n\t"
  1303. "push %%"REG_BP" \n\t"
  1304. YSCALEYUV2RGB1b(%%REGBP, %5)
  1305. "pxor %%mm7, %%mm7 \n\t"
  1306. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1307. #ifdef DITHER1XBPP
  1308. "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
  1309. "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
  1310. "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
  1311. #endif
  1312. WRITERGB16(%%REGb, DSTW_OFFSET"(%5)", %%REGBP)
  1313. "pop %%"REG_BP" \n\t"
  1314. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1315. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1316. "a" (&c->redDither)
  1317. NAMED_CONSTRAINTS_ADD(bF8,bFC)
  1318. );
  1319. }
  1320. }
  1321. #define REAL_YSCALEYUV2PACKED1(index, c) \
  1322. "xor "#index", "#index" \n\t"\
  1323. ".p2align 4 \n\t"\
  1324. "1: \n\t"\
  1325. "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
  1326. "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \
  1327. "movq (%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
  1328. "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \
  1329. "psraw $7, %%mm3 \n\t" \
  1330. "psraw $7, %%mm4 \n\t" \
  1331. "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
  1332. "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
  1333. "psraw $7, %%mm1 \n\t" \
  1334. "psraw $7, %%mm7 \n\t" \
  1335. #define YSCALEYUV2PACKED1(index, c) REAL_YSCALEYUV2PACKED1(index, c)
  1336. #define REAL_YSCALEYUV2PACKED1b(index, c) \
  1337. "xor "#index", "#index" \n\t"\
  1338. ".p2align 4 \n\t"\
  1339. "1: \n\t"\
  1340. "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
  1341. "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
  1342. "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \
  1343. "movq (%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
  1344. "movq (%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
  1345. "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \
  1346. "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
  1347. "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
  1348. "psrlw $8, %%mm3 \n\t" \
  1349. "psrlw $8, %%mm4 \n\t" \
  1350. "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
  1351. "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
  1352. "psraw $7, %%mm1 \n\t" \
  1353. "psraw $7, %%mm7 \n\t"
  1354. #define YSCALEYUV2PACKED1b(index, c) REAL_YSCALEYUV2PACKED1b(index, c)
  1355. static void RENAME(yuv2yuyv422_1)(SwsContext *c, const int16_t *buf0,
  1356. const int16_t *ubuf[2], const int16_t *vbuf[2],
  1357. const int16_t *abuf0, uint8_t *dest,
  1358. int dstW, int uvalpha, int y)
  1359. {
  1360. const int16_t *ubuf0 = ubuf[0];
  1361. const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
  1362. if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
  1363. const int16_t *ubuf1 = ubuf[0];
  1364. __asm__ volatile(
  1365. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1366. "mov %4, %%"REG_b" \n\t"
  1367. "push %%"REG_BP" \n\t"
  1368. YSCALEYUV2PACKED1(%%REGBP, %5)
  1369. WRITEYUY2(%%REGb, DSTW_OFFSET"(%5)", %%REGBP)
  1370. "pop %%"REG_BP" \n\t"
  1371. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1372. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1373. "a" (&c->redDither)
  1374. );
  1375. } else {
  1376. const int16_t *ubuf1 = ubuf[1];
  1377. __asm__ volatile(
  1378. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1379. "mov %4, %%"REG_b" \n\t"
  1380. "push %%"REG_BP" \n\t"
  1381. YSCALEYUV2PACKED1b(%%REGBP, %5)
  1382. WRITEYUY2(%%REGb, DSTW_OFFSET"(%5)", %%REGBP)
  1383. "pop %%"REG_BP" \n\t"
  1384. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1385. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1386. "a" (&c->redDither)
  1387. );
  1388. }
  1389. }
  1390. static av_cold void RENAME(sws_init_swscale)(SwsContext *c)
  1391. {
  1392. enum AVPixelFormat dstFormat = c->dstFormat;
  1393. c->use_mmx_vfilter= 0;
  1394. if (!is16BPS(dstFormat) && !is9_OR_10BPS(dstFormat) && dstFormat != AV_PIX_FMT_NV12
  1395. && dstFormat != AV_PIX_FMT_NV21 && !(c->flags & SWS_BITEXACT)) {
  1396. if (c->flags & SWS_ACCURATE_RND) {
  1397. if (!(c->flags & SWS_FULL_CHR_H_INT)) {
  1398. switch (c->dstFormat) {
  1399. case AV_PIX_FMT_RGB32: c->yuv2packedX = RENAME(yuv2rgb32_X_ar); break;
  1400. #if HAVE_6REGS
  1401. case AV_PIX_FMT_BGR24: c->yuv2packedX = RENAME(yuv2bgr24_X_ar); break;
  1402. #endif
  1403. case AV_PIX_FMT_RGB555: c->yuv2packedX = RENAME(yuv2rgb555_X_ar); break;
  1404. case AV_PIX_FMT_RGB565: c->yuv2packedX = RENAME(yuv2rgb565_X_ar); break;
  1405. case AV_PIX_FMT_YUYV422: c->yuv2packedX = RENAME(yuv2yuyv422_X_ar); break;
  1406. default: break;
  1407. }
  1408. }
  1409. } else {
  1410. c->use_mmx_vfilter= 1;
  1411. c->yuv2planeX = RENAME(yuv2yuvX );
  1412. if (!(c->flags & SWS_FULL_CHR_H_INT)) {
  1413. switch (c->dstFormat) {
  1414. case AV_PIX_FMT_RGB32: c->yuv2packedX = RENAME(yuv2rgb32_X); break;
  1415. #if HAVE_6REGS
  1416. case AV_PIX_FMT_BGR24: c->yuv2packedX = RENAME(yuv2bgr24_X); break;
  1417. #endif
  1418. case AV_PIX_FMT_RGB555: c->yuv2packedX = RENAME(yuv2rgb555_X); break;
  1419. case AV_PIX_FMT_RGB565: c->yuv2packedX = RENAME(yuv2rgb565_X); break;
  1420. case AV_PIX_FMT_YUYV422: c->yuv2packedX = RENAME(yuv2yuyv422_X); break;
  1421. default: break;
  1422. }
  1423. }
  1424. }
  1425. if (!(c->flags & SWS_FULL_CHR_H_INT)) {
  1426. switch (c->dstFormat) {
  1427. case AV_PIX_FMT_RGB32:
  1428. c->yuv2packed1 = RENAME(yuv2rgb32_1);
  1429. c->yuv2packed2 = RENAME(yuv2rgb32_2);
  1430. break;
  1431. case AV_PIX_FMT_BGR24:
  1432. c->yuv2packed1 = RENAME(yuv2bgr24_1);
  1433. c->yuv2packed2 = RENAME(yuv2bgr24_2);
  1434. break;
  1435. case AV_PIX_FMT_RGB555:
  1436. c->yuv2packed1 = RENAME(yuv2rgb555_1);
  1437. c->yuv2packed2 = RENAME(yuv2rgb555_2);
  1438. break;
  1439. case AV_PIX_FMT_RGB565:
  1440. c->yuv2packed1 = RENAME(yuv2rgb565_1);
  1441. c->yuv2packed2 = RENAME(yuv2rgb565_2);
  1442. break;
  1443. case AV_PIX_FMT_YUYV422:
  1444. c->yuv2packed1 = RENAME(yuv2yuyv422_1);
  1445. c->yuv2packed2 = RENAME(yuv2yuyv422_2);
  1446. break;
  1447. default:
  1448. break;
  1449. }
  1450. }
  1451. }
  1452. if (c->srcBpc == 8 && c->dstBpc <= 14) {
  1453. // Use the new MMX scaler if the MMXEXT one can't be used (it is faster than the x86 ASM one).
  1454. #if COMPILE_TEMPLATE_MMXEXT
  1455. if (c->flags & SWS_FAST_BILINEAR && c->canMMXEXTBeUsed) {
  1456. c->hyscale_fast = ff_hyscale_fast_mmxext;
  1457. c->hcscale_fast = ff_hcscale_fast_mmxext;
  1458. } else {
  1459. #endif /* COMPILE_TEMPLATE_MMXEXT */
  1460. c->hyscale_fast = NULL;
  1461. c->hcscale_fast = NULL;
  1462. #if COMPILE_TEMPLATE_MMXEXT
  1463. }
  1464. #endif /* COMPILE_TEMPLATE_MMXEXT */
  1465. }
  1466. }