You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1718 lines
75KB

  1. /*
  2. * Copyright (C) 2001-2011 Michael Niedermayer <michaelni@gmx.at>
  3. *
  4. * This file is part of FFmpeg.
  5. *
  6. * FFmpeg is free software; you can redistribute it and/or
  7. * modify it under the terms of the GNU Lesser General Public
  8. * License as published by the Free Software Foundation; either
  9. * version 2.1 of the License, or (at your option) any later version.
  10. *
  11. * FFmpeg is distributed in the hope that it will be useful,
  12. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  14. * Lesser General Public License for more details.
  15. *
  16. * You should have received a copy of the GNU Lesser General Public
  17. * License along with FFmpeg; if not, write to the Free Software
  18. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  19. */
  20. #undef REAL_MOVNTQ
  21. #undef MOVNTQ
  22. #undef MOVNTQ2
  23. #undef PREFETCH
  24. #if COMPILE_TEMPLATE_MMXEXT
  25. #define PREFETCH "prefetchnta"
  26. #else
  27. #define PREFETCH " # nop"
  28. #endif
  29. #if COMPILE_TEMPLATE_MMXEXT
  30. #define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
  31. #define MOVNTQ2 "movntq "
  32. #else
  33. #define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
  34. #define MOVNTQ2 "movq "
  35. #endif
  36. #define MOVNTQ(a,b) REAL_MOVNTQ(a,b)
  37. #if !COMPILE_TEMPLATE_MMXEXT
  38. static av_always_inline void
  39. dither_8to16(const uint8_t *srcDither, int rot)
  40. {
  41. if (rot) {
  42. __asm__ volatile("pxor %%mm0, %%mm0\n\t"
  43. "movq (%0), %%mm3\n\t"
  44. "movq %%mm3, %%mm4\n\t"
  45. "psrlq $24, %%mm3\n\t"
  46. "psllq $40, %%mm4\n\t"
  47. "por %%mm4, %%mm3\n\t"
  48. "movq %%mm3, %%mm4\n\t"
  49. "punpcklbw %%mm0, %%mm3\n\t"
  50. "punpckhbw %%mm0, %%mm4\n\t"
  51. :: "r"(srcDither)
  52. );
  53. } else {
  54. __asm__ volatile("pxor %%mm0, %%mm0\n\t"
  55. "movq (%0), %%mm3\n\t"
  56. "movq %%mm3, %%mm4\n\t"
  57. "punpcklbw %%mm0, %%mm3\n\t"
  58. "punpckhbw %%mm0, %%mm4\n\t"
  59. :: "r"(srcDither)
  60. );
  61. }
  62. }
  63. #endif
  64. static void RENAME(yuv2yuvX)(const int16_t *filter, int filterSize,
  65. const int16_t **src, uint8_t *dest, int dstW,
  66. const uint8_t *dither, int offset)
  67. {
  68. dither_8to16(dither, offset);
  69. filterSize--;
  70. __asm__ volatile(
  71. "movd %0, %%mm1\n\t"
  72. "punpcklwd %%mm1, %%mm1\n\t"
  73. "punpckldq %%mm1, %%mm1\n\t"
  74. "psllw $3, %%mm1\n\t"
  75. "paddw %%mm1, %%mm3\n\t"
  76. "paddw %%mm1, %%mm4\n\t"
  77. "psraw $4, %%mm3\n\t"
  78. "psraw $4, %%mm4\n\t"
  79. ::"m"(filterSize)
  80. );
  81. __asm__ volatile(\
  82. "movq %%mm3, %%mm6\n\t"
  83. "movq %%mm4, %%mm7\n\t"
  84. "movl %3, %%ecx\n\t"
  85. "mov %0, %%"REG_d" \n\t"\
  86. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  87. ".p2align 4 \n\t" /* FIXME Unroll? */\
  88. "1: \n\t"\
  89. "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
  90. "movq (%%"REG_S", %%"REG_c", 2), %%mm2 \n\t" /* srcData */\
  91. "movq 8(%%"REG_S", %%"REG_c", 2), %%mm5 \n\t" /* srcData */\
  92. "add $16, %%"REG_d" \n\t"\
  93. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  94. "test %%"REG_S", %%"REG_S" \n\t"\
  95. "pmulhw %%mm0, %%mm2 \n\t"\
  96. "pmulhw %%mm0, %%mm5 \n\t"\
  97. "paddw %%mm2, %%mm3 \n\t"\
  98. "paddw %%mm5, %%mm4 \n\t"\
  99. " jnz 1b \n\t"\
  100. "psraw $3, %%mm3 \n\t"\
  101. "psraw $3, %%mm4 \n\t"\
  102. "packuswb %%mm4, %%mm3 \n\t"
  103. MOVNTQ2 " %%mm3, (%1, %%"REG_c")\n\t"
  104. "add $8, %%"REG_c" \n\t"\
  105. "cmp %2, %%"REG_c" \n\t"\
  106. "movq %%mm6, %%mm3\n\t"
  107. "movq %%mm7, %%mm4\n\t"
  108. "mov %0, %%"REG_d" \n\t"\
  109. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  110. "jb 1b \n\t"\
  111. :: "g" (filter),
  112. "r" (dest-offset), "g" ((x86_reg)(dstW+offset)), "m" (offset)
  113. : "%"REG_d, "%"REG_S, "%"REG_c
  114. );
  115. }
  116. #define YSCALEYUV2PACKEDX_UV \
  117. __asm__ volatile(\
  118. "xor %%"REG_a", %%"REG_a" \n\t"\
  119. ".p2align 4 \n\t"\
  120. "nop \n\t"\
  121. "1: \n\t"\
  122. "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
  123. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  124. "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
  125. "movq %%mm3, %%mm4 \n\t"\
  126. ".p2align 4 \n\t"\
  127. "2: \n\t"\
  128. "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
  129. "movq (%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* UsrcData */\
  130. "add %6, %%"REG_S" \n\t" \
  131. "movq (%%"REG_S", %%"REG_a"), %%mm5 \n\t" /* VsrcData */\
  132. "add $16, %%"REG_d" \n\t"\
  133. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  134. "pmulhw %%mm0, %%mm2 \n\t"\
  135. "pmulhw %%mm0, %%mm5 \n\t"\
  136. "paddw %%mm2, %%mm3 \n\t"\
  137. "paddw %%mm5, %%mm4 \n\t"\
  138. "test %%"REG_S", %%"REG_S" \n\t"\
  139. " jnz 2b \n\t"\
  140. #define YSCALEYUV2PACKEDX_YA(offset,coeff,src1,src2,dst1,dst2) \
  141. "lea "offset"(%0), %%"REG_d" \n\t"\
  142. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  143. "movq "VROUNDER_OFFSET"(%0), "#dst1" \n\t"\
  144. "movq "#dst1", "#dst2" \n\t"\
  145. ".p2align 4 \n\t"\
  146. "2: \n\t"\
  147. "movq 8(%%"REG_d"), "#coeff" \n\t" /* filterCoeff */\
  148. "movq (%%"REG_S", %%"REG_a", 2), "#src1" \n\t" /* Y1srcData */\
  149. "movq 8(%%"REG_S", %%"REG_a", 2), "#src2" \n\t" /* Y2srcData */\
  150. "add $16, %%"REG_d" \n\t"\
  151. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  152. "pmulhw "#coeff", "#src1" \n\t"\
  153. "pmulhw "#coeff", "#src2" \n\t"\
  154. "paddw "#src1", "#dst1" \n\t"\
  155. "paddw "#src2", "#dst2" \n\t"\
  156. "test %%"REG_S", %%"REG_S" \n\t"\
  157. " jnz 2b \n\t"\
  158. #define YSCALEYUV2PACKEDX \
  159. YSCALEYUV2PACKEDX_UV \
  160. YSCALEYUV2PACKEDX_YA(LUM_MMX_FILTER_OFFSET,%%mm0,%%mm2,%%mm5,%%mm1,%%mm7) \
  161. #define YSCALEYUV2PACKEDX_END \
  162. :: "r" (&c->redDither), \
  163. "m" (dummy), "m" (dummy), "m" (dummy),\
  164. "r" (dest), "m" (dstW_reg), "m"(uv_off) \
  165. : "%"REG_a, "%"REG_d, "%"REG_S \
  166. );
  167. #define YSCALEYUV2PACKEDX_ACCURATE_UV \
  168. __asm__ volatile(\
  169. "xor %%"REG_a", %%"REG_a" \n\t"\
  170. ".p2align 4 \n\t"\
  171. "nop \n\t"\
  172. "1: \n\t"\
  173. "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
  174. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  175. "pxor %%mm4, %%mm4 \n\t"\
  176. "pxor %%mm5, %%mm5 \n\t"\
  177. "pxor %%mm6, %%mm6 \n\t"\
  178. "pxor %%mm7, %%mm7 \n\t"\
  179. ".p2align 4 \n\t"\
  180. "2: \n\t"\
  181. "movq (%%"REG_S", %%"REG_a"), %%mm0 \n\t" /* UsrcData */\
  182. "add %6, %%"REG_S" \n\t" \
  183. "movq (%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* VsrcData */\
  184. "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
  185. "movq (%%"REG_S", %%"REG_a"), %%mm1 \n\t" /* UsrcData */\
  186. "movq %%mm0, %%mm3 \n\t"\
  187. "punpcklwd %%mm1, %%mm0 \n\t"\
  188. "punpckhwd %%mm1, %%mm3 \n\t"\
  189. "movq "STR(APCK_COEF)"(%%"REG_d"),%%mm1 \n\t" /* filterCoeff */\
  190. "pmaddwd %%mm1, %%mm0 \n\t"\
  191. "pmaddwd %%mm1, %%mm3 \n\t"\
  192. "paddd %%mm0, %%mm4 \n\t"\
  193. "paddd %%mm3, %%mm5 \n\t"\
  194. "add %6, %%"REG_S" \n\t" \
  195. "movq (%%"REG_S", %%"REG_a"), %%mm3 \n\t" /* VsrcData */\
  196. "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
  197. "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
  198. "test %%"REG_S", %%"REG_S" \n\t"\
  199. "movq %%mm2, %%mm0 \n\t"\
  200. "punpcklwd %%mm3, %%mm2 \n\t"\
  201. "punpckhwd %%mm3, %%mm0 \n\t"\
  202. "pmaddwd %%mm1, %%mm2 \n\t"\
  203. "pmaddwd %%mm1, %%mm0 \n\t"\
  204. "paddd %%mm2, %%mm6 \n\t"\
  205. "paddd %%mm0, %%mm7 \n\t"\
  206. " jnz 2b \n\t"\
  207. "psrad $16, %%mm4 \n\t"\
  208. "psrad $16, %%mm5 \n\t"\
  209. "psrad $16, %%mm6 \n\t"\
  210. "psrad $16, %%mm7 \n\t"\
  211. "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
  212. "packssdw %%mm5, %%mm4 \n\t"\
  213. "packssdw %%mm7, %%mm6 \n\t"\
  214. "paddw %%mm0, %%mm4 \n\t"\
  215. "paddw %%mm0, %%mm6 \n\t"\
  216. "movq %%mm4, "U_TEMP"(%0) \n\t"\
  217. "movq %%mm6, "V_TEMP"(%0) \n\t"\
  218. #define YSCALEYUV2PACKEDX_ACCURATE_YA(offset) \
  219. "lea "offset"(%0), %%"REG_d" \n\t"\
  220. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  221. "pxor %%mm1, %%mm1 \n\t"\
  222. "pxor %%mm5, %%mm5 \n\t"\
  223. "pxor %%mm7, %%mm7 \n\t"\
  224. "pxor %%mm6, %%mm6 \n\t"\
  225. ".p2align 4 \n\t"\
  226. "2: \n\t"\
  227. "movq (%%"REG_S", %%"REG_a", 2), %%mm0 \n\t" /* Y1srcData */\
  228. "movq 8(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* Y2srcData */\
  229. "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
  230. "movq (%%"REG_S", %%"REG_a", 2), %%mm4 \n\t" /* Y1srcData */\
  231. "movq %%mm0, %%mm3 \n\t"\
  232. "punpcklwd %%mm4, %%mm0 \n\t"\
  233. "punpckhwd %%mm4, %%mm3 \n\t"\
  234. "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm4 \n\t" /* filterCoeff */\
  235. "pmaddwd %%mm4, %%mm0 \n\t"\
  236. "pmaddwd %%mm4, %%mm3 \n\t"\
  237. "paddd %%mm0, %%mm1 \n\t"\
  238. "paddd %%mm3, %%mm5 \n\t"\
  239. "movq 8(%%"REG_S", %%"REG_a", 2), %%mm3 \n\t" /* Y2srcData */\
  240. "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
  241. "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
  242. "test %%"REG_S", %%"REG_S" \n\t"\
  243. "movq %%mm2, %%mm0 \n\t"\
  244. "punpcklwd %%mm3, %%mm2 \n\t"\
  245. "punpckhwd %%mm3, %%mm0 \n\t"\
  246. "pmaddwd %%mm4, %%mm2 \n\t"\
  247. "pmaddwd %%mm4, %%mm0 \n\t"\
  248. "paddd %%mm2, %%mm7 \n\t"\
  249. "paddd %%mm0, %%mm6 \n\t"\
  250. " jnz 2b \n\t"\
  251. "psrad $16, %%mm1 \n\t"\
  252. "psrad $16, %%mm5 \n\t"\
  253. "psrad $16, %%mm7 \n\t"\
  254. "psrad $16, %%mm6 \n\t"\
  255. "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
  256. "packssdw %%mm5, %%mm1 \n\t"\
  257. "packssdw %%mm6, %%mm7 \n\t"\
  258. "paddw %%mm0, %%mm1 \n\t"\
  259. "paddw %%mm0, %%mm7 \n\t"\
  260. "movq "U_TEMP"(%0), %%mm3 \n\t"\
  261. "movq "V_TEMP"(%0), %%mm4 \n\t"\
  262. #define YSCALEYUV2PACKEDX_ACCURATE \
  263. YSCALEYUV2PACKEDX_ACCURATE_UV \
  264. YSCALEYUV2PACKEDX_ACCURATE_YA(LUM_MMX_FILTER_OFFSET)
  265. #define YSCALEYUV2RGBX \
  266. "psubw "U_OFFSET"(%0), %%mm3 \n\t" /* (U-128)8*/\
  267. "psubw "V_OFFSET"(%0), %%mm4 \n\t" /* (V-128)8*/\
  268. "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
  269. "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
  270. "pmulhw "UG_COEFF"(%0), %%mm3 \n\t"\
  271. "pmulhw "VG_COEFF"(%0), %%mm4 \n\t"\
  272. /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
  273. "pmulhw "UB_COEFF"(%0), %%mm2 \n\t"\
  274. "pmulhw "VR_COEFF"(%0), %%mm5 \n\t"\
  275. "psubw "Y_OFFSET"(%0), %%mm1 \n\t" /* 8(Y-16)*/\
  276. "psubw "Y_OFFSET"(%0), %%mm7 \n\t" /* 8(Y-16)*/\
  277. "pmulhw "Y_COEFF"(%0), %%mm1 \n\t"\
  278. "pmulhw "Y_COEFF"(%0), %%mm7 \n\t"\
  279. /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
  280. "paddw %%mm3, %%mm4 \n\t"\
  281. "movq %%mm2, %%mm0 \n\t"\
  282. "movq %%mm5, %%mm6 \n\t"\
  283. "movq %%mm4, %%mm3 \n\t"\
  284. "punpcklwd %%mm2, %%mm2 \n\t"\
  285. "punpcklwd %%mm5, %%mm5 \n\t"\
  286. "punpcklwd %%mm4, %%mm4 \n\t"\
  287. "paddw %%mm1, %%mm2 \n\t"\
  288. "paddw %%mm1, %%mm5 \n\t"\
  289. "paddw %%mm1, %%mm4 \n\t"\
  290. "punpckhwd %%mm0, %%mm0 \n\t"\
  291. "punpckhwd %%mm6, %%mm6 \n\t"\
  292. "punpckhwd %%mm3, %%mm3 \n\t"\
  293. "paddw %%mm7, %%mm0 \n\t"\
  294. "paddw %%mm7, %%mm6 \n\t"\
  295. "paddw %%mm7, %%mm3 \n\t"\
  296. /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
  297. "packuswb %%mm0, %%mm2 \n\t"\
  298. "packuswb %%mm6, %%mm5 \n\t"\
  299. "packuswb %%mm3, %%mm4 \n\t"\
  300. #define REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) \
  301. "movq "#b", "#q2" \n\t" /* B */\
  302. "movq "#r", "#t" \n\t" /* R */\
  303. "punpcklbw "#g", "#b" \n\t" /* GBGBGBGB 0 */\
  304. "punpcklbw "#a", "#r" \n\t" /* ARARARAR 0 */\
  305. "punpckhbw "#g", "#q2" \n\t" /* GBGBGBGB 2 */\
  306. "punpckhbw "#a", "#t" \n\t" /* ARARARAR 2 */\
  307. "movq "#b", "#q0" \n\t" /* GBGBGBGB 0 */\
  308. "movq "#q2", "#q3" \n\t" /* GBGBGBGB 2 */\
  309. "punpcklwd "#r", "#q0" \n\t" /* ARGBARGB 0 */\
  310. "punpckhwd "#r", "#b" \n\t" /* ARGBARGB 1 */\
  311. "punpcklwd "#t", "#q2" \n\t" /* ARGBARGB 2 */\
  312. "punpckhwd "#t", "#q3" \n\t" /* ARGBARGB 3 */\
  313. \
  314. MOVNTQ( q0, (dst, index, 4))\
  315. MOVNTQ( b, 8(dst, index, 4))\
  316. MOVNTQ( q2, 16(dst, index, 4))\
  317. MOVNTQ( q3, 24(dst, index, 4))\
  318. \
  319. "add $8, "#index" \n\t"\
  320. "cmp "#dstw", "#index" \n\t"\
  321. " jb 1b \n\t"
  322. #define WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t)
  323. static void RENAME(yuv2rgb32_X_ar)(SwsContext *c, const int16_t *lumFilter,
  324. const int16_t **lumSrc, int lumFilterSize,
  325. const int16_t *chrFilter, const int16_t **chrUSrc,
  326. const int16_t **chrVSrc,
  327. int chrFilterSize, const int16_t **alpSrc,
  328. uint8_t *dest, int dstW, int dstY)
  329. {
  330. x86_reg dummy=0;
  331. x86_reg dstW_reg = dstW;
  332. x86_reg uv_off = c->uv_offx2;
  333. if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
  334. YSCALEYUV2PACKEDX_ACCURATE
  335. YSCALEYUV2RGBX
  336. "movq %%mm2, "U_TEMP"(%0) \n\t"
  337. "movq %%mm4, "V_TEMP"(%0) \n\t"
  338. "movq %%mm5, "Y_TEMP"(%0) \n\t"
  339. YSCALEYUV2PACKEDX_ACCURATE_YA(ALP_MMX_FILTER_OFFSET)
  340. "movq "Y_TEMP"(%0), %%mm5 \n\t"
  341. "psraw $3, %%mm1 \n\t"
  342. "psraw $3, %%mm7 \n\t"
  343. "packuswb %%mm7, %%mm1 \n\t"
  344. WRITEBGR32(%4, %5, %%REGa, %%mm3, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm2, %%mm6)
  345. YSCALEYUV2PACKEDX_END
  346. } else {
  347. YSCALEYUV2PACKEDX_ACCURATE
  348. YSCALEYUV2RGBX
  349. "pcmpeqd %%mm7, %%mm7 \n\t"
  350. WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
  351. YSCALEYUV2PACKEDX_END
  352. }
  353. }
  354. static void RENAME(yuv2rgb32_X)(SwsContext *c, const int16_t *lumFilter,
  355. const int16_t **lumSrc, int lumFilterSize,
  356. const int16_t *chrFilter, const int16_t **chrUSrc,
  357. const int16_t **chrVSrc,
  358. int chrFilterSize, const int16_t **alpSrc,
  359. uint8_t *dest, int dstW, int dstY)
  360. {
  361. x86_reg dummy=0;
  362. x86_reg dstW_reg = dstW;
  363. x86_reg uv_off = c->uv_offx2;
  364. if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
  365. YSCALEYUV2PACKEDX
  366. YSCALEYUV2RGBX
  367. YSCALEYUV2PACKEDX_YA(ALP_MMX_FILTER_OFFSET, %%mm0, %%mm3, %%mm6, %%mm1, %%mm7)
  368. "psraw $3, %%mm1 \n\t"
  369. "psraw $3, %%mm7 \n\t"
  370. "packuswb %%mm7, %%mm1 \n\t"
  371. WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
  372. YSCALEYUV2PACKEDX_END
  373. } else {
  374. YSCALEYUV2PACKEDX
  375. YSCALEYUV2RGBX
  376. "pcmpeqd %%mm7, %%mm7 \n\t"
  377. WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
  378. YSCALEYUV2PACKEDX_END
  379. }
  380. }
  381. #define REAL_WRITERGB16(dst, dstw, index) \
  382. "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
  383. "pand "MANGLE(bFC)", %%mm4 \n\t" /* G */\
  384. "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
  385. "psrlq $3, %%mm2 \n\t"\
  386. \
  387. "movq %%mm2, %%mm1 \n\t"\
  388. "movq %%mm4, %%mm3 \n\t"\
  389. \
  390. "punpcklbw %%mm7, %%mm3 \n\t"\
  391. "punpcklbw %%mm5, %%mm2 \n\t"\
  392. "punpckhbw %%mm7, %%mm4 \n\t"\
  393. "punpckhbw %%mm5, %%mm1 \n\t"\
  394. \
  395. "psllq $3, %%mm3 \n\t"\
  396. "psllq $3, %%mm4 \n\t"\
  397. \
  398. "por %%mm3, %%mm2 \n\t"\
  399. "por %%mm4, %%mm1 \n\t"\
  400. \
  401. MOVNTQ(%%mm2, (dst, index, 2))\
  402. MOVNTQ(%%mm1, 8(dst, index, 2))\
  403. \
  404. "add $8, "#index" \n\t"\
  405. "cmp "#dstw", "#index" \n\t"\
  406. " jb 1b \n\t"
  407. #define WRITERGB16(dst, dstw, index) REAL_WRITERGB16(dst, dstw, index)
  408. static void RENAME(yuv2rgb565_X_ar)(SwsContext *c, const int16_t *lumFilter,
  409. const int16_t **lumSrc, int lumFilterSize,
  410. const int16_t *chrFilter, const int16_t **chrUSrc,
  411. const int16_t **chrVSrc,
  412. int chrFilterSize, const int16_t **alpSrc,
  413. uint8_t *dest, int dstW, int dstY)
  414. {
  415. x86_reg dummy=0;
  416. x86_reg dstW_reg = dstW;
  417. x86_reg uv_off = c->uv_offx2;
  418. YSCALEYUV2PACKEDX_ACCURATE
  419. YSCALEYUV2RGBX
  420. "pxor %%mm7, %%mm7 \n\t"
  421. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  422. #ifdef DITHER1XBPP
  423. "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
  424. "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
  425. "paddusb "RED_DITHER"(%0), %%mm5\n\t"
  426. #endif
  427. WRITERGB16(%4, %5, %%REGa)
  428. YSCALEYUV2PACKEDX_END
  429. }
  430. static void RENAME(yuv2rgb565_X)(SwsContext *c, const int16_t *lumFilter,
  431. const int16_t **lumSrc, int lumFilterSize,
  432. const int16_t *chrFilter, const int16_t **chrUSrc,
  433. const int16_t **chrVSrc,
  434. int chrFilterSize, const int16_t **alpSrc,
  435. uint8_t *dest, int dstW, int dstY)
  436. {
  437. x86_reg dummy=0;
  438. x86_reg dstW_reg = dstW;
  439. x86_reg uv_off = c->uv_offx2;
  440. YSCALEYUV2PACKEDX
  441. YSCALEYUV2RGBX
  442. "pxor %%mm7, %%mm7 \n\t"
  443. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  444. #ifdef DITHER1XBPP
  445. "paddusb "BLUE_DITHER"(%0), %%mm2 \n\t"
  446. "paddusb "GREEN_DITHER"(%0), %%mm4 \n\t"
  447. "paddusb "RED_DITHER"(%0), %%mm5 \n\t"
  448. #endif
  449. WRITERGB16(%4, %5, %%REGa)
  450. YSCALEYUV2PACKEDX_END
  451. }
  452. #define REAL_WRITERGB15(dst, dstw, index) \
  453. "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
  454. "pand "MANGLE(bF8)", %%mm4 \n\t" /* G */\
  455. "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
  456. "psrlq $3, %%mm2 \n\t"\
  457. "psrlq $1, %%mm5 \n\t"\
  458. \
  459. "movq %%mm2, %%mm1 \n\t"\
  460. "movq %%mm4, %%mm3 \n\t"\
  461. \
  462. "punpcklbw %%mm7, %%mm3 \n\t"\
  463. "punpcklbw %%mm5, %%mm2 \n\t"\
  464. "punpckhbw %%mm7, %%mm4 \n\t"\
  465. "punpckhbw %%mm5, %%mm1 \n\t"\
  466. \
  467. "psllq $2, %%mm3 \n\t"\
  468. "psllq $2, %%mm4 \n\t"\
  469. \
  470. "por %%mm3, %%mm2 \n\t"\
  471. "por %%mm4, %%mm1 \n\t"\
  472. \
  473. MOVNTQ(%%mm2, (dst, index, 2))\
  474. MOVNTQ(%%mm1, 8(dst, index, 2))\
  475. \
  476. "add $8, "#index" \n\t"\
  477. "cmp "#dstw", "#index" \n\t"\
  478. " jb 1b \n\t"
  479. #define WRITERGB15(dst, dstw, index) REAL_WRITERGB15(dst, dstw, index)
  480. static void RENAME(yuv2rgb555_X_ar)(SwsContext *c, const int16_t *lumFilter,
  481. const int16_t **lumSrc, int lumFilterSize,
  482. const int16_t *chrFilter, const int16_t **chrUSrc,
  483. const int16_t **chrVSrc,
  484. int chrFilterSize, const int16_t **alpSrc,
  485. uint8_t *dest, int dstW, int dstY)
  486. {
  487. x86_reg dummy=0;
  488. x86_reg dstW_reg = dstW;
  489. x86_reg uv_off = c->uv_offx2;
  490. YSCALEYUV2PACKEDX_ACCURATE
  491. YSCALEYUV2RGBX
  492. "pxor %%mm7, %%mm7 \n\t"
  493. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  494. #ifdef DITHER1XBPP
  495. "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
  496. "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
  497. "paddusb "RED_DITHER"(%0), %%mm5\n\t"
  498. #endif
  499. WRITERGB15(%4, %5, %%REGa)
  500. YSCALEYUV2PACKEDX_END
  501. }
  502. static void RENAME(yuv2rgb555_X)(SwsContext *c, const int16_t *lumFilter,
  503. const int16_t **lumSrc, int lumFilterSize,
  504. const int16_t *chrFilter, const int16_t **chrUSrc,
  505. const int16_t **chrVSrc,
  506. int chrFilterSize, const int16_t **alpSrc,
  507. uint8_t *dest, int dstW, int dstY)
  508. {
  509. x86_reg dummy=0;
  510. x86_reg dstW_reg = dstW;
  511. x86_reg uv_off = c->uv_offx2;
  512. YSCALEYUV2PACKEDX
  513. YSCALEYUV2RGBX
  514. "pxor %%mm7, %%mm7 \n\t"
  515. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  516. #ifdef DITHER1XBPP
  517. "paddusb "BLUE_DITHER"(%0), %%mm2 \n\t"
  518. "paddusb "GREEN_DITHER"(%0), %%mm4 \n\t"
  519. "paddusb "RED_DITHER"(%0), %%mm5 \n\t"
  520. #endif
  521. WRITERGB15(%4, %5, %%REGa)
  522. YSCALEYUV2PACKEDX_END
  523. }
  524. #define WRITEBGR24MMX(dst, dstw, index) \
  525. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
  526. "movq %%mm2, %%mm1 \n\t" /* B */\
  527. "movq %%mm5, %%mm6 \n\t" /* R */\
  528. "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
  529. "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
  530. "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
  531. "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
  532. "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
  533. "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
  534. "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
  535. "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
  536. "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
  537. "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
  538. \
  539. "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
  540. "movq %%mm2, %%mm6 \n\t" /* 0RGB0RGB 1 */\
  541. "movq %%mm1, %%mm5 \n\t" /* 0RGB0RGB 2 */\
  542. "movq %%mm3, %%mm7 \n\t" /* 0RGB0RGB 3 */\
  543. \
  544. "psllq $40, %%mm0 \n\t" /* RGB00000 0 */\
  545. "psllq $40, %%mm2 \n\t" /* RGB00000 1 */\
  546. "psllq $40, %%mm1 \n\t" /* RGB00000 2 */\
  547. "psllq $40, %%mm3 \n\t" /* RGB00000 3 */\
  548. \
  549. "punpckhdq %%mm4, %%mm0 \n\t" /* 0RGBRGB0 0 */\
  550. "punpckhdq %%mm6, %%mm2 \n\t" /* 0RGBRGB0 1 */\
  551. "punpckhdq %%mm5, %%mm1 \n\t" /* 0RGBRGB0 2 */\
  552. "punpckhdq %%mm7, %%mm3 \n\t" /* 0RGBRGB0 3 */\
  553. \
  554. "psrlq $8, %%mm0 \n\t" /* 00RGBRGB 0 */\
  555. "movq %%mm2, %%mm6 \n\t" /* 0RGBRGB0 1 */\
  556. "psllq $40, %%mm2 \n\t" /* GB000000 1 */\
  557. "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
  558. MOVNTQ(%%mm0, (dst))\
  559. \
  560. "psrlq $24, %%mm6 \n\t" /* 0000RGBR 1 */\
  561. "movq %%mm1, %%mm5 \n\t" /* 0RGBRGB0 2 */\
  562. "psllq $24, %%mm1 \n\t" /* BRGB0000 2 */\
  563. "por %%mm1, %%mm6 \n\t" /* BRGBRGBR 1 */\
  564. MOVNTQ(%%mm6, 8(dst))\
  565. \
  566. "psrlq $40, %%mm5 \n\t" /* 000000RG 2 */\
  567. "psllq $8, %%mm3 \n\t" /* RGBRGB00 3 */\
  568. "por %%mm3, %%mm5 \n\t" /* RGBRGBRG 2 */\
  569. MOVNTQ(%%mm5, 16(dst))\
  570. \
  571. "add $24, "#dst" \n\t"\
  572. \
  573. "add $8, "#index" \n\t"\
  574. "cmp "#dstw", "#index" \n\t"\
  575. " jb 1b \n\t"
  576. #define WRITEBGR24MMXEXT(dst, dstw, index) \
  577. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
  578. "movq "MANGLE(ff_M24A)", %%mm0 \n\t"\
  579. "movq "MANGLE(ff_M24C)", %%mm7 \n\t"\
  580. "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2 B1 B0 B1 B0 */\
  581. "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2 G1 G0 G1 G0 */\
  582. "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0 R1 R0 R1 R0 */\
  583. \
  584. "pand %%mm0, %%mm1 \n\t" /* B2 B1 B0 */\
  585. "pand %%mm0, %%mm3 \n\t" /* G2 G1 G0 */\
  586. "pand %%mm7, %%mm6 \n\t" /* R1 R0 */\
  587. \
  588. "psllq $8, %%mm3 \n\t" /* G2 G1 G0 */\
  589. "por %%mm1, %%mm6 \n\t"\
  590. "por %%mm3, %%mm6 \n\t"\
  591. MOVNTQ(%%mm6, (dst))\
  592. \
  593. "psrlq $8, %%mm4 \n\t" /* 00 G7 G6 G5 G4 G3 G2 G1 */\
  594. "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4 B3 B2 B3 B2 */\
  595. "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3 G4 G3 G4 G3 */\
  596. "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4 R3 R2 R3 R2 */\
  597. \
  598. "pand "MANGLE(ff_M24B)", %%mm1 \n\t" /* B5 B4 B3 */\
  599. "pand %%mm7, %%mm3 \n\t" /* G4 G3 */\
  600. "pand %%mm0, %%mm6 \n\t" /* R4 R3 R2 */\
  601. \
  602. "por %%mm1, %%mm3 \n\t" /* B5 G4 B4 G3 B3 */\
  603. "por %%mm3, %%mm6 \n\t"\
  604. MOVNTQ(%%mm6, 8(dst))\
  605. \
  606. "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6 B7 B6 B6 B7 */\
  607. "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7 G6 G5 G6 G5 */\
  608. "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6 R5 R4 R5 R4 */\
  609. \
  610. "pand %%mm7, %%mm1 \n\t" /* B7 B6 */\
  611. "pand %%mm0, %%mm3 \n\t" /* G7 G6 G5 */\
  612. "pand "MANGLE(ff_M24B)", %%mm6 \n\t" /* R7 R6 R5 */\
  613. \
  614. "por %%mm1, %%mm3 \n\t"\
  615. "por %%mm3, %%mm6 \n\t"\
  616. MOVNTQ(%%mm6, 16(dst))\
  617. \
  618. "add $24, "#dst" \n\t"\
  619. \
  620. "add $8, "#index" \n\t"\
  621. "cmp "#dstw", "#index" \n\t"\
  622. " jb 1b \n\t"
  623. #if COMPILE_TEMPLATE_MMXEXT
  624. #undef WRITEBGR24
  625. #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMXEXT(dst, dstw, index)
  626. #else
  627. #undef WRITEBGR24
  628. #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX(dst, dstw, index)
  629. #endif
  630. static void RENAME(yuv2bgr24_X_ar)(SwsContext *c, const int16_t *lumFilter,
  631. const int16_t **lumSrc, int lumFilterSize,
  632. const int16_t *chrFilter, const int16_t **chrUSrc,
  633. const int16_t **chrVSrc,
  634. int chrFilterSize, const int16_t **alpSrc,
  635. uint8_t *dest, int dstW, int dstY)
  636. {
  637. x86_reg dummy=0;
  638. x86_reg dstW_reg = dstW;
  639. x86_reg uv_off = c->uv_offx2;
  640. YSCALEYUV2PACKEDX_ACCURATE
  641. YSCALEYUV2RGBX
  642. "pxor %%mm7, %%mm7 \n\t"
  643. "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"\n\t" //FIXME optimize
  644. "add %4, %%"REG_c" \n\t"
  645. WRITEBGR24(%%REGc, %5, %%REGa)
  646. :: "r" (&c->redDither),
  647. "m" (dummy), "m" (dummy), "m" (dummy),
  648. "r" (dest), "m" (dstW_reg), "m"(uv_off)
  649. : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
  650. );
  651. }
  652. static void RENAME(yuv2bgr24_X)(SwsContext *c, const int16_t *lumFilter,
  653. const int16_t **lumSrc, int lumFilterSize,
  654. const int16_t *chrFilter, const int16_t **chrUSrc,
  655. const int16_t **chrVSrc,
  656. int chrFilterSize, const int16_t **alpSrc,
  657. uint8_t *dest, int dstW, int dstY)
  658. {
  659. x86_reg dummy=0;
  660. x86_reg dstW_reg = dstW;
  661. x86_reg uv_off = c->uv_offx2;
  662. YSCALEYUV2PACKEDX
  663. YSCALEYUV2RGBX
  664. "pxor %%mm7, %%mm7 \n\t"
  665. "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c" \n\t" //FIXME optimize
  666. "add %4, %%"REG_c" \n\t"
  667. WRITEBGR24(%%REGc, %5, %%REGa)
  668. :: "r" (&c->redDither),
  669. "m" (dummy), "m" (dummy), "m" (dummy),
  670. "r" (dest), "m" (dstW_reg), "m"(uv_off)
  671. : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
  672. );
  673. }
  674. #define REAL_WRITEYUY2(dst, dstw, index) \
  675. "packuswb %%mm3, %%mm3 \n\t"\
  676. "packuswb %%mm4, %%mm4 \n\t"\
  677. "packuswb %%mm7, %%mm1 \n\t"\
  678. "punpcklbw %%mm4, %%mm3 \n\t"\
  679. "movq %%mm1, %%mm7 \n\t"\
  680. "punpcklbw %%mm3, %%mm1 \n\t"\
  681. "punpckhbw %%mm3, %%mm7 \n\t"\
  682. \
  683. MOVNTQ(%%mm1, (dst, index, 2))\
  684. MOVNTQ(%%mm7, 8(dst, index, 2))\
  685. \
  686. "add $8, "#index" \n\t"\
  687. "cmp "#dstw", "#index" \n\t"\
  688. " jb 1b \n\t"
  689. #define WRITEYUY2(dst, dstw, index) REAL_WRITEYUY2(dst, dstw, index)
  690. static void RENAME(yuv2yuyv422_X_ar)(SwsContext *c, const int16_t *lumFilter,
  691. const int16_t **lumSrc, int lumFilterSize,
  692. const int16_t *chrFilter, const int16_t **chrUSrc,
  693. const int16_t **chrVSrc,
  694. int chrFilterSize, const int16_t **alpSrc,
  695. uint8_t *dest, int dstW, int dstY)
  696. {
  697. x86_reg dummy=0;
  698. x86_reg dstW_reg = dstW;
  699. x86_reg uv_off = c->uv_offx2;
  700. YSCALEYUV2PACKEDX_ACCURATE
  701. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  702. "psraw $3, %%mm3 \n\t"
  703. "psraw $3, %%mm4 \n\t"
  704. "psraw $3, %%mm1 \n\t"
  705. "psraw $3, %%mm7 \n\t"
  706. WRITEYUY2(%4, %5, %%REGa)
  707. YSCALEYUV2PACKEDX_END
  708. }
  709. static void RENAME(yuv2yuyv422_X)(SwsContext *c, const int16_t *lumFilter,
  710. const int16_t **lumSrc, int lumFilterSize,
  711. const int16_t *chrFilter, const int16_t **chrUSrc,
  712. const int16_t **chrVSrc,
  713. int chrFilterSize, const int16_t **alpSrc,
  714. uint8_t *dest, int dstW, int dstY)
  715. {
  716. x86_reg dummy=0;
  717. x86_reg dstW_reg = dstW;
  718. x86_reg uv_off = c->uv_offx2;
  719. YSCALEYUV2PACKEDX
  720. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  721. "psraw $3, %%mm3 \n\t"
  722. "psraw $3, %%mm4 \n\t"
  723. "psraw $3, %%mm1 \n\t"
  724. "psraw $3, %%mm7 \n\t"
  725. WRITEYUY2(%4, %5, %%REGa)
  726. YSCALEYUV2PACKEDX_END
  727. }
  728. #define REAL_YSCALEYUV2RGB_UV(index, c) \
  729. "xor "#index", "#index" \n\t"\
  730. ".p2align 4 \n\t"\
  731. "1: \n\t"\
  732. "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
  733. "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
  734. "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \
  735. "movq (%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
  736. "movq (%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
  737. "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \
  738. "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
  739. "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
  740. "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
  741. "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
  742. "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
  743. "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
  744. "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
  745. "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
  746. "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
  747. "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
  748. "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
  749. "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
  750. "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
  751. "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
  752. "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
  753. /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
  754. #define REAL_YSCALEYUV2RGB_YA(index, c, b1, b2) \
  755. "movq ("#b1", "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
  756. "movq ("#b2", "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
  757. "movq 8("#b1", "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
  758. "movq 8("#b2", "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
  759. "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
  760. "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
  761. "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
  762. "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
  763. "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  764. "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  765. "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
  766. "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
  767. #define REAL_YSCALEYUV2RGB_COEFF(c) \
  768. "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
  769. "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
  770. "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
  771. "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
  772. "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
  773. "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
  774. /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
  775. "paddw %%mm3, %%mm4 \n\t"\
  776. "movq %%mm2, %%mm0 \n\t"\
  777. "movq %%mm5, %%mm6 \n\t"\
  778. "movq %%mm4, %%mm3 \n\t"\
  779. "punpcklwd %%mm2, %%mm2 \n\t"\
  780. "punpcklwd %%mm5, %%mm5 \n\t"\
  781. "punpcklwd %%mm4, %%mm4 \n\t"\
  782. "paddw %%mm1, %%mm2 \n\t"\
  783. "paddw %%mm1, %%mm5 \n\t"\
  784. "paddw %%mm1, %%mm4 \n\t"\
  785. "punpckhwd %%mm0, %%mm0 \n\t"\
  786. "punpckhwd %%mm6, %%mm6 \n\t"\
  787. "punpckhwd %%mm3, %%mm3 \n\t"\
  788. "paddw %%mm7, %%mm0 \n\t"\
  789. "paddw %%mm7, %%mm6 \n\t"\
  790. "paddw %%mm7, %%mm3 \n\t"\
  791. /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
  792. "packuswb %%mm0, %%mm2 \n\t"\
  793. "packuswb %%mm6, %%mm5 \n\t"\
  794. "packuswb %%mm3, %%mm4 \n\t"\
  795. #define YSCALEYUV2RGB_YA(index, c, b1, b2) REAL_YSCALEYUV2RGB_YA(index, c, b1, b2)
  796. #define YSCALEYUV2RGB(index, c) \
  797. REAL_YSCALEYUV2RGB_UV(index, c) \
  798. REAL_YSCALEYUV2RGB_YA(index, c, %0, %1) \
  799. REAL_YSCALEYUV2RGB_COEFF(c)
  800. /**
  801. * vertical bilinear scale YV12 to RGB
  802. */
  803. static void RENAME(yuv2rgb32_2)(SwsContext *c, const int16_t *buf[2],
  804. const int16_t *ubuf[2], const int16_t *vbuf[2],
  805. const int16_t *abuf[2], uint8_t *dest,
  806. int dstW, int yalpha, int uvalpha, int y)
  807. {
  808. const int16_t *buf0 = buf[0], *buf1 = buf[1],
  809. *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
  810. if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
  811. const int16_t *abuf0 = abuf[0], *abuf1 = abuf[1];
  812. #if ARCH_X86_64
  813. __asm__ volatile(
  814. YSCALEYUV2RGB(%%r8, %5)
  815. YSCALEYUV2RGB_YA(%%r8, %5, %6, %7)
  816. "psraw $3, %%mm1 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
  817. "psraw $3, %%mm7 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
  818. "packuswb %%mm7, %%mm1 \n\t"
  819. WRITEBGR32(%4, 8280(%5), %%r8, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
  820. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "r" (dest),
  821. "a" (&c->redDither),
  822. "r" (abuf0), "r" (abuf1)
  823. : "%r8"
  824. );
  825. #else
  826. c->u_temp=(intptr_t)abuf0;
  827. c->v_temp=(intptr_t)abuf1;
  828. __asm__ volatile(
  829. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  830. "mov %4, %%"REG_b" \n\t"
  831. "push %%"REG_BP" \n\t"
  832. YSCALEYUV2RGB(%%REGBP, %5)
  833. "push %0 \n\t"
  834. "push %1 \n\t"
  835. "mov "U_TEMP"(%5), %0 \n\t"
  836. "mov "V_TEMP"(%5), %1 \n\t"
  837. YSCALEYUV2RGB_YA(%%REGBP, %5, %0, %1)
  838. "psraw $3, %%mm1 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
  839. "psraw $3, %%mm7 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
  840. "packuswb %%mm7, %%mm1 \n\t"
  841. "pop %1 \n\t"
  842. "pop %0 \n\t"
  843. WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
  844. "pop %%"REG_BP" \n\t"
  845. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  846. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  847. "a" (&c->redDither)
  848. );
  849. #endif
  850. } else {
  851. __asm__ volatile(
  852. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  853. "mov %4, %%"REG_b" \n\t"
  854. "push %%"REG_BP" \n\t"
  855. YSCALEYUV2RGB(%%REGBP, %5)
  856. "pcmpeqd %%mm7, %%mm7 \n\t"
  857. WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
  858. "pop %%"REG_BP" \n\t"
  859. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  860. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  861. "a" (&c->redDither)
  862. );
  863. }
  864. }
  865. static void RENAME(yuv2bgr24_2)(SwsContext *c, const int16_t *buf[2],
  866. const int16_t *ubuf[2], const int16_t *vbuf[2],
  867. const int16_t *abuf[2], uint8_t *dest,
  868. int dstW, int yalpha, int uvalpha, int y)
  869. {
  870. const int16_t *buf0 = buf[0], *buf1 = buf[1],
  871. *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
  872. //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
  873. __asm__ volatile(
  874. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  875. "mov %4, %%"REG_b" \n\t"
  876. "push %%"REG_BP" \n\t"
  877. YSCALEYUV2RGB(%%REGBP, %5)
  878. "pxor %%mm7, %%mm7 \n\t"
  879. WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
  880. "pop %%"REG_BP" \n\t"
  881. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  882. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  883. "a" (&c->redDither)
  884. );
  885. }
  886. static void RENAME(yuv2rgb555_2)(SwsContext *c, const int16_t *buf[2],
  887. const int16_t *ubuf[2], const int16_t *vbuf[2],
  888. const int16_t *abuf[2], uint8_t *dest,
  889. int dstW, int yalpha, int uvalpha, int y)
  890. {
  891. const int16_t *buf0 = buf[0], *buf1 = buf[1],
  892. *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
  893. //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
  894. __asm__ volatile(
  895. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  896. "mov %4, %%"REG_b" \n\t"
  897. "push %%"REG_BP" \n\t"
  898. YSCALEYUV2RGB(%%REGBP, %5)
  899. "pxor %%mm7, %%mm7 \n\t"
  900. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  901. #ifdef DITHER1XBPP
  902. "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
  903. "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
  904. "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
  905. #endif
  906. WRITERGB15(%%REGb, 8280(%5), %%REGBP)
  907. "pop %%"REG_BP" \n\t"
  908. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  909. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  910. "a" (&c->redDither)
  911. );
  912. }
  913. static void RENAME(yuv2rgb565_2)(SwsContext *c, const int16_t *buf[2],
  914. const int16_t *ubuf[2], const int16_t *vbuf[2],
  915. const int16_t *abuf[2], uint8_t *dest,
  916. int dstW, int yalpha, int uvalpha, int y)
  917. {
  918. const int16_t *buf0 = buf[0], *buf1 = buf[1],
  919. *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
  920. //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
  921. __asm__ volatile(
  922. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  923. "mov %4, %%"REG_b" \n\t"
  924. "push %%"REG_BP" \n\t"
  925. YSCALEYUV2RGB(%%REGBP, %5)
  926. "pxor %%mm7, %%mm7 \n\t"
  927. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  928. #ifdef DITHER1XBPP
  929. "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
  930. "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
  931. "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
  932. #endif
  933. WRITERGB16(%%REGb, 8280(%5), %%REGBP)
  934. "pop %%"REG_BP" \n\t"
  935. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  936. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  937. "a" (&c->redDither)
  938. );
  939. }
  940. #define REAL_YSCALEYUV2PACKED(index, c) \
  941. "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
  942. "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1 \n\t"\
  943. "psraw $3, %%mm0 \n\t"\
  944. "psraw $3, %%mm1 \n\t"\
  945. "movq %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c") \n\t"\
  946. "movq %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c") \n\t"\
  947. "xor "#index", "#index" \n\t"\
  948. ".p2align 4 \n\t"\
  949. "1: \n\t"\
  950. "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
  951. "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
  952. "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \
  953. "movq (%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
  954. "movq (%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
  955. "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \
  956. "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
  957. "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
  958. "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
  959. "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
  960. "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
  961. "psraw $7, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
  962. "psraw $7, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
  963. "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
  964. "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
  965. "movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
  966. "movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
  967. "movq 8(%0, "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
  968. "movq 8(%1, "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
  969. "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
  970. "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
  971. "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
  972. "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
  973. "psraw $7, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  974. "psraw $7, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  975. "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
  976. "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
  977. #define YSCALEYUV2PACKED(index, c) REAL_YSCALEYUV2PACKED(index, c)
  978. static void RENAME(yuv2yuyv422_2)(SwsContext *c, const int16_t *buf[2],
  979. const int16_t *ubuf[2], const int16_t *vbuf[2],
  980. const int16_t *abuf[2], uint8_t *dest,
  981. int dstW, int yalpha, int uvalpha, int y)
  982. {
  983. const int16_t *buf0 = buf[0], *buf1 = buf[1],
  984. *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
  985. //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
  986. __asm__ volatile(
  987. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  988. "mov %4, %%"REG_b" \n\t"
  989. "push %%"REG_BP" \n\t"
  990. YSCALEYUV2PACKED(%%REGBP, %5)
  991. WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
  992. "pop %%"REG_BP" \n\t"
  993. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  994. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  995. "a" (&c->redDither)
  996. );
  997. }
  998. #define REAL_YSCALEYUV2RGB1(index, c) \
  999. "xor "#index", "#index" \n\t"\
  1000. ".p2align 4 \n\t"\
  1001. "1: \n\t"\
  1002. "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
  1003. "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \
  1004. "movq (%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
  1005. "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \
  1006. "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
  1007. "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
  1008. "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
  1009. "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
  1010. "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
  1011. "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
  1012. "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
  1013. "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
  1014. /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
  1015. "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
  1016. "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
  1017. "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  1018. "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  1019. "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
  1020. "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
  1021. "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
  1022. "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
  1023. "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
  1024. "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
  1025. /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
  1026. "paddw %%mm3, %%mm4 \n\t"\
  1027. "movq %%mm2, %%mm0 \n\t"\
  1028. "movq %%mm5, %%mm6 \n\t"\
  1029. "movq %%mm4, %%mm3 \n\t"\
  1030. "punpcklwd %%mm2, %%mm2 \n\t"\
  1031. "punpcklwd %%mm5, %%mm5 \n\t"\
  1032. "punpcklwd %%mm4, %%mm4 \n\t"\
  1033. "paddw %%mm1, %%mm2 \n\t"\
  1034. "paddw %%mm1, %%mm5 \n\t"\
  1035. "paddw %%mm1, %%mm4 \n\t"\
  1036. "punpckhwd %%mm0, %%mm0 \n\t"\
  1037. "punpckhwd %%mm6, %%mm6 \n\t"\
  1038. "punpckhwd %%mm3, %%mm3 \n\t"\
  1039. "paddw %%mm7, %%mm0 \n\t"\
  1040. "paddw %%mm7, %%mm6 \n\t"\
  1041. "paddw %%mm7, %%mm3 \n\t"\
  1042. /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
  1043. "packuswb %%mm0, %%mm2 \n\t"\
  1044. "packuswb %%mm6, %%mm5 \n\t"\
  1045. "packuswb %%mm3, %%mm4 \n\t"\
  1046. #define YSCALEYUV2RGB1(index, c) REAL_YSCALEYUV2RGB1(index, c)
  1047. // do vertical chrominance interpolation
  1048. #define REAL_YSCALEYUV2RGB1b(index, c) \
  1049. "xor "#index", "#index" \n\t"\
  1050. ".p2align 4 \n\t"\
  1051. "1: \n\t"\
  1052. "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
  1053. "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
  1054. "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \
  1055. "movq (%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
  1056. "movq (%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
  1057. "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \
  1058. "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
  1059. "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
  1060. "psrlw $5, %%mm3 \n\t" /*FIXME might overflow*/\
  1061. "psrlw $5, %%mm4 \n\t" /*FIXME might overflow*/\
  1062. "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
  1063. "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
  1064. "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
  1065. "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
  1066. "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
  1067. "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
  1068. /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
  1069. "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
  1070. "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
  1071. "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  1072. "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  1073. "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
  1074. "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
  1075. "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
  1076. "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
  1077. "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
  1078. "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
  1079. /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
  1080. "paddw %%mm3, %%mm4 \n\t"\
  1081. "movq %%mm2, %%mm0 \n\t"\
  1082. "movq %%mm5, %%mm6 \n\t"\
  1083. "movq %%mm4, %%mm3 \n\t"\
  1084. "punpcklwd %%mm2, %%mm2 \n\t"\
  1085. "punpcklwd %%mm5, %%mm5 \n\t"\
  1086. "punpcklwd %%mm4, %%mm4 \n\t"\
  1087. "paddw %%mm1, %%mm2 \n\t"\
  1088. "paddw %%mm1, %%mm5 \n\t"\
  1089. "paddw %%mm1, %%mm4 \n\t"\
  1090. "punpckhwd %%mm0, %%mm0 \n\t"\
  1091. "punpckhwd %%mm6, %%mm6 \n\t"\
  1092. "punpckhwd %%mm3, %%mm3 \n\t"\
  1093. "paddw %%mm7, %%mm0 \n\t"\
  1094. "paddw %%mm7, %%mm6 \n\t"\
  1095. "paddw %%mm7, %%mm3 \n\t"\
  1096. /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
  1097. "packuswb %%mm0, %%mm2 \n\t"\
  1098. "packuswb %%mm6, %%mm5 \n\t"\
  1099. "packuswb %%mm3, %%mm4 \n\t"\
  1100. #define YSCALEYUV2RGB1b(index, c) REAL_YSCALEYUV2RGB1b(index, c)
  1101. #define REAL_YSCALEYUV2RGB1_ALPHA(index) \
  1102. "movq (%1, "#index", 2), %%mm7 \n\t" /* abuf0[index ] */\
  1103. "movq 8(%1, "#index", 2), %%mm1 \n\t" /* abuf0[index+4] */\
  1104. "psraw $7, %%mm7 \n\t" /* abuf0[index ] >>7 */\
  1105. "psraw $7, %%mm1 \n\t" /* abuf0[index+4] >>7 */\
  1106. "packuswb %%mm1, %%mm7 \n\t"
  1107. #define YSCALEYUV2RGB1_ALPHA(index) REAL_YSCALEYUV2RGB1_ALPHA(index)
  1108. /**
  1109. * YV12 to RGB without scaling or interpolating
  1110. */
  1111. static void RENAME(yuv2rgb32_1)(SwsContext *c, const int16_t *buf0,
  1112. const int16_t *ubuf[2], const int16_t *vbuf[2],
  1113. const int16_t *abuf0, uint8_t *dest,
  1114. int dstW, int uvalpha, int y)
  1115. {
  1116. const int16_t *ubuf0 = ubuf[0];
  1117. const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
  1118. if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
  1119. const int16_t *ubuf1 = ubuf[0];
  1120. if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
  1121. __asm__ volatile(
  1122. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1123. "mov %4, %%"REG_b" \n\t"
  1124. "push %%"REG_BP" \n\t"
  1125. YSCALEYUV2RGB1(%%REGBP, %5)
  1126. YSCALEYUV2RGB1_ALPHA(%%REGBP)
  1127. WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
  1128. "pop %%"REG_BP" \n\t"
  1129. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1130. :: "c" (buf0), "d" (abuf0), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1131. "a" (&c->redDither)
  1132. );
  1133. } else {
  1134. __asm__ volatile(
  1135. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1136. "mov %4, %%"REG_b" \n\t"
  1137. "push %%"REG_BP" \n\t"
  1138. YSCALEYUV2RGB1(%%REGBP, %5)
  1139. "pcmpeqd %%mm7, %%mm7 \n\t"
  1140. WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
  1141. "pop %%"REG_BP" \n\t"
  1142. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1143. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1144. "a" (&c->redDither)
  1145. );
  1146. }
  1147. } else {
  1148. const int16_t *ubuf1 = ubuf[1];
  1149. if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
  1150. __asm__ volatile(
  1151. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1152. "mov %4, %%"REG_b" \n\t"
  1153. "push %%"REG_BP" \n\t"
  1154. YSCALEYUV2RGB1b(%%REGBP, %5)
  1155. YSCALEYUV2RGB1_ALPHA(%%REGBP)
  1156. WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
  1157. "pop %%"REG_BP" \n\t"
  1158. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1159. :: "c" (buf0), "d" (abuf0), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1160. "a" (&c->redDither)
  1161. );
  1162. } else {
  1163. __asm__ volatile(
  1164. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1165. "mov %4, %%"REG_b" \n\t"
  1166. "push %%"REG_BP" \n\t"
  1167. YSCALEYUV2RGB1b(%%REGBP, %5)
  1168. "pcmpeqd %%mm7, %%mm7 \n\t"
  1169. WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
  1170. "pop %%"REG_BP" \n\t"
  1171. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1172. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1173. "a" (&c->redDither)
  1174. );
  1175. }
  1176. }
  1177. }
  1178. static void RENAME(yuv2bgr24_1)(SwsContext *c, const int16_t *buf0,
  1179. const int16_t *ubuf[2], const int16_t *vbuf[2],
  1180. const int16_t *abuf0, uint8_t *dest,
  1181. int dstW, int uvalpha, int y)
  1182. {
  1183. const int16_t *ubuf0 = ubuf[0];
  1184. const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
  1185. if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
  1186. const int16_t *ubuf1 = ubuf[0];
  1187. __asm__ volatile(
  1188. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1189. "mov %4, %%"REG_b" \n\t"
  1190. "push %%"REG_BP" \n\t"
  1191. YSCALEYUV2RGB1(%%REGBP, %5)
  1192. "pxor %%mm7, %%mm7 \n\t"
  1193. WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
  1194. "pop %%"REG_BP" \n\t"
  1195. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1196. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1197. "a" (&c->redDither)
  1198. );
  1199. } else {
  1200. const int16_t *ubuf1 = ubuf[1];
  1201. __asm__ volatile(
  1202. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1203. "mov %4, %%"REG_b" \n\t"
  1204. "push %%"REG_BP" \n\t"
  1205. YSCALEYUV2RGB1b(%%REGBP, %5)
  1206. "pxor %%mm7, %%mm7 \n\t"
  1207. WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
  1208. "pop %%"REG_BP" \n\t"
  1209. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1210. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1211. "a" (&c->redDither)
  1212. );
  1213. }
  1214. }
  1215. static void RENAME(yuv2rgb555_1)(SwsContext *c, const int16_t *buf0,
  1216. const int16_t *ubuf[2], const int16_t *vbuf[2],
  1217. const int16_t *abuf0, uint8_t *dest,
  1218. int dstW, int uvalpha, int y)
  1219. {
  1220. const int16_t *ubuf0 = ubuf[0];
  1221. const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
  1222. if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
  1223. const int16_t *ubuf1 = ubuf[0];
  1224. __asm__ volatile(
  1225. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1226. "mov %4, %%"REG_b" \n\t"
  1227. "push %%"REG_BP" \n\t"
  1228. YSCALEYUV2RGB1(%%REGBP, %5)
  1229. "pxor %%mm7, %%mm7 \n\t"
  1230. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1231. #ifdef DITHER1XBPP
  1232. "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
  1233. "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
  1234. "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
  1235. #endif
  1236. WRITERGB15(%%REGb, 8280(%5), %%REGBP)
  1237. "pop %%"REG_BP" \n\t"
  1238. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1239. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1240. "a" (&c->redDither)
  1241. );
  1242. } else {
  1243. const int16_t *ubuf1 = ubuf[1];
  1244. __asm__ volatile(
  1245. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1246. "mov %4, %%"REG_b" \n\t"
  1247. "push %%"REG_BP" \n\t"
  1248. YSCALEYUV2RGB1b(%%REGBP, %5)
  1249. "pxor %%mm7, %%mm7 \n\t"
  1250. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1251. #ifdef DITHER1XBPP
  1252. "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
  1253. "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
  1254. "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
  1255. #endif
  1256. WRITERGB15(%%REGb, 8280(%5), %%REGBP)
  1257. "pop %%"REG_BP" \n\t"
  1258. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1259. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1260. "a" (&c->redDither)
  1261. );
  1262. }
  1263. }
  1264. static void RENAME(yuv2rgb565_1)(SwsContext *c, const int16_t *buf0,
  1265. const int16_t *ubuf[2], const int16_t *vbuf[2],
  1266. const int16_t *abuf0, uint8_t *dest,
  1267. int dstW, int uvalpha, int y)
  1268. {
  1269. const int16_t *ubuf0 = ubuf[0];
  1270. const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
  1271. if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
  1272. const int16_t *ubuf1 = ubuf[0];
  1273. __asm__ volatile(
  1274. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1275. "mov %4, %%"REG_b" \n\t"
  1276. "push %%"REG_BP" \n\t"
  1277. YSCALEYUV2RGB1(%%REGBP, %5)
  1278. "pxor %%mm7, %%mm7 \n\t"
  1279. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1280. #ifdef DITHER1XBPP
  1281. "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
  1282. "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
  1283. "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
  1284. #endif
  1285. WRITERGB16(%%REGb, 8280(%5), %%REGBP)
  1286. "pop %%"REG_BP" \n\t"
  1287. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1288. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1289. "a" (&c->redDither)
  1290. );
  1291. } else {
  1292. const int16_t *ubuf1 = ubuf[1];
  1293. __asm__ volatile(
  1294. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1295. "mov %4, %%"REG_b" \n\t"
  1296. "push %%"REG_BP" \n\t"
  1297. YSCALEYUV2RGB1b(%%REGBP, %5)
  1298. "pxor %%mm7, %%mm7 \n\t"
  1299. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1300. #ifdef DITHER1XBPP
  1301. "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
  1302. "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
  1303. "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
  1304. #endif
  1305. WRITERGB16(%%REGb, 8280(%5), %%REGBP)
  1306. "pop %%"REG_BP" \n\t"
  1307. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1308. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1309. "a" (&c->redDither)
  1310. );
  1311. }
  1312. }
  1313. #define REAL_YSCALEYUV2PACKED1(index, c) \
  1314. "xor "#index", "#index" \n\t"\
  1315. ".p2align 4 \n\t"\
  1316. "1: \n\t"\
  1317. "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
  1318. "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \
  1319. "movq (%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
  1320. "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \
  1321. "psraw $7, %%mm3 \n\t" \
  1322. "psraw $7, %%mm4 \n\t" \
  1323. "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
  1324. "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
  1325. "psraw $7, %%mm1 \n\t" \
  1326. "psraw $7, %%mm7 \n\t" \
  1327. #define YSCALEYUV2PACKED1(index, c) REAL_YSCALEYUV2PACKED1(index, c)
  1328. #define REAL_YSCALEYUV2PACKED1b(index, c) \
  1329. "xor "#index", "#index" \n\t"\
  1330. ".p2align 4 \n\t"\
  1331. "1: \n\t"\
  1332. "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
  1333. "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
  1334. "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \
  1335. "movq (%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
  1336. "movq (%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
  1337. "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \
  1338. "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
  1339. "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
  1340. "psrlw $8, %%mm3 \n\t" \
  1341. "psrlw $8, %%mm4 \n\t" \
  1342. "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
  1343. "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
  1344. "psraw $7, %%mm1 \n\t" \
  1345. "psraw $7, %%mm7 \n\t"
  1346. #define YSCALEYUV2PACKED1b(index, c) REAL_YSCALEYUV2PACKED1b(index, c)
  1347. static void RENAME(yuv2yuyv422_1)(SwsContext *c, const int16_t *buf0,
  1348. const int16_t *ubuf[2], const int16_t *vbuf[2],
  1349. const int16_t *abuf0, uint8_t *dest,
  1350. int dstW, int uvalpha, int y)
  1351. {
  1352. const int16_t *ubuf0 = ubuf[0];
  1353. const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
  1354. if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
  1355. const int16_t *ubuf1 = ubuf[0];
  1356. __asm__ volatile(
  1357. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1358. "mov %4, %%"REG_b" \n\t"
  1359. "push %%"REG_BP" \n\t"
  1360. YSCALEYUV2PACKED1(%%REGBP, %5)
  1361. WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
  1362. "pop %%"REG_BP" \n\t"
  1363. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1364. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1365. "a" (&c->redDither)
  1366. );
  1367. } else {
  1368. const int16_t *ubuf1 = ubuf[1];
  1369. __asm__ volatile(
  1370. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1371. "mov %4, %%"REG_b" \n\t"
  1372. "push %%"REG_BP" \n\t"
  1373. YSCALEYUV2PACKED1b(%%REGBP, %5)
  1374. WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
  1375. "pop %%"REG_BP" \n\t"
  1376. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1377. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1378. "a" (&c->redDither)
  1379. );
  1380. }
  1381. }
  1382. #if COMPILE_TEMPLATE_MMXEXT
  1383. static void RENAME(hyscale_fast)(SwsContext *c, int16_t *dst,
  1384. int dstWidth, const uint8_t *src,
  1385. int srcW, int xInc)
  1386. {
  1387. int32_t *filterPos = c->hLumFilterPos;
  1388. int16_t *filter = c->hLumFilter;
  1389. void *mmxextFilterCode = c->lumMmxextFilterCode;
  1390. int i;
  1391. #if defined(PIC)
  1392. uint64_t ebxsave;
  1393. #endif
  1394. #if ARCH_X86_64
  1395. uint64_t retsave;
  1396. #endif
  1397. __asm__ volatile(
  1398. #if defined(PIC)
  1399. "mov %%"REG_b", %5 \n\t"
  1400. #if ARCH_X86_64
  1401. "mov -8(%%rsp), %%"REG_a" \n\t"
  1402. "mov %%"REG_a", %6 \n\t"
  1403. #endif
  1404. #else
  1405. #if ARCH_X86_64
  1406. "mov -8(%%rsp), %%"REG_a" \n\t"
  1407. "mov %%"REG_a", %5 \n\t"
  1408. #endif
  1409. #endif
  1410. "pxor %%mm7, %%mm7 \n\t"
  1411. "mov %0, %%"REG_c" \n\t"
  1412. "mov %1, %%"REG_D" \n\t"
  1413. "mov %2, %%"REG_d" \n\t"
  1414. "mov %3, %%"REG_b" \n\t"
  1415. "xor %%"REG_a", %%"REG_a" \n\t" // i
  1416. PREFETCH" (%%"REG_c") \n\t"
  1417. PREFETCH" 32(%%"REG_c") \n\t"
  1418. PREFETCH" 64(%%"REG_c") \n\t"
  1419. #if ARCH_X86_64
  1420. #define CALL_MMXEXT_FILTER_CODE \
  1421. "movl (%%"REG_b"), %%esi \n\t"\
  1422. "call *%4 \n\t"\
  1423. "movl (%%"REG_b", %%"REG_a"), %%esi \n\t"\
  1424. "add %%"REG_S", %%"REG_c" \n\t"\
  1425. "add %%"REG_a", %%"REG_D" \n\t"\
  1426. "xor %%"REG_a", %%"REG_a" \n\t"\
  1427. #else
  1428. #define CALL_MMXEXT_FILTER_CODE \
  1429. "movl (%%"REG_b"), %%esi \n\t"\
  1430. "call *%4 \n\t"\
  1431. "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\
  1432. "add %%"REG_a", %%"REG_D" \n\t"\
  1433. "xor %%"REG_a", %%"REG_a" \n\t"\
  1434. #endif /* ARCH_X86_64 */
  1435. CALL_MMXEXT_FILTER_CODE
  1436. CALL_MMXEXT_FILTER_CODE
  1437. CALL_MMXEXT_FILTER_CODE
  1438. CALL_MMXEXT_FILTER_CODE
  1439. CALL_MMXEXT_FILTER_CODE
  1440. CALL_MMXEXT_FILTER_CODE
  1441. CALL_MMXEXT_FILTER_CODE
  1442. CALL_MMXEXT_FILTER_CODE
  1443. #if defined(PIC)
  1444. "mov %5, %%"REG_b" \n\t"
  1445. #if ARCH_X86_64
  1446. "mov %6, %%"REG_a" \n\t"
  1447. "mov %%"REG_a", -8(%%rsp) \n\t"
  1448. #endif
  1449. #else
  1450. #if ARCH_X86_64
  1451. "mov %5, %%"REG_a" \n\t"
  1452. "mov %%"REG_a", -8(%%rsp) \n\t"
  1453. #endif
  1454. #endif
  1455. :: "m" (src), "m" (dst), "m" (filter), "m" (filterPos),
  1456. "m" (mmxextFilterCode)
  1457. #if defined(PIC)
  1458. ,"m" (ebxsave)
  1459. #endif
  1460. #if ARCH_X86_64
  1461. ,"m"(retsave)
  1462. #endif
  1463. : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
  1464. #if !defined(PIC)
  1465. ,"%"REG_b
  1466. #endif
  1467. );
  1468. for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
  1469. dst[i] = src[srcW-1]*128;
  1470. }
  1471. static void RENAME(hcscale_fast)(SwsContext *c, int16_t *dst1, int16_t *dst2,
  1472. int dstWidth, const uint8_t *src1,
  1473. const uint8_t *src2, int srcW, int xInc)
  1474. {
  1475. int32_t *filterPos = c->hChrFilterPos;
  1476. int16_t *filter = c->hChrFilter;
  1477. void *mmxextFilterCode = c->chrMmxextFilterCode;
  1478. int i;
  1479. #if defined(PIC)
  1480. DECLARE_ALIGNED(8, uint64_t, ebxsave);
  1481. #endif
  1482. #if ARCH_X86_64
  1483. DECLARE_ALIGNED(8, uint64_t, retsave);
  1484. #endif
  1485. __asm__ volatile(
  1486. #if defined(PIC)
  1487. "mov %%"REG_b", %7 \n\t"
  1488. #if ARCH_X86_64
  1489. "mov -8(%%rsp), %%"REG_a" \n\t"
  1490. "mov %%"REG_a", %8 \n\t"
  1491. #endif
  1492. #else
  1493. #if ARCH_X86_64
  1494. "mov -8(%%rsp), %%"REG_a" \n\t"
  1495. "mov %%"REG_a", %7 \n\t"
  1496. #endif
  1497. #endif
  1498. "pxor %%mm7, %%mm7 \n\t"
  1499. "mov %0, %%"REG_c" \n\t"
  1500. "mov %1, %%"REG_D" \n\t"
  1501. "mov %2, %%"REG_d" \n\t"
  1502. "mov %3, %%"REG_b" \n\t"
  1503. "xor %%"REG_a", %%"REG_a" \n\t" // i
  1504. PREFETCH" (%%"REG_c") \n\t"
  1505. PREFETCH" 32(%%"REG_c") \n\t"
  1506. PREFETCH" 64(%%"REG_c") \n\t"
  1507. CALL_MMXEXT_FILTER_CODE
  1508. CALL_MMXEXT_FILTER_CODE
  1509. CALL_MMXEXT_FILTER_CODE
  1510. CALL_MMXEXT_FILTER_CODE
  1511. "xor %%"REG_a", %%"REG_a" \n\t" // i
  1512. "mov %5, %%"REG_c" \n\t" // src
  1513. "mov %6, %%"REG_D" \n\t" // buf2
  1514. PREFETCH" (%%"REG_c") \n\t"
  1515. PREFETCH" 32(%%"REG_c") \n\t"
  1516. PREFETCH" 64(%%"REG_c") \n\t"
  1517. CALL_MMXEXT_FILTER_CODE
  1518. CALL_MMXEXT_FILTER_CODE
  1519. CALL_MMXEXT_FILTER_CODE
  1520. CALL_MMXEXT_FILTER_CODE
  1521. #if defined(PIC)
  1522. "mov %7, %%"REG_b" \n\t"
  1523. #if ARCH_X86_64
  1524. "mov %8, %%"REG_a" \n\t"
  1525. "mov %%"REG_a", -8(%%rsp) \n\t"
  1526. #endif
  1527. #else
  1528. #if ARCH_X86_64
  1529. "mov %7, %%"REG_a" \n\t"
  1530. "mov %%"REG_a", -8(%%rsp) \n\t"
  1531. #endif
  1532. #endif
  1533. :: "m" (src1), "m" (dst1), "m" (filter), "m" (filterPos),
  1534. "m" (mmxextFilterCode), "m" (src2), "m"(dst2)
  1535. #if defined(PIC)
  1536. ,"m" (ebxsave)
  1537. #endif
  1538. #if ARCH_X86_64
  1539. ,"m"(retsave)
  1540. #endif
  1541. : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
  1542. #if !defined(PIC)
  1543. ,"%"REG_b
  1544. #endif
  1545. );
  1546. for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) {
  1547. dst1[i] = src1[srcW-1]*128;
  1548. dst2[i] = src2[srcW-1]*128;
  1549. }
  1550. }
  1551. #endif /* COMPILE_TEMPLATE_MMXEXT */
  1552. static av_cold void RENAME(sws_init_swscale)(SwsContext *c)
  1553. {
  1554. enum AVPixelFormat dstFormat = c->dstFormat;
  1555. c->use_mmx_vfilter= 0;
  1556. if (!is16BPS(dstFormat) && !is9_OR_10BPS(dstFormat) && dstFormat != AV_PIX_FMT_NV12
  1557. && dstFormat != AV_PIX_FMT_NV21 && !(c->flags & SWS_BITEXACT)) {
  1558. if (c->flags & SWS_ACCURATE_RND) {
  1559. if (!(c->flags & SWS_FULL_CHR_H_INT)) {
  1560. switch (c->dstFormat) {
  1561. case AV_PIX_FMT_RGB32: c->yuv2packedX = RENAME(yuv2rgb32_X_ar); break;
  1562. case AV_PIX_FMT_BGR24: c->yuv2packedX = RENAME(yuv2bgr24_X_ar); break;
  1563. case AV_PIX_FMT_RGB555: c->yuv2packedX = RENAME(yuv2rgb555_X_ar); break;
  1564. case AV_PIX_FMT_RGB565: c->yuv2packedX = RENAME(yuv2rgb565_X_ar); break;
  1565. case AV_PIX_FMT_YUYV422: c->yuv2packedX = RENAME(yuv2yuyv422_X_ar); break;
  1566. default: break;
  1567. }
  1568. }
  1569. } else {
  1570. c->use_mmx_vfilter= 1;
  1571. c->yuv2planeX = RENAME(yuv2yuvX );
  1572. if (!(c->flags & SWS_FULL_CHR_H_INT)) {
  1573. switch (c->dstFormat) {
  1574. case AV_PIX_FMT_RGB32: c->yuv2packedX = RENAME(yuv2rgb32_X); break;
  1575. case AV_PIX_FMT_BGR24: c->yuv2packedX = RENAME(yuv2bgr24_X); break;
  1576. case AV_PIX_FMT_RGB555: c->yuv2packedX = RENAME(yuv2rgb555_X); break;
  1577. case AV_PIX_FMT_RGB565: c->yuv2packedX = RENAME(yuv2rgb565_X); break;
  1578. case AV_PIX_FMT_YUYV422: c->yuv2packedX = RENAME(yuv2yuyv422_X); break;
  1579. default: break;
  1580. }
  1581. }
  1582. }
  1583. if (!(c->flags & SWS_FULL_CHR_H_INT)) {
  1584. switch (c->dstFormat) {
  1585. case AV_PIX_FMT_RGB32:
  1586. c->yuv2packed1 = RENAME(yuv2rgb32_1);
  1587. c->yuv2packed2 = RENAME(yuv2rgb32_2);
  1588. break;
  1589. case AV_PIX_FMT_BGR24:
  1590. c->yuv2packed1 = RENAME(yuv2bgr24_1);
  1591. c->yuv2packed2 = RENAME(yuv2bgr24_2);
  1592. break;
  1593. case AV_PIX_FMT_RGB555:
  1594. c->yuv2packed1 = RENAME(yuv2rgb555_1);
  1595. c->yuv2packed2 = RENAME(yuv2rgb555_2);
  1596. break;
  1597. case AV_PIX_FMT_RGB565:
  1598. c->yuv2packed1 = RENAME(yuv2rgb565_1);
  1599. c->yuv2packed2 = RENAME(yuv2rgb565_2);
  1600. break;
  1601. case AV_PIX_FMT_YUYV422:
  1602. c->yuv2packed1 = RENAME(yuv2yuyv422_1);
  1603. c->yuv2packed2 = RENAME(yuv2yuyv422_2);
  1604. break;
  1605. default:
  1606. break;
  1607. }
  1608. }
  1609. }
  1610. if (c->srcBpc == 8 && c->dstBpc <= 14) {
  1611. // Use the new MMX scaler if the MMXEXT one can't be used (it is faster than the x86 ASM one).
  1612. #if COMPILE_TEMPLATE_MMXEXT
  1613. if (c->flags & SWS_FAST_BILINEAR && c->canMMXEXTBeUsed) {
  1614. c->hyscale_fast = RENAME(hyscale_fast);
  1615. c->hcscale_fast = RENAME(hcscale_fast);
  1616. } else {
  1617. #endif /* COMPILE_TEMPLATE_MMXEXT */
  1618. c->hyscale_fast = NULL;
  1619. c->hcscale_fast = NULL;
  1620. #if COMPILE_TEMPLATE_MMXEXT
  1621. }
  1622. #endif /* COMPILE_TEMPLATE_MMXEXT */
  1623. }
  1624. }