You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1730 lines
76KB

  1. /*
  2. * Copyright (C) 2001-2011 Michael Niedermayer <michaelni@gmx.at>
  3. *
  4. * This file is part of FFmpeg.
  5. *
  6. * FFmpeg is free software; you can redistribute it and/or
  7. * modify it under the terms of the GNU Lesser General Public
  8. * License as published by the Free Software Foundation; either
  9. * version 2.1 of the License, or (at your option) any later version.
  10. *
  11. * FFmpeg is distributed in the hope that it will be useful,
  12. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  14. * Lesser General Public License for more details.
  15. *
  16. * You should have received a copy of the GNU Lesser General Public
  17. * License along with FFmpeg; if not, write to the Free Software
  18. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  19. */
  20. #undef REAL_MOVNTQ
  21. #undef MOVNTQ
  22. #undef MOVNTQ2
  23. #undef PREFETCH
  24. #if COMPILE_TEMPLATE_MMXEXT
  25. #define PREFETCH "prefetchnta"
  26. #else
  27. #define PREFETCH " # nop"
  28. #endif
  29. #if COMPILE_TEMPLATE_MMXEXT
  30. #define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
  31. #define MOVNTQ2 "movntq "
  32. #else
  33. #define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
  34. #define MOVNTQ2 "movq "
  35. #endif
  36. #define MOVNTQ(a,b) REAL_MOVNTQ(a,b)
  37. #if !COMPILE_TEMPLATE_MMXEXT
  38. static av_always_inline void
  39. dither_8to16(const uint8_t *srcDither, int rot)
  40. {
  41. if (rot) {
  42. __asm__ volatile("pxor %%mm0, %%mm0\n\t"
  43. "movq (%0), %%mm3\n\t"
  44. "movq %%mm3, %%mm4\n\t"
  45. "psrlq $24, %%mm3\n\t"
  46. "psllq $40, %%mm4\n\t"
  47. "por %%mm4, %%mm3\n\t"
  48. "movq %%mm3, %%mm4\n\t"
  49. "punpcklbw %%mm0, %%mm3\n\t"
  50. "punpckhbw %%mm0, %%mm4\n\t"
  51. :: "r"(srcDither)
  52. );
  53. } else {
  54. __asm__ volatile("pxor %%mm0, %%mm0\n\t"
  55. "movq (%0), %%mm3\n\t"
  56. "movq %%mm3, %%mm4\n\t"
  57. "punpcklbw %%mm0, %%mm3\n\t"
  58. "punpckhbw %%mm0, %%mm4\n\t"
  59. :: "r"(srcDither)
  60. );
  61. }
  62. }
  63. #endif
  64. static void RENAME(yuv2yuvX)(const int16_t *filter, int filterSize,
  65. const int16_t **src, uint8_t *dest, int dstW,
  66. const uint8_t *dither, int offset)
  67. {
  68. dither_8to16(dither, offset);
  69. filterSize--;
  70. __asm__ volatile(
  71. "movd %0, %%mm1\n\t"
  72. "punpcklwd %%mm1, %%mm1\n\t"
  73. "punpckldq %%mm1, %%mm1\n\t"
  74. "psllw $3, %%mm1\n\t"
  75. "paddw %%mm1, %%mm3\n\t"
  76. "paddw %%mm1, %%mm4\n\t"
  77. "psraw $4, %%mm3\n\t"
  78. "psraw $4, %%mm4\n\t"
  79. ::"m"(filterSize)
  80. );
  81. __asm__ volatile(\
  82. "movq %%mm3, %%mm6\n\t"
  83. "movq %%mm4, %%mm7\n\t"
  84. "movl %3, %%ecx\n\t"
  85. "mov %0, %%"REG_d" \n\t"\
  86. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  87. ".p2align 4 \n\t" /* FIXME Unroll? */\
  88. "1: \n\t"\
  89. "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
  90. "movq (%%"REG_S", %%"REG_c", 2), %%mm2 \n\t" /* srcData */\
  91. "movq 8(%%"REG_S", %%"REG_c", 2), %%mm5 \n\t" /* srcData */\
  92. "add $16, %%"REG_d" \n\t"\
  93. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  94. "test %%"REG_S", %%"REG_S" \n\t"\
  95. "pmulhw %%mm0, %%mm2 \n\t"\
  96. "pmulhw %%mm0, %%mm5 \n\t"\
  97. "paddw %%mm2, %%mm3 \n\t"\
  98. "paddw %%mm5, %%mm4 \n\t"\
  99. " jnz 1b \n\t"\
  100. "psraw $3, %%mm3 \n\t"\
  101. "psraw $3, %%mm4 \n\t"\
  102. "packuswb %%mm4, %%mm3 \n\t"
  103. MOVNTQ2 " %%mm3, (%1, %%"REG_c")\n\t"
  104. "add $8, %%"REG_c" \n\t"\
  105. "cmp %2, %%"REG_c" \n\t"\
  106. "movq %%mm6, %%mm3\n\t"
  107. "movq %%mm7, %%mm4\n\t"
  108. "mov %0, %%"REG_d" \n\t"\
  109. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  110. "jb 1b \n\t"\
  111. :: "g" (filter),
  112. "r" (dest-offset), "g" ((x86_reg)(dstW+offset)), "m" (offset)
  113. : "%"REG_d, "%"REG_S, "%"REG_c
  114. );
  115. }
  116. #define YSCALEYUV2PACKEDX_UV \
  117. __asm__ volatile(\
  118. "xor %%"REG_a", %%"REG_a" \n\t"\
  119. ".p2align 4 \n\t"\
  120. "nop \n\t"\
  121. "1: \n\t"\
  122. "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
  123. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  124. "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
  125. "movq %%mm3, %%mm4 \n\t"\
  126. ".p2align 4 \n\t"\
  127. "2: \n\t"\
  128. "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
  129. "movq (%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* UsrcData */\
  130. "add %6, %%"REG_S" \n\t" \
  131. "movq (%%"REG_S", %%"REG_a"), %%mm5 \n\t" /* VsrcData */\
  132. "add $16, %%"REG_d" \n\t"\
  133. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  134. "pmulhw %%mm0, %%mm2 \n\t"\
  135. "pmulhw %%mm0, %%mm5 \n\t"\
  136. "paddw %%mm2, %%mm3 \n\t"\
  137. "paddw %%mm5, %%mm4 \n\t"\
  138. "test %%"REG_S", %%"REG_S" \n\t"\
  139. " jnz 2b \n\t"\
  140. #define YSCALEYUV2PACKEDX_YA(offset,coeff,src1,src2,dst1,dst2) \
  141. "lea "offset"(%0), %%"REG_d" \n\t"\
  142. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  143. "movq "VROUNDER_OFFSET"(%0), "#dst1" \n\t"\
  144. "movq "#dst1", "#dst2" \n\t"\
  145. ".p2align 4 \n\t"\
  146. "2: \n\t"\
  147. "movq 8(%%"REG_d"), "#coeff" \n\t" /* filterCoeff */\
  148. "movq (%%"REG_S", %%"REG_a", 2), "#src1" \n\t" /* Y1srcData */\
  149. "movq 8(%%"REG_S", %%"REG_a", 2), "#src2" \n\t" /* Y2srcData */\
  150. "add $16, %%"REG_d" \n\t"\
  151. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  152. "pmulhw "#coeff", "#src1" \n\t"\
  153. "pmulhw "#coeff", "#src2" \n\t"\
  154. "paddw "#src1", "#dst1" \n\t"\
  155. "paddw "#src2", "#dst2" \n\t"\
  156. "test %%"REG_S", %%"REG_S" \n\t"\
  157. " jnz 2b \n\t"\
  158. #define YSCALEYUV2PACKEDX \
  159. YSCALEYUV2PACKEDX_UV \
  160. YSCALEYUV2PACKEDX_YA(LUM_MMX_FILTER_OFFSET,%%mm0,%%mm2,%%mm5,%%mm1,%%mm7) \
  161. #define YSCALEYUV2PACKEDX_END \
  162. :: "r" (&c->redDither), \
  163. "m" (dummy), "m" (dummy), "m" (dummy),\
  164. "r" (dest), "m" (dstW_reg), "m"(uv_off) \
  165. NAMED_CONSTRAINTS_ADD(bF8,bFC) \
  166. : "%"REG_a, "%"REG_d, "%"REG_S \
  167. );
  168. #define YSCALEYUV2PACKEDX_ACCURATE_UV \
  169. __asm__ volatile(\
  170. "xor %%"REG_a", %%"REG_a" \n\t"\
  171. ".p2align 4 \n\t"\
  172. "nop \n\t"\
  173. "1: \n\t"\
  174. "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
  175. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  176. "pxor %%mm4, %%mm4 \n\t"\
  177. "pxor %%mm5, %%mm5 \n\t"\
  178. "pxor %%mm6, %%mm6 \n\t"\
  179. "pxor %%mm7, %%mm7 \n\t"\
  180. ".p2align 4 \n\t"\
  181. "2: \n\t"\
  182. "movq (%%"REG_S", %%"REG_a"), %%mm0 \n\t" /* UsrcData */\
  183. "add %6, %%"REG_S" \n\t" \
  184. "movq (%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* VsrcData */\
  185. "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
  186. "movq (%%"REG_S", %%"REG_a"), %%mm1 \n\t" /* UsrcData */\
  187. "movq %%mm0, %%mm3 \n\t"\
  188. "punpcklwd %%mm1, %%mm0 \n\t"\
  189. "punpckhwd %%mm1, %%mm3 \n\t"\
  190. "movq "STR(APCK_COEF)"(%%"REG_d"),%%mm1 \n\t" /* filterCoeff */\
  191. "pmaddwd %%mm1, %%mm0 \n\t"\
  192. "pmaddwd %%mm1, %%mm3 \n\t"\
  193. "paddd %%mm0, %%mm4 \n\t"\
  194. "paddd %%mm3, %%mm5 \n\t"\
  195. "add %6, %%"REG_S" \n\t" \
  196. "movq (%%"REG_S", %%"REG_a"), %%mm3 \n\t" /* VsrcData */\
  197. "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
  198. "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
  199. "test %%"REG_S", %%"REG_S" \n\t"\
  200. "movq %%mm2, %%mm0 \n\t"\
  201. "punpcklwd %%mm3, %%mm2 \n\t"\
  202. "punpckhwd %%mm3, %%mm0 \n\t"\
  203. "pmaddwd %%mm1, %%mm2 \n\t"\
  204. "pmaddwd %%mm1, %%mm0 \n\t"\
  205. "paddd %%mm2, %%mm6 \n\t"\
  206. "paddd %%mm0, %%mm7 \n\t"\
  207. " jnz 2b \n\t"\
  208. "psrad $16, %%mm4 \n\t"\
  209. "psrad $16, %%mm5 \n\t"\
  210. "psrad $16, %%mm6 \n\t"\
  211. "psrad $16, %%mm7 \n\t"\
  212. "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
  213. "packssdw %%mm5, %%mm4 \n\t"\
  214. "packssdw %%mm7, %%mm6 \n\t"\
  215. "paddw %%mm0, %%mm4 \n\t"\
  216. "paddw %%mm0, %%mm6 \n\t"\
  217. "movq %%mm4, "U_TEMP"(%0) \n\t"\
  218. "movq %%mm6, "V_TEMP"(%0) \n\t"\
  219. #define YSCALEYUV2PACKEDX_ACCURATE_YA(offset) \
  220. "lea "offset"(%0), %%"REG_d" \n\t"\
  221. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  222. "pxor %%mm1, %%mm1 \n\t"\
  223. "pxor %%mm5, %%mm5 \n\t"\
  224. "pxor %%mm7, %%mm7 \n\t"\
  225. "pxor %%mm6, %%mm6 \n\t"\
  226. ".p2align 4 \n\t"\
  227. "2: \n\t"\
  228. "movq (%%"REG_S", %%"REG_a", 2), %%mm0 \n\t" /* Y1srcData */\
  229. "movq 8(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* Y2srcData */\
  230. "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
  231. "movq (%%"REG_S", %%"REG_a", 2), %%mm4 \n\t" /* Y1srcData */\
  232. "movq %%mm0, %%mm3 \n\t"\
  233. "punpcklwd %%mm4, %%mm0 \n\t"\
  234. "punpckhwd %%mm4, %%mm3 \n\t"\
  235. "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm4 \n\t" /* filterCoeff */\
  236. "pmaddwd %%mm4, %%mm0 \n\t"\
  237. "pmaddwd %%mm4, %%mm3 \n\t"\
  238. "paddd %%mm0, %%mm1 \n\t"\
  239. "paddd %%mm3, %%mm5 \n\t"\
  240. "movq 8(%%"REG_S", %%"REG_a", 2), %%mm3 \n\t" /* Y2srcData */\
  241. "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
  242. "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
  243. "test %%"REG_S", %%"REG_S" \n\t"\
  244. "movq %%mm2, %%mm0 \n\t"\
  245. "punpcklwd %%mm3, %%mm2 \n\t"\
  246. "punpckhwd %%mm3, %%mm0 \n\t"\
  247. "pmaddwd %%mm4, %%mm2 \n\t"\
  248. "pmaddwd %%mm4, %%mm0 \n\t"\
  249. "paddd %%mm2, %%mm7 \n\t"\
  250. "paddd %%mm0, %%mm6 \n\t"\
  251. " jnz 2b \n\t"\
  252. "psrad $16, %%mm1 \n\t"\
  253. "psrad $16, %%mm5 \n\t"\
  254. "psrad $16, %%mm7 \n\t"\
  255. "psrad $16, %%mm6 \n\t"\
  256. "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
  257. "packssdw %%mm5, %%mm1 \n\t"\
  258. "packssdw %%mm6, %%mm7 \n\t"\
  259. "paddw %%mm0, %%mm1 \n\t"\
  260. "paddw %%mm0, %%mm7 \n\t"\
  261. "movq "U_TEMP"(%0), %%mm3 \n\t"\
  262. "movq "V_TEMP"(%0), %%mm4 \n\t"\
  263. #define YSCALEYUV2PACKEDX_ACCURATE \
  264. YSCALEYUV2PACKEDX_ACCURATE_UV \
  265. YSCALEYUV2PACKEDX_ACCURATE_YA(LUM_MMX_FILTER_OFFSET)
  266. #define YSCALEYUV2RGBX \
  267. "psubw "U_OFFSET"(%0), %%mm3 \n\t" /* (U-128)8*/\
  268. "psubw "V_OFFSET"(%0), %%mm4 \n\t" /* (V-128)8*/\
  269. "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
  270. "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
  271. "pmulhw "UG_COEFF"(%0), %%mm3 \n\t"\
  272. "pmulhw "VG_COEFF"(%0), %%mm4 \n\t"\
  273. /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
  274. "pmulhw "UB_COEFF"(%0), %%mm2 \n\t"\
  275. "pmulhw "VR_COEFF"(%0), %%mm5 \n\t"\
  276. "psubw "Y_OFFSET"(%0), %%mm1 \n\t" /* 8(Y-16)*/\
  277. "psubw "Y_OFFSET"(%0), %%mm7 \n\t" /* 8(Y-16)*/\
  278. "pmulhw "Y_COEFF"(%0), %%mm1 \n\t"\
  279. "pmulhw "Y_COEFF"(%0), %%mm7 \n\t"\
  280. /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
  281. "paddw %%mm3, %%mm4 \n\t"\
  282. "movq %%mm2, %%mm0 \n\t"\
  283. "movq %%mm5, %%mm6 \n\t"\
  284. "movq %%mm4, %%mm3 \n\t"\
  285. "punpcklwd %%mm2, %%mm2 \n\t"\
  286. "punpcklwd %%mm5, %%mm5 \n\t"\
  287. "punpcklwd %%mm4, %%mm4 \n\t"\
  288. "paddw %%mm1, %%mm2 \n\t"\
  289. "paddw %%mm1, %%mm5 \n\t"\
  290. "paddw %%mm1, %%mm4 \n\t"\
  291. "punpckhwd %%mm0, %%mm0 \n\t"\
  292. "punpckhwd %%mm6, %%mm6 \n\t"\
  293. "punpckhwd %%mm3, %%mm3 \n\t"\
  294. "paddw %%mm7, %%mm0 \n\t"\
  295. "paddw %%mm7, %%mm6 \n\t"\
  296. "paddw %%mm7, %%mm3 \n\t"\
  297. /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
  298. "packuswb %%mm0, %%mm2 \n\t"\
  299. "packuswb %%mm6, %%mm5 \n\t"\
  300. "packuswb %%mm3, %%mm4 \n\t"\
  301. #define REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) \
  302. "movq "#b", "#q2" \n\t" /* B */\
  303. "movq "#r", "#t" \n\t" /* R */\
  304. "punpcklbw "#g", "#b" \n\t" /* GBGBGBGB 0 */\
  305. "punpcklbw "#a", "#r" \n\t" /* ARARARAR 0 */\
  306. "punpckhbw "#g", "#q2" \n\t" /* GBGBGBGB 2 */\
  307. "punpckhbw "#a", "#t" \n\t" /* ARARARAR 2 */\
  308. "movq "#b", "#q0" \n\t" /* GBGBGBGB 0 */\
  309. "movq "#q2", "#q3" \n\t" /* GBGBGBGB 2 */\
  310. "punpcklwd "#r", "#q0" \n\t" /* ARGBARGB 0 */\
  311. "punpckhwd "#r", "#b" \n\t" /* ARGBARGB 1 */\
  312. "punpcklwd "#t", "#q2" \n\t" /* ARGBARGB 2 */\
  313. "punpckhwd "#t", "#q3" \n\t" /* ARGBARGB 3 */\
  314. \
  315. MOVNTQ( q0, (dst, index, 4))\
  316. MOVNTQ( b, 8(dst, index, 4))\
  317. MOVNTQ( q2, 16(dst, index, 4))\
  318. MOVNTQ( q3, 24(dst, index, 4))\
  319. \
  320. "add $8, "#index" \n\t"\
  321. "cmp "#dstw", "#index" \n\t"\
  322. " jb 1b \n\t"
  323. #define WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t)
  324. static void RENAME(yuv2rgb32_X_ar)(SwsContext *c, const int16_t *lumFilter,
  325. const int16_t **lumSrc, int lumFilterSize,
  326. const int16_t *chrFilter, const int16_t **chrUSrc,
  327. const int16_t **chrVSrc,
  328. int chrFilterSize, const int16_t **alpSrc,
  329. uint8_t *dest, int dstW, int dstY)
  330. {
  331. x86_reg dummy=0;
  332. x86_reg dstW_reg = dstW;
  333. x86_reg uv_off = c->uv_offx2;
  334. if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
  335. YSCALEYUV2PACKEDX_ACCURATE
  336. YSCALEYUV2RGBX
  337. "movq %%mm2, "U_TEMP"(%0) \n\t"
  338. "movq %%mm4, "V_TEMP"(%0) \n\t"
  339. "movq %%mm5, "Y_TEMP"(%0) \n\t"
  340. YSCALEYUV2PACKEDX_ACCURATE_YA(ALP_MMX_FILTER_OFFSET)
  341. "movq "Y_TEMP"(%0), %%mm5 \n\t"
  342. "psraw $3, %%mm1 \n\t"
  343. "psraw $3, %%mm7 \n\t"
  344. "packuswb %%mm7, %%mm1 \n\t"
  345. WRITEBGR32(%4, %5, %%REGa, %%mm3, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm2, %%mm6)
  346. YSCALEYUV2PACKEDX_END
  347. } else {
  348. YSCALEYUV2PACKEDX_ACCURATE
  349. YSCALEYUV2RGBX
  350. "pcmpeqd %%mm7, %%mm7 \n\t"
  351. WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
  352. YSCALEYUV2PACKEDX_END
  353. }
  354. }
  355. static void RENAME(yuv2rgb32_X)(SwsContext *c, const int16_t *lumFilter,
  356. const int16_t **lumSrc, int lumFilterSize,
  357. const int16_t *chrFilter, const int16_t **chrUSrc,
  358. const int16_t **chrVSrc,
  359. int chrFilterSize, const int16_t **alpSrc,
  360. uint8_t *dest, int dstW, int dstY)
  361. {
  362. x86_reg dummy=0;
  363. x86_reg dstW_reg = dstW;
  364. x86_reg uv_off = c->uv_offx2;
  365. if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
  366. YSCALEYUV2PACKEDX
  367. YSCALEYUV2RGBX
  368. YSCALEYUV2PACKEDX_YA(ALP_MMX_FILTER_OFFSET, %%mm0, %%mm3, %%mm6, %%mm1, %%mm7)
  369. "psraw $3, %%mm1 \n\t"
  370. "psraw $3, %%mm7 \n\t"
  371. "packuswb %%mm7, %%mm1 \n\t"
  372. WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
  373. YSCALEYUV2PACKEDX_END
  374. } else {
  375. YSCALEYUV2PACKEDX
  376. YSCALEYUV2RGBX
  377. "pcmpeqd %%mm7, %%mm7 \n\t"
  378. WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
  379. YSCALEYUV2PACKEDX_END
  380. }
  381. }
  382. #define REAL_WRITERGB16(dst, dstw, index) \
  383. "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
  384. "pand "MANGLE(bFC)", %%mm4 \n\t" /* G */\
  385. "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
  386. "psrlq $3, %%mm2 \n\t"\
  387. \
  388. "movq %%mm2, %%mm1 \n\t"\
  389. "movq %%mm4, %%mm3 \n\t"\
  390. \
  391. "punpcklbw %%mm7, %%mm3 \n\t"\
  392. "punpcklbw %%mm5, %%mm2 \n\t"\
  393. "punpckhbw %%mm7, %%mm4 \n\t"\
  394. "punpckhbw %%mm5, %%mm1 \n\t"\
  395. \
  396. "psllq $3, %%mm3 \n\t"\
  397. "psllq $3, %%mm4 \n\t"\
  398. \
  399. "por %%mm3, %%mm2 \n\t"\
  400. "por %%mm4, %%mm1 \n\t"\
  401. \
  402. MOVNTQ(%%mm2, (dst, index, 2))\
  403. MOVNTQ(%%mm1, 8(dst, index, 2))\
  404. \
  405. "add $8, "#index" \n\t"\
  406. "cmp "#dstw", "#index" \n\t"\
  407. " jb 1b \n\t"
  408. #define WRITERGB16(dst, dstw, index) REAL_WRITERGB16(dst, dstw, index)
  409. static void RENAME(yuv2rgb565_X_ar)(SwsContext *c, const int16_t *lumFilter,
  410. const int16_t **lumSrc, int lumFilterSize,
  411. const int16_t *chrFilter, const int16_t **chrUSrc,
  412. const int16_t **chrVSrc,
  413. int chrFilterSize, const int16_t **alpSrc,
  414. uint8_t *dest, int dstW, int dstY)
  415. {
  416. x86_reg dummy=0;
  417. x86_reg dstW_reg = dstW;
  418. x86_reg uv_off = c->uv_offx2;
  419. YSCALEYUV2PACKEDX_ACCURATE
  420. YSCALEYUV2RGBX
  421. "pxor %%mm7, %%mm7 \n\t"
  422. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  423. #ifdef DITHER1XBPP
  424. "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
  425. "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
  426. "paddusb "RED_DITHER"(%0), %%mm5\n\t"
  427. #endif
  428. WRITERGB16(%4, %5, %%REGa)
  429. YSCALEYUV2PACKEDX_END
  430. }
  431. static void RENAME(yuv2rgb565_X)(SwsContext *c, const int16_t *lumFilter,
  432. const int16_t **lumSrc, int lumFilterSize,
  433. const int16_t *chrFilter, const int16_t **chrUSrc,
  434. const int16_t **chrVSrc,
  435. int chrFilterSize, const int16_t **alpSrc,
  436. uint8_t *dest, int dstW, int dstY)
  437. {
  438. x86_reg dummy=0;
  439. x86_reg dstW_reg = dstW;
  440. x86_reg uv_off = c->uv_offx2;
  441. YSCALEYUV2PACKEDX
  442. YSCALEYUV2RGBX
  443. "pxor %%mm7, %%mm7 \n\t"
  444. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  445. #ifdef DITHER1XBPP
  446. "paddusb "BLUE_DITHER"(%0), %%mm2 \n\t"
  447. "paddusb "GREEN_DITHER"(%0), %%mm4 \n\t"
  448. "paddusb "RED_DITHER"(%0), %%mm5 \n\t"
  449. #endif
  450. WRITERGB16(%4, %5, %%REGa)
  451. YSCALEYUV2PACKEDX_END
  452. }
  453. #define REAL_WRITERGB15(dst, dstw, index) \
  454. "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
  455. "pand "MANGLE(bF8)", %%mm4 \n\t" /* G */\
  456. "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
  457. "psrlq $3, %%mm2 \n\t"\
  458. "psrlq $1, %%mm5 \n\t"\
  459. \
  460. "movq %%mm2, %%mm1 \n\t"\
  461. "movq %%mm4, %%mm3 \n\t"\
  462. \
  463. "punpcklbw %%mm7, %%mm3 \n\t"\
  464. "punpcklbw %%mm5, %%mm2 \n\t"\
  465. "punpckhbw %%mm7, %%mm4 \n\t"\
  466. "punpckhbw %%mm5, %%mm1 \n\t"\
  467. \
  468. "psllq $2, %%mm3 \n\t"\
  469. "psllq $2, %%mm4 \n\t"\
  470. \
  471. "por %%mm3, %%mm2 \n\t"\
  472. "por %%mm4, %%mm1 \n\t"\
  473. \
  474. MOVNTQ(%%mm2, (dst, index, 2))\
  475. MOVNTQ(%%mm1, 8(dst, index, 2))\
  476. \
  477. "add $8, "#index" \n\t"\
  478. "cmp "#dstw", "#index" \n\t"\
  479. " jb 1b \n\t"
  480. #define WRITERGB15(dst, dstw, index) REAL_WRITERGB15(dst, dstw, index)
  481. static void RENAME(yuv2rgb555_X_ar)(SwsContext *c, const int16_t *lumFilter,
  482. const int16_t **lumSrc, int lumFilterSize,
  483. const int16_t *chrFilter, const int16_t **chrUSrc,
  484. const int16_t **chrVSrc,
  485. int chrFilterSize, const int16_t **alpSrc,
  486. uint8_t *dest, int dstW, int dstY)
  487. {
  488. x86_reg dummy=0;
  489. x86_reg dstW_reg = dstW;
  490. x86_reg uv_off = c->uv_offx2;
  491. YSCALEYUV2PACKEDX_ACCURATE
  492. YSCALEYUV2RGBX
  493. "pxor %%mm7, %%mm7 \n\t"
  494. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  495. #ifdef DITHER1XBPP
  496. "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
  497. "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
  498. "paddusb "RED_DITHER"(%0), %%mm5\n\t"
  499. #endif
  500. WRITERGB15(%4, %5, %%REGa)
  501. YSCALEYUV2PACKEDX_END
  502. }
  503. static void RENAME(yuv2rgb555_X)(SwsContext *c, const int16_t *lumFilter,
  504. const int16_t **lumSrc, int lumFilterSize,
  505. const int16_t *chrFilter, const int16_t **chrUSrc,
  506. const int16_t **chrVSrc,
  507. int chrFilterSize, const int16_t **alpSrc,
  508. uint8_t *dest, int dstW, int dstY)
  509. {
  510. x86_reg dummy=0;
  511. x86_reg dstW_reg = dstW;
  512. x86_reg uv_off = c->uv_offx2;
  513. YSCALEYUV2PACKEDX
  514. YSCALEYUV2RGBX
  515. "pxor %%mm7, %%mm7 \n\t"
  516. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  517. #ifdef DITHER1XBPP
  518. "paddusb "BLUE_DITHER"(%0), %%mm2 \n\t"
  519. "paddusb "GREEN_DITHER"(%0), %%mm4 \n\t"
  520. "paddusb "RED_DITHER"(%0), %%mm5 \n\t"
  521. #endif
  522. WRITERGB15(%4, %5, %%REGa)
  523. YSCALEYUV2PACKEDX_END
  524. }
  525. #define WRITEBGR24MMX(dst, dstw, index) \
  526. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
  527. "movq %%mm2, %%mm1 \n\t" /* B */\
  528. "movq %%mm5, %%mm6 \n\t" /* R */\
  529. "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
  530. "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
  531. "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
  532. "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
  533. "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
  534. "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
  535. "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
  536. "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
  537. "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
  538. "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
  539. \
  540. "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
  541. "movq %%mm2, %%mm6 \n\t" /* 0RGB0RGB 1 */\
  542. "movq %%mm1, %%mm5 \n\t" /* 0RGB0RGB 2 */\
  543. "movq %%mm3, %%mm7 \n\t" /* 0RGB0RGB 3 */\
  544. \
  545. "psllq $40, %%mm0 \n\t" /* RGB00000 0 */\
  546. "psllq $40, %%mm2 \n\t" /* RGB00000 1 */\
  547. "psllq $40, %%mm1 \n\t" /* RGB00000 2 */\
  548. "psllq $40, %%mm3 \n\t" /* RGB00000 3 */\
  549. \
  550. "punpckhdq %%mm4, %%mm0 \n\t" /* 0RGBRGB0 0 */\
  551. "punpckhdq %%mm6, %%mm2 \n\t" /* 0RGBRGB0 1 */\
  552. "punpckhdq %%mm5, %%mm1 \n\t" /* 0RGBRGB0 2 */\
  553. "punpckhdq %%mm7, %%mm3 \n\t" /* 0RGBRGB0 3 */\
  554. \
  555. "psrlq $8, %%mm0 \n\t" /* 00RGBRGB 0 */\
  556. "movq %%mm2, %%mm6 \n\t" /* 0RGBRGB0 1 */\
  557. "psllq $40, %%mm2 \n\t" /* GB000000 1 */\
  558. "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
  559. MOVNTQ(%%mm0, (dst))\
  560. \
  561. "psrlq $24, %%mm6 \n\t" /* 0000RGBR 1 */\
  562. "movq %%mm1, %%mm5 \n\t" /* 0RGBRGB0 2 */\
  563. "psllq $24, %%mm1 \n\t" /* BRGB0000 2 */\
  564. "por %%mm1, %%mm6 \n\t" /* BRGBRGBR 1 */\
  565. MOVNTQ(%%mm6, 8(dst))\
  566. \
  567. "psrlq $40, %%mm5 \n\t" /* 000000RG 2 */\
  568. "psllq $8, %%mm3 \n\t" /* RGBRGB00 3 */\
  569. "por %%mm3, %%mm5 \n\t" /* RGBRGBRG 2 */\
  570. MOVNTQ(%%mm5, 16(dst))\
  571. \
  572. "add $24, "#dst" \n\t"\
  573. \
  574. "add $8, "#index" \n\t"\
  575. "cmp "#dstw", "#index" \n\t"\
  576. " jb 1b \n\t"
  577. #define WRITEBGR24MMXEXT(dst, dstw, index) \
  578. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
  579. "movq "MANGLE(ff_M24A)", %%mm0 \n\t"\
  580. "movq "MANGLE(ff_M24C)", %%mm7 \n\t"\
  581. "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2 B1 B0 B1 B0 */\
  582. "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2 G1 G0 G1 G0 */\
  583. "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0 R1 R0 R1 R0 */\
  584. \
  585. "pand %%mm0, %%mm1 \n\t" /* B2 B1 B0 */\
  586. "pand %%mm0, %%mm3 \n\t" /* G2 G1 G0 */\
  587. "pand %%mm7, %%mm6 \n\t" /* R1 R0 */\
  588. \
  589. "psllq $8, %%mm3 \n\t" /* G2 G1 G0 */\
  590. "por %%mm1, %%mm6 \n\t"\
  591. "por %%mm3, %%mm6 \n\t"\
  592. MOVNTQ(%%mm6, (dst))\
  593. \
  594. "psrlq $8, %%mm4 \n\t" /* 00 G7 G6 G5 G4 G3 G2 G1 */\
  595. "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4 B3 B2 B3 B2 */\
  596. "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3 G4 G3 G4 G3 */\
  597. "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4 R3 R2 R3 R2 */\
  598. \
  599. "pand "MANGLE(ff_M24B)", %%mm1 \n\t" /* B5 B4 B3 */\
  600. "pand %%mm7, %%mm3 \n\t" /* G4 G3 */\
  601. "pand %%mm0, %%mm6 \n\t" /* R4 R3 R2 */\
  602. \
  603. "por %%mm1, %%mm3 \n\t" /* B5 G4 B4 G3 B3 */\
  604. "por %%mm3, %%mm6 \n\t"\
  605. MOVNTQ(%%mm6, 8(dst))\
  606. \
  607. "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6 B7 B6 B6 B7 */\
  608. "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7 G6 G5 G6 G5 */\
  609. "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6 R5 R4 R5 R4 */\
  610. \
  611. "pand %%mm7, %%mm1 \n\t" /* B7 B6 */\
  612. "pand %%mm0, %%mm3 \n\t" /* G7 G6 G5 */\
  613. "pand "MANGLE(ff_M24B)", %%mm6 \n\t" /* R7 R6 R5 */\
  614. \
  615. "por %%mm1, %%mm3 \n\t"\
  616. "por %%mm3, %%mm6 \n\t"\
  617. MOVNTQ(%%mm6, 16(dst))\
  618. \
  619. "add $24, "#dst" \n\t"\
  620. \
  621. "add $8, "#index" \n\t"\
  622. "cmp "#dstw", "#index" \n\t"\
  623. " jb 1b \n\t"
  624. #if COMPILE_TEMPLATE_MMXEXT
  625. #undef WRITEBGR24
  626. #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMXEXT(dst, dstw, index)
  627. #else
  628. #undef WRITEBGR24
  629. #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX(dst, dstw, index)
  630. #endif
  631. static void RENAME(yuv2bgr24_X_ar)(SwsContext *c, const int16_t *lumFilter,
  632. const int16_t **lumSrc, int lumFilterSize,
  633. const int16_t *chrFilter, const int16_t **chrUSrc,
  634. const int16_t **chrVSrc,
  635. int chrFilterSize, const int16_t **alpSrc,
  636. uint8_t *dest, int dstW, int dstY)
  637. {
  638. x86_reg dummy=0;
  639. x86_reg dstW_reg = dstW;
  640. x86_reg uv_off = c->uv_offx2;
  641. YSCALEYUV2PACKEDX_ACCURATE
  642. YSCALEYUV2RGBX
  643. "pxor %%mm7, %%mm7 \n\t"
  644. "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"\n\t" //FIXME optimize
  645. "add %4, %%"REG_c" \n\t"
  646. WRITEBGR24(%%REGc, %5, %%REGa)
  647. :: "r" (&c->redDither),
  648. "m" (dummy), "m" (dummy), "m" (dummy),
  649. "r" (dest), "m" (dstW_reg), "m"(uv_off)
  650. NAMED_CONSTRAINTS_ADD(ff_M24A,ff_M24C,ff_M24B)
  651. : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
  652. );
  653. }
  654. static void RENAME(yuv2bgr24_X)(SwsContext *c, const int16_t *lumFilter,
  655. const int16_t **lumSrc, int lumFilterSize,
  656. const int16_t *chrFilter, const int16_t **chrUSrc,
  657. const int16_t **chrVSrc,
  658. int chrFilterSize, const int16_t **alpSrc,
  659. uint8_t *dest, int dstW, int dstY)
  660. {
  661. x86_reg dummy=0;
  662. x86_reg dstW_reg = dstW;
  663. x86_reg uv_off = c->uv_offx2;
  664. YSCALEYUV2PACKEDX
  665. YSCALEYUV2RGBX
  666. "pxor %%mm7, %%mm7 \n\t"
  667. "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c" \n\t" //FIXME optimize
  668. "add %4, %%"REG_c" \n\t"
  669. WRITEBGR24(%%REGc, %5, %%REGa)
  670. :: "r" (&c->redDither),
  671. "m" (dummy), "m" (dummy), "m" (dummy),
  672. "r" (dest), "m" (dstW_reg), "m"(uv_off)
  673. NAMED_CONSTRAINTS_ADD(ff_M24A,ff_M24C,ff_M24B)
  674. : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
  675. );
  676. }
  677. #define REAL_WRITEYUY2(dst, dstw, index) \
  678. "packuswb %%mm3, %%mm3 \n\t"\
  679. "packuswb %%mm4, %%mm4 \n\t"\
  680. "packuswb %%mm7, %%mm1 \n\t"\
  681. "punpcklbw %%mm4, %%mm3 \n\t"\
  682. "movq %%mm1, %%mm7 \n\t"\
  683. "punpcklbw %%mm3, %%mm1 \n\t"\
  684. "punpckhbw %%mm3, %%mm7 \n\t"\
  685. \
  686. MOVNTQ(%%mm1, (dst, index, 2))\
  687. MOVNTQ(%%mm7, 8(dst, index, 2))\
  688. \
  689. "add $8, "#index" \n\t"\
  690. "cmp "#dstw", "#index" \n\t"\
  691. " jb 1b \n\t"
  692. #define WRITEYUY2(dst, dstw, index) REAL_WRITEYUY2(dst, dstw, index)
  693. static void RENAME(yuv2yuyv422_X_ar)(SwsContext *c, const int16_t *lumFilter,
  694. const int16_t **lumSrc, int lumFilterSize,
  695. const int16_t *chrFilter, const int16_t **chrUSrc,
  696. const int16_t **chrVSrc,
  697. int chrFilterSize, const int16_t **alpSrc,
  698. uint8_t *dest, int dstW, int dstY)
  699. {
  700. x86_reg dummy=0;
  701. x86_reg dstW_reg = dstW;
  702. x86_reg uv_off = c->uv_offx2;
  703. YSCALEYUV2PACKEDX_ACCURATE
  704. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  705. "psraw $3, %%mm3 \n\t"
  706. "psraw $3, %%mm4 \n\t"
  707. "psraw $3, %%mm1 \n\t"
  708. "psraw $3, %%mm7 \n\t"
  709. WRITEYUY2(%4, %5, %%REGa)
  710. YSCALEYUV2PACKEDX_END
  711. }
  712. static void RENAME(yuv2yuyv422_X)(SwsContext *c, const int16_t *lumFilter,
  713. const int16_t **lumSrc, int lumFilterSize,
  714. const int16_t *chrFilter, const int16_t **chrUSrc,
  715. const int16_t **chrVSrc,
  716. int chrFilterSize, const int16_t **alpSrc,
  717. uint8_t *dest, int dstW, int dstY)
  718. {
  719. x86_reg dummy=0;
  720. x86_reg dstW_reg = dstW;
  721. x86_reg uv_off = c->uv_offx2;
  722. YSCALEYUV2PACKEDX
  723. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  724. "psraw $3, %%mm3 \n\t"
  725. "psraw $3, %%mm4 \n\t"
  726. "psraw $3, %%mm1 \n\t"
  727. "psraw $3, %%mm7 \n\t"
  728. WRITEYUY2(%4, %5, %%REGa)
  729. YSCALEYUV2PACKEDX_END
  730. }
  731. #define REAL_YSCALEYUV2RGB_UV(index, c) \
  732. "xor "#index", "#index" \n\t"\
  733. ".p2align 4 \n\t"\
  734. "1: \n\t"\
  735. "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
  736. "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
  737. "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \
  738. "movq (%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
  739. "movq (%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
  740. "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \
  741. "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
  742. "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
  743. "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
  744. "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
  745. "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
  746. "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
  747. "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
  748. "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
  749. "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
  750. "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
  751. "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
  752. "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
  753. "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
  754. "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
  755. "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
  756. /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
  757. #define REAL_YSCALEYUV2RGB_YA(index, c, b1, b2) \
  758. "movq ("#b1", "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
  759. "movq ("#b2", "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
  760. "movq 8("#b1", "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
  761. "movq 8("#b2", "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
  762. "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
  763. "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
  764. "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
  765. "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
  766. "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  767. "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  768. "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
  769. "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
  770. #define REAL_YSCALEYUV2RGB_COEFF(c) \
  771. "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
  772. "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
  773. "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
  774. "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
  775. "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
  776. "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
  777. /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
  778. "paddw %%mm3, %%mm4 \n\t"\
  779. "movq %%mm2, %%mm0 \n\t"\
  780. "movq %%mm5, %%mm6 \n\t"\
  781. "movq %%mm4, %%mm3 \n\t"\
  782. "punpcklwd %%mm2, %%mm2 \n\t"\
  783. "punpcklwd %%mm5, %%mm5 \n\t"\
  784. "punpcklwd %%mm4, %%mm4 \n\t"\
  785. "paddw %%mm1, %%mm2 \n\t"\
  786. "paddw %%mm1, %%mm5 \n\t"\
  787. "paddw %%mm1, %%mm4 \n\t"\
  788. "punpckhwd %%mm0, %%mm0 \n\t"\
  789. "punpckhwd %%mm6, %%mm6 \n\t"\
  790. "punpckhwd %%mm3, %%mm3 \n\t"\
  791. "paddw %%mm7, %%mm0 \n\t"\
  792. "paddw %%mm7, %%mm6 \n\t"\
  793. "paddw %%mm7, %%mm3 \n\t"\
  794. /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
  795. "packuswb %%mm0, %%mm2 \n\t"\
  796. "packuswb %%mm6, %%mm5 \n\t"\
  797. "packuswb %%mm3, %%mm4 \n\t"\
  798. #define YSCALEYUV2RGB_YA(index, c, b1, b2) REAL_YSCALEYUV2RGB_YA(index, c, b1, b2)
  799. #define YSCALEYUV2RGB(index, c) \
  800. REAL_YSCALEYUV2RGB_UV(index, c) \
  801. REAL_YSCALEYUV2RGB_YA(index, c, %0, %1) \
  802. REAL_YSCALEYUV2RGB_COEFF(c)
  803. /**
  804. * vertical bilinear scale YV12 to RGB
  805. */
  806. static void RENAME(yuv2rgb32_2)(SwsContext *c, const int16_t *buf[2],
  807. const int16_t *ubuf[2], const int16_t *vbuf[2],
  808. const int16_t *abuf[2], uint8_t *dest,
  809. int dstW, int yalpha, int uvalpha, int y)
  810. {
  811. const int16_t *buf0 = buf[0], *buf1 = buf[1],
  812. *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
  813. if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
  814. const int16_t *abuf0 = abuf[0], *abuf1 = abuf[1];
  815. #if ARCH_X86_64
  816. __asm__ volatile(
  817. YSCALEYUV2RGB(%%r8, %5)
  818. YSCALEYUV2RGB_YA(%%r8, %5, %6, %7)
  819. "psraw $3, %%mm1 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
  820. "psraw $3, %%mm7 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
  821. "packuswb %%mm7, %%mm1 \n\t"
  822. WRITEBGR32(%4, 8280(%5), %%r8, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
  823. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "r" (dest),
  824. "a" (&c->redDither),
  825. "r" (abuf0), "r" (abuf1)
  826. : "%r8"
  827. );
  828. #else
  829. c->u_temp=(intptr_t)abuf0;
  830. c->v_temp=(intptr_t)abuf1;
  831. __asm__ volatile(
  832. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  833. "mov %4, %%"REG_b" \n\t"
  834. "push %%"REG_BP" \n\t"
  835. YSCALEYUV2RGB(%%REGBP, %5)
  836. "push %0 \n\t"
  837. "push %1 \n\t"
  838. "mov "U_TEMP"(%5), %0 \n\t"
  839. "mov "V_TEMP"(%5), %1 \n\t"
  840. YSCALEYUV2RGB_YA(%%REGBP, %5, %0, %1)
  841. "psraw $3, %%mm1 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
  842. "psraw $3, %%mm7 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
  843. "packuswb %%mm7, %%mm1 \n\t"
  844. "pop %1 \n\t"
  845. "pop %0 \n\t"
  846. WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
  847. "pop %%"REG_BP" \n\t"
  848. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  849. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  850. "a" (&c->redDither)
  851. );
  852. #endif
  853. } else {
  854. __asm__ volatile(
  855. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  856. "mov %4, %%"REG_b" \n\t"
  857. "push %%"REG_BP" \n\t"
  858. YSCALEYUV2RGB(%%REGBP, %5)
  859. "pcmpeqd %%mm7, %%mm7 \n\t"
  860. WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
  861. "pop %%"REG_BP" \n\t"
  862. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  863. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  864. "a" (&c->redDither)
  865. );
  866. }
  867. }
  868. static void RENAME(yuv2bgr24_2)(SwsContext *c, const int16_t *buf[2],
  869. const int16_t *ubuf[2], const int16_t *vbuf[2],
  870. const int16_t *abuf[2], uint8_t *dest,
  871. int dstW, int yalpha, int uvalpha, int y)
  872. {
  873. const int16_t *buf0 = buf[0], *buf1 = buf[1],
  874. *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
  875. //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
  876. __asm__ volatile(
  877. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  878. "mov %4, %%"REG_b" \n\t"
  879. "push %%"REG_BP" \n\t"
  880. YSCALEYUV2RGB(%%REGBP, %5)
  881. "pxor %%mm7, %%mm7 \n\t"
  882. WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
  883. "pop %%"REG_BP" \n\t"
  884. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  885. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  886. "a" (&c->redDither)
  887. NAMED_CONSTRAINTS_ADD(ff_M24A,ff_M24C,ff_M24B)
  888. );
  889. }
  890. static void RENAME(yuv2rgb555_2)(SwsContext *c, const int16_t *buf[2],
  891. const int16_t *ubuf[2], const int16_t *vbuf[2],
  892. const int16_t *abuf[2], uint8_t *dest,
  893. int dstW, int yalpha, int uvalpha, int y)
  894. {
  895. const int16_t *buf0 = buf[0], *buf1 = buf[1],
  896. *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
  897. //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
  898. __asm__ volatile(
  899. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  900. "mov %4, %%"REG_b" \n\t"
  901. "push %%"REG_BP" \n\t"
  902. YSCALEYUV2RGB(%%REGBP, %5)
  903. "pxor %%mm7, %%mm7 \n\t"
  904. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  905. #ifdef DITHER1XBPP
  906. "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
  907. "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
  908. "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
  909. #endif
  910. WRITERGB15(%%REGb, 8280(%5), %%REGBP)
  911. "pop %%"REG_BP" \n\t"
  912. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  913. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  914. "a" (&c->redDither)
  915. NAMED_CONSTRAINTS_ADD(bF8)
  916. );
  917. }
  918. static void RENAME(yuv2rgb565_2)(SwsContext *c, const int16_t *buf[2],
  919. const int16_t *ubuf[2], const int16_t *vbuf[2],
  920. const int16_t *abuf[2], uint8_t *dest,
  921. int dstW, int yalpha, int uvalpha, int y)
  922. {
  923. const int16_t *buf0 = buf[0], *buf1 = buf[1],
  924. *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
  925. //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
  926. __asm__ volatile(
  927. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  928. "mov %4, %%"REG_b" \n\t"
  929. "push %%"REG_BP" \n\t"
  930. YSCALEYUV2RGB(%%REGBP, %5)
  931. "pxor %%mm7, %%mm7 \n\t"
  932. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  933. #ifdef DITHER1XBPP
  934. "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
  935. "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
  936. "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
  937. #endif
  938. WRITERGB16(%%REGb, 8280(%5), %%REGBP)
  939. "pop %%"REG_BP" \n\t"
  940. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  941. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  942. "a" (&c->redDither)
  943. NAMED_CONSTRAINTS_ADD(bF8,bFC)
  944. );
  945. }
  946. #define REAL_YSCALEYUV2PACKED(index, c) \
  947. "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
  948. "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1 \n\t"\
  949. "psraw $3, %%mm0 \n\t"\
  950. "psraw $3, %%mm1 \n\t"\
  951. "movq %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c") \n\t"\
  952. "movq %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c") \n\t"\
  953. "xor "#index", "#index" \n\t"\
  954. ".p2align 4 \n\t"\
  955. "1: \n\t"\
  956. "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
  957. "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
  958. "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \
  959. "movq (%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
  960. "movq (%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
  961. "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \
  962. "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
  963. "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
  964. "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
  965. "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
  966. "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
  967. "psraw $7, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
  968. "psraw $7, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
  969. "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
  970. "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
  971. "movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
  972. "movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
  973. "movq 8(%0, "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
  974. "movq 8(%1, "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
  975. "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
  976. "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
  977. "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
  978. "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
  979. "psraw $7, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  980. "psraw $7, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  981. "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
  982. "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
  983. #define YSCALEYUV2PACKED(index, c) REAL_YSCALEYUV2PACKED(index, c)
  984. static void RENAME(yuv2yuyv422_2)(SwsContext *c, const int16_t *buf[2],
  985. const int16_t *ubuf[2], const int16_t *vbuf[2],
  986. const int16_t *abuf[2], uint8_t *dest,
  987. int dstW, int yalpha, int uvalpha, int y)
  988. {
  989. const int16_t *buf0 = buf[0], *buf1 = buf[1],
  990. *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
  991. //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
  992. __asm__ volatile(
  993. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  994. "mov %4, %%"REG_b" \n\t"
  995. "push %%"REG_BP" \n\t"
  996. YSCALEYUV2PACKED(%%REGBP, %5)
  997. WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
  998. "pop %%"REG_BP" \n\t"
  999. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1000. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1001. "a" (&c->redDither)
  1002. );
  1003. }
  1004. #define REAL_YSCALEYUV2RGB1(index, c) \
  1005. "xor "#index", "#index" \n\t"\
  1006. ".p2align 4 \n\t"\
  1007. "1: \n\t"\
  1008. "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
  1009. "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \
  1010. "movq (%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
  1011. "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \
  1012. "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
  1013. "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
  1014. "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
  1015. "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
  1016. "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
  1017. "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
  1018. "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
  1019. "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
  1020. /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
  1021. "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
  1022. "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
  1023. "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  1024. "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  1025. "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
  1026. "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
  1027. "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
  1028. "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
  1029. "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
  1030. "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
  1031. /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
  1032. "paddw %%mm3, %%mm4 \n\t"\
  1033. "movq %%mm2, %%mm0 \n\t"\
  1034. "movq %%mm5, %%mm6 \n\t"\
  1035. "movq %%mm4, %%mm3 \n\t"\
  1036. "punpcklwd %%mm2, %%mm2 \n\t"\
  1037. "punpcklwd %%mm5, %%mm5 \n\t"\
  1038. "punpcklwd %%mm4, %%mm4 \n\t"\
  1039. "paddw %%mm1, %%mm2 \n\t"\
  1040. "paddw %%mm1, %%mm5 \n\t"\
  1041. "paddw %%mm1, %%mm4 \n\t"\
  1042. "punpckhwd %%mm0, %%mm0 \n\t"\
  1043. "punpckhwd %%mm6, %%mm6 \n\t"\
  1044. "punpckhwd %%mm3, %%mm3 \n\t"\
  1045. "paddw %%mm7, %%mm0 \n\t"\
  1046. "paddw %%mm7, %%mm6 \n\t"\
  1047. "paddw %%mm7, %%mm3 \n\t"\
  1048. /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
  1049. "packuswb %%mm0, %%mm2 \n\t"\
  1050. "packuswb %%mm6, %%mm5 \n\t"\
  1051. "packuswb %%mm3, %%mm4 \n\t"\
  1052. #define YSCALEYUV2RGB1(index, c) REAL_YSCALEYUV2RGB1(index, c)
  1053. // do vertical chrominance interpolation
  1054. #define REAL_YSCALEYUV2RGB1b(index, c) \
  1055. "xor "#index", "#index" \n\t"\
  1056. ".p2align 4 \n\t"\
  1057. "1: \n\t"\
  1058. "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
  1059. "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
  1060. "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \
  1061. "movq (%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
  1062. "movq (%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
  1063. "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \
  1064. "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
  1065. "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
  1066. "psrlw $5, %%mm3 \n\t" /*FIXME might overflow*/\
  1067. "psrlw $5, %%mm4 \n\t" /*FIXME might overflow*/\
  1068. "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
  1069. "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
  1070. "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
  1071. "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
  1072. "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
  1073. "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
  1074. /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
  1075. "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
  1076. "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
  1077. "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  1078. "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  1079. "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
  1080. "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
  1081. "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
  1082. "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
  1083. "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
  1084. "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
  1085. /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
  1086. "paddw %%mm3, %%mm4 \n\t"\
  1087. "movq %%mm2, %%mm0 \n\t"\
  1088. "movq %%mm5, %%mm6 \n\t"\
  1089. "movq %%mm4, %%mm3 \n\t"\
  1090. "punpcklwd %%mm2, %%mm2 \n\t"\
  1091. "punpcklwd %%mm5, %%mm5 \n\t"\
  1092. "punpcklwd %%mm4, %%mm4 \n\t"\
  1093. "paddw %%mm1, %%mm2 \n\t"\
  1094. "paddw %%mm1, %%mm5 \n\t"\
  1095. "paddw %%mm1, %%mm4 \n\t"\
  1096. "punpckhwd %%mm0, %%mm0 \n\t"\
  1097. "punpckhwd %%mm6, %%mm6 \n\t"\
  1098. "punpckhwd %%mm3, %%mm3 \n\t"\
  1099. "paddw %%mm7, %%mm0 \n\t"\
  1100. "paddw %%mm7, %%mm6 \n\t"\
  1101. "paddw %%mm7, %%mm3 \n\t"\
  1102. /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
  1103. "packuswb %%mm0, %%mm2 \n\t"\
  1104. "packuswb %%mm6, %%mm5 \n\t"\
  1105. "packuswb %%mm3, %%mm4 \n\t"\
  1106. #define YSCALEYUV2RGB1b(index, c) REAL_YSCALEYUV2RGB1b(index, c)
  1107. #define REAL_YSCALEYUV2RGB1_ALPHA(index) \
  1108. "movq (%1, "#index", 2), %%mm7 \n\t" /* abuf0[index ] */\
  1109. "movq 8(%1, "#index", 2), %%mm1 \n\t" /* abuf0[index+4] */\
  1110. "psraw $7, %%mm7 \n\t" /* abuf0[index ] >>7 */\
  1111. "psraw $7, %%mm1 \n\t" /* abuf0[index+4] >>7 */\
  1112. "packuswb %%mm1, %%mm7 \n\t"
  1113. #define YSCALEYUV2RGB1_ALPHA(index) REAL_YSCALEYUV2RGB1_ALPHA(index)
  1114. /**
  1115. * YV12 to RGB without scaling or interpolating
  1116. */
  1117. static void RENAME(yuv2rgb32_1)(SwsContext *c, const int16_t *buf0,
  1118. const int16_t *ubuf[2], const int16_t *vbuf[2],
  1119. const int16_t *abuf0, uint8_t *dest,
  1120. int dstW, int uvalpha, int y)
  1121. {
  1122. const int16_t *ubuf0 = ubuf[0];
  1123. const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
  1124. if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
  1125. const int16_t *ubuf1 = ubuf[0];
  1126. if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
  1127. __asm__ volatile(
  1128. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1129. "mov %4, %%"REG_b" \n\t"
  1130. "push %%"REG_BP" \n\t"
  1131. YSCALEYUV2RGB1(%%REGBP, %5)
  1132. YSCALEYUV2RGB1_ALPHA(%%REGBP)
  1133. WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
  1134. "pop %%"REG_BP" \n\t"
  1135. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1136. :: "c" (buf0), "d" (abuf0), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1137. "a" (&c->redDither)
  1138. );
  1139. } else {
  1140. __asm__ volatile(
  1141. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1142. "mov %4, %%"REG_b" \n\t"
  1143. "push %%"REG_BP" \n\t"
  1144. YSCALEYUV2RGB1(%%REGBP, %5)
  1145. "pcmpeqd %%mm7, %%mm7 \n\t"
  1146. WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
  1147. "pop %%"REG_BP" \n\t"
  1148. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1149. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1150. "a" (&c->redDither)
  1151. );
  1152. }
  1153. } else {
  1154. const int16_t *ubuf1 = ubuf[1];
  1155. if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
  1156. __asm__ volatile(
  1157. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1158. "mov %4, %%"REG_b" \n\t"
  1159. "push %%"REG_BP" \n\t"
  1160. YSCALEYUV2RGB1b(%%REGBP, %5)
  1161. YSCALEYUV2RGB1_ALPHA(%%REGBP)
  1162. WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
  1163. "pop %%"REG_BP" \n\t"
  1164. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1165. :: "c" (buf0), "d" (abuf0), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1166. "a" (&c->redDither)
  1167. );
  1168. } else {
  1169. __asm__ volatile(
  1170. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1171. "mov %4, %%"REG_b" \n\t"
  1172. "push %%"REG_BP" \n\t"
  1173. YSCALEYUV2RGB1b(%%REGBP, %5)
  1174. "pcmpeqd %%mm7, %%mm7 \n\t"
  1175. WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
  1176. "pop %%"REG_BP" \n\t"
  1177. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1178. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1179. "a" (&c->redDither)
  1180. );
  1181. }
  1182. }
  1183. }
  1184. static void RENAME(yuv2bgr24_1)(SwsContext *c, const int16_t *buf0,
  1185. const int16_t *ubuf[2], const int16_t *vbuf[2],
  1186. const int16_t *abuf0, uint8_t *dest,
  1187. int dstW, int uvalpha, int y)
  1188. {
  1189. const int16_t *ubuf0 = ubuf[0];
  1190. const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
  1191. if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
  1192. const int16_t *ubuf1 = ubuf[0];
  1193. __asm__ volatile(
  1194. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1195. "mov %4, %%"REG_b" \n\t"
  1196. "push %%"REG_BP" \n\t"
  1197. YSCALEYUV2RGB1(%%REGBP, %5)
  1198. "pxor %%mm7, %%mm7 \n\t"
  1199. WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
  1200. "pop %%"REG_BP" \n\t"
  1201. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1202. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1203. "a" (&c->redDither)
  1204. NAMED_CONSTRAINTS_ADD(ff_M24A,ff_M24C,ff_M24B)
  1205. );
  1206. } else {
  1207. const int16_t *ubuf1 = ubuf[1];
  1208. __asm__ volatile(
  1209. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1210. "mov %4, %%"REG_b" \n\t"
  1211. "push %%"REG_BP" \n\t"
  1212. YSCALEYUV2RGB1b(%%REGBP, %5)
  1213. "pxor %%mm7, %%mm7 \n\t"
  1214. WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
  1215. "pop %%"REG_BP" \n\t"
  1216. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1217. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1218. "a" (&c->redDither)
  1219. NAMED_CONSTRAINTS_ADD(ff_M24A,ff_M24C,ff_M24B)
  1220. );
  1221. }
  1222. }
  1223. static void RENAME(yuv2rgb555_1)(SwsContext *c, const int16_t *buf0,
  1224. const int16_t *ubuf[2], const int16_t *vbuf[2],
  1225. const int16_t *abuf0, uint8_t *dest,
  1226. int dstW, int uvalpha, int y)
  1227. {
  1228. const int16_t *ubuf0 = ubuf[0];
  1229. const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
  1230. if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
  1231. const int16_t *ubuf1 = ubuf[0];
  1232. __asm__ volatile(
  1233. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1234. "mov %4, %%"REG_b" \n\t"
  1235. "push %%"REG_BP" \n\t"
  1236. YSCALEYUV2RGB1(%%REGBP, %5)
  1237. "pxor %%mm7, %%mm7 \n\t"
  1238. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1239. #ifdef DITHER1XBPP
  1240. "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
  1241. "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
  1242. "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
  1243. #endif
  1244. WRITERGB15(%%REGb, 8280(%5), %%REGBP)
  1245. "pop %%"REG_BP" \n\t"
  1246. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1247. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1248. "a" (&c->redDither)
  1249. NAMED_CONSTRAINTS_ADD(bF8)
  1250. );
  1251. } else {
  1252. const int16_t *ubuf1 = ubuf[1];
  1253. __asm__ volatile(
  1254. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1255. "mov %4, %%"REG_b" \n\t"
  1256. "push %%"REG_BP" \n\t"
  1257. YSCALEYUV2RGB1b(%%REGBP, %5)
  1258. "pxor %%mm7, %%mm7 \n\t"
  1259. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1260. #ifdef DITHER1XBPP
  1261. "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
  1262. "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
  1263. "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
  1264. #endif
  1265. WRITERGB15(%%REGb, 8280(%5), %%REGBP)
  1266. "pop %%"REG_BP" \n\t"
  1267. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1268. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1269. "a" (&c->redDither)
  1270. NAMED_CONSTRAINTS_ADD(bF8)
  1271. );
  1272. }
  1273. }
  1274. static void RENAME(yuv2rgb565_1)(SwsContext *c, const int16_t *buf0,
  1275. const int16_t *ubuf[2], const int16_t *vbuf[2],
  1276. const int16_t *abuf0, uint8_t *dest,
  1277. int dstW, int uvalpha, int y)
  1278. {
  1279. const int16_t *ubuf0 = ubuf[0];
  1280. const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
  1281. if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
  1282. const int16_t *ubuf1 = ubuf[0];
  1283. __asm__ volatile(
  1284. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1285. "mov %4, %%"REG_b" \n\t"
  1286. "push %%"REG_BP" \n\t"
  1287. YSCALEYUV2RGB1(%%REGBP, %5)
  1288. "pxor %%mm7, %%mm7 \n\t"
  1289. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1290. #ifdef DITHER1XBPP
  1291. "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
  1292. "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
  1293. "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
  1294. #endif
  1295. WRITERGB16(%%REGb, 8280(%5), %%REGBP)
  1296. "pop %%"REG_BP" \n\t"
  1297. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1298. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1299. "a" (&c->redDither)
  1300. NAMED_CONSTRAINTS_ADD(bF8,bFC)
  1301. );
  1302. } else {
  1303. const int16_t *ubuf1 = ubuf[1];
  1304. __asm__ volatile(
  1305. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1306. "mov %4, %%"REG_b" \n\t"
  1307. "push %%"REG_BP" \n\t"
  1308. YSCALEYUV2RGB1b(%%REGBP, %5)
  1309. "pxor %%mm7, %%mm7 \n\t"
  1310. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1311. #ifdef DITHER1XBPP
  1312. "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
  1313. "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
  1314. "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
  1315. #endif
  1316. WRITERGB16(%%REGb, 8280(%5), %%REGBP)
  1317. "pop %%"REG_BP" \n\t"
  1318. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1319. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1320. "a" (&c->redDither)
  1321. NAMED_CONSTRAINTS_ADD(bF8,bFC)
  1322. );
  1323. }
  1324. }
  1325. #define REAL_YSCALEYUV2PACKED1(index, c) \
  1326. "xor "#index", "#index" \n\t"\
  1327. ".p2align 4 \n\t"\
  1328. "1: \n\t"\
  1329. "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
  1330. "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \
  1331. "movq (%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
  1332. "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \
  1333. "psraw $7, %%mm3 \n\t" \
  1334. "psraw $7, %%mm4 \n\t" \
  1335. "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
  1336. "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
  1337. "psraw $7, %%mm1 \n\t" \
  1338. "psraw $7, %%mm7 \n\t" \
  1339. #define YSCALEYUV2PACKED1(index, c) REAL_YSCALEYUV2PACKED1(index, c)
  1340. #define REAL_YSCALEYUV2PACKED1b(index, c) \
  1341. "xor "#index", "#index" \n\t"\
  1342. ".p2align 4 \n\t"\
  1343. "1: \n\t"\
  1344. "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
  1345. "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
  1346. "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \
  1347. "movq (%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
  1348. "movq (%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
  1349. "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \
  1350. "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
  1351. "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
  1352. "psrlw $8, %%mm3 \n\t" \
  1353. "psrlw $8, %%mm4 \n\t" \
  1354. "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
  1355. "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
  1356. "psraw $7, %%mm1 \n\t" \
  1357. "psraw $7, %%mm7 \n\t"
  1358. #define YSCALEYUV2PACKED1b(index, c) REAL_YSCALEYUV2PACKED1b(index, c)
  1359. static void RENAME(yuv2yuyv422_1)(SwsContext *c, const int16_t *buf0,
  1360. const int16_t *ubuf[2], const int16_t *vbuf[2],
  1361. const int16_t *abuf0, uint8_t *dest,
  1362. int dstW, int uvalpha, int y)
  1363. {
  1364. const int16_t *ubuf0 = ubuf[0];
  1365. const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
  1366. if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
  1367. const int16_t *ubuf1 = ubuf[0];
  1368. __asm__ volatile(
  1369. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1370. "mov %4, %%"REG_b" \n\t"
  1371. "push %%"REG_BP" \n\t"
  1372. YSCALEYUV2PACKED1(%%REGBP, %5)
  1373. WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
  1374. "pop %%"REG_BP" \n\t"
  1375. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1376. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1377. "a" (&c->redDither)
  1378. );
  1379. } else {
  1380. const int16_t *ubuf1 = ubuf[1];
  1381. __asm__ volatile(
  1382. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1383. "mov %4, %%"REG_b" \n\t"
  1384. "push %%"REG_BP" \n\t"
  1385. YSCALEYUV2PACKED1b(%%REGBP, %5)
  1386. WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
  1387. "pop %%"REG_BP" \n\t"
  1388. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1389. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1390. "a" (&c->redDither)
  1391. );
  1392. }
  1393. }
  1394. #if COMPILE_TEMPLATE_MMXEXT
  1395. static void RENAME(hyscale_fast)(SwsContext *c, int16_t *dst,
  1396. int dstWidth, const uint8_t *src,
  1397. int srcW, int xInc)
  1398. {
  1399. int32_t *filterPos = c->hLumFilterPos;
  1400. int16_t *filter = c->hLumFilter;
  1401. void *mmxextFilterCode = c->lumMmxextFilterCode;
  1402. int i;
  1403. #if defined(PIC)
  1404. uint64_t ebxsave;
  1405. #endif
  1406. #if ARCH_X86_64
  1407. uint64_t retsave;
  1408. #endif
  1409. __asm__ volatile(
  1410. #if defined(PIC)
  1411. "mov %%"REG_b", %5 \n\t"
  1412. #if ARCH_X86_64
  1413. "mov -8(%%rsp), %%"REG_a" \n\t"
  1414. "mov %%"REG_a", %6 \n\t"
  1415. #endif
  1416. #else
  1417. #if ARCH_X86_64
  1418. "mov -8(%%rsp), %%"REG_a" \n\t"
  1419. "mov %%"REG_a", %5 \n\t"
  1420. #endif
  1421. #endif
  1422. "pxor %%mm7, %%mm7 \n\t"
  1423. "mov %0, %%"REG_c" \n\t"
  1424. "mov %1, %%"REG_D" \n\t"
  1425. "mov %2, %%"REG_d" \n\t"
  1426. "mov %3, %%"REG_b" \n\t"
  1427. "xor %%"REG_a", %%"REG_a" \n\t" // i
  1428. PREFETCH" (%%"REG_c") \n\t"
  1429. PREFETCH" 32(%%"REG_c") \n\t"
  1430. PREFETCH" 64(%%"REG_c") \n\t"
  1431. #if ARCH_X86_64
  1432. #define CALL_MMXEXT_FILTER_CODE \
  1433. "movl (%%"REG_b"), %%esi \n\t"\
  1434. "call *%4 \n\t"\
  1435. "movl (%%"REG_b", %%"REG_a"), %%esi \n\t"\
  1436. "add %%"REG_S", %%"REG_c" \n\t"\
  1437. "add %%"REG_a", %%"REG_D" \n\t"\
  1438. "xor %%"REG_a", %%"REG_a" \n\t"\
  1439. #else
  1440. #define CALL_MMXEXT_FILTER_CODE \
  1441. "movl (%%"REG_b"), %%esi \n\t"\
  1442. "call *%4 \n\t"\
  1443. "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\
  1444. "add %%"REG_a", %%"REG_D" \n\t"\
  1445. "xor %%"REG_a", %%"REG_a" \n\t"\
  1446. #endif /* ARCH_X86_64 */
  1447. CALL_MMXEXT_FILTER_CODE
  1448. CALL_MMXEXT_FILTER_CODE
  1449. CALL_MMXEXT_FILTER_CODE
  1450. CALL_MMXEXT_FILTER_CODE
  1451. CALL_MMXEXT_FILTER_CODE
  1452. CALL_MMXEXT_FILTER_CODE
  1453. CALL_MMXEXT_FILTER_CODE
  1454. CALL_MMXEXT_FILTER_CODE
  1455. #if defined(PIC)
  1456. "mov %5, %%"REG_b" \n\t"
  1457. #if ARCH_X86_64
  1458. "mov %6, %%"REG_a" \n\t"
  1459. "mov %%"REG_a", -8(%%rsp) \n\t"
  1460. #endif
  1461. #else
  1462. #if ARCH_X86_64
  1463. "mov %5, %%"REG_a" \n\t"
  1464. "mov %%"REG_a", -8(%%rsp) \n\t"
  1465. #endif
  1466. #endif
  1467. :: "m" (src), "m" (dst), "m" (filter), "m" (filterPos),
  1468. "m" (mmxextFilterCode)
  1469. #if defined(PIC)
  1470. ,"m" (ebxsave)
  1471. #endif
  1472. #if ARCH_X86_64
  1473. ,"m"(retsave)
  1474. #endif
  1475. : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
  1476. #if !defined(PIC)
  1477. ,"%"REG_b
  1478. #endif
  1479. );
  1480. for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
  1481. dst[i] = src[srcW-1]*128;
  1482. }
  1483. static void RENAME(hcscale_fast)(SwsContext *c, int16_t *dst1, int16_t *dst2,
  1484. int dstWidth, const uint8_t *src1,
  1485. const uint8_t *src2, int srcW, int xInc)
  1486. {
  1487. int32_t *filterPos = c->hChrFilterPos;
  1488. int16_t *filter = c->hChrFilter;
  1489. void *mmxextFilterCode = c->chrMmxextFilterCode;
  1490. int i;
  1491. #if defined(PIC)
  1492. DECLARE_ALIGNED(8, uint64_t, ebxsave);
  1493. #endif
  1494. #if ARCH_X86_64
  1495. DECLARE_ALIGNED(8, uint64_t, retsave);
  1496. #endif
  1497. __asm__ volatile(
  1498. #if defined(PIC)
  1499. "mov %%"REG_b", %7 \n\t"
  1500. #if ARCH_X86_64
  1501. "mov -8(%%rsp), %%"REG_a" \n\t"
  1502. "mov %%"REG_a", %8 \n\t"
  1503. #endif
  1504. #else
  1505. #if ARCH_X86_64
  1506. "mov -8(%%rsp), %%"REG_a" \n\t"
  1507. "mov %%"REG_a", %7 \n\t"
  1508. #endif
  1509. #endif
  1510. "pxor %%mm7, %%mm7 \n\t"
  1511. "mov %0, %%"REG_c" \n\t"
  1512. "mov %1, %%"REG_D" \n\t"
  1513. "mov %2, %%"REG_d" \n\t"
  1514. "mov %3, %%"REG_b" \n\t"
  1515. "xor %%"REG_a", %%"REG_a" \n\t" // i
  1516. PREFETCH" (%%"REG_c") \n\t"
  1517. PREFETCH" 32(%%"REG_c") \n\t"
  1518. PREFETCH" 64(%%"REG_c") \n\t"
  1519. CALL_MMXEXT_FILTER_CODE
  1520. CALL_MMXEXT_FILTER_CODE
  1521. CALL_MMXEXT_FILTER_CODE
  1522. CALL_MMXEXT_FILTER_CODE
  1523. "xor %%"REG_a", %%"REG_a" \n\t" // i
  1524. "mov %5, %%"REG_c" \n\t" // src
  1525. "mov %6, %%"REG_D" \n\t" // buf2
  1526. PREFETCH" (%%"REG_c") \n\t"
  1527. PREFETCH" 32(%%"REG_c") \n\t"
  1528. PREFETCH" 64(%%"REG_c") \n\t"
  1529. CALL_MMXEXT_FILTER_CODE
  1530. CALL_MMXEXT_FILTER_CODE
  1531. CALL_MMXEXT_FILTER_CODE
  1532. CALL_MMXEXT_FILTER_CODE
  1533. #if defined(PIC)
  1534. "mov %7, %%"REG_b" \n\t"
  1535. #if ARCH_X86_64
  1536. "mov %8, %%"REG_a" \n\t"
  1537. "mov %%"REG_a", -8(%%rsp) \n\t"
  1538. #endif
  1539. #else
  1540. #if ARCH_X86_64
  1541. "mov %7, %%"REG_a" \n\t"
  1542. "mov %%"REG_a", -8(%%rsp) \n\t"
  1543. #endif
  1544. #endif
  1545. :: "m" (src1), "m" (dst1), "m" (filter), "m" (filterPos),
  1546. "m" (mmxextFilterCode), "m" (src2), "m"(dst2)
  1547. #if defined(PIC)
  1548. ,"m" (ebxsave)
  1549. #endif
  1550. #if ARCH_X86_64
  1551. ,"m"(retsave)
  1552. #endif
  1553. : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
  1554. #if !defined(PIC)
  1555. ,"%"REG_b
  1556. #endif
  1557. );
  1558. for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) {
  1559. dst1[i] = src1[srcW-1]*128;
  1560. dst2[i] = src2[srcW-1]*128;
  1561. }
  1562. }
  1563. #endif /* COMPILE_TEMPLATE_MMXEXT */
  1564. static av_cold void RENAME(sws_init_swscale)(SwsContext *c)
  1565. {
  1566. enum AVPixelFormat dstFormat = c->dstFormat;
  1567. c->use_mmx_vfilter= 0;
  1568. if (!is16BPS(dstFormat) && !is9_OR_10BPS(dstFormat) && dstFormat != AV_PIX_FMT_NV12
  1569. && dstFormat != AV_PIX_FMT_NV21 && !(c->flags & SWS_BITEXACT)) {
  1570. if (c->flags & SWS_ACCURATE_RND) {
  1571. if (!(c->flags & SWS_FULL_CHR_H_INT)) {
  1572. switch (c->dstFormat) {
  1573. case AV_PIX_FMT_RGB32: c->yuv2packedX = RENAME(yuv2rgb32_X_ar); break;
  1574. case AV_PIX_FMT_BGR24: c->yuv2packedX = RENAME(yuv2bgr24_X_ar); break;
  1575. case AV_PIX_FMT_RGB555: c->yuv2packedX = RENAME(yuv2rgb555_X_ar); break;
  1576. case AV_PIX_FMT_RGB565: c->yuv2packedX = RENAME(yuv2rgb565_X_ar); break;
  1577. case AV_PIX_FMT_YUYV422: c->yuv2packedX = RENAME(yuv2yuyv422_X_ar); break;
  1578. default: break;
  1579. }
  1580. }
  1581. } else {
  1582. c->use_mmx_vfilter= 1;
  1583. c->yuv2planeX = RENAME(yuv2yuvX );
  1584. if (!(c->flags & SWS_FULL_CHR_H_INT)) {
  1585. switch (c->dstFormat) {
  1586. case AV_PIX_FMT_RGB32: c->yuv2packedX = RENAME(yuv2rgb32_X); break;
  1587. case AV_PIX_FMT_BGR24: c->yuv2packedX = RENAME(yuv2bgr24_X); break;
  1588. case AV_PIX_FMT_RGB555: c->yuv2packedX = RENAME(yuv2rgb555_X); break;
  1589. case AV_PIX_FMT_RGB565: c->yuv2packedX = RENAME(yuv2rgb565_X); break;
  1590. case AV_PIX_FMT_YUYV422: c->yuv2packedX = RENAME(yuv2yuyv422_X); break;
  1591. default: break;
  1592. }
  1593. }
  1594. }
  1595. if (!(c->flags & SWS_FULL_CHR_H_INT)) {
  1596. switch (c->dstFormat) {
  1597. case AV_PIX_FMT_RGB32:
  1598. c->yuv2packed1 = RENAME(yuv2rgb32_1);
  1599. c->yuv2packed2 = RENAME(yuv2rgb32_2);
  1600. break;
  1601. case AV_PIX_FMT_BGR24:
  1602. c->yuv2packed1 = RENAME(yuv2bgr24_1);
  1603. c->yuv2packed2 = RENAME(yuv2bgr24_2);
  1604. break;
  1605. case AV_PIX_FMT_RGB555:
  1606. c->yuv2packed1 = RENAME(yuv2rgb555_1);
  1607. c->yuv2packed2 = RENAME(yuv2rgb555_2);
  1608. break;
  1609. case AV_PIX_FMT_RGB565:
  1610. c->yuv2packed1 = RENAME(yuv2rgb565_1);
  1611. c->yuv2packed2 = RENAME(yuv2rgb565_2);
  1612. break;
  1613. case AV_PIX_FMT_YUYV422:
  1614. c->yuv2packed1 = RENAME(yuv2yuyv422_1);
  1615. c->yuv2packed2 = RENAME(yuv2yuyv422_2);
  1616. break;
  1617. default:
  1618. break;
  1619. }
  1620. }
  1621. }
  1622. if (c->srcBpc == 8 && c->dstBpc <= 14) {
  1623. // Use the new MMX scaler if the MMXEXT one can't be used (it is faster than the x86 ASM one).
  1624. #if COMPILE_TEMPLATE_MMXEXT
  1625. if (c->flags & SWS_FAST_BILINEAR && c->canMMXEXTBeUsed) {
  1626. c->hyscale_fast = RENAME(hyscale_fast);
  1627. c->hcscale_fast = RENAME(hcscale_fast);
  1628. } else {
  1629. #endif /* COMPILE_TEMPLATE_MMXEXT */
  1630. c->hyscale_fast = NULL;
  1631. c->hcscale_fast = NULL;
  1632. #if COMPILE_TEMPLATE_MMXEXT
  1633. }
  1634. #endif /* COMPILE_TEMPLATE_MMXEXT */
  1635. }
  1636. }