You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

2007 lines
88KB

  1. /*
  2. * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
  3. *
  4. * This file is part of Libav.
  5. *
  6. * Libav is free software; you can redistribute it and/or
  7. * modify it under the terms of the GNU Lesser General Public
  8. * License as published by the Free Software Foundation; either
  9. * version 2.1 of the License, or (at your option) any later version.
  10. *
  11. * Libav is distributed in the hope that it will be useful,
  12. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  14. * Lesser General Public License for more details.
  15. *
  16. * You should have received a copy of the GNU Lesser General Public
  17. * License along with Libav; if not, write to the Free Software
  18. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  19. */
  20. #undef REAL_MOVNTQ
  21. #undef MOVNTQ
  22. #undef PREFETCH
  23. #if COMPILE_TEMPLATE_MMX2
  24. #define PREFETCH "prefetchnta"
  25. #else
  26. #define PREFETCH " # nop"
  27. #endif
  28. #if COMPILE_TEMPLATE_MMX2
  29. #define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
  30. #else
  31. #define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
  32. #endif
  33. #define MOVNTQ(a,b) REAL_MOVNTQ(a,b)
  34. #if !COMPILE_TEMPLATE_MMX2
  35. static av_always_inline void
  36. dither_8to16(SwsContext *c, const uint8_t *srcDither, int rot)
  37. {
  38. if (rot) {
  39. __asm__ volatile("pxor %%mm0, %%mm0\n\t"
  40. "movq (%0), %%mm3\n\t"
  41. "movq %%mm3, %%mm4\n\t"
  42. "psrlq $24, %%mm3\n\t"
  43. "psllq $40, %%mm4\n\t"
  44. "por %%mm4, %%mm3\n\t"
  45. "movq %%mm3, %%mm4\n\t"
  46. "punpcklbw %%mm0, %%mm3\n\t"
  47. "punpckhbw %%mm0, %%mm4\n\t"
  48. "psraw $4, %%mm3\n\t"
  49. "psraw $4, %%mm4\n\t"
  50. "movq %%mm3, "DITHER16"+0(%1)\n\t"
  51. "movq %%mm4, "DITHER16"+8(%1)\n\t"
  52. :: "r"(srcDither), "r"(&c->redDither)
  53. );
  54. } else {
  55. __asm__ volatile("pxor %%mm0, %%mm0\n\t"
  56. "movq (%0), %%mm3\n\t"
  57. "movq %%mm3, %%mm4\n\t"
  58. "punpcklbw %%mm0, %%mm3\n\t"
  59. "punpckhbw %%mm0, %%mm4\n\t"
  60. "psraw $4, %%mm3\n\t"
  61. "psraw $4, %%mm4\n\t"
  62. "movq %%mm3, "DITHER16"+0(%1)\n\t"
  63. "movq %%mm4, "DITHER16"+8(%1)\n\t"
  64. :: "r"(srcDither), "r"(&c->redDither)
  65. );
  66. }
  67. }
  68. #endif
  69. static void RENAME(yuv2yuv1)(SwsContext *c, const int16_t *lumSrc,
  70. const int16_t *chrUSrc, const int16_t *chrVSrc,
  71. const int16_t *alpSrc,
  72. uint8_t *dst[4], int dstW, int chrDstW)
  73. {
  74. int p= 4;
  75. const int16_t *src[4]= {
  76. lumSrc + dstW, chrUSrc + chrDstW,
  77. chrVSrc + chrDstW, alpSrc + dstW
  78. };
  79. x86_reg counter[4]= { dstW, chrDstW, chrDstW, dstW };
  80. while (p--) {
  81. if (dst[p]) {
  82. __asm__ volatile(
  83. "mov %2, %%"REG_a" \n\t"
  84. ".p2align 4 \n\t" /* FIXME Unroll? */
  85. "1: \n\t"
  86. "movq (%0, %%"REG_a", 2), %%mm0 \n\t"
  87. "movq 8(%0, %%"REG_a", 2), %%mm1 \n\t"
  88. "psraw $7, %%mm0 \n\t"
  89. "psraw $7, %%mm1 \n\t"
  90. "packuswb %%mm1, %%mm0 \n\t"
  91. MOVNTQ(%%mm0, (%1, %%REGa))
  92. "add $8, %%"REG_a" \n\t"
  93. "jnc 1b \n\t"
  94. :: "r" (src[p]), "r" (dst[p] + counter[p]),
  95. "g" (-counter[p])
  96. : "%"REG_a
  97. );
  98. }
  99. }
  100. }
  101. static void RENAME(yuv2yuv1_ar)(SwsContext *c, const int16_t *lumSrc,
  102. const int16_t *chrUSrc, const int16_t *chrVSrc,
  103. const int16_t *alpSrc,
  104. uint8_t *dst[4], int dstW, int chrDstW)
  105. {
  106. int p= 4;
  107. const int16_t *src[4]= {
  108. lumSrc + dstW, chrUSrc + chrDstW,
  109. chrVSrc + chrDstW, alpSrc + dstW
  110. };
  111. x86_reg counter[4]= { dstW, chrDstW, chrDstW, dstW };
  112. const uint8_t *lumDither = c->lumDither8, *chrDither = c->chrDither8;
  113. while (p--) {
  114. if (dst[p]) {
  115. dither_8to16(c, (p == 2 || p == 3) ? chrDither : lumDither, p == 2);
  116. __asm__ volatile(
  117. "mov %2, %%"REG_a" \n\t"
  118. "movq "DITHER16"+0(%3), %%mm6 \n\t"
  119. "movq "DITHER16"+8(%3), %%mm7 \n\t"
  120. ".p2align 4 \n\t" /* FIXME Unroll? */
  121. "1: \n\t"
  122. "movq (%0, %%"REG_a", 2), %%mm0 \n\t"
  123. "movq 8(%0, %%"REG_a", 2), %%mm1 \n\t"
  124. "paddsw %%mm6, %%mm0 \n\t"
  125. "paddsw %%mm7, %%mm1 \n\t"
  126. "psraw $7, %%mm0 \n\t"
  127. "psraw $7, %%mm1 \n\t"
  128. "packuswb %%mm1, %%mm0 \n\t"
  129. MOVNTQ(%%mm0, (%1, %%REGa))
  130. "add $8, %%"REG_a" \n\t"
  131. "jnc 1b \n\t"
  132. :: "r" (src[p]), "r" (dst[p] + counter[p]),
  133. "g" (-counter[p]), "r"(&c->redDither)
  134. : "%"REG_a
  135. );
  136. }
  137. }
  138. }
  139. #define YSCALEYUV2PACKEDX_UV \
  140. __asm__ volatile(\
  141. "xor %%"REG_a", %%"REG_a" \n\t"\
  142. ".p2align 4 \n\t"\
  143. "nop \n\t"\
  144. "1: \n\t"\
  145. "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
  146. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  147. "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
  148. "movq %%mm3, %%mm4 \n\t"\
  149. ".p2align 4 \n\t"\
  150. "2: \n\t"\
  151. "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
  152. "movq (%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* UsrcData */\
  153. "add %6, %%"REG_S" \n\t" \
  154. "movq (%%"REG_S", %%"REG_a"), %%mm5 \n\t" /* VsrcData */\
  155. "add $16, %%"REG_d" \n\t"\
  156. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  157. "pmulhw %%mm0, %%mm2 \n\t"\
  158. "pmulhw %%mm0, %%mm5 \n\t"\
  159. "paddw %%mm2, %%mm3 \n\t"\
  160. "paddw %%mm5, %%mm4 \n\t"\
  161. "test %%"REG_S", %%"REG_S" \n\t"\
  162. " jnz 2b \n\t"\
  163. #define YSCALEYUV2PACKEDX_YA(offset,coeff,src1,src2,dst1,dst2) \
  164. "lea "offset"(%0), %%"REG_d" \n\t"\
  165. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  166. "movq "VROUNDER_OFFSET"(%0), "#dst1" \n\t"\
  167. "movq "#dst1", "#dst2" \n\t"\
  168. ".p2align 4 \n\t"\
  169. "2: \n\t"\
  170. "movq 8(%%"REG_d"), "#coeff" \n\t" /* filterCoeff */\
  171. "movq (%%"REG_S", %%"REG_a", 2), "#src1" \n\t" /* Y1srcData */\
  172. "movq 8(%%"REG_S", %%"REG_a", 2), "#src2" \n\t" /* Y2srcData */\
  173. "add $16, %%"REG_d" \n\t"\
  174. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  175. "pmulhw "#coeff", "#src1" \n\t"\
  176. "pmulhw "#coeff", "#src2" \n\t"\
  177. "paddw "#src1", "#dst1" \n\t"\
  178. "paddw "#src2", "#dst2" \n\t"\
  179. "test %%"REG_S", %%"REG_S" \n\t"\
  180. " jnz 2b \n\t"\
  181. #define YSCALEYUV2PACKEDX \
  182. YSCALEYUV2PACKEDX_UV \
  183. YSCALEYUV2PACKEDX_YA(LUM_MMX_FILTER_OFFSET,%%mm0,%%mm2,%%mm5,%%mm1,%%mm7) \
  184. #define YSCALEYUV2PACKEDX_END \
  185. :: "r" (&c->redDither), \
  186. "m" (dummy), "m" (dummy), "m" (dummy),\
  187. "r" (dest), "m" (dstW_reg), "m"(uv_off) \
  188. : "%"REG_a, "%"REG_d, "%"REG_S \
  189. );
  190. #define YSCALEYUV2PACKEDX_ACCURATE_UV \
  191. __asm__ volatile(\
  192. "xor %%"REG_a", %%"REG_a" \n\t"\
  193. ".p2align 4 \n\t"\
  194. "nop \n\t"\
  195. "1: \n\t"\
  196. "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
  197. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  198. "pxor %%mm4, %%mm4 \n\t"\
  199. "pxor %%mm5, %%mm5 \n\t"\
  200. "pxor %%mm6, %%mm6 \n\t"\
  201. "pxor %%mm7, %%mm7 \n\t"\
  202. ".p2align 4 \n\t"\
  203. "2: \n\t"\
  204. "movq (%%"REG_S", %%"REG_a"), %%mm0 \n\t" /* UsrcData */\
  205. "add %6, %%"REG_S" \n\t" \
  206. "movq (%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* VsrcData */\
  207. "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
  208. "movq (%%"REG_S", %%"REG_a"), %%mm1 \n\t" /* UsrcData */\
  209. "movq %%mm0, %%mm3 \n\t"\
  210. "punpcklwd %%mm1, %%mm0 \n\t"\
  211. "punpckhwd %%mm1, %%mm3 \n\t"\
  212. "movq "STR(APCK_COEF)"(%%"REG_d"),%%mm1 \n\t" /* filterCoeff */\
  213. "pmaddwd %%mm1, %%mm0 \n\t"\
  214. "pmaddwd %%mm1, %%mm3 \n\t"\
  215. "paddd %%mm0, %%mm4 \n\t"\
  216. "paddd %%mm3, %%mm5 \n\t"\
  217. "add %6, %%"REG_S" \n\t" \
  218. "movq (%%"REG_S", %%"REG_a"), %%mm3 \n\t" /* VsrcData */\
  219. "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
  220. "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
  221. "test %%"REG_S", %%"REG_S" \n\t"\
  222. "movq %%mm2, %%mm0 \n\t"\
  223. "punpcklwd %%mm3, %%mm2 \n\t"\
  224. "punpckhwd %%mm3, %%mm0 \n\t"\
  225. "pmaddwd %%mm1, %%mm2 \n\t"\
  226. "pmaddwd %%mm1, %%mm0 \n\t"\
  227. "paddd %%mm2, %%mm6 \n\t"\
  228. "paddd %%mm0, %%mm7 \n\t"\
  229. " jnz 2b \n\t"\
  230. "psrad $16, %%mm4 \n\t"\
  231. "psrad $16, %%mm5 \n\t"\
  232. "psrad $16, %%mm6 \n\t"\
  233. "psrad $16, %%mm7 \n\t"\
  234. "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
  235. "packssdw %%mm5, %%mm4 \n\t"\
  236. "packssdw %%mm7, %%mm6 \n\t"\
  237. "paddw %%mm0, %%mm4 \n\t"\
  238. "paddw %%mm0, %%mm6 \n\t"\
  239. "movq %%mm4, "U_TEMP"(%0) \n\t"\
  240. "movq %%mm6, "V_TEMP"(%0) \n\t"\
  241. #define YSCALEYUV2PACKEDX_ACCURATE_YA(offset) \
  242. "lea "offset"(%0), %%"REG_d" \n\t"\
  243. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  244. "pxor %%mm1, %%mm1 \n\t"\
  245. "pxor %%mm5, %%mm5 \n\t"\
  246. "pxor %%mm7, %%mm7 \n\t"\
  247. "pxor %%mm6, %%mm6 \n\t"\
  248. ".p2align 4 \n\t"\
  249. "2: \n\t"\
  250. "movq (%%"REG_S", %%"REG_a", 2), %%mm0 \n\t" /* Y1srcData */\
  251. "movq 8(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* Y2srcData */\
  252. "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
  253. "movq (%%"REG_S", %%"REG_a", 2), %%mm4 \n\t" /* Y1srcData */\
  254. "movq %%mm0, %%mm3 \n\t"\
  255. "punpcklwd %%mm4, %%mm0 \n\t"\
  256. "punpckhwd %%mm4, %%mm3 \n\t"\
  257. "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm4 \n\t" /* filterCoeff */\
  258. "pmaddwd %%mm4, %%mm0 \n\t"\
  259. "pmaddwd %%mm4, %%mm3 \n\t"\
  260. "paddd %%mm0, %%mm1 \n\t"\
  261. "paddd %%mm3, %%mm5 \n\t"\
  262. "movq 8(%%"REG_S", %%"REG_a", 2), %%mm3 \n\t" /* Y2srcData */\
  263. "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
  264. "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
  265. "test %%"REG_S", %%"REG_S" \n\t"\
  266. "movq %%mm2, %%mm0 \n\t"\
  267. "punpcklwd %%mm3, %%mm2 \n\t"\
  268. "punpckhwd %%mm3, %%mm0 \n\t"\
  269. "pmaddwd %%mm4, %%mm2 \n\t"\
  270. "pmaddwd %%mm4, %%mm0 \n\t"\
  271. "paddd %%mm2, %%mm7 \n\t"\
  272. "paddd %%mm0, %%mm6 \n\t"\
  273. " jnz 2b \n\t"\
  274. "psrad $16, %%mm1 \n\t"\
  275. "psrad $16, %%mm5 \n\t"\
  276. "psrad $16, %%mm7 \n\t"\
  277. "psrad $16, %%mm6 \n\t"\
  278. "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
  279. "packssdw %%mm5, %%mm1 \n\t"\
  280. "packssdw %%mm6, %%mm7 \n\t"\
  281. "paddw %%mm0, %%mm1 \n\t"\
  282. "paddw %%mm0, %%mm7 \n\t"\
  283. "movq "U_TEMP"(%0), %%mm3 \n\t"\
  284. "movq "V_TEMP"(%0), %%mm4 \n\t"\
  285. #define YSCALEYUV2PACKEDX_ACCURATE \
  286. YSCALEYUV2PACKEDX_ACCURATE_UV \
  287. YSCALEYUV2PACKEDX_ACCURATE_YA(LUM_MMX_FILTER_OFFSET)
  288. #define YSCALEYUV2RGBX \
  289. "psubw "U_OFFSET"(%0), %%mm3 \n\t" /* (U-128)8*/\
  290. "psubw "V_OFFSET"(%0), %%mm4 \n\t" /* (V-128)8*/\
  291. "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
  292. "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
  293. "pmulhw "UG_COEFF"(%0), %%mm3 \n\t"\
  294. "pmulhw "VG_COEFF"(%0), %%mm4 \n\t"\
  295. /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
  296. "pmulhw "UB_COEFF"(%0), %%mm2 \n\t"\
  297. "pmulhw "VR_COEFF"(%0), %%mm5 \n\t"\
  298. "psubw "Y_OFFSET"(%0), %%mm1 \n\t" /* 8(Y-16)*/\
  299. "psubw "Y_OFFSET"(%0), %%mm7 \n\t" /* 8(Y-16)*/\
  300. "pmulhw "Y_COEFF"(%0), %%mm1 \n\t"\
  301. "pmulhw "Y_COEFF"(%0), %%mm7 \n\t"\
  302. /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
  303. "paddw %%mm3, %%mm4 \n\t"\
  304. "movq %%mm2, %%mm0 \n\t"\
  305. "movq %%mm5, %%mm6 \n\t"\
  306. "movq %%mm4, %%mm3 \n\t"\
  307. "punpcklwd %%mm2, %%mm2 \n\t"\
  308. "punpcklwd %%mm5, %%mm5 \n\t"\
  309. "punpcklwd %%mm4, %%mm4 \n\t"\
  310. "paddw %%mm1, %%mm2 \n\t"\
  311. "paddw %%mm1, %%mm5 \n\t"\
  312. "paddw %%mm1, %%mm4 \n\t"\
  313. "punpckhwd %%mm0, %%mm0 \n\t"\
  314. "punpckhwd %%mm6, %%mm6 \n\t"\
  315. "punpckhwd %%mm3, %%mm3 \n\t"\
  316. "paddw %%mm7, %%mm0 \n\t"\
  317. "paddw %%mm7, %%mm6 \n\t"\
  318. "paddw %%mm7, %%mm3 \n\t"\
  319. /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
  320. "packuswb %%mm0, %%mm2 \n\t"\
  321. "packuswb %%mm6, %%mm5 \n\t"\
  322. "packuswb %%mm3, %%mm4 \n\t"\
  323. #define REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) \
  324. "movq "#b", "#q2" \n\t" /* B */\
  325. "movq "#r", "#t" \n\t" /* R */\
  326. "punpcklbw "#g", "#b" \n\t" /* GBGBGBGB 0 */\
  327. "punpcklbw "#a", "#r" \n\t" /* ARARARAR 0 */\
  328. "punpckhbw "#g", "#q2" \n\t" /* GBGBGBGB 2 */\
  329. "punpckhbw "#a", "#t" \n\t" /* ARARARAR 2 */\
  330. "movq "#b", "#q0" \n\t" /* GBGBGBGB 0 */\
  331. "movq "#q2", "#q3" \n\t" /* GBGBGBGB 2 */\
  332. "punpcklwd "#r", "#q0" \n\t" /* ARGBARGB 0 */\
  333. "punpckhwd "#r", "#b" \n\t" /* ARGBARGB 1 */\
  334. "punpcklwd "#t", "#q2" \n\t" /* ARGBARGB 2 */\
  335. "punpckhwd "#t", "#q3" \n\t" /* ARGBARGB 3 */\
  336. \
  337. MOVNTQ( q0, (dst, index, 4))\
  338. MOVNTQ( b, 8(dst, index, 4))\
  339. MOVNTQ( q2, 16(dst, index, 4))\
  340. MOVNTQ( q3, 24(dst, index, 4))\
  341. \
  342. "add $8, "#index" \n\t"\
  343. "cmp "#dstw", "#index" \n\t"\
  344. " jb 1b \n\t"
  345. #define WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t)
  346. static void RENAME(yuv2rgb32_X_ar)(SwsContext *c, const int16_t *lumFilter,
  347. const int16_t **lumSrc, int lumFilterSize,
  348. const int16_t *chrFilter, const int16_t **chrUSrc,
  349. const int16_t **chrVSrc,
  350. int chrFilterSize, const int16_t **alpSrc,
  351. uint8_t *dest, int dstW, int dstY)
  352. {
  353. x86_reg dummy=0;
  354. x86_reg dstW_reg = dstW;
  355. x86_reg uv_off = c->uv_off_byte;
  356. if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
  357. YSCALEYUV2PACKEDX_ACCURATE
  358. YSCALEYUV2RGBX
  359. "movq %%mm2, "U_TEMP"(%0) \n\t"
  360. "movq %%mm4, "V_TEMP"(%0) \n\t"
  361. "movq %%mm5, "Y_TEMP"(%0) \n\t"
  362. YSCALEYUV2PACKEDX_ACCURATE_YA(ALP_MMX_FILTER_OFFSET)
  363. "movq "Y_TEMP"(%0), %%mm5 \n\t"
  364. "psraw $3, %%mm1 \n\t"
  365. "psraw $3, %%mm7 \n\t"
  366. "packuswb %%mm7, %%mm1 \n\t"
  367. WRITEBGR32(%4, %5, %%REGa, %%mm3, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm2, %%mm6)
  368. YSCALEYUV2PACKEDX_END
  369. } else {
  370. YSCALEYUV2PACKEDX_ACCURATE
  371. YSCALEYUV2RGBX
  372. "pcmpeqd %%mm7, %%mm7 \n\t"
  373. WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
  374. YSCALEYUV2PACKEDX_END
  375. }
  376. }
  377. static void RENAME(yuv2rgb32_X)(SwsContext *c, const int16_t *lumFilter,
  378. const int16_t **lumSrc, int lumFilterSize,
  379. const int16_t *chrFilter, const int16_t **chrUSrc,
  380. const int16_t **chrVSrc,
  381. int chrFilterSize, const int16_t **alpSrc,
  382. uint8_t *dest, int dstW, int dstY)
  383. {
  384. x86_reg dummy=0;
  385. x86_reg dstW_reg = dstW;
  386. x86_reg uv_off = c->uv_off_byte;
  387. if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
  388. YSCALEYUV2PACKEDX
  389. YSCALEYUV2RGBX
  390. YSCALEYUV2PACKEDX_YA(ALP_MMX_FILTER_OFFSET, %%mm0, %%mm3, %%mm6, %%mm1, %%mm7)
  391. "psraw $3, %%mm1 \n\t"
  392. "psraw $3, %%mm7 \n\t"
  393. "packuswb %%mm7, %%mm1 \n\t"
  394. WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
  395. YSCALEYUV2PACKEDX_END
  396. } else {
  397. YSCALEYUV2PACKEDX
  398. YSCALEYUV2RGBX
  399. "pcmpeqd %%mm7, %%mm7 \n\t"
  400. WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
  401. YSCALEYUV2PACKEDX_END
  402. }
  403. }
  404. #define REAL_WRITERGB16(dst, dstw, index) \
  405. "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
  406. "pand "MANGLE(bFC)", %%mm4 \n\t" /* G */\
  407. "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
  408. "psrlq $3, %%mm2 \n\t"\
  409. \
  410. "movq %%mm2, %%mm1 \n\t"\
  411. "movq %%mm4, %%mm3 \n\t"\
  412. \
  413. "punpcklbw %%mm7, %%mm3 \n\t"\
  414. "punpcklbw %%mm5, %%mm2 \n\t"\
  415. "punpckhbw %%mm7, %%mm4 \n\t"\
  416. "punpckhbw %%mm5, %%mm1 \n\t"\
  417. \
  418. "psllq $3, %%mm3 \n\t"\
  419. "psllq $3, %%mm4 \n\t"\
  420. \
  421. "por %%mm3, %%mm2 \n\t"\
  422. "por %%mm4, %%mm1 \n\t"\
  423. \
  424. MOVNTQ(%%mm2, (dst, index, 2))\
  425. MOVNTQ(%%mm1, 8(dst, index, 2))\
  426. \
  427. "add $8, "#index" \n\t"\
  428. "cmp "#dstw", "#index" \n\t"\
  429. " jb 1b \n\t"
  430. #define WRITERGB16(dst, dstw, index) REAL_WRITERGB16(dst, dstw, index)
  431. static void RENAME(yuv2rgb565_X_ar)(SwsContext *c, const int16_t *lumFilter,
  432. const int16_t **lumSrc, int lumFilterSize,
  433. const int16_t *chrFilter, const int16_t **chrUSrc,
  434. const int16_t **chrVSrc,
  435. int chrFilterSize, const int16_t **alpSrc,
  436. uint8_t *dest, int dstW, int dstY)
  437. {
  438. x86_reg dummy=0;
  439. x86_reg dstW_reg = dstW;
  440. x86_reg uv_off = c->uv_off_byte;
  441. YSCALEYUV2PACKEDX_ACCURATE
  442. YSCALEYUV2RGBX
  443. "pxor %%mm7, %%mm7 \n\t"
  444. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  445. #ifdef DITHER1XBPP
  446. "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
  447. "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
  448. "paddusb "RED_DITHER"(%0), %%mm5\n\t"
  449. #endif
  450. WRITERGB16(%4, %5, %%REGa)
  451. YSCALEYUV2PACKEDX_END
  452. }
  453. static void RENAME(yuv2rgb565_X)(SwsContext *c, const int16_t *lumFilter,
  454. const int16_t **lumSrc, int lumFilterSize,
  455. const int16_t *chrFilter, const int16_t **chrUSrc,
  456. const int16_t **chrVSrc,
  457. int chrFilterSize, const int16_t **alpSrc,
  458. uint8_t *dest, int dstW, int dstY)
  459. {
  460. x86_reg dummy=0;
  461. x86_reg dstW_reg = dstW;
  462. x86_reg uv_off = c->uv_off_byte;
  463. YSCALEYUV2PACKEDX
  464. YSCALEYUV2RGBX
  465. "pxor %%mm7, %%mm7 \n\t"
  466. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  467. #ifdef DITHER1XBPP
  468. "paddusb "BLUE_DITHER"(%0), %%mm2 \n\t"
  469. "paddusb "GREEN_DITHER"(%0), %%mm4 \n\t"
  470. "paddusb "RED_DITHER"(%0), %%mm5 \n\t"
  471. #endif
  472. WRITERGB16(%4, %5, %%REGa)
  473. YSCALEYUV2PACKEDX_END
  474. }
  475. #define REAL_WRITERGB15(dst, dstw, index) \
  476. "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
  477. "pand "MANGLE(bF8)", %%mm4 \n\t" /* G */\
  478. "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
  479. "psrlq $3, %%mm2 \n\t"\
  480. "psrlq $1, %%mm5 \n\t"\
  481. \
  482. "movq %%mm2, %%mm1 \n\t"\
  483. "movq %%mm4, %%mm3 \n\t"\
  484. \
  485. "punpcklbw %%mm7, %%mm3 \n\t"\
  486. "punpcklbw %%mm5, %%mm2 \n\t"\
  487. "punpckhbw %%mm7, %%mm4 \n\t"\
  488. "punpckhbw %%mm5, %%mm1 \n\t"\
  489. \
  490. "psllq $2, %%mm3 \n\t"\
  491. "psllq $2, %%mm4 \n\t"\
  492. \
  493. "por %%mm3, %%mm2 \n\t"\
  494. "por %%mm4, %%mm1 \n\t"\
  495. \
  496. MOVNTQ(%%mm2, (dst, index, 2))\
  497. MOVNTQ(%%mm1, 8(dst, index, 2))\
  498. \
  499. "add $8, "#index" \n\t"\
  500. "cmp "#dstw", "#index" \n\t"\
  501. " jb 1b \n\t"
  502. #define WRITERGB15(dst, dstw, index) REAL_WRITERGB15(dst, dstw, index)
  503. static void RENAME(yuv2rgb555_X_ar)(SwsContext *c, const int16_t *lumFilter,
  504. const int16_t **lumSrc, int lumFilterSize,
  505. const int16_t *chrFilter, const int16_t **chrUSrc,
  506. const int16_t **chrVSrc,
  507. int chrFilterSize, const int16_t **alpSrc,
  508. uint8_t *dest, int dstW, int dstY)
  509. {
  510. x86_reg dummy=0;
  511. x86_reg dstW_reg = dstW;
  512. x86_reg uv_off = c->uv_off_byte;
  513. YSCALEYUV2PACKEDX_ACCURATE
  514. YSCALEYUV2RGBX
  515. "pxor %%mm7, %%mm7 \n\t"
  516. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  517. #ifdef DITHER1XBPP
  518. "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
  519. "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
  520. "paddusb "RED_DITHER"(%0), %%mm5\n\t"
  521. #endif
  522. WRITERGB15(%4, %5, %%REGa)
  523. YSCALEYUV2PACKEDX_END
  524. }
  525. static void RENAME(yuv2rgb555_X)(SwsContext *c, const int16_t *lumFilter,
  526. const int16_t **lumSrc, int lumFilterSize,
  527. const int16_t *chrFilter, const int16_t **chrUSrc,
  528. const int16_t **chrVSrc,
  529. int chrFilterSize, const int16_t **alpSrc,
  530. uint8_t *dest, int dstW, int dstY)
  531. {
  532. x86_reg dummy=0;
  533. x86_reg dstW_reg = dstW;
  534. x86_reg uv_off = c->uv_off_byte;
  535. YSCALEYUV2PACKEDX
  536. YSCALEYUV2RGBX
  537. "pxor %%mm7, %%mm7 \n\t"
  538. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  539. #ifdef DITHER1XBPP
  540. "paddusb "BLUE_DITHER"(%0), %%mm2 \n\t"
  541. "paddusb "GREEN_DITHER"(%0), %%mm4 \n\t"
  542. "paddusb "RED_DITHER"(%0), %%mm5 \n\t"
  543. #endif
  544. WRITERGB15(%4, %5, %%REGa)
  545. YSCALEYUV2PACKEDX_END
  546. }
  547. #define WRITEBGR24MMX(dst, dstw, index) \
  548. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
  549. "movq %%mm2, %%mm1 \n\t" /* B */\
  550. "movq %%mm5, %%mm6 \n\t" /* R */\
  551. "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
  552. "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
  553. "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
  554. "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
  555. "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
  556. "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
  557. "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
  558. "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
  559. "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
  560. "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
  561. \
  562. "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
  563. "movq %%mm2, %%mm6 \n\t" /* 0RGB0RGB 1 */\
  564. "movq %%mm1, %%mm5 \n\t" /* 0RGB0RGB 2 */\
  565. "movq %%mm3, %%mm7 \n\t" /* 0RGB0RGB 3 */\
  566. \
  567. "psllq $40, %%mm0 \n\t" /* RGB00000 0 */\
  568. "psllq $40, %%mm2 \n\t" /* RGB00000 1 */\
  569. "psllq $40, %%mm1 \n\t" /* RGB00000 2 */\
  570. "psllq $40, %%mm3 \n\t" /* RGB00000 3 */\
  571. \
  572. "punpckhdq %%mm4, %%mm0 \n\t" /* 0RGBRGB0 0 */\
  573. "punpckhdq %%mm6, %%mm2 \n\t" /* 0RGBRGB0 1 */\
  574. "punpckhdq %%mm5, %%mm1 \n\t" /* 0RGBRGB0 2 */\
  575. "punpckhdq %%mm7, %%mm3 \n\t" /* 0RGBRGB0 3 */\
  576. \
  577. "psrlq $8, %%mm0 \n\t" /* 00RGBRGB 0 */\
  578. "movq %%mm2, %%mm6 \n\t" /* 0RGBRGB0 1 */\
  579. "psllq $40, %%mm2 \n\t" /* GB000000 1 */\
  580. "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
  581. MOVNTQ(%%mm0, (dst))\
  582. \
  583. "psrlq $24, %%mm6 \n\t" /* 0000RGBR 1 */\
  584. "movq %%mm1, %%mm5 \n\t" /* 0RGBRGB0 2 */\
  585. "psllq $24, %%mm1 \n\t" /* BRGB0000 2 */\
  586. "por %%mm1, %%mm6 \n\t" /* BRGBRGBR 1 */\
  587. MOVNTQ(%%mm6, 8(dst))\
  588. \
  589. "psrlq $40, %%mm5 \n\t" /* 000000RG 2 */\
  590. "psllq $8, %%mm3 \n\t" /* RGBRGB00 3 */\
  591. "por %%mm3, %%mm5 \n\t" /* RGBRGBRG 2 */\
  592. MOVNTQ(%%mm5, 16(dst))\
  593. \
  594. "add $24, "#dst" \n\t"\
  595. \
  596. "add $8, "#index" \n\t"\
  597. "cmp "#dstw", "#index" \n\t"\
  598. " jb 1b \n\t"
  599. #define WRITEBGR24MMX2(dst, dstw, index) \
  600. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
  601. "movq "MANGLE(ff_M24A)", %%mm0 \n\t"\
  602. "movq "MANGLE(ff_M24C)", %%mm7 \n\t"\
  603. "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2 B1 B0 B1 B0 */\
  604. "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2 G1 G0 G1 G0 */\
  605. "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0 R1 R0 R1 R0 */\
  606. \
  607. "pand %%mm0, %%mm1 \n\t" /* B2 B1 B0 */\
  608. "pand %%mm0, %%mm3 \n\t" /* G2 G1 G0 */\
  609. "pand %%mm7, %%mm6 \n\t" /* R1 R0 */\
  610. \
  611. "psllq $8, %%mm3 \n\t" /* G2 G1 G0 */\
  612. "por %%mm1, %%mm6 \n\t"\
  613. "por %%mm3, %%mm6 \n\t"\
  614. MOVNTQ(%%mm6, (dst))\
  615. \
  616. "psrlq $8, %%mm4 \n\t" /* 00 G7 G6 G5 G4 G3 G2 G1 */\
  617. "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4 B3 B2 B3 B2 */\
  618. "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3 G4 G3 G4 G3 */\
  619. "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4 R3 R2 R3 R2 */\
  620. \
  621. "pand "MANGLE(ff_M24B)", %%mm1 \n\t" /* B5 B4 B3 */\
  622. "pand %%mm7, %%mm3 \n\t" /* G4 G3 */\
  623. "pand %%mm0, %%mm6 \n\t" /* R4 R3 R2 */\
  624. \
  625. "por %%mm1, %%mm3 \n\t" /* B5 G4 B4 G3 B3 */\
  626. "por %%mm3, %%mm6 \n\t"\
  627. MOVNTQ(%%mm6, 8(dst))\
  628. \
  629. "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6 B7 B6 B6 B7 */\
  630. "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7 G6 G5 G6 G5 */\
  631. "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6 R5 R4 R5 R4 */\
  632. \
  633. "pand %%mm7, %%mm1 \n\t" /* B7 B6 */\
  634. "pand %%mm0, %%mm3 \n\t" /* G7 G6 G5 */\
  635. "pand "MANGLE(ff_M24B)", %%mm6 \n\t" /* R7 R6 R5 */\
  636. \
  637. "por %%mm1, %%mm3 \n\t"\
  638. "por %%mm3, %%mm6 \n\t"\
  639. MOVNTQ(%%mm6, 16(dst))\
  640. \
  641. "add $24, "#dst" \n\t"\
  642. \
  643. "add $8, "#index" \n\t"\
  644. "cmp "#dstw", "#index" \n\t"\
  645. " jb 1b \n\t"
  646. #if COMPILE_TEMPLATE_MMX2
  647. #undef WRITEBGR24
  648. #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX2(dst, dstw, index)
  649. #else
  650. #undef WRITEBGR24
  651. #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX(dst, dstw, index)
  652. #endif
  653. static void RENAME(yuv2bgr24_X_ar)(SwsContext *c, const int16_t *lumFilter,
  654. const int16_t **lumSrc, int lumFilterSize,
  655. const int16_t *chrFilter, const int16_t **chrUSrc,
  656. const int16_t **chrVSrc,
  657. int chrFilterSize, const int16_t **alpSrc,
  658. uint8_t *dest, int dstW, int dstY)
  659. {
  660. x86_reg dummy=0;
  661. x86_reg dstW_reg = dstW;
  662. x86_reg uv_off = c->uv_off_byte;
  663. YSCALEYUV2PACKEDX_ACCURATE
  664. YSCALEYUV2RGBX
  665. "pxor %%mm7, %%mm7 \n\t"
  666. "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"\n\t" //FIXME optimize
  667. "add %4, %%"REG_c" \n\t"
  668. WRITEBGR24(%%REGc, %5, %%REGa)
  669. :: "r" (&c->redDither),
  670. "m" (dummy), "m" (dummy), "m" (dummy),
  671. "r" (dest), "m" (dstW_reg), "m"(uv_off)
  672. : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
  673. );
  674. }
  675. static void RENAME(yuv2bgr24_X)(SwsContext *c, const int16_t *lumFilter,
  676. const int16_t **lumSrc, int lumFilterSize,
  677. const int16_t *chrFilter, const int16_t **chrUSrc,
  678. const int16_t **chrVSrc,
  679. int chrFilterSize, const int16_t **alpSrc,
  680. uint8_t *dest, int dstW, int dstY)
  681. {
  682. x86_reg dummy=0;
  683. x86_reg dstW_reg = dstW;
  684. x86_reg uv_off = c->uv_off_byte;
  685. YSCALEYUV2PACKEDX
  686. YSCALEYUV2RGBX
  687. "pxor %%mm7, %%mm7 \n\t"
  688. "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c" \n\t" //FIXME optimize
  689. "add %4, %%"REG_c" \n\t"
  690. WRITEBGR24(%%REGc, %5, %%REGa)
  691. :: "r" (&c->redDither),
  692. "m" (dummy), "m" (dummy), "m" (dummy),
  693. "r" (dest), "m" (dstW_reg), "m"(uv_off)
  694. : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
  695. );
  696. }
  697. #define REAL_WRITEYUY2(dst, dstw, index) \
  698. "packuswb %%mm3, %%mm3 \n\t"\
  699. "packuswb %%mm4, %%mm4 \n\t"\
  700. "packuswb %%mm7, %%mm1 \n\t"\
  701. "punpcklbw %%mm4, %%mm3 \n\t"\
  702. "movq %%mm1, %%mm7 \n\t"\
  703. "punpcklbw %%mm3, %%mm1 \n\t"\
  704. "punpckhbw %%mm3, %%mm7 \n\t"\
  705. \
  706. MOVNTQ(%%mm1, (dst, index, 2))\
  707. MOVNTQ(%%mm7, 8(dst, index, 2))\
  708. \
  709. "add $8, "#index" \n\t"\
  710. "cmp "#dstw", "#index" \n\t"\
  711. " jb 1b \n\t"
  712. #define WRITEYUY2(dst, dstw, index) REAL_WRITEYUY2(dst, dstw, index)
  713. static void RENAME(yuv2yuyv422_X_ar)(SwsContext *c, const int16_t *lumFilter,
  714. const int16_t **lumSrc, int lumFilterSize,
  715. const int16_t *chrFilter, const int16_t **chrUSrc,
  716. const int16_t **chrVSrc,
  717. int chrFilterSize, const int16_t **alpSrc,
  718. uint8_t *dest, int dstW, int dstY)
  719. {
  720. x86_reg dummy=0;
  721. x86_reg dstW_reg = dstW;
  722. x86_reg uv_off = c->uv_off_byte;
  723. YSCALEYUV2PACKEDX_ACCURATE
  724. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  725. "psraw $3, %%mm3 \n\t"
  726. "psraw $3, %%mm4 \n\t"
  727. "psraw $3, %%mm1 \n\t"
  728. "psraw $3, %%mm7 \n\t"
  729. WRITEYUY2(%4, %5, %%REGa)
  730. YSCALEYUV2PACKEDX_END
  731. }
  732. static void RENAME(yuv2yuyv422_X)(SwsContext *c, const int16_t *lumFilter,
  733. const int16_t **lumSrc, int lumFilterSize,
  734. const int16_t *chrFilter, const int16_t **chrUSrc,
  735. const int16_t **chrVSrc,
  736. int chrFilterSize, const int16_t **alpSrc,
  737. uint8_t *dest, int dstW, int dstY)
  738. {
  739. x86_reg dummy=0;
  740. x86_reg dstW_reg = dstW;
  741. x86_reg uv_off = c->uv_off_byte;
  742. YSCALEYUV2PACKEDX
  743. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  744. "psraw $3, %%mm3 \n\t"
  745. "psraw $3, %%mm4 \n\t"
  746. "psraw $3, %%mm1 \n\t"
  747. "psraw $3, %%mm7 \n\t"
  748. WRITEYUY2(%4, %5, %%REGa)
  749. YSCALEYUV2PACKEDX_END
  750. }
  751. #define REAL_YSCALEYUV2RGB_UV(index, c) \
  752. "xor "#index", "#index" \n\t"\
  753. ".p2align 4 \n\t"\
  754. "1: \n\t"\
  755. "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
  756. "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
  757. "add "UV_OFF_PX"("#c"), "#index" \n\t" \
  758. "movq (%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
  759. "movq (%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
  760. "sub "UV_OFF_PX"("#c"), "#index" \n\t" \
  761. "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
  762. "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
  763. "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
  764. "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
  765. "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
  766. "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
  767. "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
  768. "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
  769. "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
  770. "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
  771. "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
  772. "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
  773. "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
  774. "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
  775. "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
  776. /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
  777. #define REAL_YSCALEYUV2RGB_YA(index, c, b1, b2) \
  778. "movq ("#b1", "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
  779. "movq ("#b2", "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
  780. "movq 8("#b1", "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
  781. "movq 8("#b2", "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
  782. "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
  783. "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
  784. "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
  785. "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
  786. "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  787. "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  788. "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
  789. "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
  790. #define REAL_YSCALEYUV2RGB_COEFF(c) \
  791. "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
  792. "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
  793. "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
  794. "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
  795. "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
  796. "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
  797. /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
  798. "paddw %%mm3, %%mm4 \n\t"\
  799. "movq %%mm2, %%mm0 \n\t"\
  800. "movq %%mm5, %%mm6 \n\t"\
  801. "movq %%mm4, %%mm3 \n\t"\
  802. "punpcklwd %%mm2, %%mm2 \n\t"\
  803. "punpcklwd %%mm5, %%mm5 \n\t"\
  804. "punpcklwd %%mm4, %%mm4 \n\t"\
  805. "paddw %%mm1, %%mm2 \n\t"\
  806. "paddw %%mm1, %%mm5 \n\t"\
  807. "paddw %%mm1, %%mm4 \n\t"\
  808. "punpckhwd %%mm0, %%mm0 \n\t"\
  809. "punpckhwd %%mm6, %%mm6 \n\t"\
  810. "punpckhwd %%mm3, %%mm3 \n\t"\
  811. "paddw %%mm7, %%mm0 \n\t"\
  812. "paddw %%mm7, %%mm6 \n\t"\
  813. "paddw %%mm7, %%mm3 \n\t"\
  814. /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
  815. "packuswb %%mm0, %%mm2 \n\t"\
  816. "packuswb %%mm6, %%mm5 \n\t"\
  817. "packuswb %%mm3, %%mm4 \n\t"\
  818. #define YSCALEYUV2RGB_YA(index, c, b1, b2) REAL_YSCALEYUV2RGB_YA(index, c, b1, b2)
  819. #define YSCALEYUV2RGB(index, c) \
  820. REAL_YSCALEYUV2RGB_UV(index, c) \
  821. REAL_YSCALEYUV2RGB_YA(index, c, %0, %1) \
  822. REAL_YSCALEYUV2RGB_COEFF(c)
  823. /**
  824. * vertical bilinear scale YV12 to RGB
  825. */
  826. static void RENAME(yuv2rgb32_2)(SwsContext *c, const int16_t *buf[2],
  827. const int16_t *ubuf[2], const int16_t *vbuf[2],
  828. const int16_t *abuf[2], uint8_t *dest,
  829. int dstW, int yalpha, int uvalpha, int y)
  830. {
  831. const int16_t *buf0 = buf[0], *buf1 = buf[1],
  832. *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
  833. if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
  834. const int16_t *abuf0 = abuf[0], *abuf1 = abuf[1];
  835. #if ARCH_X86_64
  836. __asm__ volatile(
  837. YSCALEYUV2RGB(%%r8, %5)
  838. YSCALEYUV2RGB_YA(%%r8, %5, %6, %7)
  839. "psraw $3, %%mm1 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
  840. "psraw $3, %%mm7 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
  841. "packuswb %%mm7, %%mm1 \n\t"
  842. WRITEBGR32(%4, 8280(%5), %%r8, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
  843. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "r" (dest),
  844. "a" (&c->redDither),
  845. "r" (abuf0), "r" (abuf1)
  846. : "%r8"
  847. );
  848. #else
  849. *(const uint16_t **)(&c->u_temp)=abuf0;
  850. *(const uint16_t **)(&c->v_temp)=abuf1;
  851. __asm__ volatile(
  852. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  853. "mov %4, %%"REG_b" \n\t"
  854. "push %%"REG_BP" \n\t"
  855. YSCALEYUV2RGB(%%REGBP, %5)
  856. "push %0 \n\t"
  857. "push %1 \n\t"
  858. "mov "U_TEMP"(%5), %0 \n\t"
  859. "mov "V_TEMP"(%5), %1 \n\t"
  860. YSCALEYUV2RGB_YA(%%REGBP, %5, %0, %1)
  861. "psraw $3, %%mm1 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
  862. "psraw $3, %%mm7 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
  863. "packuswb %%mm7, %%mm1 \n\t"
  864. "pop %1 \n\t"
  865. "pop %0 \n\t"
  866. WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
  867. "pop %%"REG_BP" \n\t"
  868. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  869. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  870. "a" (&c->redDither)
  871. );
  872. #endif
  873. } else {
  874. __asm__ volatile(
  875. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  876. "mov %4, %%"REG_b" \n\t"
  877. "push %%"REG_BP" \n\t"
  878. YSCALEYUV2RGB(%%REGBP, %5)
  879. "pcmpeqd %%mm7, %%mm7 \n\t"
  880. WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
  881. "pop %%"REG_BP" \n\t"
  882. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  883. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  884. "a" (&c->redDither)
  885. );
  886. }
  887. }
  888. static void RENAME(yuv2bgr24_2)(SwsContext *c, const int16_t *buf[2],
  889. const int16_t *ubuf[2], const int16_t *vbuf[2],
  890. const int16_t *abuf[2], uint8_t *dest,
  891. int dstW, int yalpha, int uvalpha, int y)
  892. {
  893. const int16_t *buf0 = buf[0], *buf1 = buf[1],
  894. *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
  895. //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
  896. __asm__ volatile(
  897. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  898. "mov %4, %%"REG_b" \n\t"
  899. "push %%"REG_BP" \n\t"
  900. YSCALEYUV2RGB(%%REGBP, %5)
  901. "pxor %%mm7, %%mm7 \n\t"
  902. WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
  903. "pop %%"REG_BP" \n\t"
  904. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  905. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  906. "a" (&c->redDither)
  907. );
  908. }
  909. static void RENAME(yuv2rgb555_2)(SwsContext *c, const int16_t *buf[2],
  910. const int16_t *ubuf[2], const int16_t *vbuf[2],
  911. const int16_t *abuf[2], uint8_t *dest,
  912. int dstW, int yalpha, int uvalpha, int y)
  913. {
  914. const int16_t *buf0 = buf[0], *buf1 = buf[1],
  915. *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
  916. //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
  917. __asm__ volatile(
  918. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  919. "mov %4, %%"REG_b" \n\t"
  920. "push %%"REG_BP" \n\t"
  921. YSCALEYUV2RGB(%%REGBP, %5)
  922. "pxor %%mm7, %%mm7 \n\t"
  923. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  924. #ifdef DITHER1XBPP
  925. "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
  926. "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
  927. "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
  928. #endif
  929. WRITERGB15(%%REGb, 8280(%5), %%REGBP)
  930. "pop %%"REG_BP" \n\t"
  931. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  932. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  933. "a" (&c->redDither)
  934. );
  935. }
  936. static void RENAME(yuv2rgb565_2)(SwsContext *c, const int16_t *buf[2],
  937. const int16_t *ubuf[2], const int16_t *vbuf[2],
  938. const int16_t *abuf[2], uint8_t *dest,
  939. int dstW, int yalpha, int uvalpha, int y)
  940. {
  941. const int16_t *buf0 = buf[0], *buf1 = buf[1],
  942. *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
  943. //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
  944. __asm__ volatile(
  945. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  946. "mov %4, %%"REG_b" \n\t"
  947. "push %%"REG_BP" \n\t"
  948. YSCALEYUV2RGB(%%REGBP, %5)
  949. "pxor %%mm7, %%mm7 \n\t"
  950. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  951. #ifdef DITHER1XBPP
  952. "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
  953. "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
  954. "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
  955. #endif
  956. WRITERGB16(%%REGb, 8280(%5), %%REGBP)
  957. "pop %%"REG_BP" \n\t"
  958. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  959. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  960. "a" (&c->redDither)
  961. );
  962. }
  963. #define REAL_YSCALEYUV2PACKED(index, c) \
  964. "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
  965. "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1 \n\t"\
  966. "psraw $3, %%mm0 \n\t"\
  967. "psraw $3, %%mm1 \n\t"\
  968. "movq %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c") \n\t"\
  969. "movq %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c") \n\t"\
  970. "xor "#index", "#index" \n\t"\
  971. ".p2align 4 \n\t"\
  972. "1: \n\t"\
  973. "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
  974. "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
  975. "add "UV_OFF_PX"("#c"), "#index" \n\t" \
  976. "movq (%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
  977. "movq (%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
  978. "sub "UV_OFF_PX"("#c"), "#index" \n\t" \
  979. "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
  980. "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
  981. "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
  982. "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
  983. "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
  984. "psraw $7, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
  985. "psraw $7, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
  986. "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
  987. "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
  988. "movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
  989. "movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
  990. "movq 8(%0, "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
  991. "movq 8(%1, "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
  992. "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
  993. "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
  994. "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
  995. "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
  996. "psraw $7, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  997. "psraw $7, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  998. "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
  999. "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
  1000. #define YSCALEYUV2PACKED(index, c) REAL_YSCALEYUV2PACKED(index, c)
  1001. static void RENAME(yuv2yuyv422_2)(SwsContext *c, const int16_t *buf[2],
  1002. const int16_t *ubuf[2], const int16_t *vbuf[2],
  1003. const int16_t *abuf[2], uint8_t *dest,
  1004. int dstW, int yalpha, int uvalpha, int y)
  1005. {
  1006. const int16_t *buf0 = buf[0], *buf1 = buf[1],
  1007. *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
  1008. //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
  1009. __asm__ volatile(
  1010. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1011. "mov %4, %%"REG_b" \n\t"
  1012. "push %%"REG_BP" \n\t"
  1013. YSCALEYUV2PACKED(%%REGBP, %5)
  1014. WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
  1015. "pop %%"REG_BP" \n\t"
  1016. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1017. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1018. "a" (&c->redDither)
  1019. );
  1020. }
  1021. #define REAL_YSCALEYUV2RGB1(index, c) \
  1022. "xor "#index", "#index" \n\t"\
  1023. ".p2align 4 \n\t"\
  1024. "1: \n\t"\
  1025. "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
  1026. "add "UV_OFF_PX"("#c"), "#index" \n\t" \
  1027. "movq (%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
  1028. "sub "UV_OFF_PX"("#c"), "#index" \n\t" \
  1029. "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
  1030. "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
  1031. "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
  1032. "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
  1033. "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
  1034. "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
  1035. "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
  1036. "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
  1037. /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
  1038. "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
  1039. "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
  1040. "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  1041. "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  1042. "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
  1043. "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
  1044. "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
  1045. "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
  1046. "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
  1047. "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
  1048. /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
  1049. "paddw %%mm3, %%mm4 \n\t"\
  1050. "movq %%mm2, %%mm0 \n\t"\
  1051. "movq %%mm5, %%mm6 \n\t"\
  1052. "movq %%mm4, %%mm3 \n\t"\
  1053. "punpcklwd %%mm2, %%mm2 \n\t"\
  1054. "punpcklwd %%mm5, %%mm5 \n\t"\
  1055. "punpcklwd %%mm4, %%mm4 \n\t"\
  1056. "paddw %%mm1, %%mm2 \n\t"\
  1057. "paddw %%mm1, %%mm5 \n\t"\
  1058. "paddw %%mm1, %%mm4 \n\t"\
  1059. "punpckhwd %%mm0, %%mm0 \n\t"\
  1060. "punpckhwd %%mm6, %%mm6 \n\t"\
  1061. "punpckhwd %%mm3, %%mm3 \n\t"\
  1062. "paddw %%mm7, %%mm0 \n\t"\
  1063. "paddw %%mm7, %%mm6 \n\t"\
  1064. "paddw %%mm7, %%mm3 \n\t"\
  1065. /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
  1066. "packuswb %%mm0, %%mm2 \n\t"\
  1067. "packuswb %%mm6, %%mm5 \n\t"\
  1068. "packuswb %%mm3, %%mm4 \n\t"\
  1069. #define YSCALEYUV2RGB1(index, c) REAL_YSCALEYUV2RGB1(index, c)
  1070. // do vertical chrominance interpolation
  1071. #define REAL_YSCALEYUV2RGB1b(index, c) \
  1072. "xor "#index", "#index" \n\t"\
  1073. ".p2align 4 \n\t"\
  1074. "1: \n\t"\
  1075. "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
  1076. "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
  1077. "add "UV_OFF_PX"("#c"), "#index" \n\t" \
  1078. "movq (%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
  1079. "movq (%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
  1080. "sub "UV_OFF_PX"("#c"), "#index" \n\t" \
  1081. "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
  1082. "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
  1083. "psrlw $5, %%mm3 \n\t" /*FIXME might overflow*/\
  1084. "psrlw $5, %%mm4 \n\t" /*FIXME might overflow*/\
  1085. "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
  1086. "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
  1087. "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
  1088. "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
  1089. "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
  1090. "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
  1091. /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
  1092. "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
  1093. "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
  1094. "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  1095. "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  1096. "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
  1097. "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
  1098. "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
  1099. "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
  1100. "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
  1101. "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
  1102. /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
  1103. "paddw %%mm3, %%mm4 \n\t"\
  1104. "movq %%mm2, %%mm0 \n\t"\
  1105. "movq %%mm5, %%mm6 \n\t"\
  1106. "movq %%mm4, %%mm3 \n\t"\
  1107. "punpcklwd %%mm2, %%mm2 \n\t"\
  1108. "punpcklwd %%mm5, %%mm5 \n\t"\
  1109. "punpcklwd %%mm4, %%mm4 \n\t"\
  1110. "paddw %%mm1, %%mm2 \n\t"\
  1111. "paddw %%mm1, %%mm5 \n\t"\
  1112. "paddw %%mm1, %%mm4 \n\t"\
  1113. "punpckhwd %%mm0, %%mm0 \n\t"\
  1114. "punpckhwd %%mm6, %%mm6 \n\t"\
  1115. "punpckhwd %%mm3, %%mm3 \n\t"\
  1116. "paddw %%mm7, %%mm0 \n\t"\
  1117. "paddw %%mm7, %%mm6 \n\t"\
  1118. "paddw %%mm7, %%mm3 \n\t"\
  1119. /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
  1120. "packuswb %%mm0, %%mm2 \n\t"\
  1121. "packuswb %%mm6, %%mm5 \n\t"\
  1122. "packuswb %%mm3, %%mm4 \n\t"\
  1123. #define YSCALEYUV2RGB1b(index, c) REAL_YSCALEYUV2RGB1b(index, c)
  1124. #define REAL_YSCALEYUV2RGB1_ALPHA(index) \
  1125. "movq (%1, "#index", 2), %%mm7 \n\t" /* abuf0[index ] */\
  1126. "movq 8(%1, "#index", 2), %%mm1 \n\t" /* abuf0[index+4] */\
  1127. "psraw $7, %%mm7 \n\t" /* abuf0[index ] >>7 */\
  1128. "psraw $7, %%mm1 \n\t" /* abuf0[index+4] >>7 */\
  1129. "packuswb %%mm1, %%mm7 \n\t"
  1130. #define YSCALEYUV2RGB1_ALPHA(index) REAL_YSCALEYUV2RGB1_ALPHA(index)
  1131. /**
  1132. * YV12 to RGB without scaling or interpolating
  1133. */
  1134. static void RENAME(yuv2rgb32_1)(SwsContext *c, const int16_t *buf0,
  1135. const int16_t *ubuf[2], const int16_t *bguf[2],
  1136. const int16_t *abuf0, uint8_t *dest,
  1137. int dstW, int uvalpha, int y)
  1138. {
  1139. const int16_t *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
  1140. const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
  1141. if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
  1142. if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
  1143. __asm__ volatile(
  1144. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1145. "mov %4, %%"REG_b" \n\t"
  1146. "push %%"REG_BP" \n\t"
  1147. YSCALEYUV2RGB1(%%REGBP, %5)
  1148. YSCALEYUV2RGB1_ALPHA(%%REGBP)
  1149. WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
  1150. "pop %%"REG_BP" \n\t"
  1151. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1152. :: "c" (buf0), "d" (abuf0), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1153. "a" (&c->redDither)
  1154. );
  1155. } else {
  1156. __asm__ volatile(
  1157. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1158. "mov %4, %%"REG_b" \n\t"
  1159. "push %%"REG_BP" \n\t"
  1160. YSCALEYUV2RGB1(%%REGBP, %5)
  1161. "pcmpeqd %%mm7, %%mm7 \n\t"
  1162. WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
  1163. "pop %%"REG_BP" \n\t"
  1164. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1165. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1166. "a" (&c->redDither)
  1167. );
  1168. }
  1169. } else {
  1170. if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
  1171. __asm__ volatile(
  1172. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1173. "mov %4, %%"REG_b" \n\t"
  1174. "push %%"REG_BP" \n\t"
  1175. YSCALEYUV2RGB1b(%%REGBP, %5)
  1176. YSCALEYUV2RGB1_ALPHA(%%REGBP)
  1177. WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
  1178. "pop %%"REG_BP" \n\t"
  1179. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1180. :: "c" (buf0), "d" (abuf0), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1181. "a" (&c->redDither)
  1182. );
  1183. } else {
  1184. __asm__ volatile(
  1185. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1186. "mov %4, %%"REG_b" \n\t"
  1187. "push %%"REG_BP" \n\t"
  1188. YSCALEYUV2RGB1b(%%REGBP, %5)
  1189. "pcmpeqd %%mm7, %%mm7 \n\t"
  1190. WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
  1191. "pop %%"REG_BP" \n\t"
  1192. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1193. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1194. "a" (&c->redDither)
  1195. );
  1196. }
  1197. }
  1198. }
  1199. static void RENAME(yuv2bgr24_1)(SwsContext *c, const int16_t *buf0,
  1200. const int16_t *ubuf[2], const int16_t *bguf[2],
  1201. const int16_t *abuf0, uint8_t *dest,
  1202. int dstW, int uvalpha, int y)
  1203. {
  1204. const int16_t *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
  1205. const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
  1206. if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
  1207. __asm__ volatile(
  1208. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1209. "mov %4, %%"REG_b" \n\t"
  1210. "push %%"REG_BP" \n\t"
  1211. YSCALEYUV2RGB1(%%REGBP, %5)
  1212. "pxor %%mm7, %%mm7 \n\t"
  1213. WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
  1214. "pop %%"REG_BP" \n\t"
  1215. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1216. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1217. "a" (&c->redDither)
  1218. );
  1219. } else {
  1220. __asm__ volatile(
  1221. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1222. "mov %4, %%"REG_b" \n\t"
  1223. "push %%"REG_BP" \n\t"
  1224. YSCALEYUV2RGB1b(%%REGBP, %5)
  1225. "pxor %%mm7, %%mm7 \n\t"
  1226. WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
  1227. "pop %%"REG_BP" \n\t"
  1228. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1229. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1230. "a" (&c->redDither)
  1231. );
  1232. }
  1233. }
  1234. static void RENAME(yuv2rgb555_1)(SwsContext *c, const int16_t *buf0,
  1235. const int16_t *ubuf[2], const int16_t *bguf[2],
  1236. const int16_t *abuf0, uint8_t *dest,
  1237. int dstW, int uvalpha, int y)
  1238. {
  1239. const int16_t *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
  1240. const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
  1241. if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
  1242. __asm__ volatile(
  1243. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1244. "mov %4, %%"REG_b" \n\t"
  1245. "push %%"REG_BP" \n\t"
  1246. YSCALEYUV2RGB1(%%REGBP, %5)
  1247. "pxor %%mm7, %%mm7 \n\t"
  1248. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1249. #ifdef DITHER1XBPP
  1250. "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
  1251. "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
  1252. "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
  1253. #endif
  1254. WRITERGB15(%%REGb, 8280(%5), %%REGBP)
  1255. "pop %%"REG_BP" \n\t"
  1256. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1257. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1258. "a" (&c->redDither)
  1259. );
  1260. } else {
  1261. __asm__ volatile(
  1262. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1263. "mov %4, %%"REG_b" \n\t"
  1264. "push %%"REG_BP" \n\t"
  1265. YSCALEYUV2RGB1b(%%REGBP, %5)
  1266. "pxor %%mm7, %%mm7 \n\t"
  1267. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1268. #ifdef DITHER1XBPP
  1269. "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
  1270. "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
  1271. "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
  1272. #endif
  1273. WRITERGB15(%%REGb, 8280(%5), %%REGBP)
  1274. "pop %%"REG_BP" \n\t"
  1275. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1276. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1277. "a" (&c->redDither)
  1278. );
  1279. }
  1280. }
  1281. static void RENAME(yuv2rgb565_1)(SwsContext *c, const int16_t *buf0,
  1282. const int16_t *ubuf[2], const int16_t *bguf[2],
  1283. const int16_t *abuf0, uint8_t *dest,
  1284. int dstW, int uvalpha, int y)
  1285. {
  1286. const int16_t *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
  1287. const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
  1288. if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
  1289. __asm__ volatile(
  1290. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1291. "mov %4, %%"REG_b" \n\t"
  1292. "push %%"REG_BP" \n\t"
  1293. YSCALEYUV2RGB1(%%REGBP, %5)
  1294. "pxor %%mm7, %%mm7 \n\t"
  1295. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1296. #ifdef DITHER1XBPP
  1297. "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
  1298. "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
  1299. "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
  1300. #endif
  1301. WRITERGB16(%%REGb, 8280(%5), %%REGBP)
  1302. "pop %%"REG_BP" \n\t"
  1303. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1304. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1305. "a" (&c->redDither)
  1306. );
  1307. } else {
  1308. __asm__ volatile(
  1309. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1310. "mov %4, %%"REG_b" \n\t"
  1311. "push %%"REG_BP" \n\t"
  1312. YSCALEYUV2RGB1b(%%REGBP, %5)
  1313. "pxor %%mm7, %%mm7 \n\t"
  1314. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1315. #ifdef DITHER1XBPP
  1316. "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
  1317. "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
  1318. "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
  1319. #endif
  1320. WRITERGB16(%%REGb, 8280(%5), %%REGBP)
  1321. "pop %%"REG_BP" \n\t"
  1322. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1323. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1324. "a" (&c->redDither)
  1325. );
  1326. }
  1327. }
  1328. #define REAL_YSCALEYUV2PACKED1(index, c) \
  1329. "xor "#index", "#index" \n\t"\
  1330. ".p2align 4 \n\t"\
  1331. "1: \n\t"\
  1332. "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
  1333. "add "UV_OFF_PX"("#c"), "#index" \n\t" \
  1334. "movq (%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
  1335. "sub "UV_OFF_PX"("#c"), "#index" \n\t" \
  1336. "psraw $7, %%mm3 \n\t" \
  1337. "psraw $7, %%mm4 \n\t" \
  1338. "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
  1339. "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
  1340. "psraw $7, %%mm1 \n\t" \
  1341. "psraw $7, %%mm7 \n\t" \
  1342. #define YSCALEYUV2PACKED1(index, c) REAL_YSCALEYUV2PACKED1(index, c)
  1343. #define REAL_YSCALEYUV2PACKED1b(index, c) \
  1344. "xor "#index", "#index" \n\t"\
  1345. ".p2align 4 \n\t"\
  1346. "1: \n\t"\
  1347. "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
  1348. "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
  1349. "add "UV_OFF_PX"("#c"), "#index" \n\t" \
  1350. "movq (%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
  1351. "movq (%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
  1352. "sub "UV_OFF_PX"("#c"), "#index" \n\t" \
  1353. "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
  1354. "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
  1355. "psrlw $8, %%mm3 \n\t" \
  1356. "psrlw $8, %%mm4 \n\t" \
  1357. "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
  1358. "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
  1359. "psraw $7, %%mm1 \n\t" \
  1360. "psraw $7, %%mm7 \n\t"
  1361. #define YSCALEYUV2PACKED1b(index, c) REAL_YSCALEYUV2PACKED1b(index, c)
  1362. static void RENAME(yuv2yuyv422_1)(SwsContext *c, const int16_t *buf0,
  1363. const int16_t *ubuf[2], const int16_t *bguf[2],
  1364. const int16_t *abuf0, uint8_t *dest,
  1365. int dstW, int uvalpha, int y)
  1366. {
  1367. const int16_t *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
  1368. const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
  1369. if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
  1370. __asm__ volatile(
  1371. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1372. "mov %4, %%"REG_b" \n\t"
  1373. "push %%"REG_BP" \n\t"
  1374. YSCALEYUV2PACKED1(%%REGBP, %5)
  1375. WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
  1376. "pop %%"REG_BP" \n\t"
  1377. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1378. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1379. "a" (&c->redDither)
  1380. );
  1381. } else {
  1382. __asm__ volatile(
  1383. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1384. "mov %4, %%"REG_b" \n\t"
  1385. "push %%"REG_BP" \n\t"
  1386. YSCALEYUV2PACKED1b(%%REGBP, %5)
  1387. WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
  1388. "pop %%"REG_BP" \n\t"
  1389. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1390. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1391. "a" (&c->redDither)
  1392. );
  1393. }
  1394. }
  1395. #if !COMPILE_TEMPLATE_MMX2
  1396. //FIXME yuy2* can read up to 7 samples too much
  1397. static void RENAME(yuy2ToY)(uint8_t *dst, const uint8_t *src,
  1398. int width, uint32_t *unused)
  1399. {
  1400. __asm__ volatile(
  1401. "movq "MANGLE(bm01010101)", %%mm2 \n\t"
  1402. "mov %0, %%"REG_a" \n\t"
  1403. "1: \n\t"
  1404. "movq (%1, %%"REG_a",2), %%mm0 \n\t"
  1405. "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
  1406. "pand %%mm2, %%mm0 \n\t"
  1407. "pand %%mm2, %%mm1 \n\t"
  1408. "packuswb %%mm1, %%mm0 \n\t"
  1409. "movq %%mm0, (%2, %%"REG_a") \n\t"
  1410. "add $8, %%"REG_a" \n\t"
  1411. " js 1b \n\t"
  1412. : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst+width)
  1413. : "%"REG_a
  1414. );
  1415. }
  1416. static void RENAME(yuy2ToUV)(uint8_t *dstU, uint8_t *dstV,
  1417. const uint8_t *src1, const uint8_t *src2,
  1418. int width, uint32_t *unused)
  1419. {
  1420. __asm__ volatile(
  1421. "movq "MANGLE(bm01010101)", %%mm4 \n\t"
  1422. "mov %0, %%"REG_a" \n\t"
  1423. "1: \n\t"
  1424. "movq (%1, %%"REG_a",4), %%mm0 \n\t"
  1425. "movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
  1426. "psrlw $8, %%mm0 \n\t"
  1427. "psrlw $8, %%mm1 \n\t"
  1428. "packuswb %%mm1, %%mm0 \n\t"
  1429. "movq %%mm0, %%mm1 \n\t"
  1430. "psrlw $8, %%mm0 \n\t"
  1431. "pand %%mm4, %%mm1 \n\t"
  1432. "packuswb %%mm0, %%mm0 \n\t"
  1433. "packuswb %%mm1, %%mm1 \n\t"
  1434. "movd %%mm0, (%3, %%"REG_a") \n\t"
  1435. "movd %%mm1, (%2, %%"REG_a") \n\t"
  1436. "add $4, %%"REG_a" \n\t"
  1437. " js 1b \n\t"
  1438. : : "g" ((x86_reg)-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
  1439. : "%"REG_a
  1440. );
  1441. assert(src1 == src2);
  1442. }
  1443. /* This is almost identical to the previous, end exists only because
  1444. * yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses. */
  1445. static void RENAME(uyvyToY)(uint8_t *dst, const uint8_t *src,
  1446. int width, uint32_t *unused)
  1447. {
  1448. __asm__ volatile(
  1449. "mov %0, %%"REG_a" \n\t"
  1450. "1: \n\t"
  1451. "movq (%1, %%"REG_a",2), %%mm0 \n\t"
  1452. "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
  1453. "psrlw $8, %%mm0 \n\t"
  1454. "psrlw $8, %%mm1 \n\t"
  1455. "packuswb %%mm1, %%mm0 \n\t"
  1456. "movq %%mm0, (%2, %%"REG_a") \n\t"
  1457. "add $8, %%"REG_a" \n\t"
  1458. " js 1b \n\t"
  1459. : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst+width)
  1460. : "%"REG_a
  1461. );
  1462. }
  1463. static void RENAME(uyvyToUV)(uint8_t *dstU, uint8_t *dstV,
  1464. const uint8_t *src1, const uint8_t *src2,
  1465. int width, uint32_t *unused)
  1466. {
  1467. __asm__ volatile(
  1468. "movq "MANGLE(bm01010101)", %%mm4 \n\t"
  1469. "mov %0, %%"REG_a" \n\t"
  1470. "1: \n\t"
  1471. "movq (%1, %%"REG_a",4), %%mm0 \n\t"
  1472. "movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
  1473. "pand %%mm4, %%mm0 \n\t"
  1474. "pand %%mm4, %%mm1 \n\t"
  1475. "packuswb %%mm1, %%mm0 \n\t"
  1476. "movq %%mm0, %%mm1 \n\t"
  1477. "psrlw $8, %%mm0 \n\t"
  1478. "pand %%mm4, %%mm1 \n\t"
  1479. "packuswb %%mm0, %%mm0 \n\t"
  1480. "packuswb %%mm1, %%mm1 \n\t"
  1481. "movd %%mm0, (%3, %%"REG_a") \n\t"
  1482. "movd %%mm1, (%2, %%"REG_a") \n\t"
  1483. "add $4, %%"REG_a" \n\t"
  1484. " js 1b \n\t"
  1485. : : "g" ((x86_reg)-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
  1486. : "%"REG_a
  1487. );
  1488. assert(src1 == src2);
  1489. }
  1490. static av_always_inline void RENAME(nvXXtoUV)(uint8_t *dst1, uint8_t *dst2,
  1491. const uint8_t *src, int width)
  1492. {
  1493. __asm__ volatile(
  1494. "movq "MANGLE(bm01010101)", %%mm4 \n\t"
  1495. "mov %0, %%"REG_a" \n\t"
  1496. "1: \n\t"
  1497. "movq (%1, %%"REG_a",2), %%mm0 \n\t"
  1498. "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
  1499. "movq %%mm0, %%mm2 \n\t"
  1500. "movq %%mm1, %%mm3 \n\t"
  1501. "pand %%mm4, %%mm0 \n\t"
  1502. "pand %%mm4, %%mm1 \n\t"
  1503. "psrlw $8, %%mm2 \n\t"
  1504. "psrlw $8, %%mm3 \n\t"
  1505. "packuswb %%mm1, %%mm0 \n\t"
  1506. "packuswb %%mm3, %%mm2 \n\t"
  1507. "movq %%mm0, (%2, %%"REG_a") \n\t"
  1508. "movq %%mm2, (%3, %%"REG_a") \n\t"
  1509. "add $8, %%"REG_a" \n\t"
  1510. " js 1b \n\t"
  1511. : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst1+width), "r" (dst2+width)
  1512. : "%"REG_a
  1513. );
  1514. }
  1515. static void RENAME(nv12ToUV)(uint8_t *dstU, uint8_t *dstV,
  1516. const uint8_t *src1, const uint8_t *src2,
  1517. int width, uint32_t *unused)
  1518. {
  1519. RENAME(nvXXtoUV)(dstU, dstV, src1, width);
  1520. }
  1521. static void RENAME(nv21ToUV)(uint8_t *dstU, uint8_t *dstV,
  1522. const uint8_t *src1, const uint8_t *src2,
  1523. int width, uint32_t *unused)
  1524. {
  1525. RENAME(nvXXtoUV)(dstV, dstU, src1, width);
  1526. }
  1527. #endif /* !COMPILE_TEMPLATE_MMX2 */
  1528. static av_always_inline void RENAME(bgr24ToY_mmx)(uint8_t *dst, const uint8_t *src,
  1529. int width, enum PixelFormat srcFormat)
  1530. {
  1531. if(srcFormat == PIX_FMT_BGR24) {
  1532. __asm__ volatile(
  1533. "movq "MANGLE(ff_bgr24toY1Coeff)", %%mm5 \n\t"
  1534. "movq "MANGLE(ff_bgr24toY2Coeff)", %%mm6 \n\t"
  1535. :
  1536. );
  1537. } else {
  1538. __asm__ volatile(
  1539. "movq "MANGLE(ff_rgb24toY1Coeff)", %%mm5 \n\t"
  1540. "movq "MANGLE(ff_rgb24toY2Coeff)", %%mm6 \n\t"
  1541. :
  1542. );
  1543. }
  1544. __asm__ volatile(
  1545. "movq "MANGLE(ff_bgr24toYOffset)", %%mm4 \n\t"
  1546. "mov %2, %%"REG_a" \n\t"
  1547. "pxor %%mm7, %%mm7 \n\t"
  1548. "1: \n\t"
  1549. PREFETCH" 64(%0) \n\t"
  1550. "movd (%0), %%mm0 \n\t"
  1551. "movd 2(%0), %%mm1 \n\t"
  1552. "movd 6(%0), %%mm2 \n\t"
  1553. "movd 8(%0), %%mm3 \n\t"
  1554. "add $12, %0 \n\t"
  1555. "punpcklbw %%mm7, %%mm0 \n\t"
  1556. "punpcklbw %%mm7, %%mm1 \n\t"
  1557. "punpcklbw %%mm7, %%mm2 \n\t"
  1558. "punpcklbw %%mm7, %%mm3 \n\t"
  1559. "pmaddwd %%mm5, %%mm0 \n\t"
  1560. "pmaddwd %%mm6, %%mm1 \n\t"
  1561. "pmaddwd %%mm5, %%mm2 \n\t"
  1562. "pmaddwd %%mm6, %%mm3 \n\t"
  1563. "paddd %%mm1, %%mm0 \n\t"
  1564. "paddd %%mm3, %%mm2 \n\t"
  1565. "paddd %%mm4, %%mm0 \n\t"
  1566. "paddd %%mm4, %%mm2 \n\t"
  1567. "psrad $15, %%mm0 \n\t"
  1568. "psrad $15, %%mm2 \n\t"
  1569. "packssdw %%mm2, %%mm0 \n\t"
  1570. "packuswb %%mm0, %%mm0 \n\t"
  1571. "movd %%mm0, (%1, %%"REG_a") \n\t"
  1572. "add $4, %%"REG_a" \n\t"
  1573. " js 1b \n\t"
  1574. : "+r" (src)
  1575. : "r" (dst+width), "g" ((x86_reg)-width)
  1576. : "%"REG_a
  1577. );
  1578. }
  1579. static void RENAME(bgr24ToY)(uint8_t *dst, const uint8_t *src,
  1580. int width, uint32_t *unused)
  1581. {
  1582. RENAME(bgr24ToY_mmx)(dst, src, width, PIX_FMT_BGR24);
  1583. }
  1584. static void RENAME(rgb24ToY)(uint8_t *dst, const uint8_t *src,
  1585. int width, uint32_t *unused)
  1586. {
  1587. RENAME(bgr24ToY_mmx)(dst, src, width, PIX_FMT_RGB24);
  1588. }
  1589. static av_always_inline void RENAME(bgr24ToUV_mmx)(uint8_t *dstU, uint8_t *dstV,
  1590. const uint8_t *src, int width,
  1591. enum PixelFormat srcFormat)
  1592. {
  1593. __asm__ volatile(
  1594. "movq 24(%4), %%mm6 \n\t"
  1595. "mov %3, %%"REG_a" \n\t"
  1596. "pxor %%mm7, %%mm7 \n\t"
  1597. "1: \n\t"
  1598. PREFETCH" 64(%0) \n\t"
  1599. "movd (%0), %%mm0 \n\t"
  1600. "movd 2(%0), %%mm1 \n\t"
  1601. "punpcklbw %%mm7, %%mm0 \n\t"
  1602. "punpcklbw %%mm7, %%mm1 \n\t"
  1603. "movq %%mm0, %%mm2 \n\t"
  1604. "movq %%mm1, %%mm3 \n\t"
  1605. "pmaddwd (%4), %%mm0 \n\t"
  1606. "pmaddwd 8(%4), %%mm1 \n\t"
  1607. "pmaddwd 16(%4), %%mm2 \n\t"
  1608. "pmaddwd %%mm6, %%mm3 \n\t"
  1609. "paddd %%mm1, %%mm0 \n\t"
  1610. "paddd %%mm3, %%mm2 \n\t"
  1611. "movd 6(%0), %%mm1 \n\t"
  1612. "movd 8(%0), %%mm3 \n\t"
  1613. "add $12, %0 \n\t"
  1614. "punpcklbw %%mm7, %%mm1 \n\t"
  1615. "punpcklbw %%mm7, %%mm3 \n\t"
  1616. "movq %%mm1, %%mm4 \n\t"
  1617. "movq %%mm3, %%mm5 \n\t"
  1618. "pmaddwd (%4), %%mm1 \n\t"
  1619. "pmaddwd 8(%4), %%mm3 \n\t"
  1620. "pmaddwd 16(%4), %%mm4 \n\t"
  1621. "pmaddwd %%mm6, %%mm5 \n\t"
  1622. "paddd %%mm3, %%mm1 \n\t"
  1623. "paddd %%mm5, %%mm4 \n\t"
  1624. "movq "MANGLE(ff_bgr24toUVOffset)", %%mm3 \n\t"
  1625. "paddd %%mm3, %%mm0 \n\t"
  1626. "paddd %%mm3, %%mm2 \n\t"
  1627. "paddd %%mm3, %%mm1 \n\t"
  1628. "paddd %%mm3, %%mm4 \n\t"
  1629. "psrad $15, %%mm0 \n\t"
  1630. "psrad $15, %%mm2 \n\t"
  1631. "psrad $15, %%mm1 \n\t"
  1632. "psrad $15, %%mm4 \n\t"
  1633. "packssdw %%mm1, %%mm0 \n\t"
  1634. "packssdw %%mm4, %%mm2 \n\t"
  1635. "packuswb %%mm0, %%mm0 \n\t"
  1636. "packuswb %%mm2, %%mm2 \n\t"
  1637. "movd %%mm0, (%1, %%"REG_a") \n\t"
  1638. "movd %%mm2, (%2, %%"REG_a") \n\t"
  1639. "add $4, %%"REG_a" \n\t"
  1640. " js 1b \n\t"
  1641. : "+r" (src)
  1642. : "r" (dstU+width), "r" (dstV+width), "g" ((x86_reg)-width), "r"(ff_bgr24toUV[srcFormat == PIX_FMT_RGB24])
  1643. : "%"REG_a
  1644. );
  1645. }
  1646. static void RENAME(bgr24ToUV)(uint8_t *dstU, uint8_t *dstV,
  1647. const uint8_t *src1, const uint8_t *src2,
  1648. int width, uint32_t *unused)
  1649. {
  1650. RENAME(bgr24ToUV_mmx)(dstU, dstV, src1, width, PIX_FMT_BGR24);
  1651. assert(src1 == src2);
  1652. }
  1653. static void RENAME(rgb24ToUV)(uint8_t *dstU, uint8_t *dstV,
  1654. const uint8_t *src1, const uint8_t *src2,
  1655. int width, uint32_t *unused)
  1656. {
  1657. assert(src1==src2);
  1658. RENAME(bgr24ToUV_mmx)(dstU, dstV, src1, width, PIX_FMT_RGB24);
  1659. }
  1660. #if COMPILE_TEMPLATE_MMX2
  1661. static void RENAME(hyscale_fast)(SwsContext *c, int16_t *dst,
  1662. int dstWidth, const uint8_t *src,
  1663. int srcW, int xInc)
  1664. {
  1665. int16_t *filterPos = c->hLumFilterPos;
  1666. int16_t *filter = c->hLumFilter;
  1667. void *mmx2FilterCode= c->lumMmx2FilterCode;
  1668. int i;
  1669. #if defined(PIC)
  1670. DECLARE_ALIGNED(8, uint64_t, ebxsave);
  1671. #endif
  1672. __asm__ volatile(
  1673. #if defined(PIC)
  1674. "mov %%"REG_b", %5 \n\t"
  1675. #endif
  1676. "pxor %%mm7, %%mm7 \n\t"
  1677. "mov %0, %%"REG_c" \n\t"
  1678. "mov %1, %%"REG_D" \n\t"
  1679. "mov %2, %%"REG_d" \n\t"
  1680. "mov %3, %%"REG_b" \n\t"
  1681. "xor %%"REG_a", %%"REG_a" \n\t" // i
  1682. PREFETCH" (%%"REG_c") \n\t"
  1683. PREFETCH" 32(%%"REG_c") \n\t"
  1684. PREFETCH" 64(%%"REG_c") \n\t"
  1685. #if ARCH_X86_64
  1686. #define CALL_MMX2_FILTER_CODE \
  1687. "movl (%%"REG_b"), %%esi \n\t"\
  1688. "call *%4 \n\t"\
  1689. "movl (%%"REG_b", %%"REG_a"), %%esi \n\t"\
  1690. "add %%"REG_S", %%"REG_c" \n\t"\
  1691. "add %%"REG_a", %%"REG_D" \n\t"\
  1692. "xor %%"REG_a", %%"REG_a" \n\t"\
  1693. #else
  1694. #define CALL_MMX2_FILTER_CODE \
  1695. "movl (%%"REG_b"), %%esi \n\t"\
  1696. "call *%4 \n\t"\
  1697. "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\
  1698. "add %%"REG_a", %%"REG_D" \n\t"\
  1699. "xor %%"REG_a", %%"REG_a" \n\t"\
  1700. #endif /* ARCH_X86_64 */
  1701. CALL_MMX2_FILTER_CODE
  1702. CALL_MMX2_FILTER_CODE
  1703. CALL_MMX2_FILTER_CODE
  1704. CALL_MMX2_FILTER_CODE
  1705. CALL_MMX2_FILTER_CODE
  1706. CALL_MMX2_FILTER_CODE
  1707. CALL_MMX2_FILTER_CODE
  1708. CALL_MMX2_FILTER_CODE
  1709. #if defined(PIC)
  1710. "mov %5, %%"REG_b" \n\t"
  1711. #endif
  1712. :: "m" (src), "m" (dst), "m" (filter), "m" (filterPos),
  1713. "m" (mmx2FilterCode)
  1714. #if defined(PIC)
  1715. ,"m" (ebxsave)
  1716. #endif
  1717. : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
  1718. #if !defined(PIC)
  1719. ,"%"REG_b
  1720. #endif
  1721. );
  1722. for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
  1723. dst[i] = src[srcW-1]*128;
  1724. }
  1725. static void RENAME(hcscale_fast)(SwsContext *c, int16_t *dst1, int16_t *dst2,
  1726. int dstWidth, const uint8_t *src1,
  1727. const uint8_t *src2, int srcW, int xInc)
  1728. {
  1729. int16_t *filterPos = c->hChrFilterPos;
  1730. int16_t *filter = c->hChrFilter;
  1731. void *mmx2FilterCode= c->chrMmx2FilterCode;
  1732. int i;
  1733. #if defined(PIC)
  1734. DECLARE_ALIGNED(8, uint64_t, ebxsave);
  1735. #endif
  1736. __asm__ volatile(
  1737. #if defined(PIC)
  1738. "mov %%"REG_b", %7 \n\t"
  1739. #endif
  1740. "pxor %%mm7, %%mm7 \n\t"
  1741. "mov %0, %%"REG_c" \n\t"
  1742. "mov %1, %%"REG_D" \n\t"
  1743. "mov %2, %%"REG_d" \n\t"
  1744. "mov %3, %%"REG_b" \n\t"
  1745. "xor %%"REG_a", %%"REG_a" \n\t" // i
  1746. PREFETCH" (%%"REG_c") \n\t"
  1747. PREFETCH" 32(%%"REG_c") \n\t"
  1748. PREFETCH" 64(%%"REG_c") \n\t"
  1749. CALL_MMX2_FILTER_CODE
  1750. CALL_MMX2_FILTER_CODE
  1751. CALL_MMX2_FILTER_CODE
  1752. CALL_MMX2_FILTER_CODE
  1753. "xor %%"REG_a", %%"REG_a" \n\t" // i
  1754. "mov %5, %%"REG_c" \n\t" // src
  1755. "mov %6, %%"REG_D" \n\t" // buf2
  1756. PREFETCH" (%%"REG_c") \n\t"
  1757. PREFETCH" 32(%%"REG_c") \n\t"
  1758. PREFETCH" 64(%%"REG_c") \n\t"
  1759. CALL_MMX2_FILTER_CODE
  1760. CALL_MMX2_FILTER_CODE
  1761. CALL_MMX2_FILTER_CODE
  1762. CALL_MMX2_FILTER_CODE
  1763. #if defined(PIC)
  1764. "mov %7, %%"REG_b" \n\t"
  1765. #endif
  1766. :: "m" (src1), "m" (dst1), "m" (filter), "m" (filterPos),
  1767. "m" (mmx2FilterCode), "m" (src2), "m"(dst2)
  1768. #if defined(PIC)
  1769. ,"m" (ebxsave)
  1770. #endif
  1771. : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
  1772. #if !defined(PIC)
  1773. ,"%"REG_b
  1774. #endif
  1775. );
  1776. for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) {
  1777. dst1[i] = src1[srcW-1]*128;
  1778. dst2[i] = src2[srcW-1]*128;
  1779. }
  1780. }
  1781. #endif /* COMPILE_TEMPLATE_MMX2 */
  1782. static av_cold void RENAME(sws_init_swScale)(SwsContext *c)
  1783. {
  1784. enum PixelFormat srcFormat = c->srcFormat,
  1785. dstFormat = c->dstFormat;
  1786. if (!is16BPS(dstFormat) && !is9_OR_10BPS(dstFormat) &&
  1787. dstFormat != PIX_FMT_NV12 && dstFormat != PIX_FMT_NV21) {
  1788. if (!(c->flags & SWS_BITEXACT)) {
  1789. if (c->flags & SWS_ACCURATE_RND) {
  1790. //c->yuv2yuv1 = RENAME(yuv2yuv1_ar );
  1791. if (!(c->flags & SWS_FULL_CHR_H_INT)) {
  1792. switch (c->dstFormat) {
  1793. case PIX_FMT_RGB32: c->yuv2packedX = RENAME(yuv2rgb32_X_ar); break;
  1794. case PIX_FMT_BGR24: c->yuv2packedX = RENAME(yuv2bgr24_X_ar); break;
  1795. case PIX_FMT_RGB555: c->yuv2packedX = RENAME(yuv2rgb555_X_ar); break;
  1796. case PIX_FMT_RGB565: c->yuv2packedX = RENAME(yuv2rgb565_X_ar); break;
  1797. case PIX_FMT_YUYV422: c->yuv2packedX = RENAME(yuv2yuyv422_X_ar); break;
  1798. default: break;
  1799. }
  1800. }
  1801. } else {
  1802. //c->yuv2yuv1 = RENAME(yuv2yuv1 );
  1803. if (!(c->flags & SWS_FULL_CHR_H_INT)) {
  1804. switch (c->dstFormat) {
  1805. case PIX_FMT_RGB32: c->yuv2packedX = RENAME(yuv2rgb32_X); break;
  1806. case PIX_FMT_BGR24: c->yuv2packedX = RENAME(yuv2bgr24_X); break;
  1807. case PIX_FMT_RGB555: c->yuv2packedX = RENAME(yuv2rgb555_X); break;
  1808. case PIX_FMT_RGB565: c->yuv2packedX = RENAME(yuv2rgb565_X); break;
  1809. case PIX_FMT_YUYV422: c->yuv2packedX = RENAME(yuv2yuyv422_X); break;
  1810. default: break;
  1811. }
  1812. }
  1813. }
  1814. }
  1815. if (!(c->flags & SWS_FULL_CHR_H_INT)) {
  1816. switch (c->dstFormat) {
  1817. case PIX_FMT_RGB32:
  1818. c->yuv2packed1 = RENAME(yuv2rgb32_1);
  1819. c->yuv2packed2 = RENAME(yuv2rgb32_2);
  1820. break;
  1821. case PIX_FMT_BGR24:
  1822. c->yuv2packed1 = RENAME(yuv2bgr24_1);
  1823. c->yuv2packed2 = RENAME(yuv2bgr24_2);
  1824. break;
  1825. case PIX_FMT_RGB555:
  1826. c->yuv2packed1 = RENAME(yuv2rgb555_1);
  1827. c->yuv2packed2 = RENAME(yuv2rgb555_2);
  1828. break;
  1829. case PIX_FMT_RGB565:
  1830. c->yuv2packed1 = RENAME(yuv2rgb565_1);
  1831. c->yuv2packed2 = RENAME(yuv2rgb565_2);
  1832. break;
  1833. case PIX_FMT_YUYV422:
  1834. c->yuv2packed1 = RENAME(yuv2yuyv422_1);
  1835. c->yuv2packed2 = RENAME(yuv2yuyv422_2);
  1836. break;
  1837. default:
  1838. break;
  1839. }
  1840. }
  1841. }
  1842. if (c->srcBpc == 8 && c->dstBpc <= 10) {
  1843. // Use the new MMX scaler if the MMX2 one can't be used (it is faster than the x86 ASM one).
  1844. #if COMPILE_TEMPLATE_MMX2
  1845. if (c->flags & SWS_FAST_BILINEAR && c->canMMX2BeUsed)
  1846. {
  1847. c->hyscale_fast = RENAME(hyscale_fast);
  1848. c->hcscale_fast = RENAME(hcscale_fast);
  1849. } else {
  1850. #endif /* COMPILE_TEMPLATE_MMX2 */
  1851. c->hyscale_fast = NULL;
  1852. c->hcscale_fast = NULL;
  1853. #if COMPILE_TEMPLATE_MMX2
  1854. }
  1855. #endif /* COMPILE_TEMPLATE_MMX2 */
  1856. }
  1857. #if !COMPILE_TEMPLATE_MMX2
  1858. switch(srcFormat) {
  1859. case PIX_FMT_YUYV422 : c->chrToYV12 = RENAME(yuy2ToUV); break;
  1860. case PIX_FMT_UYVY422 : c->chrToYV12 = RENAME(uyvyToUV); break;
  1861. case PIX_FMT_NV12 : c->chrToYV12 = RENAME(nv12ToUV); break;
  1862. case PIX_FMT_NV21 : c->chrToYV12 = RENAME(nv21ToUV); break;
  1863. default: break;
  1864. }
  1865. #endif /* !COMPILE_TEMPLATE_MMX2 */
  1866. if (!c->chrSrcHSubSample) {
  1867. switch(srcFormat) {
  1868. case PIX_FMT_BGR24 : c->chrToYV12 = RENAME(bgr24ToUV); break;
  1869. case PIX_FMT_RGB24 : c->chrToYV12 = RENAME(rgb24ToUV); break;
  1870. default: break;
  1871. }
  1872. }
  1873. switch (srcFormat) {
  1874. #if !COMPILE_TEMPLATE_MMX2
  1875. case PIX_FMT_YUYV422 :
  1876. case PIX_FMT_Y400A : c->lumToYV12 = RENAME(yuy2ToY); break;
  1877. case PIX_FMT_UYVY422 : c->lumToYV12 = RENAME(uyvyToY); break;
  1878. #endif /* !COMPILE_TEMPLATE_MMX2 */
  1879. case PIX_FMT_BGR24 : c->lumToYV12 = RENAME(bgr24ToY); break;
  1880. case PIX_FMT_RGB24 : c->lumToYV12 = RENAME(rgb24ToY); break;
  1881. default: break;
  1882. }
  1883. #if !COMPILE_TEMPLATE_MMX2
  1884. if (c->alpPixBuf) {
  1885. switch (srcFormat) {
  1886. case PIX_FMT_Y400A : c->alpToYV12 = RENAME(yuy2ToY); break;
  1887. default: break;
  1888. }
  1889. }
  1890. #endif /* !COMPILE_TEMPLATE_MMX2 */
  1891. }