You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

2332 lines
106KB

  1. /*
  2. * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
  3. *
  4. * This file is part of Libav.
  5. *
  6. * Libav is free software; you can redistribute it and/or
  7. * modify it under the terms of the GNU Lesser General Public
  8. * License as published by the Free Software Foundation; either
  9. * version 2.1 of the License, or (at your option) any later version.
  10. *
  11. * Libav is distributed in the hope that it will be useful,
  12. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  14. * Lesser General Public License for more details.
  15. *
  16. * You should have received a copy of the GNU Lesser General Public
  17. * License along with Libav; if not, write to the Free Software
  18. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  19. */
  20. #undef REAL_MOVNTQ
  21. #undef MOVNTQ
  22. #undef PREFETCH
  23. #if COMPILE_TEMPLATE_MMX2
  24. #define PREFETCH "prefetchnta"
  25. #else
  26. #define PREFETCH " # nop"
  27. #endif
  28. #if COMPILE_TEMPLATE_MMX2
  29. #define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
  30. #else
  31. #define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
  32. #endif
  33. #define MOVNTQ(a,b) REAL_MOVNTQ(a,b)
  34. #define YSCALEYUV2YV12X(offset, dest, end, pos) \
  35. __asm__ volatile(\
  36. "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
  37. "movq %%mm3, %%mm4 \n\t"\
  38. "lea " offset "(%0), %%"REG_d" \n\t"\
  39. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  40. ".p2align 4 \n\t" /* FIXME Unroll? */\
  41. "1: \n\t"\
  42. "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
  43. "movq (%%"REG_S", %3, 2), %%mm2 \n\t" /* srcData */\
  44. "movq 8(%%"REG_S", %3, 2), %%mm5 \n\t" /* srcData */\
  45. "add $16, %%"REG_d" \n\t"\
  46. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  47. "test %%"REG_S", %%"REG_S" \n\t"\
  48. "pmulhw %%mm0, %%mm2 \n\t"\
  49. "pmulhw %%mm0, %%mm5 \n\t"\
  50. "paddw %%mm2, %%mm3 \n\t"\
  51. "paddw %%mm5, %%mm4 \n\t"\
  52. " jnz 1b \n\t"\
  53. "psraw $3, %%mm3 \n\t"\
  54. "psraw $3, %%mm4 \n\t"\
  55. "packuswb %%mm4, %%mm3 \n\t"\
  56. MOVNTQ(%%mm3, (%1, %3))\
  57. "add $8, %3 \n\t"\
  58. "cmp %2, %3 \n\t"\
  59. "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
  60. "movq %%mm3, %%mm4 \n\t"\
  61. "lea " offset "(%0), %%"REG_d" \n\t"\
  62. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  63. "jb 1b \n\t"\
  64. :: "r" (&c->redDither),\
  65. "r" (dest), "g" ((x86_reg)(end)), "r"((x86_reg)(pos))\
  66. : "%"REG_d, "%"REG_S\
  67. );
  68. static void RENAME(yuv2yuvX)(SwsContext *c, const int16_t *lumFilter,
  69. const int16_t **lumSrc, int lumFilterSize,
  70. const int16_t *chrFilter, const int16_t **chrUSrc,
  71. const int16_t **chrVSrc,
  72. int chrFilterSize, const int16_t **alpSrc,
  73. uint8_t *dest, uint8_t *uDest, uint8_t *vDest,
  74. uint8_t *aDest, int dstW, int chrDstW)
  75. {
  76. if (uDest) {
  77. x86_reg uv_off = c->uv_off;
  78. YSCALEYUV2YV12X(CHR_MMX_FILTER_OFFSET, uDest, chrDstW, 0)
  79. YSCALEYUV2YV12X(CHR_MMX_FILTER_OFFSET, vDest - uv_off, chrDstW + uv_off, uv_off)
  80. }
  81. if (CONFIG_SWSCALE_ALPHA && aDest) {
  82. YSCALEYUV2YV12X(ALP_MMX_FILTER_OFFSET, aDest, dstW, 0)
  83. }
  84. YSCALEYUV2YV12X(LUM_MMX_FILTER_OFFSET, dest, dstW, 0)
  85. }
  86. #define YSCALEYUV2YV12X_ACCURATE(offset, dest, end, pos) \
  87. __asm__ volatile(\
  88. "lea " offset "(%0), %%"REG_d" \n\t"\
  89. "pxor %%mm4, %%mm4 \n\t"\
  90. "pxor %%mm5, %%mm5 \n\t"\
  91. "pxor %%mm6, %%mm6 \n\t"\
  92. "pxor %%mm7, %%mm7 \n\t"\
  93. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  94. ".p2align 4 \n\t"\
  95. "1: \n\t"\
  96. "movq (%%"REG_S", %3, 2), %%mm0 \n\t" /* srcData */\
  97. "movq 8(%%"REG_S", %3, 2), %%mm2 \n\t" /* srcData */\
  98. "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
  99. "movq (%%"REG_S", %3, 2), %%mm1 \n\t" /* srcData */\
  100. "movq %%mm0, %%mm3 \n\t"\
  101. "punpcklwd %%mm1, %%mm0 \n\t"\
  102. "punpckhwd %%mm1, %%mm3 \n\t"\
  103. "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm1 \n\t" /* filterCoeff */\
  104. "pmaddwd %%mm1, %%mm0 \n\t"\
  105. "pmaddwd %%mm1, %%mm3 \n\t"\
  106. "paddd %%mm0, %%mm4 \n\t"\
  107. "paddd %%mm3, %%mm5 \n\t"\
  108. "movq 8(%%"REG_S", %3, 2), %%mm3 \n\t" /* srcData */\
  109. "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
  110. "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
  111. "test %%"REG_S", %%"REG_S" \n\t"\
  112. "movq %%mm2, %%mm0 \n\t"\
  113. "punpcklwd %%mm3, %%mm2 \n\t"\
  114. "punpckhwd %%mm3, %%mm0 \n\t"\
  115. "pmaddwd %%mm1, %%mm2 \n\t"\
  116. "pmaddwd %%mm1, %%mm0 \n\t"\
  117. "paddd %%mm2, %%mm6 \n\t"\
  118. "paddd %%mm0, %%mm7 \n\t"\
  119. " jnz 1b \n\t"\
  120. "psrad $16, %%mm4 \n\t"\
  121. "psrad $16, %%mm5 \n\t"\
  122. "psrad $16, %%mm6 \n\t"\
  123. "psrad $16, %%mm7 \n\t"\
  124. "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
  125. "packssdw %%mm5, %%mm4 \n\t"\
  126. "packssdw %%mm7, %%mm6 \n\t"\
  127. "paddw %%mm0, %%mm4 \n\t"\
  128. "paddw %%mm0, %%mm6 \n\t"\
  129. "psraw $3, %%mm4 \n\t"\
  130. "psraw $3, %%mm6 \n\t"\
  131. "packuswb %%mm6, %%mm4 \n\t"\
  132. MOVNTQ(%%mm4, (%1, %3))\
  133. "add $8, %3 \n\t"\
  134. "cmp %2, %3 \n\t"\
  135. "lea " offset "(%0), %%"REG_d" \n\t"\
  136. "pxor %%mm4, %%mm4 \n\t"\
  137. "pxor %%mm5, %%mm5 \n\t"\
  138. "pxor %%mm6, %%mm6 \n\t"\
  139. "pxor %%mm7, %%mm7 \n\t"\
  140. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  141. "jb 1b \n\t"\
  142. :: "r" (&c->redDither),\
  143. "r" (dest), "g" ((x86_reg)(end)), "r"((x86_reg)(pos))\
  144. : "%"REG_a, "%"REG_d, "%"REG_S\
  145. );
  146. static void RENAME(yuv2yuvX_ar)(SwsContext *c, const int16_t *lumFilter,
  147. const int16_t **lumSrc, int lumFilterSize,
  148. const int16_t *chrFilter, const int16_t **chrUSrc,
  149. const int16_t **chrVSrc,
  150. int chrFilterSize, const int16_t **alpSrc,
  151. uint8_t *dest, uint8_t *uDest, uint8_t *vDest,
  152. uint8_t *aDest, int dstW, int chrDstW)
  153. {
  154. if (uDest) {
  155. x86_reg uv_off = c->uv_off;
  156. YSCALEYUV2YV12X_ACCURATE(CHR_MMX_FILTER_OFFSET, uDest, chrDstW, 0)
  157. YSCALEYUV2YV12X_ACCURATE(CHR_MMX_FILTER_OFFSET, vDest - uv_off, chrDstW + uv_off, uv_off)
  158. }
  159. if (CONFIG_SWSCALE_ALPHA && aDest) {
  160. YSCALEYUV2YV12X_ACCURATE(ALP_MMX_FILTER_OFFSET, aDest, dstW, 0)
  161. }
  162. YSCALEYUV2YV12X_ACCURATE(LUM_MMX_FILTER_OFFSET, dest, dstW, 0)
  163. }
  164. static void RENAME(yuv2yuv1)(SwsContext *c, const int16_t *lumSrc,
  165. const int16_t *chrUSrc, const int16_t *chrVSrc,
  166. const int16_t *alpSrc,
  167. uint8_t *dest, uint8_t *uDest, uint8_t *vDest,
  168. uint8_t *aDest, int dstW, int chrDstW)
  169. {
  170. int p= 4;
  171. const int16_t *src[4]= { alpSrc + dstW, lumSrc + dstW, chrUSrc + chrDstW, chrVSrc + chrDstW };
  172. uint8_t *dst[4]= { aDest, dest, uDest, vDest };
  173. x86_reg counter[4]= { dstW, dstW, chrDstW, chrDstW };
  174. while (p--) {
  175. if (dst[p]) {
  176. __asm__ volatile(
  177. "mov %2, %%"REG_a" \n\t"
  178. ".p2align 4 \n\t" /* FIXME Unroll? */
  179. "1: \n\t"
  180. "movq (%0, %%"REG_a", 2), %%mm0 \n\t"
  181. "movq 8(%0, %%"REG_a", 2), %%mm1 \n\t"
  182. "psraw $7, %%mm0 \n\t"
  183. "psraw $7, %%mm1 \n\t"
  184. "packuswb %%mm1, %%mm0 \n\t"
  185. MOVNTQ(%%mm0, (%1, %%REGa))
  186. "add $8, %%"REG_a" \n\t"
  187. "jnc 1b \n\t"
  188. :: "r" (src[p]), "r" (dst[p] + counter[p]),
  189. "g" (-counter[p])
  190. : "%"REG_a
  191. );
  192. }
  193. }
  194. }
  195. static void RENAME(yuv2yuv1_ar)(SwsContext *c, const int16_t *lumSrc,
  196. const int16_t *chrUSrc, const int16_t *chrVSrc,
  197. const int16_t *alpSrc,
  198. uint8_t *dest, uint8_t *uDest, uint8_t *vDest,
  199. uint8_t *aDest, int dstW, int chrDstW)
  200. {
  201. int p= 4;
  202. const int16_t *src[4]= { alpSrc + dstW, lumSrc + dstW, chrUSrc + chrDstW, chrVSrc + chrDstW };
  203. uint8_t *dst[4]= { aDest, dest, uDest, vDest };
  204. x86_reg counter[4]= { dstW, dstW, chrDstW, chrDstW };
  205. while (p--) {
  206. if (dst[p]) {
  207. __asm__ volatile(
  208. "mov %2, %%"REG_a" \n\t"
  209. "pcmpeqw %%mm7, %%mm7 \n\t"
  210. "psrlw $15, %%mm7 \n\t"
  211. "psllw $6, %%mm7 \n\t"
  212. ".p2align 4 \n\t" /* FIXME Unroll? */
  213. "1: \n\t"
  214. "movq (%0, %%"REG_a", 2), %%mm0 \n\t"
  215. "movq 8(%0, %%"REG_a", 2), %%mm1 \n\t"
  216. "paddsw %%mm7, %%mm0 \n\t"
  217. "paddsw %%mm7, %%mm1 \n\t"
  218. "psraw $7, %%mm0 \n\t"
  219. "psraw $7, %%mm1 \n\t"
  220. "packuswb %%mm1, %%mm0 \n\t"
  221. MOVNTQ(%%mm0, (%1, %%REGa))
  222. "add $8, %%"REG_a" \n\t"
  223. "jnc 1b \n\t"
  224. :: "r" (src[p]), "r" (dst[p] + counter[p]),
  225. "g" (-counter[p])
  226. : "%"REG_a
  227. );
  228. }
  229. }
  230. }
  231. #define YSCALEYUV2PACKEDX_UV \
  232. __asm__ volatile(\
  233. "xor %%"REG_a", %%"REG_a" \n\t"\
  234. ".p2align 4 \n\t"\
  235. "nop \n\t"\
  236. "1: \n\t"\
  237. "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
  238. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  239. "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
  240. "movq %%mm3, %%mm4 \n\t"\
  241. ".p2align 4 \n\t"\
  242. "2: \n\t"\
  243. "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
  244. "movq (%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* UsrcData */\
  245. "add %6, %%"REG_S" \n\t" \
  246. "movq (%%"REG_S", %%"REG_a"), %%mm5 \n\t" /* VsrcData */\
  247. "add $16, %%"REG_d" \n\t"\
  248. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  249. "pmulhw %%mm0, %%mm2 \n\t"\
  250. "pmulhw %%mm0, %%mm5 \n\t"\
  251. "paddw %%mm2, %%mm3 \n\t"\
  252. "paddw %%mm5, %%mm4 \n\t"\
  253. "test %%"REG_S", %%"REG_S" \n\t"\
  254. " jnz 2b \n\t"\
  255. #define YSCALEYUV2PACKEDX_YA(offset,coeff,src1,src2,dst1,dst2) \
  256. "lea "offset"(%0), %%"REG_d" \n\t"\
  257. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  258. "movq "VROUNDER_OFFSET"(%0), "#dst1" \n\t"\
  259. "movq "#dst1", "#dst2" \n\t"\
  260. ".p2align 4 \n\t"\
  261. "2: \n\t"\
  262. "movq 8(%%"REG_d"), "#coeff" \n\t" /* filterCoeff */\
  263. "movq (%%"REG_S", %%"REG_a", 2), "#src1" \n\t" /* Y1srcData */\
  264. "movq 8(%%"REG_S", %%"REG_a", 2), "#src2" \n\t" /* Y2srcData */\
  265. "add $16, %%"REG_d" \n\t"\
  266. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  267. "pmulhw "#coeff", "#src1" \n\t"\
  268. "pmulhw "#coeff", "#src2" \n\t"\
  269. "paddw "#src1", "#dst1" \n\t"\
  270. "paddw "#src2", "#dst2" \n\t"\
  271. "test %%"REG_S", %%"REG_S" \n\t"\
  272. " jnz 2b \n\t"\
  273. #define YSCALEYUV2PACKEDX \
  274. YSCALEYUV2PACKEDX_UV \
  275. YSCALEYUV2PACKEDX_YA(LUM_MMX_FILTER_OFFSET,%%mm0,%%mm2,%%mm5,%%mm1,%%mm7) \
  276. #define YSCALEYUV2PACKEDX_END \
  277. :: "r" (&c->redDither), \
  278. "m" (dummy), "m" (dummy), "m" (dummy),\
  279. "r" (dest), "m" (dstW_reg), "m"(uv_off) \
  280. : "%"REG_a, "%"REG_d, "%"REG_S \
  281. );
  282. #define YSCALEYUV2PACKEDX_ACCURATE_UV \
  283. __asm__ volatile(\
  284. "xor %%"REG_a", %%"REG_a" \n\t"\
  285. ".p2align 4 \n\t"\
  286. "nop \n\t"\
  287. "1: \n\t"\
  288. "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
  289. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  290. "pxor %%mm4, %%mm4 \n\t"\
  291. "pxor %%mm5, %%mm5 \n\t"\
  292. "pxor %%mm6, %%mm6 \n\t"\
  293. "pxor %%mm7, %%mm7 \n\t"\
  294. ".p2align 4 \n\t"\
  295. "2: \n\t"\
  296. "movq (%%"REG_S", %%"REG_a"), %%mm0 \n\t" /* UsrcData */\
  297. "add %6, %%"REG_S" \n\t" \
  298. "movq (%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* VsrcData */\
  299. "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
  300. "movq (%%"REG_S", %%"REG_a"), %%mm1 \n\t" /* UsrcData */\
  301. "movq %%mm0, %%mm3 \n\t"\
  302. "punpcklwd %%mm1, %%mm0 \n\t"\
  303. "punpckhwd %%mm1, %%mm3 \n\t"\
  304. "movq "STR(APCK_COEF)"(%%"REG_d"),%%mm1 \n\t" /* filterCoeff */\
  305. "pmaddwd %%mm1, %%mm0 \n\t"\
  306. "pmaddwd %%mm1, %%mm3 \n\t"\
  307. "paddd %%mm0, %%mm4 \n\t"\
  308. "paddd %%mm3, %%mm5 \n\t"\
  309. "add %6, %%"REG_S" \n\t" \
  310. "movq (%%"REG_S", %%"REG_a"), %%mm3 \n\t" /* VsrcData */\
  311. "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
  312. "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
  313. "test %%"REG_S", %%"REG_S" \n\t"\
  314. "movq %%mm2, %%mm0 \n\t"\
  315. "punpcklwd %%mm3, %%mm2 \n\t"\
  316. "punpckhwd %%mm3, %%mm0 \n\t"\
  317. "pmaddwd %%mm1, %%mm2 \n\t"\
  318. "pmaddwd %%mm1, %%mm0 \n\t"\
  319. "paddd %%mm2, %%mm6 \n\t"\
  320. "paddd %%mm0, %%mm7 \n\t"\
  321. " jnz 2b \n\t"\
  322. "psrad $16, %%mm4 \n\t"\
  323. "psrad $16, %%mm5 \n\t"\
  324. "psrad $16, %%mm6 \n\t"\
  325. "psrad $16, %%mm7 \n\t"\
  326. "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
  327. "packssdw %%mm5, %%mm4 \n\t"\
  328. "packssdw %%mm7, %%mm6 \n\t"\
  329. "paddw %%mm0, %%mm4 \n\t"\
  330. "paddw %%mm0, %%mm6 \n\t"\
  331. "movq %%mm4, "U_TEMP"(%0) \n\t"\
  332. "movq %%mm6, "V_TEMP"(%0) \n\t"\
  333. #define YSCALEYUV2PACKEDX_ACCURATE_YA(offset) \
  334. "lea "offset"(%0), %%"REG_d" \n\t"\
  335. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  336. "pxor %%mm1, %%mm1 \n\t"\
  337. "pxor %%mm5, %%mm5 \n\t"\
  338. "pxor %%mm7, %%mm7 \n\t"\
  339. "pxor %%mm6, %%mm6 \n\t"\
  340. ".p2align 4 \n\t"\
  341. "2: \n\t"\
  342. "movq (%%"REG_S", %%"REG_a", 2), %%mm0 \n\t" /* Y1srcData */\
  343. "movq 8(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* Y2srcData */\
  344. "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
  345. "movq (%%"REG_S", %%"REG_a", 2), %%mm4 \n\t" /* Y1srcData */\
  346. "movq %%mm0, %%mm3 \n\t"\
  347. "punpcklwd %%mm4, %%mm0 \n\t"\
  348. "punpckhwd %%mm4, %%mm3 \n\t"\
  349. "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm4 \n\t" /* filterCoeff */\
  350. "pmaddwd %%mm4, %%mm0 \n\t"\
  351. "pmaddwd %%mm4, %%mm3 \n\t"\
  352. "paddd %%mm0, %%mm1 \n\t"\
  353. "paddd %%mm3, %%mm5 \n\t"\
  354. "movq 8(%%"REG_S", %%"REG_a", 2), %%mm3 \n\t" /* Y2srcData */\
  355. "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
  356. "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
  357. "test %%"REG_S", %%"REG_S" \n\t"\
  358. "movq %%mm2, %%mm0 \n\t"\
  359. "punpcklwd %%mm3, %%mm2 \n\t"\
  360. "punpckhwd %%mm3, %%mm0 \n\t"\
  361. "pmaddwd %%mm4, %%mm2 \n\t"\
  362. "pmaddwd %%mm4, %%mm0 \n\t"\
  363. "paddd %%mm2, %%mm7 \n\t"\
  364. "paddd %%mm0, %%mm6 \n\t"\
  365. " jnz 2b \n\t"\
  366. "psrad $16, %%mm1 \n\t"\
  367. "psrad $16, %%mm5 \n\t"\
  368. "psrad $16, %%mm7 \n\t"\
  369. "psrad $16, %%mm6 \n\t"\
  370. "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
  371. "packssdw %%mm5, %%mm1 \n\t"\
  372. "packssdw %%mm6, %%mm7 \n\t"\
  373. "paddw %%mm0, %%mm1 \n\t"\
  374. "paddw %%mm0, %%mm7 \n\t"\
  375. "movq "U_TEMP"(%0), %%mm3 \n\t"\
  376. "movq "V_TEMP"(%0), %%mm4 \n\t"\
  377. #define YSCALEYUV2PACKEDX_ACCURATE \
  378. YSCALEYUV2PACKEDX_ACCURATE_UV \
  379. YSCALEYUV2PACKEDX_ACCURATE_YA(LUM_MMX_FILTER_OFFSET)
  380. #define YSCALEYUV2RGBX \
  381. "psubw "U_OFFSET"(%0), %%mm3 \n\t" /* (U-128)8*/\
  382. "psubw "V_OFFSET"(%0), %%mm4 \n\t" /* (V-128)8*/\
  383. "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
  384. "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
  385. "pmulhw "UG_COEFF"(%0), %%mm3 \n\t"\
  386. "pmulhw "VG_COEFF"(%0), %%mm4 \n\t"\
  387. /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
  388. "pmulhw "UB_COEFF"(%0), %%mm2 \n\t"\
  389. "pmulhw "VR_COEFF"(%0), %%mm5 \n\t"\
  390. "psubw "Y_OFFSET"(%0), %%mm1 \n\t" /* 8(Y-16)*/\
  391. "psubw "Y_OFFSET"(%0), %%mm7 \n\t" /* 8(Y-16)*/\
  392. "pmulhw "Y_COEFF"(%0), %%mm1 \n\t"\
  393. "pmulhw "Y_COEFF"(%0), %%mm7 \n\t"\
  394. /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
  395. "paddw %%mm3, %%mm4 \n\t"\
  396. "movq %%mm2, %%mm0 \n\t"\
  397. "movq %%mm5, %%mm6 \n\t"\
  398. "movq %%mm4, %%mm3 \n\t"\
  399. "punpcklwd %%mm2, %%mm2 \n\t"\
  400. "punpcklwd %%mm5, %%mm5 \n\t"\
  401. "punpcklwd %%mm4, %%mm4 \n\t"\
  402. "paddw %%mm1, %%mm2 \n\t"\
  403. "paddw %%mm1, %%mm5 \n\t"\
  404. "paddw %%mm1, %%mm4 \n\t"\
  405. "punpckhwd %%mm0, %%mm0 \n\t"\
  406. "punpckhwd %%mm6, %%mm6 \n\t"\
  407. "punpckhwd %%mm3, %%mm3 \n\t"\
  408. "paddw %%mm7, %%mm0 \n\t"\
  409. "paddw %%mm7, %%mm6 \n\t"\
  410. "paddw %%mm7, %%mm3 \n\t"\
  411. /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
  412. "packuswb %%mm0, %%mm2 \n\t"\
  413. "packuswb %%mm6, %%mm5 \n\t"\
  414. "packuswb %%mm3, %%mm4 \n\t"\
  415. #define REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) \
  416. "movq "#b", "#q2" \n\t" /* B */\
  417. "movq "#r", "#t" \n\t" /* R */\
  418. "punpcklbw "#g", "#b" \n\t" /* GBGBGBGB 0 */\
  419. "punpcklbw "#a", "#r" \n\t" /* ARARARAR 0 */\
  420. "punpckhbw "#g", "#q2" \n\t" /* GBGBGBGB 2 */\
  421. "punpckhbw "#a", "#t" \n\t" /* ARARARAR 2 */\
  422. "movq "#b", "#q0" \n\t" /* GBGBGBGB 0 */\
  423. "movq "#q2", "#q3" \n\t" /* GBGBGBGB 2 */\
  424. "punpcklwd "#r", "#q0" \n\t" /* ARGBARGB 0 */\
  425. "punpckhwd "#r", "#b" \n\t" /* ARGBARGB 1 */\
  426. "punpcklwd "#t", "#q2" \n\t" /* ARGBARGB 2 */\
  427. "punpckhwd "#t", "#q3" \n\t" /* ARGBARGB 3 */\
  428. \
  429. MOVNTQ( q0, (dst, index, 4))\
  430. MOVNTQ( b, 8(dst, index, 4))\
  431. MOVNTQ( q2, 16(dst, index, 4))\
  432. MOVNTQ( q3, 24(dst, index, 4))\
  433. \
  434. "add $8, "#index" \n\t"\
  435. "cmp "#dstw", "#index" \n\t"\
  436. " jb 1b \n\t"
  437. #define WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t)
  438. static void RENAME(yuv2rgb32_X_ar)(SwsContext *c, const int16_t *lumFilter,
  439. const int16_t **lumSrc, int lumFilterSize,
  440. const int16_t *chrFilter, const int16_t **chrUSrc,
  441. const int16_t **chrVSrc,
  442. int chrFilterSize, const int16_t **alpSrc,
  443. uint8_t *dest, int dstW, int dstY)
  444. {
  445. x86_reg dummy=0;
  446. x86_reg dstW_reg = dstW;
  447. x86_reg uv_off = c->uv_off << 1;
  448. if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
  449. YSCALEYUV2PACKEDX_ACCURATE
  450. YSCALEYUV2RGBX
  451. "movq %%mm2, "U_TEMP"(%0) \n\t"
  452. "movq %%mm4, "V_TEMP"(%0) \n\t"
  453. "movq %%mm5, "Y_TEMP"(%0) \n\t"
  454. YSCALEYUV2PACKEDX_ACCURATE_YA(ALP_MMX_FILTER_OFFSET)
  455. "movq "Y_TEMP"(%0), %%mm5 \n\t"
  456. "psraw $3, %%mm1 \n\t"
  457. "psraw $3, %%mm7 \n\t"
  458. "packuswb %%mm7, %%mm1 \n\t"
  459. WRITEBGR32(%4, %5, %%REGa, %%mm3, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm2, %%mm6)
  460. YSCALEYUV2PACKEDX_END
  461. } else {
  462. YSCALEYUV2PACKEDX_ACCURATE
  463. YSCALEYUV2RGBX
  464. "pcmpeqd %%mm7, %%mm7 \n\t"
  465. WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
  466. YSCALEYUV2PACKEDX_END
  467. }
  468. }
  469. static void RENAME(yuv2rgb32_X)(SwsContext *c, const int16_t *lumFilter,
  470. const int16_t **lumSrc, int lumFilterSize,
  471. const int16_t *chrFilter, const int16_t **chrUSrc,
  472. const int16_t **chrVSrc,
  473. int chrFilterSize, const int16_t **alpSrc,
  474. uint8_t *dest, int dstW, int dstY)
  475. {
  476. x86_reg dummy=0;
  477. x86_reg dstW_reg = dstW;
  478. x86_reg uv_off = c->uv_off << 1;
  479. if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
  480. YSCALEYUV2PACKEDX
  481. YSCALEYUV2RGBX
  482. YSCALEYUV2PACKEDX_YA(ALP_MMX_FILTER_OFFSET, %%mm0, %%mm3, %%mm6, %%mm1, %%mm7)
  483. "psraw $3, %%mm1 \n\t"
  484. "psraw $3, %%mm7 \n\t"
  485. "packuswb %%mm7, %%mm1 \n\t"
  486. WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
  487. YSCALEYUV2PACKEDX_END
  488. } else {
  489. YSCALEYUV2PACKEDX
  490. YSCALEYUV2RGBX
  491. "pcmpeqd %%mm7, %%mm7 \n\t"
  492. WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
  493. YSCALEYUV2PACKEDX_END
  494. }
  495. }
  496. #define REAL_WRITERGB16(dst, dstw, index) \
  497. "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
  498. "pand "MANGLE(bFC)", %%mm4 \n\t" /* G */\
  499. "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
  500. "psrlq $3, %%mm2 \n\t"\
  501. \
  502. "movq %%mm2, %%mm1 \n\t"\
  503. "movq %%mm4, %%mm3 \n\t"\
  504. \
  505. "punpcklbw %%mm7, %%mm3 \n\t"\
  506. "punpcklbw %%mm5, %%mm2 \n\t"\
  507. "punpckhbw %%mm7, %%mm4 \n\t"\
  508. "punpckhbw %%mm5, %%mm1 \n\t"\
  509. \
  510. "psllq $3, %%mm3 \n\t"\
  511. "psllq $3, %%mm4 \n\t"\
  512. \
  513. "por %%mm3, %%mm2 \n\t"\
  514. "por %%mm4, %%mm1 \n\t"\
  515. \
  516. MOVNTQ(%%mm2, (dst, index, 2))\
  517. MOVNTQ(%%mm1, 8(dst, index, 2))\
  518. \
  519. "add $8, "#index" \n\t"\
  520. "cmp "#dstw", "#index" \n\t"\
  521. " jb 1b \n\t"
  522. #define WRITERGB16(dst, dstw, index) REAL_WRITERGB16(dst, dstw, index)
  523. static void RENAME(yuv2rgb565_X_ar)(SwsContext *c, const int16_t *lumFilter,
  524. const int16_t **lumSrc, int lumFilterSize,
  525. const int16_t *chrFilter, const int16_t **chrUSrc,
  526. const int16_t **chrVSrc,
  527. int chrFilterSize, const int16_t **alpSrc,
  528. uint8_t *dest, int dstW, int dstY)
  529. {
  530. x86_reg dummy=0;
  531. x86_reg dstW_reg = dstW;
  532. x86_reg uv_off = c->uv_off << 1;
  533. YSCALEYUV2PACKEDX_ACCURATE
  534. YSCALEYUV2RGBX
  535. "pxor %%mm7, %%mm7 \n\t"
  536. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  537. #ifdef DITHER1XBPP
  538. "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
  539. "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
  540. "paddusb "RED_DITHER"(%0), %%mm5\n\t"
  541. #endif
  542. WRITERGB16(%4, %5, %%REGa)
  543. YSCALEYUV2PACKEDX_END
  544. }
  545. static void RENAME(yuv2rgb565_X)(SwsContext *c, const int16_t *lumFilter,
  546. const int16_t **lumSrc, int lumFilterSize,
  547. const int16_t *chrFilter, const int16_t **chrUSrc,
  548. const int16_t **chrVSrc,
  549. int chrFilterSize, const int16_t **alpSrc,
  550. uint8_t *dest, int dstW, int dstY)
  551. {
  552. x86_reg dummy=0;
  553. x86_reg dstW_reg = dstW;
  554. x86_reg uv_off = c->uv_off << 1;
  555. YSCALEYUV2PACKEDX
  556. YSCALEYUV2RGBX
  557. "pxor %%mm7, %%mm7 \n\t"
  558. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  559. #ifdef DITHER1XBPP
  560. "paddusb "BLUE_DITHER"(%0), %%mm2 \n\t"
  561. "paddusb "GREEN_DITHER"(%0), %%mm4 \n\t"
  562. "paddusb "RED_DITHER"(%0), %%mm5 \n\t"
  563. #endif
  564. WRITERGB16(%4, %5, %%REGa)
  565. YSCALEYUV2PACKEDX_END
  566. }
  567. #define REAL_WRITERGB15(dst, dstw, index) \
  568. "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
  569. "pand "MANGLE(bF8)", %%mm4 \n\t" /* G */\
  570. "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
  571. "psrlq $3, %%mm2 \n\t"\
  572. "psrlq $1, %%mm5 \n\t"\
  573. \
  574. "movq %%mm2, %%mm1 \n\t"\
  575. "movq %%mm4, %%mm3 \n\t"\
  576. \
  577. "punpcklbw %%mm7, %%mm3 \n\t"\
  578. "punpcklbw %%mm5, %%mm2 \n\t"\
  579. "punpckhbw %%mm7, %%mm4 \n\t"\
  580. "punpckhbw %%mm5, %%mm1 \n\t"\
  581. \
  582. "psllq $2, %%mm3 \n\t"\
  583. "psllq $2, %%mm4 \n\t"\
  584. \
  585. "por %%mm3, %%mm2 \n\t"\
  586. "por %%mm4, %%mm1 \n\t"\
  587. \
  588. MOVNTQ(%%mm2, (dst, index, 2))\
  589. MOVNTQ(%%mm1, 8(dst, index, 2))\
  590. \
  591. "add $8, "#index" \n\t"\
  592. "cmp "#dstw", "#index" \n\t"\
  593. " jb 1b \n\t"
  594. #define WRITERGB15(dst, dstw, index) REAL_WRITERGB15(dst, dstw, index)
  595. static void RENAME(yuv2rgb555_X_ar)(SwsContext *c, const int16_t *lumFilter,
  596. const int16_t **lumSrc, int lumFilterSize,
  597. const int16_t *chrFilter, const int16_t **chrUSrc,
  598. const int16_t **chrVSrc,
  599. int chrFilterSize, const int16_t **alpSrc,
  600. uint8_t *dest, int dstW, int dstY)
  601. {
  602. x86_reg dummy=0;
  603. x86_reg dstW_reg = dstW;
  604. x86_reg uv_off = c->uv_off << 1;
  605. YSCALEYUV2PACKEDX_ACCURATE
  606. YSCALEYUV2RGBX
  607. "pxor %%mm7, %%mm7 \n\t"
  608. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  609. #ifdef DITHER1XBPP
  610. "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
  611. "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
  612. "paddusb "RED_DITHER"(%0), %%mm5\n\t"
  613. #endif
  614. WRITERGB15(%4, %5, %%REGa)
  615. YSCALEYUV2PACKEDX_END
  616. }
  617. static void RENAME(yuv2rgb555_X)(SwsContext *c, const int16_t *lumFilter,
  618. const int16_t **lumSrc, int lumFilterSize,
  619. const int16_t *chrFilter, const int16_t **chrUSrc,
  620. const int16_t **chrVSrc,
  621. int chrFilterSize, const int16_t **alpSrc,
  622. uint8_t *dest, int dstW, int dstY)
  623. {
  624. x86_reg dummy=0;
  625. x86_reg dstW_reg = dstW;
  626. x86_reg uv_off = c->uv_off << 1;
  627. YSCALEYUV2PACKEDX
  628. YSCALEYUV2RGBX
  629. "pxor %%mm7, %%mm7 \n\t"
  630. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  631. #ifdef DITHER1XBPP
  632. "paddusb "BLUE_DITHER"(%0), %%mm2 \n\t"
  633. "paddusb "GREEN_DITHER"(%0), %%mm4 \n\t"
  634. "paddusb "RED_DITHER"(%0), %%mm5 \n\t"
  635. #endif
  636. WRITERGB15(%4, %5, %%REGa)
  637. YSCALEYUV2PACKEDX_END
  638. }
  639. #define WRITEBGR24MMX(dst, dstw, index) \
  640. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
  641. "movq %%mm2, %%mm1 \n\t" /* B */\
  642. "movq %%mm5, %%mm6 \n\t" /* R */\
  643. "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
  644. "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
  645. "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
  646. "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
  647. "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
  648. "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
  649. "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
  650. "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
  651. "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
  652. "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
  653. \
  654. "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
  655. "movq %%mm2, %%mm6 \n\t" /* 0RGB0RGB 1 */\
  656. "movq %%mm1, %%mm5 \n\t" /* 0RGB0RGB 2 */\
  657. "movq %%mm3, %%mm7 \n\t" /* 0RGB0RGB 3 */\
  658. \
  659. "psllq $40, %%mm0 \n\t" /* RGB00000 0 */\
  660. "psllq $40, %%mm2 \n\t" /* RGB00000 1 */\
  661. "psllq $40, %%mm1 \n\t" /* RGB00000 2 */\
  662. "psllq $40, %%mm3 \n\t" /* RGB00000 3 */\
  663. \
  664. "punpckhdq %%mm4, %%mm0 \n\t" /* 0RGBRGB0 0 */\
  665. "punpckhdq %%mm6, %%mm2 \n\t" /* 0RGBRGB0 1 */\
  666. "punpckhdq %%mm5, %%mm1 \n\t" /* 0RGBRGB0 2 */\
  667. "punpckhdq %%mm7, %%mm3 \n\t" /* 0RGBRGB0 3 */\
  668. \
  669. "psrlq $8, %%mm0 \n\t" /* 00RGBRGB 0 */\
  670. "movq %%mm2, %%mm6 \n\t" /* 0RGBRGB0 1 */\
  671. "psllq $40, %%mm2 \n\t" /* GB000000 1 */\
  672. "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
  673. MOVNTQ(%%mm0, (dst))\
  674. \
  675. "psrlq $24, %%mm6 \n\t" /* 0000RGBR 1 */\
  676. "movq %%mm1, %%mm5 \n\t" /* 0RGBRGB0 2 */\
  677. "psllq $24, %%mm1 \n\t" /* BRGB0000 2 */\
  678. "por %%mm1, %%mm6 \n\t" /* BRGBRGBR 1 */\
  679. MOVNTQ(%%mm6, 8(dst))\
  680. \
  681. "psrlq $40, %%mm5 \n\t" /* 000000RG 2 */\
  682. "psllq $8, %%mm3 \n\t" /* RGBRGB00 3 */\
  683. "por %%mm3, %%mm5 \n\t" /* RGBRGBRG 2 */\
  684. MOVNTQ(%%mm5, 16(dst))\
  685. \
  686. "add $24, "#dst" \n\t"\
  687. \
  688. "add $8, "#index" \n\t"\
  689. "cmp "#dstw", "#index" \n\t"\
  690. " jb 1b \n\t"
  691. #define WRITEBGR24MMX2(dst, dstw, index) \
  692. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
  693. "movq "MANGLE(ff_M24A)", %%mm0 \n\t"\
  694. "movq "MANGLE(ff_M24C)", %%mm7 \n\t"\
  695. "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2 B1 B0 B1 B0 */\
  696. "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2 G1 G0 G1 G0 */\
  697. "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0 R1 R0 R1 R0 */\
  698. \
  699. "pand %%mm0, %%mm1 \n\t" /* B2 B1 B0 */\
  700. "pand %%mm0, %%mm3 \n\t" /* G2 G1 G0 */\
  701. "pand %%mm7, %%mm6 \n\t" /* R1 R0 */\
  702. \
  703. "psllq $8, %%mm3 \n\t" /* G2 G1 G0 */\
  704. "por %%mm1, %%mm6 \n\t"\
  705. "por %%mm3, %%mm6 \n\t"\
  706. MOVNTQ(%%mm6, (dst))\
  707. \
  708. "psrlq $8, %%mm4 \n\t" /* 00 G7 G6 G5 G4 G3 G2 G1 */\
  709. "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4 B3 B2 B3 B2 */\
  710. "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3 G4 G3 G4 G3 */\
  711. "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4 R3 R2 R3 R2 */\
  712. \
  713. "pand "MANGLE(ff_M24B)", %%mm1 \n\t" /* B5 B4 B3 */\
  714. "pand %%mm7, %%mm3 \n\t" /* G4 G3 */\
  715. "pand %%mm0, %%mm6 \n\t" /* R4 R3 R2 */\
  716. \
  717. "por %%mm1, %%mm3 \n\t" /* B5 G4 B4 G3 B3 */\
  718. "por %%mm3, %%mm6 \n\t"\
  719. MOVNTQ(%%mm6, 8(dst))\
  720. \
  721. "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6 B7 B6 B6 B7 */\
  722. "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7 G6 G5 G6 G5 */\
  723. "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6 R5 R4 R5 R4 */\
  724. \
  725. "pand %%mm7, %%mm1 \n\t" /* B7 B6 */\
  726. "pand %%mm0, %%mm3 \n\t" /* G7 G6 G5 */\
  727. "pand "MANGLE(ff_M24B)", %%mm6 \n\t" /* R7 R6 R5 */\
  728. \
  729. "por %%mm1, %%mm3 \n\t"\
  730. "por %%mm3, %%mm6 \n\t"\
  731. MOVNTQ(%%mm6, 16(dst))\
  732. \
  733. "add $24, "#dst" \n\t"\
  734. \
  735. "add $8, "#index" \n\t"\
  736. "cmp "#dstw", "#index" \n\t"\
  737. " jb 1b \n\t"
  738. #if COMPILE_TEMPLATE_MMX2
  739. #undef WRITEBGR24
  740. #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX2(dst, dstw, index)
  741. #else
  742. #undef WRITEBGR24
  743. #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX(dst, dstw, index)
  744. #endif
  745. static void RENAME(yuv2bgr24_X_ar)(SwsContext *c, const int16_t *lumFilter,
  746. const int16_t **lumSrc, int lumFilterSize,
  747. const int16_t *chrFilter, const int16_t **chrUSrc,
  748. const int16_t **chrVSrc,
  749. int chrFilterSize, const int16_t **alpSrc,
  750. uint8_t *dest, int dstW, int dstY)
  751. {
  752. x86_reg dummy=0;
  753. x86_reg dstW_reg = dstW;
  754. x86_reg uv_off = c->uv_off << 1;
  755. YSCALEYUV2PACKEDX_ACCURATE
  756. YSCALEYUV2RGBX
  757. "pxor %%mm7, %%mm7 \n\t"
  758. "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"\n\t" //FIXME optimize
  759. "add %4, %%"REG_c" \n\t"
  760. WRITEBGR24(%%REGc, %5, %%REGa)
  761. :: "r" (&c->redDither),
  762. "m" (dummy), "m" (dummy), "m" (dummy),
  763. "r" (dest), "m" (dstW_reg), "m"(uv_off)
  764. : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
  765. );
  766. }
  767. static void RENAME(yuv2bgr24_X)(SwsContext *c, const int16_t *lumFilter,
  768. const int16_t **lumSrc, int lumFilterSize,
  769. const int16_t *chrFilter, const int16_t **chrUSrc,
  770. const int16_t **chrVSrc,
  771. int chrFilterSize, const int16_t **alpSrc,
  772. uint8_t *dest, int dstW, int dstY)
  773. {
  774. x86_reg dummy=0;
  775. x86_reg dstW_reg = dstW;
  776. x86_reg uv_off = c->uv_off << 1;
  777. YSCALEYUV2PACKEDX
  778. YSCALEYUV2RGBX
  779. "pxor %%mm7, %%mm7 \n\t"
  780. "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c" \n\t" //FIXME optimize
  781. "add %4, %%"REG_c" \n\t"
  782. WRITEBGR24(%%REGc, %5, %%REGa)
  783. :: "r" (&c->redDither),
  784. "m" (dummy), "m" (dummy), "m" (dummy),
  785. "r" (dest), "m" (dstW_reg), "m"(uv_off)
  786. : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
  787. );
  788. }
  789. #define REAL_WRITEYUY2(dst, dstw, index) \
  790. "packuswb %%mm3, %%mm3 \n\t"\
  791. "packuswb %%mm4, %%mm4 \n\t"\
  792. "packuswb %%mm7, %%mm1 \n\t"\
  793. "punpcklbw %%mm4, %%mm3 \n\t"\
  794. "movq %%mm1, %%mm7 \n\t"\
  795. "punpcklbw %%mm3, %%mm1 \n\t"\
  796. "punpckhbw %%mm3, %%mm7 \n\t"\
  797. \
  798. MOVNTQ(%%mm1, (dst, index, 2))\
  799. MOVNTQ(%%mm7, 8(dst, index, 2))\
  800. \
  801. "add $8, "#index" \n\t"\
  802. "cmp "#dstw", "#index" \n\t"\
  803. " jb 1b \n\t"
  804. #define WRITEYUY2(dst, dstw, index) REAL_WRITEYUY2(dst, dstw, index)
  805. static void RENAME(yuv2yuyv422_X_ar)(SwsContext *c, const int16_t *lumFilter,
  806. const int16_t **lumSrc, int lumFilterSize,
  807. const int16_t *chrFilter, const int16_t **chrUSrc,
  808. const int16_t **chrVSrc,
  809. int chrFilterSize, const int16_t **alpSrc,
  810. uint8_t *dest, int dstW, int dstY)
  811. {
  812. x86_reg dummy=0;
  813. x86_reg dstW_reg = dstW;
  814. x86_reg uv_off = c->uv_off << 1;
  815. YSCALEYUV2PACKEDX_ACCURATE
  816. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  817. "psraw $3, %%mm3 \n\t"
  818. "psraw $3, %%mm4 \n\t"
  819. "psraw $3, %%mm1 \n\t"
  820. "psraw $3, %%mm7 \n\t"
  821. WRITEYUY2(%4, %5, %%REGa)
  822. YSCALEYUV2PACKEDX_END
  823. }
  824. static void RENAME(yuv2yuyv422_X)(SwsContext *c, const int16_t *lumFilter,
  825. const int16_t **lumSrc, int lumFilterSize,
  826. const int16_t *chrFilter, const int16_t **chrUSrc,
  827. const int16_t **chrVSrc,
  828. int chrFilterSize, const int16_t **alpSrc,
  829. uint8_t *dest, int dstW, int dstY)
  830. {
  831. x86_reg dummy=0;
  832. x86_reg dstW_reg = dstW;
  833. x86_reg uv_off = c->uv_off << 1;
  834. YSCALEYUV2PACKEDX
  835. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  836. "psraw $3, %%mm3 \n\t"
  837. "psraw $3, %%mm4 \n\t"
  838. "psraw $3, %%mm1 \n\t"
  839. "psraw $3, %%mm7 \n\t"
  840. WRITEYUY2(%4, %5, %%REGa)
  841. YSCALEYUV2PACKEDX_END
  842. }
  843. #define REAL_YSCALEYUV2RGB_UV(index, c) \
  844. "xor "#index", "#index" \n\t"\
  845. ".p2align 4 \n\t"\
  846. "1: \n\t"\
  847. "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
  848. "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
  849. "add "UV_OFFx2"("#c"), "#index" \n\t" \
  850. "movq (%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
  851. "movq (%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
  852. "sub "UV_OFFx2"("#c"), "#index" \n\t" \
  853. "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
  854. "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
  855. "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
  856. "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
  857. "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
  858. "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
  859. "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
  860. "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
  861. "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
  862. "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
  863. "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
  864. "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
  865. "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
  866. "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
  867. "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
  868. /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
  869. #define REAL_YSCALEYUV2RGB_YA(index, c, b1, b2) \
  870. "movq ("#b1", "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
  871. "movq ("#b2", "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
  872. "movq 8("#b1", "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
  873. "movq 8("#b2", "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
  874. "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
  875. "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
  876. "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
  877. "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
  878. "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  879. "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  880. "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
  881. "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
  882. #define REAL_YSCALEYUV2RGB_COEFF(c) \
  883. "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
  884. "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
  885. "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
  886. "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
  887. "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
  888. "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
  889. /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
  890. "paddw %%mm3, %%mm4 \n\t"\
  891. "movq %%mm2, %%mm0 \n\t"\
  892. "movq %%mm5, %%mm6 \n\t"\
  893. "movq %%mm4, %%mm3 \n\t"\
  894. "punpcklwd %%mm2, %%mm2 \n\t"\
  895. "punpcklwd %%mm5, %%mm5 \n\t"\
  896. "punpcklwd %%mm4, %%mm4 \n\t"\
  897. "paddw %%mm1, %%mm2 \n\t"\
  898. "paddw %%mm1, %%mm5 \n\t"\
  899. "paddw %%mm1, %%mm4 \n\t"\
  900. "punpckhwd %%mm0, %%mm0 \n\t"\
  901. "punpckhwd %%mm6, %%mm6 \n\t"\
  902. "punpckhwd %%mm3, %%mm3 \n\t"\
  903. "paddw %%mm7, %%mm0 \n\t"\
  904. "paddw %%mm7, %%mm6 \n\t"\
  905. "paddw %%mm7, %%mm3 \n\t"\
  906. /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
  907. "packuswb %%mm0, %%mm2 \n\t"\
  908. "packuswb %%mm6, %%mm5 \n\t"\
  909. "packuswb %%mm3, %%mm4 \n\t"\
  910. #define YSCALEYUV2RGB_YA(index, c, b1, b2) REAL_YSCALEYUV2RGB_YA(index, c, b1, b2)
  911. #define YSCALEYUV2RGB(index, c) \
  912. REAL_YSCALEYUV2RGB_UV(index, c) \
  913. REAL_YSCALEYUV2RGB_YA(index, c, %0, %1) \
  914. REAL_YSCALEYUV2RGB_COEFF(c)
  915. /**
  916. * vertical bilinear scale YV12 to RGB
  917. */
  918. static void RENAME(yuv2rgb32_2)(SwsContext *c, const uint16_t *buf0,
  919. const uint16_t *buf1, const uint16_t *ubuf0,
  920. const uint16_t *ubuf1, const uint16_t *vbuf0,
  921. const uint16_t *vbuf1, const uint16_t *abuf0,
  922. const uint16_t *abuf1, uint8_t *dest,
  923. int dstW, int yalpha, int uvalpha, int y)
  924. {
  925. if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
  926. #if ARCH_X86_64
  927. __asm__ volatile(
  928. YSCALEYUV2RGB(%%r8, %5)
  929. YSCALEYUV2RGB_YA(%%r8, %5, %6, %7)
  930. "psraw $3, %%mm1 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
  931. "psraw $3, %%mm7 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
  932. "packuswb %%mm7, %%mm1 \n\t"
  933. WRITEBGR32(%4, 8280(%5), %%r8, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
  934. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "r" (dest),
  935. "a" (&c->redDither),
  936. "r" (abuf0), "r" (abuf1)
  937. : "%r8"
  938. );
  939. #else
  940. *(const uint16_t **)(&c->u_temp)=abuf0;
  941. *(const uint16_t **)(&c->v_temp)=abuf1;
  942. __asm__ volatile(
  943. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  944. "mov %4, %%"REG_b" \n\t"
  945. "push %%"REG_BP" \n\t"
  946. YSCALEYUV2RGB(%%REGBP, %5)
  947. "push %0 \n\t"
  948. "push %1 \n\t"
  949. "mov "U_TEMP"(%5), %0 \n\t"
  950. "mov "V_TEMP"(%5), %1 \n\t"
  951. YSCALEYUV2RGB_YA(%%REGBP, %5, %0, %1)
  952. "psraw $3, %%mm1 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
  953. "psraw $3, %%mm7 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
  954. "packuswb %%mm7, %%mm1 \n\t"
  955. "pop %1 \n\t"
  956. "pop %0 \n\t"
  957. WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
  958. "pop %%"REG_BP" \n\t"
  959. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  960. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  961. "a" (&c->redDither)
  962. );
  963. #endif
  964. } else {
  965. __asm__ volatile(
  966. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  967. "mov %4, %%"REG_b" \n\t"
  968. "push %%"REG_BP" \n\t"
  969. YSCALEYUV2RGB(%%REGBP, %5)
  970. "pcmpeqd %%mm7, %%mm7 \n\t"
  971. WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
  972. "pop %%"REG_BP" \n\t"
  973. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  974. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  975. "a" (&c->redDither)
  976. );
  977. }
  978. }
  979. static void RENAME(yuv2bgr24_2)(SwsContext *c, const uint16_t *buf0,
  980. const uint16_t *buf1, const uint16_t *ubuf0,
  981. const uint16_t *ubuf1, const uint16_t *vbuf0,
  982. const uint16_t *vbuf1, const uint16_t *abuf0,
  983. const uint16_t *abuf1, uint8_t *dest,
  984. int dstW, int yalpha, int uvalpha, int y)
  985. {
  986. //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
  987. __asm__ volatile(
  988. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  989. "mov %4, %%"REG_b" \n\t"
  990. "push %%"REG_BP" \n\t"
  991. YSCALEYUV2RGB(%%REGBP, %5)
  992. "pxor %%mm7, %%mm7 \n\t"
  993. WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
  994. "pop %%"REG_BP" \n\t"
  995. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  996. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  997. "a" (&c->redDither)
  998. );
  999. }
  1000. static void RENAME(yuv2rgb555_2)(SwsContext *c, const uint16_t *buf0,
  1001. const uint16_t *buf1, const uint16_t *ubuf0,
  1002. const uint16_t *ubuf1, const uint16_t *vbuf0,
  1003. const uint16_t *vbuf1, const uint16_t *abuf0,
  1004. const uint16_t *abuf1, uint8_t *dest,
  1005. int dstW, int yalpha, int uvalpha, int y)
  1006. {
  1007. //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
  1008. __asm__ volatile(
  1009. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1010. "mov %4, %%"REG_b" \n\t"
  1011. "push %%"REG_BP" \n\t"
  1012. YSCALEYUV2RGB(%%REGBP, %5)
  1013. "pxor %%mm7, %%mm7 \n\t"
  1014. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1015. #ifdef DITHER1XBPP
  1016. "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
  1017. "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
  1018. "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
  1019. #endif
  1020. WRITERGB15(%%REGb, 8280(%5), %%REGBP)
  1021. "pop %%"REG_BP" \n\t"
  1022. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1023. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1024. "a" (&c->redDither)
  1025. );
  1026. }
  1027. static void RENAME(yuv2rgb565_2)(SwsContext *c, const uint16_t *buf0,
  1028. const uint16_t *buf1, const uint16_t *ubuf0,
  1029. const uint16_t *ubuf1, const uint16_t *vbuf0,
  1030. const uint16_t *vbuf1, const uint16_t *abuf0,
  1031. const uint16_t *abuf1, uint8_t *dest,
  1032. int dstW, int yalpha, int uvalpha, int y)
  1033. {
  1034. //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
  1035. __asm__ volatile(
  1036. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1037. "mov %4, %%"REG_b" \n\t"
  1038. "push %%"REG_BP" \n\t"
  1039. YSCALEYUV2RGB(%%REGBP, %5)
  1040. "pxor %%mm7, %%mm7 \n\t"
  1041. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1042. #ifdef DITHER1XBPP
  1043. "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
  1044. "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
  1045. "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
  1046. #endif
  1047. WRITERGB16(%%REGb, 8280(%5), %%REGBP)
  1048. "pop %%"REG_BP" \n\t"
  1049. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1050. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1051. "a" (&c->redDither)
  1052. );
  1053. }
  1054. #define REAL_YSCALEYUV2PACKED(index, c) \
  1055. "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
  1056. "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1 \n\t"\
  1057. "psraw $3, %%mm0 \n\t"\
  1058. "psraw $3, %%mm1 \n\t"\
  1059. "movq %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c") \n\t"\
  1060. "movq %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c") \n\t"\
  1061. "xor "#index", "#index" \n\t"\
  1062. ".p2align 4 \n\t"\
  1063. "1: \n\t"\
  1064. "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
  1065. "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
  1066. "add "UV_OFFx2"("#c"), "#index" \n\t" \
  1067. "movq (%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
  1068. "movq (%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
  1069. "sub "UV_OFFx2"("#c"), "#index" \n\t" \
  1070. "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
  1071. "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
  1072. "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
  1073. "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
  1074. "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
  1075. "psraw $7, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
  1076. "psraw $7, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
  1077. "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
  1078. "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
  1079. "movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
  1080. "movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
  1081. "movq 8(%0, "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
  1082. "movq 8(%1, "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
  1083. "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
  1084. "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
  1085. "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
  1086. "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
  1087. "psraw $7, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  1088. "psraw $7, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  1089. "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
  1090. "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
  1091. #define YSCALEYUV2PACKED(index, c) REAL_YSCALEYUV2PACKED(index, c)
  1092. static void RENAME(yuv2yuyv422_2)(SwsContext *c, const uint16_t *buf0,
  1093. const uint16_t *buf1, const uint16_t *ubuf0,
  1094. const uint16_t *ubuf1, const uint16_t *vbuf0,
  1095. const uint16_t *vbuf1, const uint16_t *abuf0,
  1096. const uint16_t *abuf1, uint8_t *dest,
  1097. int dstW, int yalpha, int uvalpha, int y)
  1098. {
  1099. //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
  1100. __asm__ volatile(
  1101. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1102. "mov %4, %%"REG_b" \n\t"
  1103. "push %%"REG_BP" \n\t"
  1104. YSCALEYUV2PACKED(%%REGBP, %5)
  1105. WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
  1106. "pop %%"REG_BP" \n\t"
  1107. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1108. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1109. "a" (&c->redDither)
  1110. );
  1111. }
  1112. #define REAL_YSCALEYUV2RGB1(index, c) \
  1113. "xor "#index", "#index" \n\t"\
  1114. ".p2align 4 \n\t"\
  1115. "1: \n\t"\
  1116. "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
  1117. "add "UV_OFFx2"("#c"), "#index" \n\t" \
  1118. "movq (%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
  1119. "sub "UV_OFFx2"("#c"), "#index" \n\t" \
  1120. "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
  1121. "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
  1122. "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
  1123. "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
  1124. "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
  1125. "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
  1126. "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
  1127. "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
  1128. /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
  1129. "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
  1130. "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
  1131. "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  1132. "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  1133. "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
  1134. "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
  1135. "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
  1136. "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
  1137. "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
  1138. "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
  1139. /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
  1140. "paddw %%mm3, %%mm4 \n\t"\
  1141. "movq %%mm2, %%mm0 \n\t"\
  1142. "movq %%mm5, %%mm6 \n\t"\
  1143. "movq %%mm4, %%mm3 \n\t"\
  1144. "punpcklwd %%mm2, %%mm2 \n\t"\
  1145. "punpcklwd %%mm5, %%mm5 \n\t"\
  1146. "punpcklwd %%mm4, %%mm4 \n\t"\
  1147. "paddw %%mm1, %%mm2 \n\t"\
  1148. "paddw %%mm1, %%mm5 \n\t"\
  1149. "paddw %%mm1, %%mm4 \n\t"\
  1150. "punpckhwd %%mm0, %%mm0 \n\t"\
  1151. "punpckhwd %%mm6, %%mm6 \n\t"\
  1152. "punpckhwd %%mm3, %%mm3 \n\t"\
  1153. "paddw %%mm7, %%mm0 \n\t"\
  1154. "paddw %%mm7, %%mm6 \n\t"\
  1155. "paddw %%mm7, %%mm3 \n\t"\
  1156. /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
  1157. "packuswb %%mm0, %%mm2 \n\t"\
  1158. "packuswb %%mm6, %%mm5 \n\t"\
  1159. "packuswb %%mm3, %%mm4 \n\t"\
  1160. #define YSCALEYUV2RGB1(index, c) REAL_YSCALEYUV2RGB1(index, c)
  1161. // do vertical chrominance interpolation
  1162. #define REAL_YSCALEYUV2RGB1b(index, c) \
  1163. "xor "#index", "#index" \n\t"\
  1164. ".p2align 4 \n\t"\
  1165. "1: \n\t"\
  1166. "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
  1167. "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
  1168. "add "UV_OFFx2"("#c"), "#index" \n\t" \
  1169. "movq (%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
  1170. "movq (%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
  1171. "sub "UV_OFFx2"("#c"), "#index" \n\t" \
  1172. "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
  1173. "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
  1174. "psrlw $5, %%mm3 \n\t" /*FIXME might overflow*/\
  1175. "psrlw $5, %%mm4 \n\t" /*FIXME might overflow*/\
  1176. "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
  1177. "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
  1178. "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
  1179. "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
  1180. "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
  1181. "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
  1182. /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
  1183. "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
  1184. "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
  1185. "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  1186. "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  1187. "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
  1188. "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
  1189. "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
  1190. "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
  1191. "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
  1192. "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
  1193. /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
  1194. "paddw %%mm3, %%mm4 \n\t"\
  1195. "movq %%mm2, %%mm0 \n\t"\
  1196. "movq %%mm5, %%mm6 \n\t"\
  1197. "movq %%mm4, %%mm3 \n\t"\
  1198. "punpcklwd %%mm2, %%mm2 \n\t"\
  1199. "punpcklwd %%mm5, %%mm5 \n\t"\
  1200. "punpcklwd %%mm4, %%mm4 \n\t"\
  1201. "paddw %%mm1, %%mm2 \n\t"\
  1202. "paddw %%mm1, %%mm5 \n\t"\
  1203. "paddw %%mm1, %%mm4 \n\t"\
  1204. "punpckhwd %%mm0, %%mm0 \n\t"\
  1205. "punpckhwd %%mm6, %%mm6 \n\t"\
  1206. "punpckhwd %%mm3, %%mm3 \n\t"\
  1207. "paddw %%mm7, %%mm0 \n\t"\
  1208. "paddw %%mm7, %%mm6 \n\t"\
  1209. "paddw %%mm7, %%mm3 \n\t"\
  1210. /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
  1211. "packuswb %%mm0, %%mm2 \n\t"\
  1212. "packuswb %%mm6, %%mm5 \n\t"\
  1213. "packuswb %%mm3, %%mm4 \n\t"\
  1214. #define YSCALEYUV2RGB1b(index, c) REAL_YSCALEYUV2RGB1b(index, c)
  1215. #define REAL_YSCALEYUV2RGB1_ALPHA(index) \
  1216. "movq (%1, "#index", 2), %%mm7 \n\t" /* abuf0[index ] */\
  1217. "movq 8(%1, "#index", 2), %%mm1 \n\t" /* abuf0[index+4] */\
  1218. "psraw $7, %%mm7 \n\t" /* abuf0[index ] >>7 */\
  1219. "psraw $7, %%mm1 \n\t" /* abuf0[index+4] >>7 */\
  1220. "packuswb %%mm1, %%mm7 \n\t"
  1221. #define YSCALEYUV2RGB1_ALPHA(index) REAL_YSCALEYUV2RGB1_ALPHA(index)
  1222. /**
  1223. * YV12 to RGB without scaling or interpolating
  1224. */
  1225. static void RENAME(yuv2rgb32_1)(SwsContext *c, const uint16_t *buf0,
  1226. const uint16_t *ubuf0, const uint16_t *ubuf1,
  1227. const uint16_t *vbuf0, const uint16_t *vbuf1,
  1228. const uint16_t *abuf0, uint8_t *dest,
  1229. int dstW, int uvalpha, enum PixelFormat dstFormat,
  1230. int flags, int y)
  1231. {
  1232. const uint16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
  1233. if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
  1234. if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
  1235. __asm__ volatile(
  1236. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1237. "mov %4, %%"REG_b" \n\t"
  1238. "push %%"REG_BP" \n\t"
  1239. YSCALEYUV2RGB1(%%REGBP, %5)
  1240. YSCALEYUV2RGB1_ALPHA(%%REGBP)
  1241. WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
  1242. "pop %%"REG_BP" \n\t"
  1243. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1244. :: "c" (buf0), "d" (abuf0), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1245. "a" (&c->redDither)
  1246. );
  1247. } else {
  1248. __asm__ volatile(
  1249. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1250. "mov %4, %%"REG_b" \n\t"
  1251. "push %%"REG_BP" \n\t"
  1252. YSCALEYUV2RGB1(%%REGBP, %5)
  1253. "pcmpeqd %%mm7, %%mm7 \n\t"
  1254. WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
  1255. "pop %%"REG_BP" \n\t"
  1256. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1257. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1258. "a" (&c->redDither)
  1259. );
  1260. }
  1261. } else {
  1262. if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
  1263. __asm__ volatile(
  1264. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1265. "mov %4, %%"REG_b" \n\t"
  1266. "push %%"REG_BP" \n\t"
  1267. YSCALEYUV2RGB1b(%%REGBP, %5)
  1268. YSCALEYUV2RGB1_ALPHA(%%REGBP)
  1269. WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
  1270. "pop %%"REG_BP" \n\t"
  1271. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1272. :: "c" (buf0), "d" (abuf0), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1273. "a" (&c->redDither)
  1274. );
  1275. } else {
  1276. __asm__ volatile(
  1277. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1278. "mov %4, %%"REG_b" \n\t"
  1279. "push %%"REG_BP" \n\t"
  1280. YSCALEYUV2RGB1b(%%REGBP, %5)
  1281. "pcmpeqd %%mm7, %%mm7 \n\t"
  1282. WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
  1283. "pop %%"REG_BP" \n\t"
  1284. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1285. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1286. "a" (&c->redDither)
  1287. );
  1288. }
  1289. }
  1290. }
  1291. static void RENAME(yuv2bgr24_1)(SwsContext *c, const uint16_t *buf0,
  1292. const uint16_t *ubuf0, const uint16_t *ubuf1,
  1293. const uint16_t *vbuf0, const uint16_t *vbuf1,
  1294. const uint16_t *abuf0, uint8_t *dest,
  1295. int dstW, int uvalpha, enum PixelFormat dstFormat,
  1296. int flags, int y)
  1297. {
  1298. const uint16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
  1299. if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
  1300. __asm__ volatile(
  1301. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1302. "mov %4, %%"REG_b" \n\t"
  1303. "push %%"REG_BP" \n\t"
  1304. YSCALEYUV2RGB1(%%REGBP, %5)
  1305. "pxor %%mm7, %%mm7 \n\t"
  1306. WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
  1307. "pop %%"REG_BP" \n\t"
  1308. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1309. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1310. "a" (&c->redDither)
  1311. );
  1312. } else {
  1313. __asm__ volatile(
  1314. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1315. "mov %4, %%"REG_b" \n\t"
  1316. "push %%"REG_BP" \n\t"
  1317. YSCALEYUV2RGB1b(%%REGBP, %5)
  1318. "pxor %%mm7, %%mm7 \n\t"
  1319. WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
  1320. "pop %%"REG_BP" \n\t"
  1321. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1322. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1323. "a" (&c->redDither)
  1324. );
  1325. }
  1326. }
  1327. static void RENAME(yuv2rgb555_1)(SwsContext *c, const uint16_t *buf0,
  1328. const uint16_t *ubuf0, const uint16_t *ubuf1,
  1329. const uint16_t *vbuf0, const uint16_t *vbuf1,
  1330. const uint16_t *abuf0, uint8_t *dest,
  1331. int dstW, int uvalpha, enum PixelFormat dstFormat,
  1332. int flags, int y)
  1333. {
  1334. const uint16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
  1335. if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
  1336. __asm__ volatile(
  1337. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1338. "mov %4, %%"REG_b" \n\t"
  1339. "push %%"REG_BP" \n\t"
  1340. YSCALEYUV2RGB1(%%REGBP, %5)
  1341. "pxor %%mm7, %%mm7 \n\t"
  1342. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1343. #ifdef DITHER1XBPP
  1344. "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
  1345. "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
  1346. "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
  1347. #endif
  1348. WRITERGB15(%%REGb, 8280(%5), %%REGBP)
  1349. "pop %%"REG_BP" \n\t"
  1350. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1351. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1352. "a" (&c->redDither)
  1353. );
  1354. } else {
  1355. __asm__ volatile(
  1356. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1357. "mov %4, %%"REG_b" \n\t"
  1358. "push %%"REG_BP" \n\t"
  1359. YSCALEYUV2RGB1b(%%REGBP, %5)
  1360. "pxor %%mm7, %%mm7 \n\t"
  1361. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1362. #ifdef DITHER1XBPP
  1363. "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
  1364. "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
  1365. "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
  1366. #endif
  1367. WRITERGB15(%%REGb, 8280(%5), %%REGBP)
  1368. "pop %%"REG_BP" \n\t"
  1369. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1370. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1371. "a" (&c->redDither)
  1372. );
  1373. }
  1374. }
  1375. static void RENAME(yuv2rgb565_1)(SwsContext *c, const uint16_t *buf0,
  1376. const uint16_t *ubuf0, const uint16_t *ubuf1,
  1377. const uint16_t *vbuf0, const uint16_t *vbuf1,
  1378. const uint16_t *abuf0, uint8_t *dest,
  1379. int dstW, int uvalpha, enum PixelFormat dstFormat,
  1380. int flags, int y)
  1381. {
  1382. const uint16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
  1383. if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
  1384. __asm__ volatile(
  1385. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1386. "mov %4, %%"REG_b" \n\t"
  1387. "push %%"REG_BP" \n\t"
  1388. YSCALEYUV2RGB1(%%REGBP, %5)
  1389. "pxor %%mm7, %%mm7 \n\t"
  1390. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1391. #ifdef DITHER1XBPP
  1392. "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
  1393. "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
  1394. "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
  1395. #endif
  1396. WRITERGB16(%%REGb, 8280(%5), %%REGBP)
  1397. "pop %%"REG_BP" \n\t"
  1398. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1399. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1400. "a" (&c->redDither)
  1401. );
  1402. } else {
  1403. __asm__ volatile(
  1404. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1405. "mov %4, %%"REG_b" \n\t"
  1406. "push %%"REG_BP" \n\t"
  1407. YSCALEYUV2RGB1b(%%REGBP, %5)
  1408. "pxor %%mm7, %%mm7 \n\t"
  1409. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1410. #ifdef DITHER1XBPP
  1411. "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
  1412. "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
  1413. "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
  1414. #endif
  1415. WRITERGB16(%%REGb, 8280(%5), %%REGBP)
  1416. "pop %%"REG_BP" \n\t"
  1417. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1418. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1419. "a" (&c->redDither)
  1420. );
  1421. }
  1422. }
  1423. #define REAL_YSCALEYUV2PACKED1(index, c) \
  1424. "xor "#index", "#index" \n\t"\
  1425. ".p2align 4 \n\t"\
  1426. "1: \n\t"\
  1427. "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
  1428. "add "UV_OFFx2"("#c"), "#index" \n\t" \
  1429. "movq (%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
  1430. "sub "UV_OFFx2"("#c"), "#index" \n\t" \
  1431. "psraw $7, %%mm3 \n\t" \
  1432. "psraw $7, %%mm4 \n\t" \
  1433. "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
  1434. "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
  1435. "psraw $7, %%mm1 \n\t" \
  1436. "psraw $7, %%mm7 \n\t" \
  1437. #define YSCALEYUV2PACKED1(index, c) REAL_YSCALEYUV2PACKED1(index, c)
  1438. #define REAL_YSCALEYUV2PACKED1b(index, c) \
  1439. "xor "#index", "#index" \n\t"\
  1440. ".p2align 4 \n\t"\
  1441. "1: \n\t"\
  1442. "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
  1443. "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
  1444. "add "UV_OFFx2"("#c"), "#index" \n\t" \
  1445. "movq (%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
  1446. "movq (%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
  1447. "sub "UV_OFFx2"("#c"), "#index" \n\t" \
  1448. "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
  1449. "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
  1450. "psrlw $8, %%mm3 \n\t" \
  1451. "psrlw $8, %%mm4 \n\t" \
  1452. "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
  1453. "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
  1454. "psraw $7, %%mm1 \n\t" \
  1455. "psraw $7, %%mm7 \n\t"
  1456. #define YSCALEYUV2PACKED1b(index, c) REAL_YSCALEYUV2PACKED1b(index, c)
  1457. static void RENAME(yuv2yuyv422_1)(SwsContext *c, const uint16_t *buf0,
  1458. const uint16_t *ubuf0, const uint16_t *ubuf1,
  1459. const uint16_t *vbuf0, const uint16_t *vbuf1,
  1460. const uint16_t *abuf0, uint8_t *dest,
  1461. int dstW, int uvalpha, enum PixelFormat dstFormat,
  1462. int flags, int y)
  1463. {
  1464. const uint16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
  1465. if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
  1466. __asm__ volatile(
  1467. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1468. "mov %4, %%"REG_b" \n\t"
  1469. "push %%"REG_BP" \n\t"
  1470. YSCALEYUV2PACKED1(%%REGBP, %5)
  1471. WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
  1472. "pop %%"REG_BP" \n\t"
  1473. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1474. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1475. "a" (&c->redDither)
  1476. );
  1477. } else {
  1478. __asm__ volatile(
  1479. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1480. "mov %4, %%"REG_b" \n\t"
  1481. "push %%"REG_BP" \n\t"
  1482. YSCALEYUV2PACKED1b(%%REGBP, %5)
  1483. WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
  1484. "pop %%"REG_BP" \n\t"
  1485. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1486. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1487. "a" (&c->redDither)
  1488. );
  1489. }
  1490. }
  1491. #if !COMPILE_TEMPLATE_MMX2
  1492. //FIXME yuy2* can read up to 7 samples too much
  1493. static void RENAME(yuy2ToY)(uint8_t *dst, const uint8_t *src,
  1494. int width, uint32_t *unused)
  1495. {
  1496. __asm__ volatile(
  1497. "movq "MANGLE(bm01010101)", %%mm2 \n\t"
  1498. "mov %0, %%"REG_a" \n\t"
  1499. "1: \n\t"
  1500. "movq (%1, %%"REG_a",2), %%mm0 \n\t"
  1501. "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
  1502. "pand %%mm2, %%mm0 \n\t"
  1503. "pand %%mm2, %%mm1 \n\t"
  1504. "packuswb %%mm1, %%mm0 \n\t"
  1505. "movq %%mm0, (%2, %%"REG_a") \n\t"
  1506. "add $8, %%"REG_a" \n\t"
  1507. " js 1b \n\t"
  1508. : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst+width)
  1509. : "%"REG_a
  1510. );
  1511. }
  1512. static void RENAME(yuy2ToUV)(uint8_t *dstU, uint8_t *dstV,
  1513. const uint8_t *src1, const uint8_t *src2,
  1514. int width, uint32_t *unused)
  1515. {
  1516. __asm__ volatile(
  1517. "movq "MANGLE(bm01010101)", %%mm4 \n\t"
  1518. "mov %0, %%"REG_a" \n\t"
  1519. "1: \n\t"
  1520. "movq (%1, %%"REG_a",4), %%mm0 \n\t"
  1521. "movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
  1522. "psrlw $8, %%mm0 \n\t"
  1523. "psrlw $8, %%mm1 \n\t"
  1524. "packuswb %%mm1, %%mm0 \n\t"
  1525. "movq %%mm0, %%mm1 \n\t"
  1526. "psrlw $8, %%mm0 \n\t"
  1527. "pand %%mm4, %%mm1 \n\t"
  1528. "packuswb %%mm0, %%mm0 \n\t"
  1529. "packuswb %%mm1, %%mm1 \n\t"
  1530. "movd %%mm0, (%3, %%"REG_a") \n\t"
  1531. "movd %%mm1, (%2, %%"REG_a") \n\t"
  1532. "add $4, %%"REG_a" \n\t"
  1533. " js 1b \n\t"
  1534. : : "g" ((x86_reg)-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
  1535. : "%"REG_a
  1536. );
  1537. assert(src1 == src2);
  1538. }
  1539. static void RENAME(LEToUV)(uint8_t *dstU, uint8_t *dstV,
  1540. const uint8_t *src1, const uint8_t *src2,
  1541. int width, uint32_t *unused)
  1542. {
  1543. __asm__ volatile(
  1544. "mov %0, %%"REG_a" \n\t"
  1545. "1: \n\t"
  1546. "movq (%1, %%"REG_a",2), %%mm0 \n\t"
  1547. "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
  1548. "movq (%2, %%"REG_a",2), %%mm2 \n\t"
  1549. "movq 8(%2, %%"REG_a",2), %%mm3 \n\t"
  1550. "psrlw $8, %%mm0 \n\t"
  1551. "psrlw $8, %%mm1 \n\t"
  1552. "psrlw $8, %%mm2 \n\t"
  1553. "psrlw $8, %%mm3 \n\t"
  1554. "packuswb %%mm1, %%mm0 \n\t"
  1555. "packuswb %%mm3, %%mm2 \n\t"
  1556. "movq %%mm0, (%3, %%"REG_a") \n\t"
  1557. "movq %%mm2, (%4, %%"REG_a") \n\t"
  1558. "add $8, %%"REG_a" \n\t"
  1559. " js 1b \n\t"
  1560. : : "g" ((x86_reg)-width), "r" (src1+width*2), "r" (src2+width*2), "r" (dstU+width), "r" (dstV+width)
  1561. : "%"REG_a
  1562. );
  1563. }
  1564. /* This is almost identical to the previous, end exists only because
  1565. * yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses. */
  1566. static void RENAME(uyvyToY)(uint8_t *dst, const uint8_t *src,
  1567. int width, uint32_t *unused)
  1568. {
  1569. __asm__ volatile(
  1570. "mov %0, %%"REG_a" \n\t"
  1571. "1: \n\t"
  1572. "movq (%1, %%"REG_a",2), %%mm0 \n\t"
  1573. "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
  1574. "psrlw $8, %%mm0 \n\t"
  1575. "psrlw $8, %%mm1 \n\t"
  1576. "packuswb %%mm1, %%mm0 \n\t"
  1577. "movq %%mm0, (%2, %%"REG_a") \n\t"
  1578. "add $8, %%"REG_a" \n\t"
  1579. " js 1b \n\t"
  1580. : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst+width)
  1581. : "%"REG_a
  1582. );
  1583. }
  1584. static void RENAME(uyvyToUV)(uint8_t *dstU, uint8_t *dstV,
  1585. const uint8_t *src1, const uint8_t *src2,
  1586. int width, uint32_t *unused)
  1587. {
  1588. __asm__ volatile(
  1589. "movq "MANGLE(bm01010101)", %%mm4 \n\t"
  1590. "mov %0, %%"REG_a" \n\t"
  1591. "1: \n\t"
  1592. "movq (%1, %%"REG_a",4), %%mm0 \n\t"
  1593. "movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
  1594. "pand %%mm4, %%mm0 \n\t"
  1595. "pand %%mm4, %%mm1 \n\t"
  1596. "packuswb %%mm1, %%mm0 \n\t"
  1597. "movq %%mm0, %%mm1 \n\t"
  1598. "psrlw $8, %%mm0 \n\t"
  1599. "pand %%mm4, %%mm1 \n\t"
  1600. "packuswb %%mm0, %%mm0 \n\t"
  1601. "packuswb %%mm1, %%mm1 \n\t"
  1602. "movd %%mm0, (%3, %%"REG_a") \n\t"
  1603. "movd %%mm1, (%2, %%"REG_a") \n\t"
  1604. "add $4, %%"REG_a" \n\t"
  1605. " js 1b \n\t"
  1606. : : "g" ((x86_reg)-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
  1607. : "%"REG_a
  1608. );
  1609. assert(src1 == src2);
  1610. }
  1611. static void RENAME(BEToUV)(uint8_t *dstU, uint8_t *dstV,
  1612. const uint8_t *src1, const uint8_t *src2,
  1613. int width, uint32_t *unused)
  1614. {
  1615. __asm__ volatile(
  1616. "movq "MANGLE(bm01010101)", %%mm4 \n\t"
  1617. "mov %0, %%"REG_a" \n\t"
  1618. "1: \n\t"
  1619. "movq (%1, %%"REG_a",2), %%mm0 \n\t"
  1620. "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
  1621. "movq (%2, %%"REG_a",2), %%mm2 \n\t"
  1622. "movq 8(%2, %%"REG_a",2), %%mm3 \n\t"
  1623. "pand %%mm4, %%mm0 \n\t"
  1624. "pand %%mm4, %%mm1 \n\t"
  1625. "pand %%mm4, %%mm2 \n\t"
  1626. "pand %%mm4, %%mm3 \n\t"
  1627. "packuswb %%mm1, %%mm0 \n\t"
  1628. "packuswb %%mm3, %%mm2 \n\t"
  1629. "movq %%mm0, (%3, %%"REG_a") \n\t"
  1630. "movq %%mm2, (%4, %%"REG_a") \n\t"
  1631. "add $8, %%"REG_a" \n\t"
  1632. " js 1b \n\t"
  1633. : : "g" ((x86_reg)-width), "r" (src1+width*2), "r" (src2+width*2), "r" (dstU+width), "r" (dstV+width)
  1634. : "%"REG_a
  1635. );
  1636. }
  1637. static av_always_inline void RENAME(nvXXtoUV)(uint8_t *dst1, uint8_t *dst2,
  1638. const uint8_t *src, int width)
  1639. {
  1640. __asm__ volatile(
  1641. "movq "MANGLE(bm01010101)", %%mm4 \n\t"
  1642. "mov %0, %%"REG_a" \n\t"
  1643. "1: \n\t"
  1644. "movq (%1, %%"REG_a",2), %%mm0 \n\t"
  1645. "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
  1646. "movq %%mm0, %%mm2 \n\t"
  1647. "movq %%mm1, %%mm3 \n\t"
  1648. "pand %%mm4, %%mm0 \n\t"
  1649. "pand %%mm4, %%mm1 \n\t"
  1650. "psrlw $8, %%mm2 \n\t"
  1651. "psrlw $8, %%mm3 \n\t"
  1652. "packuswb %%mm1, %%mm0 \n\t"
  1653. "packuswb %%mm3, %%mm2 \n\t"
  1654. "movq %%mm0, (%2, %%"REG_a") \n\t"
  1655. "movq %%mm2, (%3, %%"REG_a") \n\t"
  1656. "add $8, %%"REG_a" \n\t"
  1657. " js 1b \n\t"
  1658. : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst1+width), "r" (dst2+width)
  1659. : "%"REG_a
  1660. );
  1661. }
  1662. static void RENAME(nv12ToUV)(uint8_t *dstU, uint8_t *dstV,
  1663. const uint8_t *src1, const uint8_t *src2,
  1664. int width, uint32_t *unused)
  1665. {
  1666. RENAME(nvXXtoUV)(dstU, dstV, src1, width);
  1667. }
  1668. static void RENAME(nv21ToUV)(uint8_t *dstU, uint8_t *dstV,
  1669. const uint8_t *src1, const uint8_t *src2,
  1670. int width, uint32_t *unused)
  1671. {
  1672. RENAME(nvXXtoUV)(dstV, dstU, src1, width);
  1673. }
  1674. #endif /* !COMPILE_TEMPLATE_MMX2 */
  1675. static av_always_inline void RENAME(bgr24ToY_mmx)(uint8_t *dst, const uint8_t *src,
  1676. int width, enum PixelFormat srcFormat)
  1677. {
  1678. if(srcFormat == PIX_FMT_BGR24) {
  1679. __asm__ volatile(
  1680. "movq "MANGLE(ff_bgr24toY1Coeff)", %%mm5 \n\t"
  1681. "movq "MANGLE(ff_bgr24toY2Coeff)", %%mm6 \n\t"
  1682. :
  1683. );
  1684. } else {
  1685. __asm__ volatile(
  1686. "movq "MANGLE(ff_rgb24toY1Coeff)", %%mm5 \n\t"
  1687. "movq "MANGLE(ff_rgb24toY2Coeff)", %%mm6 \n\t"
  1688. :
  1689. );
  1690. }
  1691. __asm__ volatile(
  1692. "movq "MANGLE(ff_bgr24toYOffset)", %%mm4 \n\t"
  1693. "mov %2, %%"REG_a" \n\t"
  1694. "pxor %%mm7, %%mm7 \n\t"
  1695. "1: \n\t"
  1696. PREFETCH" 64(%0) \n\t"
  1697. "movd (%0), %%mm0 \n\t"
  1698. "movd 2(%0), %%mm1 \n\t"
  1699. "movd 6(%0), %%mm2 \n\t"
  1700. "movd 8(%0), %%mm3 \n\t"
  1701. "add $12, %0 \n\t"
  1702. "punpcklbw %%mm7, %%mm0 \n\t"
  1703. "punpcklbw %%mm7, %%mm1 \n\t"
  1704. "punpcklbw %%mm7, %%mm2 \n\t"
  1705. "punpcklbw %%mm7, %%mm3 \n\t"
  1706. "pmaddwd %%mm5, %%mm0 \n\t"
  1707. "pmaddwd %%mm6, %%mm1 \n\t"
  1708. "pmaddwd %%mm5, %%mm2 \n\t"
  1709. "pmaddwd %%mm6, %%mm3 \n\t"
  1710. "paddd %%mm1, %%mm0 \n\t"
  1711. "paddd %%mm3, %%mm2 \n\t"
  1712. "paddd %%mm4, %%mm0 \n\t"
  1713. "paddd %%mm4, %%mm2 \n\t"
  1714. "psrad $15, %%mm0 \n\t"
  1715. "psrad $15, %%mm2 \n\t"
  1716. "packssdw %%mm2, %%mm0 \n\t"
  1717. "packuswb %%mm0, %%mm0 \n\t"
  1718. "movd %%mm0, (%1, %%"REG_a") \n\t"
  1719. "add $4, %%"REG_a" \n\t"
  1720. " js 1b \n\t"
  1721. : "+r" (src)
  1722. : "r" (dst+width), "g" ((x86_reg)-width)
  1723. : "%"REG_a
  1724. );
  1725. }
  1726. static void RENAME(bgr24ToY)(uint8_t *dst, const uint8_t *src,
  1727. int width, uint32_t *unused)
  1728. {
  1729. RENAME(bgr24ToY_mmx)(dst, src, width, PIX_FMT_BGR24);
  1730. }
  1731. static void RENAME(rgb24ToY)(uint8_t *dst, const uint8_t *src,
  1732. int width, uint32_t *unused)
  1733. {
  1734. RENAME(bgr24ToY_mmx)(dst, src, width, PIX_FMT_RGB24);
  1735. }
  1736. static av_always_inline void RENAME(bgr24ToUV_mmx)(uint8_t *dstU, uint8_t *dstV,
  1737. const uint8_t *src, int width,
  1738. enum PixelFormat srcFormat)
  1739. {
  1740. __asm__ volatile(
  1741. "movq 24(%4), %%mm6 \n\t"
  1742. "mov %3, %%"REG_a" \n\t"
  1743. "pxor %%mm7, %%mm7 \n\t"
  1744. "1: \n\t"
  1745. PREFETCH" 64(%0) \n\t"
  1746. "movd (%0), %%mm0 \n\t"
  1747. "movd 2(%0), %%mm1 \n\t"
  1748. "punpcklbw %%mm7, %%mm0 \n\t"
  1749. "punpcklbw %%mm7, %%mm1 \n\t"
  1750. "movq %%mm0, %%mm2 \n\t"
  1751. "movq %%mm1, %%mm3 \n\t"
  1752. "pmaddwd (%4), %%mm0 \n\t"
  1753. "pmaddwd 8(%4), %%mm1 \n\t"
  1754. "pmaddwd 16(%4), %%mm2 \n\t"
  1755. "pmaddwd %%mm6, %%mm3 \n\t"
  1756. "paddd %%mm1, %%mm0 \n\t"
  1757. "paddd %%mm3, %%mm2 \n\t"
  1758. "movd 6(%0), %%mm1 \n\t"
  1759. "movd 8(%0), %%mm3 \n\t"
  1760. "add $12, %0 \n\t"
  1761. "punpcklbw %%mm7, %%mm1 \n\t"
  1762. "punpcklbw %%mm7, %%mm3 \n\t"
  1763. "movq %%mm1, %%mm4 \n\t"
  1764. "movq %%mm3, %%mm5 \n\t"
  1765. "pmaddwd (%4), %%mm1 \n\t"
  1766. "pmaddwd 8(%4), %%mm3 \n\t"
  1767. "pmaddwd 16(%4), %%mm4 \n\t"
  1768. "pmaddwd %%mm6, %%mm5 \n\t"
  1769. "paddd %%mm3, %%mm1 \n\t"
  1770. "paddd %%mm5, %%mm4 \n\t"
  1771. "movq "MANGLE(ff_bgr24toUVOffset)", %%mm3 \n\t"
  1772. "paddd %%mm3, %%mm0 \n\t"
  1773. "paddd %%mm3, %%mm2 \n\t"
  1774. "paddd %%mm3, %%mm1 \n\t"
  1775. "paddd %%mm3, %%mm4 \n\t"
  1776. "psrad $15, %%mm0 \n\t"
  1777. "psrad $15, %%mm2 \n\t"
  1778. "psrad $15, %%mm1 \n\t"
  1779. "psrad $15, %%mm4 \n\t"
  1780. "packssdw %%mm1, %%mm0 \n\t"
  1781. "packssdw %%mm4, %%mm2 \n\t"
  1782. "packuswb %%mm0, %%mm0 \n\t"
  1783. "packuswb %%mm2, %%mm2 \n\t"
  1784. "movd %%mm0, (%1, %%"REG_a") \n\t"
  1785. "movd %%mm2, (%2, %%"REG_a") \n\t"
  1786. "add $4, %%"REG_a" \n\t"
  1787. " js 1b \n\t"
  1788. : "+r" (src)
  1789. : "r" (dstU+width), "r" (dstV+width), "g" ((x86_reg)-width), "r"(ff_bgr24toUV[srcFormat == PIX_FMT_RGB24])
  1790. : "%"REG_a
  1791. );
  1792. }
  1793. static void RENAME(bgr24ToUV)(uint8_t *dstU, uint8_t *dstV,
  1794. const uint8_t *src1, const uint8_t *src2,
  1795. int width, uint32_t *unused)
  1796. {
  1797. RENAME(bgr24ToUV_mmx)(dstU, dstV, src1, width, PIX_FMT_BGR24);
  1798. assert(src1 == src2);
  1799. }
  1800. static void RENAME(rgb24ToUV)(uint8_t *dstU, uint8_t *dstV,
  1801. const uint8_t *src1, const uint8_t *src2,
  1802. int width, uint32_t *unused)
  1803. {
  1804. assert(src1==src2);
  1805. RENAME(bgr24ToUV_mmx)(dstU, dstV, src1, width, PIX_FMT_RGB24);
  1806. }
  1807. #if !COMPILE_TEMPLATE_MMX2
  1808. // bilinear / bicubic scaling
  1809. static void RENAME(hScale)(int16_t *dst, int dstW,
  1810. const uint8_t *src, int srcW,
  1811. int xInc, const int16_t *filter,
  1812. const int16_t *filterPos, int filterSize)
  1813. {
  1814. assert(filterSize % 4 == 0 && filterSize>0);
  1815. if (filterSize==4) { // Always true for upscaling, sometimes for down, too.
  1816. x86_reg counter= -2*dstW;
  1817. filter-= counter*2;
  1818. filterPos-= counter/2;
  1819. dst-= counter/2;
  1820. __asm__ volatile(
  1821. #if defined(PIC)
  1822. "push %%"REG_b" \n\t"
  1823. #endif
  1824. "pxor %%mm7, %%mm7 \n\t"
  1825. "push %%"REG_BP" \n\t" // we use 7 regs here ...
  1826. "mov %%"REG_a", %%"REG_BP" \n\t"
  1827. ".p2align 4 \n\t"
  1828. "1: \n\t"
  1829. "movzwl (%2, %%"REG_BP"), %%eax \n\t"
  1830. "movzwl 2(%2, %%"REG_BP"), %%ebx \n\t"
  1831. "movq (%1, %%"REG_BP", 4), %%mm1 \n\t"
  1832. "movq 8(%1, %%"REG_BP", 4), %%mm3 \n\t"
  1833. "movd (%3, %%"REG_a"), %%mm0 \n\t"
  1834. "movd (%3, %%"REG_b"), %%mm2 \n\t"
  1835. "punpcklbw %%mm7, %%mm0 \n\t"
  1836. "punpcklbw %%mm7, %%mm2 \n\t"
  1837. "pmaddwd %%mm1, %%mm0 \n\t"
  1838. "pmaddwd %%mm2, %%mm3 \n\t"
  1839. "movq %%mm0, %%mm4 \n\t"
  1840. "punpckldq %%mm3, %%mm0 \n\t"
  1841. "punpckhdq %%mm3, %%mm4 \n\t"
  1842. "paddd %%mm4, %%mm0 \n\t"
  1843. "psrad $7, %%mm0 \n\t"
  1844. "packssdw %%mm0, %%mm0 \n\t"
  1845. "movd %%mm0, (%4, %%"REG_BP") \n\t"
  1846. "add $4, %%"REG_BP" \n\t"
  1847. " jnc 1b \n\t"
  1848. "pop %%"REG_BP" \n\t"
  1849. #if defined(PIC)
  1850. "pop %%"REG_b" \n\t"
  1851. #endif
  1852. : "+a" (counter)
  1853. : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
  1854. #if !defined(PIC)
  1855. : "%"REG_b
  1856. #endif
  1857. );
  1858. } else if (filterSize==8) {
  1859. x86_reg counter= -2*dstW;
  1860. filter-= counter*4;
  1861. filterPos-= counter/2;
  1862. dst-= counter/2;
  1863. __asm__ volatile(
  1864. #if defined(PIC)
  1865. "push %%"REG_b" \n\t"
  1866. #endif
  1867. "pxor %%mm7, %%mm7 \n\t"
  1868. "push %%"REG_BP" \n\t" // we use 7 regs here ...
  1869. "mov %%"REG_a", %%"REG_BP" \n\t"
  1870. ".p2align 4 \n\t"
  1871. "1: \n\t"
  1872. "movzwl (%2, %%"REG_BP"), %%eax \n\t"
  1873. "movzwl 2(%2, %%"REG_BP"), %%ebx \n\t"
  1874. "movq (%1, %%"REG_BP", 8), %%mm1 \n\t"
  1875. "movq 16(%1, %%"REG_BP", 8), %%mm3 \n\t"
  1876. "movd (%3, %%"REG_a"), %%mm0 \n\t"
  1877. "movd (%3, %%"REG_b"), %%mm2 \n\t"
  1878. "punpcklbw %%mm7, %%mm0 \n\t"
  1879. "punpcklbw %%mm7, %%mm2 \n\t"
  1880. "pmaddwd %%mm1, %%mm0 \n\t"
  1881. "pmaddwd %%mm2, %%mm3 \n\t"
  1882. "movq 8(%1, %%"REG_BP", 8), %%mm1 \n\t"
  1883. "movq 24(%1, %%"REG_BP", 8), %%mm5 \n\t"
  1884. "movd 4(%3, %%"REG_a"), %%mm4 \n\t"
  1885. "movd 4(%3, %%"REG_b"), %%mm2 \n\t"
  1886. "punpcklbw %%mm7, %%mm4 \n\t"
  1887. "punpcklbw %%mm7, %%mm2 \n\t"
  1888. "pmaddwd %%mm1, %%mm4 \n\t"
  1889. "pmaddwd %%mm2, %%mm5 \n\t"
  1890. "paddd %%mm4, %%mm0 \n\t"
  1891. "paddd %%mm5, %%mm3 \n\t"
  1892. "movq %%mm0, %%mm4 \n\t"
  1893. "punpckldq %%mm3, %%mm0 \n\t"
  1894. "punpckhdq %%mm3, %%mm4 \n\t"
  1895. "paddd %%mm4, %%mm0 \n\t"
  1896. "psrad $7, %%mm0 \n\t"
  1897. "packssdw %%mm0, %%mm0 \n\t"
  1898. "movd %%mm0, (%4, %%"REG_BP") \n\t"
  1899. "add $4, %%"REG_BP" \n\t"
  1900. " jnc 1b \n\t"
  1901. "pop %%"REG_BP" \n\t"
  1902. #if defined(PIC)
  1903. "pop %%"REG_b" \n\t"
  1904. #endif
  1905. : "+a" (counter)
  1906. : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
  1907. #if !defined(PIC)
  1908. : "%"REG_b
  1909. #endif
  1910. );
  1911. } else {
  1912. const uint8_t *offset = src+filterSize;
  1913. x86_reg counter= -2*dstW;
  1914. //filter-= counter*filterSize/2;
  1915. filterPos-= counter/2;
  1916. dst-= counter/2;
  1917. __asm__ volatile(
  1918. "pxor %%mm7, %%mm7 \n\t"
  1919. ".p2align 4 \n\t"
  1920. "1: \n\t"
  1921. "mov %2, %%"REG_c" \n\t"
  1922. "movzwl (%%"REG_c", %0), %%eax \n\t"
  1923. "movzwl 2(%%"REG_c", %0), %%edx \n\t"
  1924. "mov %5, %%"REG_c" \n\t"
  1925. "pxor %%mm4, %%mm4 \n\t"
  1926. "pxor %%mm5, %%mm5 \n\t"
  1927. "2: \n\t"
  1928. "movq (%1), %%mm1 \n\t"
  1929. "movq (%1, %6), %%mm3 \n\t"
  1930. "movd (%%"REG_c", %%"REG_a"), %%mm0 \n\t"
  1931. "movd (%%"REG_c", %%"REG_d"), %%mm2 \n\t"
  1932. "punpcklbw %%mm7, %%mm0 \n\t"
  1933. "punpcklbw %%mm7, %%mm2 \n\t"
  1934. "pmaddwd %%mm1, %%mm0 \n\t"
  1935. "pmaddwd %%mm2, %%mm3 \n\t"
  1936. "paddd %%mm3, %%mm5 \n\t"
  1937. "paddd %%mm0, %%mm4 \n\t"
  1938. "add $8, %1 \n\t"
  1939. "add $4, %%"REG_c" \n\t"
  1940. "cmp %4, %%"REG_c" \n\t"
  1941. " jb 2b \n\t"
  1942. "add %6, %1 \n\t"
  1943. "movq %%mm4, %%mm0 \n\t"
  1944. "punpckldq %%mm5, %%mm4 \n\t"
  1945. "punpckhdq %%mm5, %%mm0 \n\t"
  1946. "paddd %%mm0, %%mm4 \n\t"
  1947. "psrad $7, %%mm4 \n\t"
  1948. "packssdw %%mm4, %%mm4 \n\t"
  1949. "mov %3, %%"REG_a" \n\t"
  1950. "movd %%mm4, (%%"REG_a", %0) \n\t"
  1951. "add $4, %0 \n\t"
  1952. " jnc 1b \n\t"
  1953. : "+r" (counter), "+r" (filter)
  1954. : "m" (filterPos), "m" (dst), "m"(offset),
  1955. "m" (src), "r" ((x86_reg)filterSize*2)
  1956. : "%"REG_a, "%"REG_c, "%"REG_d
  1957. );
  1958. }
  1959. }
  1960. #endif /* !COMPILE_TEMPLATE_MMX2 */
  1961. #if COMPILE_TEMPLATE_MMX2
  1962. static void RENAME(hyscale_fast)(SwsContext *c, int16_t *dst,
  1963. int dstWidth, const uint8_t *src,
  1964. int srcW, int xInc)
  1965. {
  1966. int16_t *filterPos = c->hLumFilterPos;
  1967. int16_t *filter = c->hLumFilter;
  1968. void *mmx2FilterCode= c->lumMmx2FilterCode;
  1969. int i;
  1970. #if defined(PIC)
  1971. DECLARE_ALIGNED(8, uint64_t, ebxsave);
  1972. #endif
  1973. __asm__ volatile(
  1974. #if defined(PIC)
  1975. "mov %%"REG_b", %5 \n\t"
  1976. #endif
  1977. "pxor %%mm7, %%mm7 \n\t"
  1978. "mov %0, %%"REG_c" \n\t"
  1979. "mov %1, %%"REG_D" \n\t"
  1980. "mov %2, %%"REG_d" \n\t"
  1981. "mov %3, %%"REG_b" \n\t"
  1982. "xor %%"REG_a", %%"REG_a" \n\t" // i
  1983. PREFETCH" (%%"REG_c") \n\t"
  1984. PREFETCH" 32(%%"REG_c") \n\t"
  1985. PREFETCH" 64(%%"REG_c") \n\t"
  1986. #if ARCH_X86_64
  1987. #define CALL_MMX2_FILTER_CODE \
  1988. "movl (%%"REG_b"), %%esi \n\t"\
  1989. "call *%4 \n\t"\
  1990. "movl (%%"REG_b", %%"REG_a"), %%esi \n\t"\
  1991. "add %%"REG_S", %%"REG_c" \n\t"\
  1992. "add %%"REG_a", %%"REG_D" \n\t"\
  1993. "xor %%"REG_a", %%"REG_a" \n\t"\
  1994. #else
  1995. #define CALL_MMX2_FILTER_CODE \
  1996. "movl (%%"REG_b"), %%esi \n\t"\
  1997. "call *%4 \n\t"\
  1998. "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\
  1999. "add %%"REG_a", %%"REG_D" \n\t"\
  2000. "xor %%"REG_a", %%"REG_a" \n\t"\
  2001. #endif /* ARCH_X86_64 */
  2002. CALL_MMX2_FILTER_CODE
  2003. CALL_MMX2_FILTER_CODE
  2004. CALL_MMX2_FILTER_CODE
  2005. CALL_MMX2_FILTER_CODE
  2006. CALL_MMX2_FILTER_CODE
  2007. CALL_MMX2_FILTER_CODE
  2008. CALL_MMX2_FILTER_CODE
  2009. CALL_MMX2_FILTER_CODE
  2010. #if defined(PIC)
  2011. "mov %5, %%"REG_b" \n\t"
  2012. #endif
  2013. :: "m" (src), "m" (dst), "m" (filter), "m" (filterPos),
  2014. "m" (mmx2FilterCode)
  2015. #if defined(PIC)
  2016. ,"m" (ebxsave)
  2017. #endif
  2018. : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
  2019. #if !defined(PIC)
  2020. ,"%"REG_b
  2021. #endif
  2022. );
  2023. for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
  2024. dst[i] = src[srcW-1]*128;
  2025. }
  2026. static void RENAME(hcscale_fast)(SwsContext *c, int16_t *dst1, int16_t *dst2,
  2027. int dstWidth, const uint8_t *src1,
  2028. const uint8_t *src2, int srcW, int xInc)
  2029. {
  2030. int16_t *filterPos = c->hChrFilterPos;
  2031. int16_t *filter = c->hChrFilter;
  2032. void *mmx2FilterCode= c->chrMmx2FilterCode;
  2033. int i;
  2034. #if defined(PIC)
  2035. DECLARE_ALIGNED(8, uint64_t, ebxsave);
  2036. #endif
  2037. __asm__ volatile(
  2038. #if defined(PIC)
  2039. "mov %%"REG_b", %7 \n\t"
  2040. #endif
  2041. "pxor %%mm7, %%mm7 \n\t"
  2042. "mov %0, %%"REG_c" \n\t"
  2043. "mov %1, %%"REG_D" \n\t"
  2044. "mov %2, %%"REG_d" \n\t"
  2045. "mov %3, %%"REG_b" \n\t"
  2046. "xor %%"REG_a", %%"REG_a" \n\t" // i
  2047. PREFETCH" (%%"REG_c") \n\t"
  2048. PREFETCH" 32(%%"REG_c") \n\t"
  2049. PREFETCH" 64(%%"REG_c") \n\t"
  2050. CALL_MMX2_FILTER_CODE
  2051. CALL_MMX2_FILTER_CODE
  2052. CALL_MMX2_FILTER_CODE
  2053. CALL_MMX2_FILTER_CODE
  2054. "xor %%"REG_a", %%"REG_a" \n\t" // i
  2055. "mov %5, %%"REG_c" \n\t" // src
  2056. "mov %6, %%"REG_D" \n\t" // buf2
  2057. PREFETCH" (%%"REG_c") \n\t"
  2058. PREFETCH" 32(%%"REG_c") \n\t"
  2059. PREFETCH" 64(%%"REG_c") \n\t"
  2060. CALL_MMX2_FILTER_CODE
  2061. CALL_MMX2_FILTER_CODE
  2062. CALL_MMX2_FILTER_CODE
  2063. CALL_MMX2_FILTER_CODE
  2064. #if defined(PIC)
  2065. "mov %7, %%"REG_b" \n\t"
  2066. #endif
  2067. :: "m" (src1), "m" (dst1), "m" (filter), "m" (filterPos),
  2068. "m" (mmx2FilterCode), "m" (src2), "m"(dst2)
  2069. #if defined(PIC)
  2070. ,"m" (ebxsave)
  2071. #endif
  2072. : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
  2073. #if !defined(PIC)
  2074. ,"%"REG_b
  2075. #endif
  2076. );
  2077. for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) {
  2078. dst1[i] = src1[srcW-1]*128;
  2079. dst2[i] = src2[srcW-1]*128;
  2080. }
  2081. }
  2082. #endif /* COMPILE_TEMPLATE_MMX2 */
  2083. static av_cold void RENAME(sws_init_swScale)(SwsContext *c)
  2084. {
  2085. enum PixelFormat srcFormat = c->srcFormat,
  2086. dstFormat = c->dstFormat;
  2087. if (!is16BPS(dstFormat) && !is9_OR_10BPS(dstFormat)) {
  2088. if (!(c->flags & SWS_BITEXACT)) {
  2089. if (c->flags & SWS_ACCURATE_RND) {
  2090. c->yuv2yuv1 = RENAME(yuv2yuv1_ar );
  2091. c->yuv2yuvX = RENAME(yuv2yuvX_ar );
  2092. if (!(c->flags & SWS_FULL_CHR_H_INT)) {
  2093. switch (c->dstFormat) {
  2094. case PIX_FMT_RGB32: c->yuv2packedX = RENAME(yuv2rgb32_X_ar); break;
  2095. case PIX_FMT_BGR24: c->yuv2packedX = RENAME(yuv2bgr24_X_ar); break;
  2096. case PIX_FMT_RGB555: c->yuv2packedX = RENAME(yuv2rgb555_X_ar); break;
  2097. case PIX_FMT_RGB565: c->yuv2packedX = RENAME(yuv2rgb565_X_ar); break;
  2098. case PIX_FMT_YUYV422: c->yuv2packedX = RENAME(yuv2yuyv422_X_ar); break;
  2099. default: break;
  2100. }
  2101. }
  2102. } else {
  2103. c->yuv2yuv1 = RENAME(yuv2yuv1 );
  2104. c->yuv2yuvX = RENAME(yuv2yuvX );
  2105. if (!(c->flags & SWS_FULL_CHR_H_INT)) {
  2106. switch (c->dstFormat) {
  2107. case PIX_FMT_RGB32: c->yuv2packedX = RENAME(yuv2rgb32_X); break;
  2108. case PIX_FMT_BGR24: c->yuv2packedX = RENAME(yuv2bgr24_X); break;
  2109. case PIX_FMT_RGB555: c->yuv2packedX = RENAME(yuv2rgb555_X); break;
  2110. case PIX_FMT_RGB565: c->yuv2packedX = RENAME(yuv2rgb565_X); break;
  2111. case PIX_FMT_YUYV422: c->yuv2packedX = RENAME(yuv2yuyv422_X); break;
  2112. default: break;
  2113. }
  2114. }
  2115. }
  2116. }
  2117. if (!(c->flags & SWS_FULL_CHR_H_INT)) {
  2118. switch (c->dstFormat) {
  2119. case PIX_FMT_RGB32:
  2120. c->yuv2packed1 = RENAME(yuv2rgb32_1);
  2121. c->yuv2packed2 = RENAME(yuv2rgb32_2);
  2122. break;
  2123. case PIX_FMT_BGR24:
  2124. c->yuv2packed1 = RENAME(yuv2bgr24_1);
  2125. c->yuv2packed2 = RENAME(yuv2bgr24_2);
  2126. break;
  2127. case PIX_FMT_RGB555:
  2128. c->yuv2packed1 = RENAME(yuv2rgb555_1);
  2129. c->yuv2packed2 = RENAME(yuv2rgb555_2);
  2130. break;
  2131. case PIX_FMT_RGB565:
  2132. c->yuv2packed1 = RENAME(yuv2rgb565_1);
  2133. c->yuv2packed2 = RENAME(yuv2rgb565_2);
  2134. break;
  2135. case PIX_FMT_YUYV422:
  2136. c->yuv2packed1 = RENAME(yuv2yuyv422_1);
  2137. c->yuv2packed2 = RENAME(yuv2yuyv422_2);
  2138. break;
  2139. default:
  2140. break;
  2141. }
  2142. }
  2143. }
  2144. #if !COMPILE_TEMPLATE_MMX2
  2145. c->hScale = RENAME(hScale );
  2146. #endif /* !COMPILE_TEMPLATE_MMX2 */
  2147. // Use the new MMX scaler if the MMX2 one can't be used (it is faster than the x86 ASM one).
  2148. #if COMPILE_TEMPLATE_MMX2
  2149. if (c->flags & SWS_FAST_BILINEAR && c->canMMX2BeUsed)
  2150. {
  2151. c->hyscale_fast = RENAME(hyscale_fast);
  2152. c->hcscale_fast = RENAME(hcscale_fast);
  2153. } else {
  2154. #endif /* COMPILE_TEMPLATE_MMX2 */
  2155. c->hyscale_fast = NULL;
  2156. c->hcscale_fast = NULL;
  2157. #if COMPILE_TEMPLATE_MMX2
  2158. }
  2159. #endif /* COMPILE_TEMPLATE_MMX2 */
  2160. #if !COMPILE_TEMPLATE_MMX2
  2161. switch(srcFormat) {
  2162. case PIX_FMT_YUYV422 : c->chrToYV12 = RENAME(yuy2ToUV); break;
  2163. case PIX_FMT_UYVY422 : c->chrToYV12 = RENAME(uyvyToUV); break;
  2164. case PIX_FMT_NV12 : c->chrToYV12 = RENAME(nv12ToUV); break;
  2165. case PIX_FMT_NV21 : c->chrToYV12 = RENAME(nv21ToUV); break;
  2166. case PIX_FMT_YUV420P16BE:
  2167. case PIX_FMT_YUV422P16BE:
  2168. case PIX_FMT_YUV444P16BE: c->chrToYV12 = RENAME(BEToUV); break;
  2169. case PIX_FMT_YUV420P16LE:
  2170. case PIX_FMT_YUV422P16LE:
  2171. case PIX_FMT_YUV444P16LE: c->chrToYV12 = RENAME(LEToUV); break;
  2172. default: break;
  2173. }
  2174. #endif /* !COMPILE_TEMPLATE_MMX2 */
  2175. if (!c->chrSrcHSubSample) {
  2176. switch(srcFormat) {
  2177. case PIX_FMT_BGR24 : c->chrToYV12 = RENAME(bgr24ToUV); break;
  2178. case PIX_FMT_RGB24 : c->chrToYV12 = RENAME(rgb24ToUV); break;
  2179. default: break;
  2180. }
  2181. }
  2182. switch (srcFormat) {
  2183. #if !COMPILE_TEMPLATE_MMX2
  2184. case PIX_FMT_YUYV422 :
  2185. case PIX_FMT_YUV420P16BE:
  2186. case PIX_FMT_YUV422P16BE:
  2187. case PIX_FMT_YUV444P16BE:
  2188. case PIX_FMT_Y400A :
  2189. case PIX_FMT_GRAY16BE : c->lumToYV12 = RENAME(yuy2ToY); break;
  2190. case PIX_FMT_UYVY422 :
  2191. case PIX_FMT_YUV420P16LE:
  2192. case PIX_FMT_YUV422P16LE:
  2193. case PIX_FMT_YUV444P16LE:
  2194. case PIX_FMT_GRAY16LE : c->lumToYV12 = RENAME(uyvyToY); break;
  2195. #endif /* !COMPILE_TEMPLATE_MMX2 */
  2196. case PIX_FMT_BGR24 : c->lumToYV12 = RENAME(bgr24ToY); break;
  2197. case PIX_FMT_RGB24 : c->lumToYV12 = RENAME(rgb24ToY); break;
  2198. default: break;
  2199. }
  2200. #if !COMPILE_TEMPLATE_MMX2
  2201. if (c->alpPixBuf) {
  2202. switch (srcFormat) {
  2203. case PIX_FMT_Y400A : c->alpToYV12 = RENAME(yuy2ToY); break;
  2204. default: break;
  2205. }
  2206. }
  2207. #endif /* !COMPILE_TEMPLATE_MMX2 */
  2208. }