You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

2274 lines
102KB

  1. /*
  2. * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
  3. *
  4. * This file is part of Libav.
  5. *
  6. * Libav is free software; you can redistribute it and/or
  7. * modify it under the terms of the GNU Lesser General Public
  8. * License as published by the Free Software Foundation; either
  9. * version 2.1 of the License, or (at your option) any later version.
  10. *
  11. * Libav is distributed in the hope that it will be useful,
  12. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  14. * Lesser General Public License for more details.
  15. *
  16. * You should have received a copy of the GNU Lesser General Public
  17. * License along with Libav; if not, write to the Free Software
  18. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  19. */
  20. #undef REAL_MOVNTQ
  21. #undef MOVNTQ
  22. #undef PREFETCH
  23. #if COMPILE_TEMPLATE_MMX2
  24. #define PREFETCH "prefetchnta"
  25. #else
  26. #define PREFETCH " # nop"
  27. #endif
  28. #if COMPILE_TEMPLATE_MMX2
  29. #define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
  30. #else
  31. #define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
  32. #endif
  33. #define MOVNTQ(a,b) REAL_MOVNTQ(a,b)
  34. #define YSCALEYUV2YV12X(offset, dest, end, pos) \
  35. __asm__ volatile(\
  36. "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
  37. "movq %%mm3, %%mm4 \n\t"\
  38. "lea " offset "(%0), %%"REG_d" \n\t"\
  39. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  40. ".p2align 4 \n\t" /* FIXME Unroll? */\
  41. "1: \n\t"\
  42. "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
  43. "movq (%%"REG_S", %3, 2), %%mm2 \n\t" /* srcData */\
  44. "movq 8(%%"REG_S", %3, 2), %%mm5 \n\t" /* srcData */\
  45. "add $16, %%"REG_d" \n\t"\
  46. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  47. "test %%"REG_S", %%"REG_S" \n\t"\
  48. "pmulhw %%mm0, %%mm2 \n\t"\
  49. "pmulhw %%mm0, %%mm5 \n\t"\
  50. "paddw %%mm2, %%mm3 \n\t"\
  51. "paddw %%mm5, %%mm4 \n\t"\
  52. " jnz 1b \n\t"\
  53. "psraw $3, %%mm3 \n\t"\
  54. "psraw $3, %%mm4 \n\t"\
  55. "packuswb %%mm4, %%mm3 \n\t"\
  56. MOVNTQ(%%mm3, (%1, %3))\
  57. "add $8, %3 \n\t"\
  58. "cmp %2, %3 \n\t"\
  59. "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
  60. "movq %%mm3, %%mm4 \n\t"\
  61. "lea " offset "(%0), %%"REG_d" \n\t"\
  62. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  63. "jb 1b \n\t"\
  64. :: "r" (&c->redDither),\
  65. "r" (dest), "g" ((x86_reg)(end)), "r"((x86_reg)(pos))\
  66. : "%"REG_d, "%"REG_S\
  67. );
  68. static void RENAME(yuv2yuvX)(SwsContext *c, const int16_t *lumFilter,
  69. const int16_t **lumSrc, int lumFilterSize,
  70. const int16_t *chrFilter, const int16_t **chrUSrc,
  71. const int16_t **chrVSrc,
  72. int chrFilterSize, const int16_t **alpSrc,
  73. uint8_t *dest[4], int dstW, int chrDstW)
  74. {
  75. uint8_t *yDest = dest[0], *uDest = dest[1], *vDest = dest[2],
  76. *aDest = CONFIG_SWSCALE_ALPHA ? dest[3] : NULL;
  77. if (uDest) {
  78. x86_reg uv_off = c->uv_off;
  79. YSCALEYUV2YV12X(CHR_MMX_FILTER_OFFSET, uDest, chrDstW, 0)
  80. YSCALEYUV2YV12X(CHR_MMX_FILTER_OFFSET, vDest - uv_off, chrDstW + uv_off, uv_off)
  81. }
  82. if (CONFIG_SWSCALE_ALPHA && aDest) {
  83. YSCALEYUV2YV12X(ALP_MMX_FILTER_OFFSET, aDest, dstW, 0)
  84. }
  85. YSCALEYUV2YV12X(LUM_MMX_FILTER_OFFSET, yDest, dstW, 0)
  86. }
  87. #define YSCALEYUV2YV12X_ACCURATE(offset, dest, end, pos) \
  88. __asm__ volatile(\
  89. "lea " offset "(%0), %%"REG_d" \n\t"\
  90. "pxor %%mm4, %%mm4 \n\t"\
  91. "pxor %%mm5, %%mm5 \n\t"\
  92. "pxor %%mm6, %%mm6 \n\t"\
  93. "pxor %%mm7, %%mm7 \n\t"\
  94. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  95. ".p2align 4 \n\t"\
  96. "1: \n\t"\
  97. "movq (%%"REG_S", %3, 2), %%mm0 \n\t" /* srcData */\
  98. "movq 8(%%"REG_S", %3, 2), %%mm2 \n\t" /* srcData */\
  99. "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
  100. "movq (%%"REG_S", %3, 2), %%mm1 \n\t" /* srcData */\
  101. "movq %%mm0, %%mm3 \n\t"\
  102. "punpcklwd %%mm1, %%mm0 \n\t"\
  103. "punpckhwd %%mm1, %%mm3 \n\t"\
  104. "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm1 \n\t" /* filterCoeff */\
  105. "pmaddwd %%mm1, %%mm0 \n\t"\
  106. "pmaddwd %%mm1, %%mm3 \n\t"\
  107. "paddd %%mm0, %%mm4 \n\t"\
  108. "paddd %%mm3, %%mm5 \n\t"\
  109. "movq 8(%%"REG_S", %3, 2), %%mm3 \n\t" /* srcData */\
  110. "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
  111. "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
  112. "test %%"REG_S", %%"REG_S" \n\t"\
  113. "movq %%mm2, %%mm0 \n\t"\
  114. "punpcklwd %%mm3, %%mm2 \n\t"\
  115. "punpckhwd %%mm3, %%mm0 \n\t"\
  116. "pmaddwd %%mm1, %%mm2 \n\t"\
  117. "pmaddwd %%mm1, %%mm0 \n\t"\
  118. "paddd %%mm2, %%mm6 \n\t"\
  119. "paddd %%mm0, %%mm7 \n\t"\
  120. " jnz 1b \n\t"\
  121. "psrad $16, %%mm4 \n\t"\
  122. "psrad $16, %%mm5 \n\t"\
  123. "psrad $16, %%mm6 \n\t"\
  124. "psrad $16, %%mm7 \n\t"\
  125. "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
  126. "packssdw %%mm5, %%mm4 \n\t"\
  127. "packssdw %%mm7, %%mm6 \n\t"\
  128. "paddw %%mm0, %%mm4 \n\t"\
  129. "paddw %%mm0, %%mm6 \n\t"\
  130. "psraw $3, %%mm4 \n\t"\
  131. "psraw $3, %%mm6 \n\t"\
  132. "packuswb %%mm6, %%mm4 \n\t"\
  133. MOVNTQ(%%mm4, (%1, %3))\
  134. "add $8, %3 \n\t"\
  135. "cmp %2, %3 \n\t"\
  136. "lea " offset "(%0), %%"REG_d" \n\t"\
  137. "pxor %%mm4, %%mm4 \n\t"\
  138. "pxor %%mm5, %%mm5 \n\t"\
  139. "pxor %%mm6, %%mm6 \n\t"\
  140. "pxor %%mm7, %%mm7 \n\t"\
  141. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  142. "jb 1b \n\t"\
  143. :: "r" (&c->redDither),\
  144. "r" (dest), "g" ((x86_reg)(end)), "r"((x86_reg)(pos))\
  145. : "%"REG_a, "%"REG_d, "%"REG_S\
  146. );
  147. static void RENAME(yuv2yuvX_ar)(SwsContext *c, const int16_t *lumFilter,
  148. const int16_t **lumSrc, int lumFilterSize,
  149. const int16_t *chrFilter, const int16_t **chrUSrc,
  150. const int16_t **chrVSrc,
  151. int chrFilterSize, const int16_t **alpSrc,
  152. uint8_t *dest[4], int dstW, int chrDstW)
  153. {
  154. uint8_t *yDest = dest[0], *uDest = dest[1], *vDest = dest[2],
  155. *aDest = CONFIG_SWSCALE_ALPHA ? dest[3] : NULL;
  156. if (uDest) {
  157. x86_reg uv_off = c->uv_off;
  158. YSCALEYUV2YV12X_ACCURATE(CHR_MMX_FILTER_OFFSET, uDest, chrDstW, 0)
  159. YSCALEYUV2YV12X_ACCURATE(CHR_MMX_FILTER_OFFSET, vDest - uv_off, chrDstW + uv_off, uv_off)
  160. }
  161. if (CONFIG_SWSCALE_ALPHA && aDest) {
  162. YSCALEYUV2YV12X_ACCURATE(ALP_MMX_FILTER_OFFSET, aDest, dstW, 0)
  163. }
  164. YSCALEYUV2YV12X_ACCURATE(LUM_MMX_FILTER_OFFSET, yDest, dstW, 0)
  165. }
  166. static void RENAME(yuv2yuv1)(SwsContext *c, const int16_t *lumSrc,
  167. const int16_t *chrUSrc, const int16_t *chrVSrc,
  168. const int16_t *alpSrc,
  169. uint8_t *dst[4], int dstW, int chrDstW)
  170. {
  171. int p= 4;
  172. const int16_t *src[4]= {
  173. lumSrc + dstW, chrUSrc + chrDstW,
  174. chrVSrc + chrDstW, alpSrc + dstW
  175. };
  176. x86_reg counter[4]= { dstW, chrDstW, chrDstW, dstW };
  177. while (p--) {
  178. if (dst[p]) {
  179. __asm__ volatile(
  180. "mov %2, %%"REG_a" \n\t"
  181. ".p2align 4 \n\t" /* FIXME Unroll? */
  182. "1: \n\t"
  183. "movq (%0, %%"REG_a", 2), %%mm0 \n\t"
  184. "movq 8(%0, %%"REG_a", 2), %%mm1 \n\t"
  185. "psraw $7, %%mm0 \n\t"
  186. "psraw $7, %%mm1 \n\t"
  187. "packuswb %%mm1, %%mm0 \n\t"
  188. MOVNTQ(%%mm0, (%1, %%REGa))
  189. "add $8, %%"REG_a" \n\t"
  190. "jnc 1b \n\t"
  191. :: "r" (src[p]), "r" (dst[p] + counter[p]),
  192. "g" (-counter[p])
  193. : "%"REG_a
  194. );
  195. }
  196. }
  197. }
  198. static void RENAME(yuv2yuv1_ar)(SwsContext *c, const int16_t *lumSrc,
  199. const int16_t *chrUSrc, const int16_t *chrVSrc,
  200. const int16_t *alpSrc,
  201. uint8_t *dst[4], int dstW, int chrDstW)
  202. {
  203. int p= 4;
  204. const int16_t *src[4]= {
  205. lumSrc + dstW, chrUSrc + chrDstW,
  206. chrVSrc + chrDstW, alpSrc + dstW
  207. };
  208. x86_reg counter[4]= { dstW, chrDstW, chrDstW, dstW };
  209. while (p--) {
  210. if (dst[p]) {
  211. __asm__ volatile(
  212. "mov %2, %%"REG_a" \n\t"
  213. "pcmpeqw %%mm7, %%mm7 \n\t"
  214. "psrlw $15, %%mm7 \n\t"
  215. "psllw $6, %%mm7 \n\t"
  216. ".p2align 4 \n\t" /* FIXME Unroll? */
  217. "1: \n\t"
  218. "movq (%0, %%"REG_a", 2), %%mm0 \n\t"
  219. "movq 8(%0, %%"REG_a", 2), %%mm1 \n\t"
  220. "paddsw %%mm7, %%mm0 \n\t"
  221. "paddsw %%mm7, %%mm1 \n\t"
  222. "psraw $7, %%mm0 \n\t"
  223. "psraw $7, %%mm1 \n\t"
  224. "packuswb %%mm1, %%mm0 \n\t"
  225. MOVNTQ(%%mm0, (%1, %%REGa))
  226. "add $8, %%"REG_a" \n\t"
  227. "jnc 1b \n\t"
  228. :: "r" (src[p]), "r" (dst[p] + counter[p]),
  229. "g" (-counter[p])
  230. : "%"REG_a
  231. );
  232. }
  233. }
  234. }
  235. #define YSCALEYUV2PACKEDX_UV \
  236. __asm__ volatile(\
  237. "xor %%"REG_a", %%"REG_a" \n\t"\
  238. ".p2align 4 \n\t"\
  239. "nop \n\t"\
  240. "1: \n\t"\
  241. "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
  242. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  243. "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
  244. "movq %%mm3, %%mm4 \n\t"\
  245. ".p2align 4 \n\t"\
  246. "2: \n\t"\
  247. "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
  248. "movq (%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* UsrcData */\
  249. "add %6, %%"REG_S" \n\t" \
  250. "movq (%%"REG_S", %%"REG_a"), %%mm5 \n\t" /* VsrcData */\
  251. "add $16, %%"REG_d" \n\t"\
  252. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  253. "pmulhw %%mm0, %%mm2 \n\t"\
  254. "pmulhw %%mm0, %%mm5 \n\t"\
  255. "paddw %%mm2, %%mm3 \n\t"\
  256. "paddw %%mm5, %%mm4 \n\t"\
  257. "test %%"REG_S", %%"REG_S" \n\t"\
  258. " jnz 2b \n\t"\
  259. #define YSCALEYUV2PACKEDX_YA(offset,coeff,src1,src2,dst1,dst2) \
  260. "lea "offset"(%0), %%"REG_d" \n\t"\
  261. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  262. "movq "VROUNDER_OFFSET"(%0), "#dst1" \n\t"\
  263. "movq "#dst1", "#dst2" \n\t"\
  264. ".p2align 4 \n\t"\
  265. "2: \n\t"\
  266. "movq 8(%%"REG_d"), "#coeff" \n\t" /* filterCoeff */\
  267. "movq (%%"REG_S", %%"REG_a", 2), "#src1" \n\t" /* Y1srcData */\
  268. "movq 8(%%"REG_S", %%"REG_a", 2), "#src2" \n\t" /* Y2srcData */\
  269. "add $16, %%"REG_d" \n\t"\
  270. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  271. "pmulhw "#coeff", "#src1" \n\t"\
  272. "pmulhw "#coeff", "#src2" \n\t"\
  273. "paddw "#src1", "#dst1" \n\t"\
  274. "paddw "#src2", "#dst2" \n\t"\
  275. "test %%"REG_S", %%"REG_S" \n\t"\
  276. " jnz 2b \n\t"\
  277. #define YSCALEYUV2PACKEDX \
  278. YSCALEYUV2PACKEDX_UV \
  279. YSCALEYUV2PACKEDX_YA(LUM_MMX_FILTER_OFFSET,%%mm0,%%mm2,%%mm5,%%mm1,%%mm7) \
  280. #define YSCALEYUV2PACKEDX_END \
  281. :: "r" (&c->redDither), \
  282. "m" (dummy), "m" (dummy), "m" (dummy),\
  283. "r" (dest), "m" (dstW_reg), "m"(uv_off) \
  284. : "%"REG_a, "%"REG_d, "%"REG_S \
  285. );
  286. #define YSCALEYUV2PACKEDX_ACCURATE_UV \
  287. __asm__ volatile(\
  288. "xor %%"REG_a", %%"REG_a" \n\t"\
  289. ".p2align 4 \n\t"\
  290. "nop \n\t"\
  291. "1: \n\t"\
  292. "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
  293. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  294. "pxor %%mm4, %%mm4 \n\t"\
  295. "pxor %%mm5, %%mm5 \n\t"\
  296. "pxor %%mm6, %%mm6 \n\t"\
  297. "pxor %%mm7, %%mm7 \n\t"\
  298. ".p2align 4 \n\t"\
  299. "2: \n\t"\
  300. "movq (%%"REG_S", %%"REG_a"), %%mm0 \n\t" /* UsrcData */\
  301. "add %6, %%"REG_S" \n\t" \
  302. "movq (%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* VsrcData */\
  303. "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
  304. "movq (%%"REG_S", %%"REG_a"), %%mm1 \n\t" /* UsrcData */\
  305. "movq %%mm0, %%mm3 \n\t"\
  306. "punpcklwd %%mm1, %%mm0 \n\t"\
  307. "punpckhwd %%mm1, %%mm3 \n\t"\
  308. "movq "STR(APCK_COEF)"(%%"REG_d"),%%mm1 \n\t" /* filterCoeff */\
  309. "pmaddwd %%mm1, %%mm0 \n\t"\
  310. "pmaddwd %%mm1, %%mm3 \n\t"\
  311. "paddd %%mm0, %%mm4 \n\t"\
  312. "paddd %%mm3, %%mm5 \n\t"\
  313. "add %6, %%"REG_S" \n\t" \
  314. "movq (%%"REG_S", %%"REG_a"), %%mm3 \n\t" /* VsrcData */\
  315. "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
  316. "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
  317. "test %%"REG_S", %%"REG_S" \n\t"\
  318. "movq %%mm2, %%mm0 \n\t"\
  319. "punpcklwd %%mm3, %%mm2 \n\t"\
  320. "punpckhwd %%mm3, %%mm0 \n\t"\
  321. "pmaddwd %%mm1, %%mm2 \n\t"\
  322. "pmaddwd %%mm1, %%mm0 \n\t"\
  323. "paddd %%mm2, %%mm6 \n\t"\
  324. "paddd %%mm0, %%mm7 \n\t"\
  325. " jnz 2b \n\t"\
  326. "psrad $16, %%mm4 \n\t"\
  327. "psrad $16, %%mm5 \n\t"\
  328. "psrad $16, %%mm6 \n\t"\
  329. "psrad $16, %%mm7 \n\t"\
  330. "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
  331. "packssdw %%mm5, %%mm4 \n\t"\
  332. "packssdw %%mm7, %%mm6 \n\t"\
  333. "paddw %%mm0, %%mm4 \n\t"\
  334. "paddw %%mm0, %%mm6 \n\t"\
  335. "movq %%mm4, "U_TEMP"(%0) \n\t"\
  336. "movq %%mm6, "V_TEMP"(%0) \n\t"\
  337. #define YSCALEYUV2PACKEDX_ACCURATE_YA(offset) \
  338. "lea "offset"(%0), %%"REG_d" \n\t"\
  339. "mov (%%"REG_d"), %%"REG_S" \n\t"\
  340. "pxor %%mm1, %%mm1 \n\t"\
  341. "pxor %%mm5, %%mm5 \n\t"\
  342. "pxor %%mm7, %%mm7 \n\t"\
  343. "pxor %%mm6, %%mm6 \n\t"\
  344. ".p2align 4 \n\t"\
  345. "2: \n\t"\
  346. "movq (%%"REG_S", %%"REG_a", 2), %%mm0 \n\t" /* Y1srcData */\
  347. "movq 8(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* Y2srcData */\
  348. "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
  349. "movq (%%"REG_S", %%"REG_a", 2), %%mm4 \n\t" /* Y1srcData */\
  350. "movq %%mm0, %%mm3 \n\t"\
  351. "punpcklwd %%mm4, %%mm0 \n\t"\
  352. "punpckhwd %%mm4, %%mm3 \n\t"\
  353. "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm4 \n\t" /* filterCoeff */\
  354. "pmaddwd %%mm4, %%mm0 \n\t"\
  355. "pmaddwd %%mm4, %%mm3 \n\t"\
  356. "paddd %%mm0, %%mm1 \n\t"\
  357. "paddd %%mm3, %%mm5 \n\t"\
  358. "movq 8(%%"REG_S", %%"REG_a", 2), %%mm3 \n\t" /* Y2srcData */\
  359. "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
  360. "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
  361. "test %%"REG_S", %%"REG_S" \n\t"\
  362. "movq %%mm2, %%mm0 \n\t"\
  363. "punpcklwd %%mm3, %%mm2 \n\t"\
  364. "punpckhwd %%mm3, %%mm0 \n\t"\
  365. "pmaddwd %%mm4, %%mm2 \n\t"\
  366. "pmaddwd %%mm4, %%mm0 \n\t"\
  367. "paddd %%mm2, %%mm7 \n\t"\
  368. "paddd %%mm0, %%mm6 \n\t"\
  369. " jnz 2b \n\t"\
  370. "psrad $16, %%mm1 \n\t"\
  371. "psrad $16, %%mm5 \n\t"\
  372. "psrad $16, %%mm7 \n\t"\
  373. "psrad $16, %%mm6 \n\t"\
  374. "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
  375. "packssdw %%mm5, %%mm1 \n\t"\
  376. "packssdw %%mm6, %%mm7 \n\t"\
  377. "paddw %%mm0, %%mm1 \n\t"\
  378. "paddw %%mm0, %%mm7 \n\t"\
  379. "movq "U_TEMP"(%0), %%mm3 \n\t"\
  380. "movq "V_TEMP"(%0), %%mm4 \n\t"\
  381. #define YSCALEYUV2PACKEDX_ACCURATE \
  382. YSCALEYUV2PACKEDX_ACCURATE_UV \
  383. YSCALEYUV2PACKEDX_ACCURATE_YA(LUM_MMX_FILTER_OFFSET)
  384. #define YSCALEYUV2RGBX \
  385. "psubw "U_OFFSET"(%0), %%mm3 \n\t" /* (U-128)8*/\
  386. "psubw "V_OFFSET"(%0), %%mm4 \n\t" /* (V-128)8*/\
  387. "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
  388. "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
  389. "pmulhw "UG_COEFF"(%0), %%mm3 \n\t"\
  390. "pmulhw "VG_COEFF"(%0), %%mm4 \n\t"\
  391. /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
  392. "pmulhw "UB_COEFF"(%0), %%mm2 \n\t"\
  393. "pmulhw "VR_COEFF"(%0), %%mm5 \n\t"\
  394. "psubw "Y_OFFSET"(%0), %%mm1 \n\t" /* 8(Y-16)*/\
  395. "psubw "Y_OFFSET"(%0), %%mm7 \n\t" /* 8(Y-16)*/\
  396. "pmulhw "Y_COEFF"(%0), %%mm1 \n\t"\
  397. "pmulhw "Y_COEFF"(%0), %%mm7 \n\t"\
  398. /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
  399. "paddw %%mm3, %%mm4 \n\t"\
  400. "movq %%mm2, %%mm0 \n\t"\
  401. "movq %%mm5, %%mm6 \n\t"\
  402. "movq %%mm4, %%mm3 \n\t"\
  403. "punpcklwd %%mm2, %%mm2 \n\t"\
  404. "punpcklwd %%mm5, %%mm5 \n\t"\
  405. "punpcklwd %%mm4, %%mm4 \n\t"\
  406. "paddw %%mm1, %%mm2 \n\t"\
  407. "paddw %%mm1, %%mm5 \n\t"\
  408. "paddw %%mm1, %%mm4 \n\t"\
  409. "punpckhwd %%mm0, %%mm0 \n\t"\
  410. "punpckhwd %%mm6, %%mm6 \n\t"\
  411. "punpckhwd %%mm3, %%mm3 \n\t"\
  412. "paddw %%mm7, %%mm0 \n\t"\
  413. "paddw %%mm7, %%mm6 \n\t"\
  414. "paddw %%mm7, %%mm3 \n\t"\
  415. /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
  416. "packuswb %%mm0, %%mm2 \n\t"\
  417. "packuswb %%mm6, %%mm5 \n\t"\
  418. "packuswb %%mm3, %%mm4 \n\t"\
  419. #define REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) \
  420. "movq "#b", "#q2" \n\t" /* B */\
  421. "movq "#r", "#t" \n\t" /* R */\
  422. "punpcklbw "#g", "#b" \n\t" /* GBGBGBGB 0 */\
  423. "punpcklbw "#a", "#r" \n\t" /* ARARARAR 0 */\
  424. "punpckhbw "#g", "#q2" \n\t" /* GBGBGBGB 2 */\
  425. "punpckhbw "#a", "#t" \n\t" /* ARARARAR 2 */\
  426. "movq "#b", "#q0" \n\t" /* GBGBGBGB 0 */\
  427. "movq "#q2", "#q3" \n\t" /* GBGBGBGB 2 */\
  428. "punpcklwd "#r", "#q0" \n\t" /* ARGBARGB 0 */\
  429. "punpckhwd "#r", "#b" \n\t" /* ARGBARGB 1 */\
  430. "punpcklwd "#t", "#q2" \n\t" /* ARGBARGB 2 */\
  431. "punpckhwd "#t", "#q3" \n\t" /* ARGBARGB 3 */\
  432. \
  433. MOVNTQ( q0, (dst, index, 4))\
  434. MOVNTQ( b, 8(dst, index, 4))\
  435. MOVNTQ( q2, 16(dst, index, 4))\
  436. MOVNTQ( q3, 24(dst, index, 4))\
  437. \
  438. "add $8, "#index" \n\t"\
  439. "cmp "#dstw", "#index" \n\t"\
  440. " jb 1b \n\t"
  441. #define WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t)
  442. static void RENAME(yuv2rgb32_X_ar)(SwsContext *c, const int16_t *lumFilter,
  443. const int16_t **lumSrc, int lumFilterSize,
  444. const int16_t *chrFilter, const int16_t **chrUSrc,
  445. const int16_t **chrVSrc,
  446. int chrFilterSize, const int16_t **alpSrc,
  447. uint8_t *dest, int dstW, int dstY)
  448. {
  449. x86_reg dummy=0;
  450. x86_reg dstW_reg = dstW;
  451. x86_reg uv_off = c->uv_off << 1;
  452. if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
  453. YSCALEYUV2PACKEDX_ACCURATE
  454. YSCALEYUV2RGBX
  455. "movq %%mm2, "U_TEMP"(%0) \n\t"
  456. "movq %%mm4, "V_TEMP"(%0) \n\t"
  457. "movq %%mm5, "Y_TEMP"(%0) \n\t"
  458. YSCALEYUV2PACKEDX_ACCURATE_YA(ALP_MMX_FILTER_OFFSET)
  459. "movq "Y_TEMP"(%0), %%mm5 \n\t"
  460. "psraw $3, %%mm1 \n\t"
  461. "psraw $3, %%mm7 \n\t"
  462. "packuswb %%mm7, %%mm1 \n\t"
  463. WRITEBGR32(%4, %5, %%REGa, %%mm3, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm2, %%mm6)
  464. YSCALEYUV2PACKEDX_END
  465. } else {
  466. YSCALEYUV2PACKEDX_ACCURATE
  467. YSCALEYUV2RGBX
  468. "pcmpeqd %%mm7, %%mm7 \n\t"
  469. WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
  470. YSCALEYUV2PACKEDX_END
  471. }
  472. }
  473. static void RENAME(yuv2rgb32_X)(SwsContext *c, const int16_t *lumFilter,
  474. const int16_t **lumSrc, int lumFilterSize,
  475. const int16_t *chrFilter, const int16_t **chrUSrc,
  476. const int16_t **chrVSrc,
  477. int chrFilterSize, const int16_t **alpSrc,
  478. uint8_t *dest, int dstW, int dstY)
  479. {
  480. x86_reg dummy=0;
  481. x86_reg dstW_reg = dstW;
  482. x86_reg uv_off = c->uv_off << 1;
  483. if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
  484. YSCALEYUV2PACKEDX
  485. YSCALEYUV2RGBX
  486. YSCALEYUV2PACKEDX_YA(ALP_MMX_FILTER_OFFSET, %%mm0, %%mm3, %%mm6, %%mm1, %%mm7)
  487. "psraw $3, %%mm1 \n\t"
  488. "psraw $3, %%mm7 \n\t"
  489. "packuswb %%mm7, %%mm1 \n\t"
  490. WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
  491. YSCALEYUV2PACKEDX_END
  492. } else {
  493. YSCALEYUV2PACKEDX
  494. YSCALEYUV2RGBX
  495. "pcmpeqd %%mm7, %%mm7 \n\t"
  496. WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
  497. YSCALEYUV2PACKEDX_END
  498. }
  499. }
  500. #define REAL_WRITERGB16(dst, dstw, index) \
  501. "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
  502. "pand "MANGLE(bFC)", %%mm4 \n\t" /* G */\
  503. "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
  504. "psrlq $3, %%mm2 \n\t"\
  505. \
  506. "movq %%mm2, %%mm1 \n\t"\
  507. "movq %%mm4, %%mm3 \n\t"\
  508. \
  509. "punpcklbw %%mm7, %%mm3 \n\t"\
  510. "punpcklbw %%mm5, %%mm2 \n\t"\
  511. "punpckhbw %%mm7, %%mm4 \n\t"\
  512. "punpckhbw %%mm5, %%mm1 \n\t"\
  513. \
  514. "psllq $3, %%mm3 \n\t"\
  515. "psllq $3, %%mm4 \n\t"\
  516. \
  517. "por %%mm3, %%mm2 \n\t"\
  518. "por %%mm4, %%mm1 \n\t"\
  519. \
  520. MOVNTQ(%%mm2, (dst, index, 2))\
  521. MOVNTQ(%%mm1, 8(dst, index, 2))\
  522. \
  523. "add $8, "#index" \n\t"\
  524. "cmp "#dstw", "#index" \n\t"\
  525. " jb 1b \n\t"
  526. #define WRITERGB16(dst, dstw, index) REAL_WRITERGB16(dst, dstw, index)
  527. static void RENAME(yuv2rgb565_X_ar)(SwsContext *c, const int16_t *lumFilter,
  528. const int16_t **lumSrc, int lumFilterSize,
  529. const int16_t *chrFilter, const int16_t **chrUSrc,
  530. const int16_t **chrVSrc,
  531. int chrFilterSize, const int16_t **alpSrc,
  532. uint8_t *dest, int dstW, int dstY)
  533. {
  534. x86_reg dummy=0;
  535. x86_reg dstW_reg = dstW;
  536. x86_reg uv_off = c->uv_off << 1;
  537. YSCALEYUV2PACKEDX_ACCURATE
  538. YSCALEYUV2RGBX
  539. "pxor %%mm7, %%mm7 \n\t"
  540. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  541. #ifdef DITHER1XBPP
  542. "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
  543. "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
  544. "paddusb "RED_DITHER"(%0), %%mm5\n\t"
  545. #endif
  546. WRITERGB16(%4, %5, %%REGa)
  547. YSCALEYUV2PACKEDX_END
  548. }
  549. static void RENAME(yuv2rgb565_X)(SwsContext *c, const int16_t *lumFilter,
  550. const int16_t **lumSrc, int lumFilterSize,
  551. const int16_t *chrFilter, const int16_t **chrUSrc,
  552. const int16_t **chrVSrc,
  553. int chrFilterSize, const int16_t **alpSrc,
  554. uint8_t *dest, int dstW, int dstY)
  555. {
  556. x86_reg dummy=0;
  557. x86_reg dstW_reg = dstW;
  558. x86_reg uv_off = c->uv_off << 1;
  559. YSCALEYUV2PACKEDX
  560. YSCALEYUV2RGBX
  561. "pxor %%mm7, %%mm7 \n\t"
  562. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  563. #ifdef DITHER1XBPP
  564. "paddusb "BLUE_DITHER"(%0), %%mm2 \n\t"
  565. "paddusb "GREEN_DITHER"(%0), %%mm4 \n\t"
  566. "paddusb "RED_DITHER"(%0), %%mm5 \n\t"
  567. #endif
  568. WRITERGB16(%4, %5, %%REGa)
  569. YSCALEYUV2PACKEDX_END
  570. }
  571. #define REAL_WRITERGB15(dst, dstw, index) \
  572. "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
  573. "pand "MANGLE(bF8)", %%mm4 \n\t" /* G */\
  574. "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
  575. "psrlq $3, %%mm2 \n\t"\
  576. "psrlq $1, %%mm5 \n\t"\
  577. \
  578. "movq %%mm2, %%mm1 \n\t"\
  579. "movq %%mm4, %%mm3 \n\t"\
  580. \
  581. "punpcklbw %%mm7, %%mm3 \n\t"\
  582. "punpcklbw %%mm5, %%mm2 \n\t"\
  583. "punpckhbw %%mm7, %%mm4 \n\t"\
  584. "punpckhbw %%mm5, %%mm1 \n\t"\
  585. \
  586. "psllq $2, %%mm3 \n\t"\
  587. "psllq $2, %%mm4 \n\t"\
  588. \
  589. "por %%mm3, %%mm2 \n\t"\
  590. "por %%mm4, %%mm1 \n\t"\
  591. \
  592. MOVNTQ(%%mm2, (dst, index, 2))\
  593. MOVNTQ(%%mm1, 8(dst, index, 2))\
  594. \
  595. "add $8, "#index" \n\t"\
  596. "cmp "#dstw", "#index" \n\t"\
  597. " jb 1b \n\t"
  598. #define WRITERGB15(dst, dstw, index) REAL_WRITERGB15(dst, dstw, index)
  599. static void RENAME(yuv2rgb555_X_ar)(SwsContext *c, const int16_t *lumFilter,
  600. const int16_t **lumSrc, int lumFilterSize,
  601. const int16_t *chrFilter, const int16_t **chrUSrc,
  602. const int16_t **chrVSrc,
  603. int chrFilterSize, const int16_t **alpSrc,
  604. uint8_t *dest, int dstW, int dstY)
  605. {
  606. x86_reg dummy=0;
  607. x86_reg dstW_reg = dstW;
  608. x86_reg uv_off = c->uv_off << 1;
  609. YSCALEYUV2PACKEDX_ACCURATE
  610. YSCALEYUV2RGBX
  611. "pxor %%mm7, %%mm7 \n\t"
  612. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  613. #ifdef DITHER1XBPP
  614. "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
  615. "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
  616. "paddusb "RED_DITHER"(%0), %%mm5\n\t"
  617. #endif
  618. WRITERGB15(%4, %5, %%REGa)
  619. YSCALEYUV2PACKEDX_END
  620. }
  621. static void RENAME(yuv2rgb555_X)(SwsContext *c, const int16_t *lumFilter,
  622. const int16_t **lumSrc, int lumFilterSize,
  623. const int16_t *chrFilter, const int16_t **chrUSrc,
  624. const int16_t **chrVSrc,
  625. int chrFilterSize, const int16_t **alpSrc,
  626. uint8_t *dest, int dstW, int dstY)
  627. {
  628. x86_reg dummy=0;
  629. x86_reg dstW_reg = dstW;
  630. x86_reg uv_off = c->uv_off << 1;
  631. YSCALEYUV2PACKEDX
  632. YSCALEYUV2RGBX
  633. "pxor %%mm7, %%mm7 \n\t"
  634. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  635. #ifdef DITHER1XBPP
  636. "paddusb "BLUE_DITHER"(%0), %%mm2 \n\t"
  637. "paddusb "GREEN_DITHER"(%0), %%mm4 \n\t"
  638. "paddusb "RED_DITHER"(%0), %%mm5 \n\t"
  639. #endif
  640. WRITERGB15(%4, %5, %%REGa)
  641. YSCALEYUV2PACKEDX_END
  642. }
  643. #define WRITEBGR24MMX(dst, dstw, index) \
  644. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
  645. "movq %%mm2, %%mm1 \n\t" /* B */\
  646. "movq %%mm5, %%mm6 \n\t" /* R */\
  647. "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
  648. "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
  649. "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
  650. "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
  651. "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
  652. "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
  653. "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
  654. "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
  655. "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
  656. "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
  657. \
  658. "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
  659. "movq %%mm2, %%mm6 \n\t" /* 0RGB0RGB 1 */\
  660. "movq %%mm1, %%mm5 \n\t" /* 0RGB0RGB 2 */\
  661. "movq %%mm3, %%mm7 \n\t" /* 0RGB0RGB 3 */\
  662. \
  663. "psllq $40, %%mm0 \n\t" /* RGB00000 0 */\
  664. "psllq $40, %%mm2 \n\t" /* RGB00000 1 */\
  665. "psllq $40, %%mm1 \n\t" /* RGB00000 2 */\
  666. "psllq $40, %%mm3 \n\t" /* RGB00000 3 */\
  667. \
  668. "punpckhdq %%mm4, %%mm0 \n\t" /* 0RGBRGB0 0 */\
  669. "punpckhdq %%mm6, %%mm2 \n\t" /* 0RGBRGB0 1 */\
  670. "punpckhdq %%mm5, %%mm1 \n\t" /* 0RGBRGB0 2 */\
  671. "punpckhdq %%mm7, %%mm3 \n\t" /* 0RGBRGB0 3 */\
  672. \
  673. "psrlq $8, %%mm0 \n\t" /* 00RGBRGB 0 */\
  674. "movq %%mm2, %%mm6 \n\t" /* 0RGBRGB0 1 */\
  675. "psllq $40, %%mm2 \n\t" /* GB000000 1 */\
  676. "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
  677. MOVNTQ(%%mm0, (dst))\
  678. \
  679. "psrlq $24, %%mm6 \n\t" /* 0000RGBR 1 */\
  680. "movq %%mm1, %%mm5 \n\t" /* 0RGBRGB0 2 */\
  681. "psllq $24, %%mm1 \n\t" /* BRGB0000 2 */\
  682. "por %%mm1, %%mm6 \n\t" /* BRGBRGBR 1 */\
  683. MOVNTQ(%%mm6, 8(dst))\
  684. \
  685. "psrlq $40, %%mm5 \n\t" /* 000000RG 2 */\
  686. "psllq $8, %%mm3 \n\t" /* RGBRGB00 3 */\
  687. "por %%mm3, %%mm5 \n\t" /* RGBRGBRG 2 */\
  688. MOVNTQ(%%mm5, 16(dst))\
  689. \
  690. "add $24, "#dst" \n\t"\
  691. \
  692. "add $8, "#index" \n\t"\
  693. "cmp "#dstw", "#index" \n\t"\
  694. " jb 1b \n\t"
  695. #define WRITEBGR24MMX2(dst, dstw, index) \
  696. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
  697. "movq "MANGLE(ff_M24A)", %%mm0 \n\t"\
  698. "movq "MANGLE(ff_M24C)", %%mm7 \n\t"\
  699. "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2 B1 B0 B1 B0 */\
  700. "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2 G1 G0 G1 G0 */\
  701. "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0 R1 R0 R1 R0 */\
  702. \
  703. "pand %%mm0, %%mm1 \n\t" /* B2 B1 B0 */\
  704. "pand %%mm0, %%mm3 \n\t" /* G2 G1 G0 */\
  705. "pand %%mm7, %%mm6 \n\t" /* R1 R0 */\
  706. \
  707. "psllq $8, %%mm3 \n\t" /* G2 G1 G0 */\
  708. "por %%mm1, %%mm6 \n\t"\
  709. "por %%mm3, %%mm6 \n\t"\
  710. MOVNTQ(%%mm6, (dst))\
  711. \
  712. "psrlq $8, %%mm4 \n\t" /* 00 G7 G6 G5 G4 G3 G2 G1 */\
  713. "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4 B3 B2 B3 B2 */\
  714. "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3 G4 G3 G4 G3 */\
  715. "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4 R3 R2 R3 R2 */\
  716. \
  717. "pand "MANGLE(ff_M24B)", %%mm1 \n\t" /* B5 B4 B3 */\
  718. "pand %%mm7, %%mm3 \n\t" /* G4 G3 */\
  719. "pand %%mm0, %%mm6 \n\t" /* R4 R3 R2 */\
  720. \
  721. "por %%mm1, %%mm3 \n\t" /* B5 G4 B4 G3 B3 */\
  722. "por %%mm3, %%mm6 \n\t"\
  723. MOVNTQ(%%mm6, 8(dst))\
  724. \
  725. "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6 B7 B6 B6 B7 */\
  726. "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7 G6 G5 G6 G5 */\
  727. "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6 R5 R4 R5 R4 */\
  728. \
  729. "pand %%mm7, %%mm1 \n\t" /* B7 B6 */\
  730. "pand %%mm0, %%mm3 \n\t" /* G7 G6 G5 */\
  731. "pand "MANGLE(ff_M24B)", %%mm6 \n\t" /* R7 R6 R5 */\
  732. \
  733. "por %%mm1, %%mm3 \n\t"\
  734. "por %%mm3, %%mm6 \n\t"\
  735. MOVNTQ(%%mm6, 16(dst))\
  736. \
  737. "add $24, "#dst" \n\t"\
  738. \
  739. "add $8, "#index" \n\t"\
  740. "cmp "#dstw", "#index" \n\t"\
  741. " jb 1b \n\t"
  742. #if COMPILE_TEMPLATE_MMX2
  743. #undef WRITEBGR24
  744. #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX2(dst, dstw, index)
  745. #else
  746. #undef WRITEBGR24
  747. #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX(dst, dstw, index)
  748. #endif
  749. static void RENAME(yuv2bgr24_X_ar)(SwsContext *c, const int16_t *lumFilter,
  750. const int16_t **lumSrc, int lumFilterSize,
  751. const int16_t *chrFilter, const int16_t **chrUSrc,
  752. const int16_t **chrVSrc,
  753. int chrFilterSize, const int16_t **alpSrc,
  754. uint8_t *dest, int dstW, int dstY)
  755. {
  756. x86_reg dummy=0;
  757. x86_reg dstW_reg = dstW;
  758. x86_reg uv_off = c->uv_off << 1;
  759. YSCALEYUV2PACKEDX_ACCURATE
  760. YSCALEYUV2RGBX
  761. "pxor %%mm7, %%mm7 \n\t"
  762. "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"\n\t" //FIXME optimize
  763. "add %4, %%"REG_c" \n\t"
  764. WRITEBGR24(%%REGc, %5, %%REGa)
  765. :: "r" (&c->redDither),
  766. "m" (dummy), "m" (dummy), "m" (dummy),
  767. "r" (dest), "m" (dstW_reg), "m"(uv_off)
  768. : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
  769. );
  770. }
  771. static void RENAME(yuv2bgr24_X)(SwsContext *c, const int16_t *lumFilter,
  772. const int16_t **lumSrc, int lumFilterSize,
  773. const int16_t *chrFilter, const int16_t **chrUSrc,
  774. const int16_t **chrVSrc,
  775. int chrFilterSize, const int16_t **alpSrc,
  776. uint8_t *dest, int dstW, int dstY)
  777. {
  778. x86_reg dummy=0;
  779. x86_reg dstW_reg = dstW;
  780. x86_reg uv_off = c->uv_off << 1;
  781. YSCALEYUV2PACKEDX
  782. YSCALEYUV2RGBX
  783. "pxor %%mm7, %%mm7 \n\t"
  784. "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c" \n\t" //FIXME optimize
  785. "add %4, %%"REG_c" \n\t"
  786. WRITEBGR24(%%REGc, %5, %%REGa)
  787. :: "r" (&c->redDither),
  788. "m" (dummy), "m" (dummy), "m" (dummy),
  789. "r" (dest), "m" (dstW_reg), "m"(uv_off)
  790. : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
  791. );
  792. }
  793. #define REAL_WRITEYUY2(dst, dstw, index) \
  794. "packuswb %%mm3, %%mm3 \n\t"\
  795. "packuswb %%mm4, %%mm4 \n\t"\
  796. "packuswb %%mm7, %%mm1 \n\t"\
  797. "punpcklbw %%mm4, %%mm3 \n\t"\
  798. "movq %%mm1, %%mm7 \n\t"\
  799. "punpcklbw %%mm3, %%mm1 \n\t"\
  800. "punpckhbw %%mm3, %%mm7 \n\t"\
  801. \
  802. MOVNTQ(%%mm1, (dst, index, 2))\
  803. MOVNTQ(%%mm7, 8(dst, index, 2))\
  804. \
  805. "add $8, "#index" \n\t"\
  806. "cmp "#dstw", "#index" \n\t"\
  807. " jb 1b \n\t"
  808. #define WRITEYUY2(dst, dstw, index) REAL_WRITEYUY2(dst, dstw, index)
  809. static void RENAME(yuv2yuyv422_X_ar)(SwsContext *c, const int16_t *lumFilter,
  810. const int16_t **lumSrc, int lumFilterSize,
  811. const int16_t *chrFilter, const int16_t **chrUSrc,
  812. const int16_t **chrVSrc,
  813. int chrFilterSize, const int16_t **alpSrc,
  814. uint8_t *dest, int dstW, int dstY)
  815. {
  816. x86_reg dummy=0;
  817. x86_reg dstW_reg = dstW;
  818. x86_reg uv_off = c->uv_off << 1;
  819. YSCALEYUV2PACKEDX_ACCURATE
  820. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  821. "psraw $3, %%mm3 \n\t"
  822. "psraw $3, %%mm4 \n\t"
  823. "psraw $3, %%mm1 \n\t"
  824. "psraw $3, %%mm7 \n\t"
  825. WRITEYUY2(%4, %5, %%REGa)
  826. YSCALEYUV2PACKEDX_END
  827. }
  828. static void RENAME(yuv2yuyv422_X)(SwsContext *c, const int16_t *lumFilter,
  829. const int16_t **lumSrc, int lumFilterSize,
  830. const int16_t *chrFilter, const int16_t **chrUSrc,
  831. const int16_t **chrVSrc,
  832. int chrFilterSize, const int16_t **alpSrc,
  833. uint8_t *dest, int dstW, int dstY)
  834. {
  835. x86_reg dummy=0;
  836. x86_reg dstW_reg = dstW;
  837. x86_reg uv_off = c->uv_off << 1;
  838. YSCALEYUV2PACKEDX
  839. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  840. "psraw $3, %%mm3 \n\t"
  841. "psraw $3, %%mm4 \n\t"
  842. "psraw $3, %%mm1 \n\t"
  843. "psraw $3, %%mm7 \n\t"
  844. WRITEYUY2(%4, %5, %%REGa)
  845. YSCALEYUV2PACKEDX_END
  846. }
  847. #define REAL_YSCALEYUV2RGB_UV(index, c) \
  848. "xor "#index", "#index" \n\t"\
  849. ".p2align 4 \n\t"\
  850. "1: \n\t"\
  851. "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
  852. "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
  853. "add "UV_OFFx2"("#c"), "#index" \n\t" \
  854. "movq (%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
  855. "movq (%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
  856. "sub "UV_OFFx2"("#c"), "#index" \n\t" \
  857. "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
  858. "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
  859. "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
  860. "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
  861. "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
  862. "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
  863. "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
  864. "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
  865. "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
  866. "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
  867. "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
  868. "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
  869. "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
  870. "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
  871. "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
  872. /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
  873. #define REAL_YSCALEYUV2RGB_YA(index, c, b1, b2) \
  874. "movq ("#b1", "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
  875. "movq ("#b2", "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
  876. "movq 8("#b1", "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
  877. "movq 8("#b2", "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
  878. "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
  879. "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
  880. "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
  881. "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
  882. "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  883. "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  884. "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
  885. "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
  886. #define REAL_YSCALEYUV2RGB_COEFF(c) \
  887. "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
  888. "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
  889. "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
  890. "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
  891. "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
  892. "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
  893. /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
  894. "paddw %%mm3, %%mm4 \n\t"\
  895. "movq %%mm2, %%mm0 \n\t"\
  896. "movq %%mm5, %%mm6 \n\t"\
  897. "movq %%mm4, %%mm3 \n\t"\
  898. "punpcklwd %%mm2, %%mm2 \n\t"\
  899. "punpcklwd %%mm5, %%mm5 \n\t"\
  900. "punpcklwd %%mm4, %%mm4 \n\t"\
  901. "paddw %%mm1, %%mm2 \n\t"\
  902. "paddw %%mm1, %%mm5 \n\t"\
  903. "paddw %%mm1, %%mm4 \n\t"\
  904. "punpckhwd %%mm0, %%mm0 \n\t"\
  905. "punpckhwd %%mm6, %%mm6 \n\t"\
  906. "punpckhwd %%mm3, %%mm3 \n\t"\
  907. "paddw %%mm7, %%mm0 \n\t"\
  908. "paddw %%mm7, %%mm6 \n\t"\
  909. "paddw %%mm7, %%mm3 \n\t"\
  910. /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
  911. "packuswb %%mm0, %%mm2 \n\t"\
  912. "packuswb %%mm6, %%mm5 \n\t"\
  913. "packuswb %%mm3, %%mm4 \n\t"\
  914. #define YSCALEYUV2RGB_YA(index, c, b1, b2) REAL_YSCALEYUV2RGB_YA(index, c, b1, b2)
  915. #define YSCALEYUV2RGB(index, c) \
  916. REAL_YSCALEYUV2RGB_UV(index, c) \
  917. REAL_YSCALEYUV2RGB_YA(index, c, %0, %1) \
  918. REAL_YSCALEYUV2RGB_COEFF(c)
  919. /**
  920. * vertical bilinear scale YV12 to RGB
  921. */
  922. static void RENAME(yuv2rgb32_2)(SwsContext *c, const int16_t *buf[2],
  923. const int16_t *ubuf[2], const int16_t *vbuf[2],
  924. const int16_t *abuf[2], uint8_t *dest,
  925. int dstW, int yalpha, int uvalpha, int y)
  926. {
  927. const int16_t *buf0 = buf[0], *buf1 = buf[1],
  928. *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
  929. if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
  930. const int16_t *abuf0 = abuf[0], *abuf1 = abuf[1];
  931. #if ARCH_X86_64
  932. __asm__ volatile(
  933. YSCALEYUV2RGB(%%r8, %5)
  934. YSCALEYUV2RGB_YA(%%r8, %5, %6, %7)
  935. "psraw $3, %%mm1 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
  936. "psraw $3, %%mm7 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
  937. "packuswb %%mm7, %%mm1 \n\t"
  938. WRITEBGR32(%4, 8280(%5), %%r8, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
  939. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "r" (dest),
  940. "a" (&c->redDither),
  941. "r" (abuf0), "r" (abuf1)
  942. : "%r8"
  943. );
  944. #else
  945. *(const uint16_t **)(&c->u_temp)=abuf0;
  946. *(const uint16_t **)(&c->v_temp)=abuf1;
  947. __asm__ volatile(
  948. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  949. "mov %4, %%"REG_b" \n\t"
  950. "push %%"REG_BP" \n\t"
  951. YSCALEYUV2RGB(%%REGBP, %5)
  952. "push %0 \n\t"
  953. "push %1 \n\t"
  954. "mov "U_TEMP"(%5), %0 \n\t"
  955. "mov "V_TEMP"(%5), %1 \n\t"
  956. YSCALEYUV2RGB_YA(%%REGBP, %5, %0, %1)
  957. "psraw $3, %%mm1 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
  958. "psraw $3, %%mm7 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
  959. "packuswb %%mm7, %%mm1 \n\t"
  960. "pop %1 \n\t"
  961. "pop %0 \n\t"
  962. WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
  963. "pop %%"REG_BP" \n\t"
  964. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  965. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  966. "a" (&c->redDither)
  967. );
  968. #endif
  969. } else {
  970. __asm__ volatile(
  971. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  972. "mov %4, %%"REG_b" \n\t"
  973. "push %%"REG_BP" \n\t"
  974. YSCALEYUV2RGB(%%REGBP, %5)
  975. "pcmpeqd %%mm7, %%mm7 \n\t"
  976. WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
  977. "pop %%"REG_BP" \n\t"
  978. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  979. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  980. "a" (&c->redDither)
  981. );
  982. }
  983. }
  984. static void RENAME(yuv2bgr24_2)(SwsContext *c, const int16_t *buf[2],
  985. const int16_t *ubuf[2], const int16_t *vbuf[2],
  986. const int16_t *abuf[2], uint8_t *dest,
  987. int dstW, int yalpha, int uvalpha, int y)
  988. {
  989. const int16_t *buf0 = buf[0], *buf1 = buf[1],
  990. *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
  991. //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
  992. __asm__ volatile(
  993. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  994. "mov %4, %%"REG_b" \n\t"
  995. "push %%"REG_BP" \n\t"
  996. YSCALEYUV2RGB(%%REGBP, %5)
  997. "pxor %%mm7, %%mm7 \n\t"
  998. WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
  999. "pop %%"REG_BP" \n\t"
  1000. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1001. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1002. "a" (&c->redDither)
  1003. );
  1004. }
  1005. static void RENAME(yuv2rgb555_2)(SwsContext *c, const int16_t *buf[2],
  1006. const int16_t *ubuf[2], const int16_t *vbuf[2],
  1007. const int16_t *abuf[2], uint8_t *dest,
  1008. int dstW, int yalpha, int uvalpha, int y)
  1009. {
  1010. const int16_t *buf0 = buf[0], *buf1 = buf[1],
  1011. *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
  1012. //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
  1013. __asm__ volatile(
  1014. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1015. "mov %4, %%"REG_b" \n\t"
  1016. "push %%"REG_BP" \n\t"
  1017. YSCALEYUV2RGB(%%REGBP, %5)
  1018. "pxor %%mm7, %%mm7 \n\t"
  1019. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1020. #ifdef DITHER1XBPP
  1021. "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
  1022. "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
  1023. "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
  1024. #endif
  1025. WRITERGB15(%%REGb, 8280(%5), %%REGBP)
  1026. "pop %%"REG_BP" \n\t"
  1027. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1028. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1029. "a" (&c->redDither)
  1030. );
  1031. }
  1032. static void RENAME(yuv2rgb565_2)(SwsContext *c, const int16_t *buf[2],
  1033. const int16_t *ubuf[2], const int16_t *vbuf[2],
  1034. const int16_t *abuf[2], uint8_t *dest,
  1035. int dstW, int yalpha, int uvalpha, int y)
  1036. {
  1037. const int16_t *buf0 = buf[0], *buf1 = buf[1],
  1038. *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
  1039. //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
  1040. __asm__ volatile(
  1041. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1042. "mov %4, %%"REG_b" \n\t"
  1043. "push %%"REG_BP" \n\t"
  1044. YSCALEYUV2RGB(%%REGBP, %5)
  1045. "pxor %%mm7, %%mm7 \n\t"
  1046. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1047. #ifdef DITHER1XBPP
  1048. "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
  1049. "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
  1050. "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
  1051. #endif
  1052. WRITERGB16(%%REGb, 8280(%5), %%REGBP)
  1053. "pop %%"REG_BP" \n\t"
  1054. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1055. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1056. "a" (&c->redDither)
  1057. );
  1058. }
  1059. #define REAL_YSCALEYUV2PACKED(index, c) \
  1060. "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
  1061. "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1 \n\t"\
  1062. "psraw $3, %%mm0 \n\t"\
  1063. "psraw $3, %%mm1 \n\t"\
  1064. "movq %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c") \n\t"\
  1065. "movq %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c") \n\t"\
  1066. "xor "#index", "#index" \n\t"\
  1067. ".p2align 4 \n\t"\
  1068. "1: \n\t"\
  1069. "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
  1070. "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
  1071. "add "UV_OFFx2"("#c"), "#index" \n\t" \
  1072. "movq (%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
  1073. "movq (%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
  1074. "sub "UV_OFFx2"("#c"), "#index" \n\t" \
  1075. "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
  1076. "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
  1077. "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
  1078. "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
  1079. "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
  1080. "psraw $7, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
  1081. "psraw $7, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
  1082. "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
  1083. "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
  1084. "movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
  1085. "movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
  1086. "movq 8(%0, "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
  1087. "movq 8(%1, "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
  1088. "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
  1089. "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
  1090. "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
  1091. "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
  1092. "psraw $7, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  1093. "psraw $7, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  1094. "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
  1095. "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
  1096. #define YSCALEYUV2PACKED(index, c) REAL_YSCALEYUV2PACKED(index, c)
  1097. static void RENAME(yuv2yuyv422_2)(SwsContext *c, const int16_t *buf[2],
  1098. const int16_t *ubuf[2], const int16_t *vbuf[2],
  1099. const int16_t *abuf[2], uint8_t *dest,
  1100. int dstW, int yalpha, int uvalpha, int y)
  1101. {
  1102. const int16_t *buf0 = buf[0], *buf1 = buf[1],
  1103. *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
  1104. //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
  1105. __asm__ volatile(
  1106. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1107. "mov %4, %%"REG_b" \n\t"
  1108. "push %%"REG_BP" \n\t"
  1109. YSCALEYUV2PACKED(%%REGBP, %5)
  1110. WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
  1111. "pop %%"REG_BP" \n\t"
  1112. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1113. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1114. "a" (&c->redDither)
  1115. );
  1116. }
  1117. #define REAL_YSCALEYUV2RGB1(index, c) \
  1118. "xor "#index", "#index" \n\t"\
  1119. ".p2align 4 \n\t"\
  1120. "1: \n\t"\
  1121. "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
  1122. "add "UV_OFFx2"("#c"), "#index" \n\t" \
  1123. "movq (%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
  1124. "sub "UV_OFFx2"("#c"), "#index" \n\t" \
  1125. "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
  1126. "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
  1127. "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
  1128. "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
  1129. "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
  1130. "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
  1131. "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
  1132. "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
  1133. /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
  1134. "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
  1135. "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
  1136. "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  1137. "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  1138. "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
  1139. "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
  1140. "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
  1141. "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
  1142. "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
  1143. "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
  1144. /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
  1145. "paddw %%mm3, %%mm4 \n\t"\
  1146. "movq %%mm2, %%mm0 \n\t"\
  1147. "movq %%mm5, %%mm6 \n\t"\
  1148. "movq %%mm4, %%mm3 \n\t"\
  1149. "punpcklwd %%mm2, %%mm2 \n\t"\
  1150. "punpcklwd %%mm5, %%mm5 \n\t"\
  1151. "punpcklwd %%mm4, %%mm4 \n\t"\
  1152. "paddw %%mm1, %%mm2 \n\t"\
  1153. "paddw %%mm1, %%mm5 \n\t"\
  1154. "paddw %%mm1, %%mm4 \n\t"\
  1155. "punpckhwd %%mm0, %%mm0 \n\t"\
  1156. "punpckhwd %%mm6, %%mm6 \n\t"\
  1157. "punpckhwd %%mm3, %%mm3 \n\t"\
  1158. "paddw %%mm7, %%mm0 \n\t"\
  1159. "paddw %%mm7, %%mm6 \n\t"\
  1160. "paddw %%mm7, %%mm3 \n\t"\
  1161. /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
  1162. "packuswb %%mm0, %%mm2 \n\t"\
  1163. "packuswb %%mm6, %%mm5 \n\t"\
  1164. "packuswb %%mm3, %%mm4 \n\t"\
  1165. #define YSCALEYUV2RGB1(index, c) REAL_YSCALEYUV2RGB1(index, c)
  1166. // do vertical chrominance interpolation
  1167. #define REAL_YSCALEYUV2RGB1b(index, c) \
  1168. "xor "#index", "#index" \n\t"\
  1169. ".p2align 4 \n\t"\
  1170. "1: \n\t"\
  1171. "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
  1172. "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
  1173. "add "UV_OFFx2"("#c"), "#index" \n\t" \
  1174. "movq (%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
  1175. "movq (%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
  1176. "sub "UV_OFFx2"("#c"), "#index" \n\t" \
  1177. "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
  1178. "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
  1179. "psrlw $5, %%mm3 \n\t" /*FIXME might overflow*/\
  1180. "psrlw $5, %%mm4 \n\t" /*FIXME might overflow*/\
  1181. "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
  1182. "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
  1183. "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
  1184. "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
  1185. "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
  1186. "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
  1187. /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
  1188. "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
  1189. "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
  1190. "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  1191. "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  1192. "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
  1193. "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
  1194. "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
  1195. "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
  1196. "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
  1197. "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
  1198. /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
  1199. "paddw %%mm3, %%mm4 \n\t"\
  1200. "movq %%mm2, %%mm0 \n\t"\
  1201. "movq %%mm5, %%mm6 \n\t"\
  1202. "movq %%mm4, %%mm3 \n\t"\
  1203. "punpcklwd %%mm2, %%mm2 \n\t"\
  1204. "punpcklwd %%mm5, %%mm5 \n\t"\
  1205. "punpcklwd %%mm4, %%mm4 \n\t"\
  1206. "paddw %%mm1, %%mm2 \n\t"\
  1207. "paddw %%mm1, %%mm5 \n\t"\
  1208. "paddw %%mm1, %%mm4 \n\t"\
  1209. "punpckhwd %%mm0, %%mm0 \n\t"\
  1210. "punpckhwd %%mm6, %%mm6 \n\t"\
  1211. "punpckhwd %%mm3, %%mm3 \n\t"\
  1212. "paddw %%mm7, %%mm0 \n\t"\
  1213. "paddw %%mm7, %%mm6 \n\t"\
  1214. "paddw %%mm7, %%mm3 \n\t"\
  1215. /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
  1216. "packuswb %%mm0, %%mm2 \n\t"\
  1217. "packuswb %%mm6, %%mm5 \n\t"\
  1218. "packuswb %%mm3, %%mm4 \n\t"\
  1219. #define YSCALEYUV2RGB1b(index, c) REAL_YSCALEYUV2RGB1b(index, c)
  1220. #define REAL_YSCALEYUV2RGB1_ALPHA(index) \
  1221. "movq (%1, "#index", 2), %%mm7 \n\t" /* abuf0[index ] */\
  1222. "movq 8(%1, "#index", 2), %%mm1 \n\t" /* abuf0[index+4] */\
  1223. "psraw $7, %%mm7 \n\t" /* abuf0[index ] >>7 */\
  1224. "psraw $7, %%mm1 \n\t" /* abuf0[index+4] >>7 */\
  1225. "packuswb %%mm1, %%mm7 \n\t"
  1226. #define YSCALEYUV2RGB1_ALPHA(index) REAL_YSCALEYUV2RGB1_ALPHA(index)
  1227. /**
  1228. * YV12 to RGB without scaling or interpolating
  1229. */
  1230. static void RENAME(yuv2rgb32_1)(SwsContext *c, const int16_t *buf0,
  1231. const int16_t *ubuf[2], const int16_t *bguf[2],
  1232. const int16_t *abuf0, uint8_t *dest,
  1233. int dstW, int uvalpha, int y)
  1234. {
  1235. const int16_t *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
  1236. const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
  1237. if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
  1238. if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
  1239. __asm__ volatile(
  1240. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1241. "mov %4, %%"REG_b" \n\t"
  1242. "push %%"REG_BP" \n\t"
  1243. YSCALEYUV2RGB1(%%REGBP, %5)
  1244. YSCALEYUV2RGB1_ALPHA(%%REGBP)
  1245. WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
  1246. "pop %%"REG_BP" \n\t"
  1247. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1248. :: "c" (buf0), "d" (abuf0), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1249. "a" (&c->redDither)
  1250. );
  1251. } else {
  1252. __asm__ volatile(
  1253. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1254. "mov %4, %%"REG_b" \n\t"
  1255. "push %%"REG_BP" \n\t"
  1256. YSCALEYUV2RGB1(%%REGBP, %5)
  1257. "pcmpeqd %%mm7, %%mm7 \n\t"
  1258. WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
  1259. "pop %%"REG_BP" \n\t"
  1260. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1261. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1262. "a" (&c->redDither)
  1263. );
  1264. }
  1265. } else {
  1266. if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
  1267. __asm__ volatile(
  1268. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1269. "mov %4, %%"REG_b" \n\t"
  1270. "push %%"REG_BP" \n\t"
  1271. YSCALEYUV2RGB1b(%%REGBP, %5)
  1272. YSCALEYUV2RGB1_ALPHA(%%REGBP)
  1273. WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
  1274. "pop %%"REG_BP" \n\t"
  1275. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1276. :: "c" (buf0), "d" (abuf0), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1277. "a" (&c->redDither)
  1278. );
  1279. } else {
  1280. __asm__ volatile(
  1281. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1282. "mov %4, %%"REG_b" \n\t"
  1283. "push %%"REG_BP" \n\t"
  1284. YSCALEYUV2RGB1b(%%REGBP, %5)
  1285. "pcmpeqd %%mm7, %%mm7 \n\t"
  1286. WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
  1287. "pop %%"REG_BP" \n\t"
  1288. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1289. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1290. "a" (&c->redDither)
  1291. );
  1292. }
  1293. }
  1294. }
  1295. static void RENAME(yuv2bgr24_1)(SwsContext *c, const int16_t *buf0,
  1296. const int16_t *ubuf[2], const int16_t *bguf[2],
  1297. const int16_t *abuf0, uint8_t *dest,
  1298. int dstW, int uvalpha, int y)
  1299. {
  1300. const int16_t *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
  1301. const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
  1302. if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
  1303. __asm__ volatile(
  1304. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1305. "mov %4, %%"REG_b" \n\t"
  1306. "push %%"REG_BP" \n\t"
  1307. YSCALEYUV2RGB1(%%REGBP, %5)
  1308. "pxor %%mm7, %%mm7 \n\t"
  1309. WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
  1310. "pop %%"REG_BP" \n\t"
  1311. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1312. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1313. "a" (&c->redDither)
  1314. );
  1315. } else {
  1316. __asm__ volatile(
  1317. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1318. "mov %4, %%"REG_b" \n\t"
  1319. "push %%"REG_BP" \n\t"
  1320. YSCALEYUV2RGB1b(%%REGBP, %5)
  1321. "pxor %%mm7, %%mm7 \n\t"
  1322. WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
  1323. "pop %%"REG_BP" \n\t"
  1324. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1325. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1326. "a" (&c->redDither)
  1327. );
  1328. }
  1329. }
  1330. static void RENAME(yuv2rgb555_1)(SwsContext *c, const int16_t *buf0,
  1331. const int16_t *ubuf[2], const int16_t *bguf[2],
  1332. const int16_t *abuf0, uint8_t *dest,
  1333. int dstW, int uvalpha, int y)
  1334. {
  1335. const int16_t *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
  1336. const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
  1337. if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
  1338. __asm__ volatile(
  1339. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1340. "mov %4, %%"REG_b" \n\t"
  1341. "push %%"REG_BP" \n\t"
  1342. YSCALEYUV2RGB1(%%REGBP, %5)
  1343. "pxor %%mm7, %%mm7 \n\t"
  1344. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1345. #ifdef DITHER1XBPP
  1346. "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
  1347. "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
  1348. "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
  1349. #endif
  1350. WRITERGB15(%%REGb, 8280(%5), %%REGBP)
  1351. "pop %%"REG_BP" \n\t"
  1352. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1353. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1354. "a" (&c->redDither)
  1355. );
  1356. } else {
  1357. __asm__ volatile(
  1358. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1359. "mov %4, %%"REG_b" \n\t"
  1360. "push %%"REG_BP" \n\t"
  1361. YSCALEYUV2RGB1b(%%REGBP, %5)
  1362. "pxor %%mm7, %%mm7 \n\t"
  1363. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1364. #ifdef DITHER1XBPP
  1365. "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
  1366. "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
  1367. "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
  1368. #endif
  1369. WRITERGB15(%%REGb, 8280(%5), %%REGBP)
  1370. "pop %%"REG_BP" \n\t"
  1371. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1372. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1373. "a" (&c->redDither)
  1374. );
  1375. }
  1376. }
  1377. static void RENAME(yuv2rgb565_1)(SwsContext *c, const int16_t *buf0,
  1378. const int16_t *ubuf[2], const int16_t *bguf[2],
  1379. const int16_t *abuf0, uint8_t *dest,
  1380. int dstW, int uvalpha, int y)
  1381. {
  1382. const int16_t *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
  1383. const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
  1384. if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
  1385. __asm__ volatile(
  1386. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1387. "mov %4, %%"REG_b" \n\t"
  1388. "push %%"REG_BP" \n\t"
  1389. YSCALEYUV2RGB1(%%REGBP, %5)
  1390. "pxor %%mm7, %%mm7 \n\t"
  1391. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1392. #ifdef DITHER1XBPP
  1393. "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
  1394. "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
  1395. "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
  1396. #endif
  1397. WRITERGB16(%%REGb, 8280(%5), %%REGBP)
  1398. "pop %%"REG_BP" \n\t"
  1399. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1400. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1401. "a" (&c->redDither)
  1402. );
  1403. } else {
  1404. __asm__ volatile(
  1405. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1406. "mov %4, %%"REG_b" \n\t"
  1407. "push %%"REG_BP" \n\t"
  1408. YSCALEYUV2RGB1b(%%REGBP, %5)
  1409. "pxor %%mm7, %%mm7 \n\t"
  1410. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1411. #ifdef DITHER1XBPP
  1412. "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
  1413. "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
  1414. "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
  1415. #endif
  1416. WRITERGB16(%%REGb, 8280(%5), %%REGBP)
  1417. "pop %%"REG_BP" \n\t"
  1418. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1419. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1420. "a" (&c->redDither)
  1421. );
  1422. }
  1423. }
  1424. #define REAL_YSCALEYUV2PACKED1(index, c) \
  1425. "xor "#index", "#index" \n\t"\
  1426. ".p2align 4 \n\t"\
  1427. "1: \n\t"\
  1428. "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
  1429. "add "UV_OFFx2"("#c"), "#index" \n\t" \
  1430. "movq (%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
  1431. "sub "UV_OFFx2"("#c"), "#index" \n\t" \
  1432. "psraw $7, %%mm3 \n\t" \
  1433. "psraw $7, %%mm4 \n\t" \
  1434. "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
  1435. "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
  1436. "psraw $7, %%mm1 \n\t" \
  1437. "psraw $7, %%mm7 \n\t" \
  1438. #define YSCALEYUV2PACKED1(index, c) REAL_YSCALEYUV2PACKED1(index, c)
  1439. #define REAL_YSCALEYUV2PACKED1b(index, c) \
  1440. "xor "#index", "#index" \n\t"\
  1441. ".p2align 4 \n\t"\
  1442. "1: \n\t"\
  1443. "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
  1444. "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
  1445. "add "UV_OFFx2"("#c"), "#index" \n\t" \
  1446. "movq (%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
  1447. "movq (%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
  1448. "sub "UV_OFFx2"("#c"), "#index" \n\t" \
  1449. "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
  1450. "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
  1451. "psrlw $8, %%mm3 \n\t" \
  1452. "psrlw $8, %%mm4 \n\t" \
  1453. "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
  1454. "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
  1455. "psraw $7, %%mm1 \n\t" \
  1456. "psraw $7, %%mm7 \n\t"
  1457. #define YSCALEYUV2PACKED1b(index, c) REAL_YSCALEYUV2PACKED1b(index, c)
  1458. static void RENAME(yuv2yuyv422_1)(SwsContext *c, const int16_t *buf0,
  1459. const int16_t *ubuf[2], const int16_t *bguf[2],
  1460. const int16_t *abuf0, uint8_t *dest,
  1461. int dstW, int uvalpha, int y)
  1462. {
  1463. const int16_t *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
  1464. const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
  1465. if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
  1466. __asm__ volatile(
  1467. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1468. "mov %4, %%"REG_b" \n\t"
  1469. "push %%"REG_BP" \n\t"
  1470. YSCALEYUV2PACKED1(%%REGBP, %5)
  1471. WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
  1472. "pop %%"REG_BP" \n\t"
  1473. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1474. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1475. "a" (&c->redDither)
  1476. );
  1477. } else {
  1478. __asm__ volatile(
  1479. "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
  1480. "mov %4, %%"REG_b" \n\t"
  1481. "push %%"REG_BP" \n\t"
  1482. YSCALEYUV2PACKED1b(%%REGBP, %5)
  1483. WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
  1484. "pop %%"REG_BP" \n\t"
  1485. "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
  1486. :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
  1487. "a" (&c->redDither)
  1488. );
  1489. }
  1490. }
  1491. #if !COMPILE_TEMPLATE_MMX2
  1492. //FIXME yuy2* can read up to 7 samples too much
  1493. static void RENAME(yuy2ToY)(uint8_t *dst, const uint8_t *src,
  1494. int width, uint32_t *unused)
  1495. {
  1496. __asm__ volatile(
  1497. "movq "MANGLE(bm01010101)", %%mm2 \n\t"
  1498. "mov %0, %%"REG_a" \n\t"
  1499. "1: \n\t"
  1500. "movq (%1, %%"REG_a",2), %%mm0 \n\t"
  1501. "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
  1502. "pand %%mm2, %%mm0 \n\t"
  1503. "pand %%mm2, %%mm1 \n\t"
  1504. "packuswb %%mm1, %%mm0 \n\t"
  1505. "movq %%mm0, (%2, %%"REG_a") \n\t"
  1506. "add $8, %%"REG_a" \n\t"
  1507. " js 1b \n\t"
  1508. : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst+width)
  1509. : "%"REG_a
  1510. );
  1511. }
  1512. static void RENAME(yuy2ToUV)(uint8_t *dstU, uint8_t *dstV,
  1513. const uint8_t *src1, const uint8_t *src2,
  1514. int width, uint32_t *unused)
  1515. {
  1516. __asm__ volatile(
  1517. "movq "MANGLE(bm01010101)", %%mm4 \n\t"
  1518. "mov %0, %%"REG_a" \n\t"
  1519. "1: \n\t"
  1520. "movq (%1, %%"REG_a",4), %%mm0 \n\t"
  1521. "movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
  1522. "psrlw $8, %%mm0 \n\t"
  1523. "psrlw $8, %%mm1 \n\t"
  1524. "packuswb %%mm1, %%mm0 \n\t"
  1525. "movq %%mm0, %%mm1 \n\t"
  1526. "psrlw $8, %%mm0 \n\t"
  1527. "pand %%mm4, %%mm1 \n\t"
  1528. "packuswb %%mm0, %%mm0 \n\t"
  1529. "packuswb %%mm1, %%mm1 \n\t"
  1530. "movd %%mm0, (%3, %%"REG_a") \n\t"
  1531. "movd %%mm1, (%2, %%"REG_a") \n\t"
  1532. "add $4, %%"REG_a" \n\t"
  1533. " js 1b \n\t"
  1534. : : "g" ((x86_reg)-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
  1535. : "%"REG_a
  1536. );
  1537. assert(src1 == src2);
  1538. }
  1539. /* This is almost identical to the previous, end exists only because
  1540. * yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses. */
  1541. static void RENAME(uyvyToY)(uint8_t *dst, const uint8_t *src,
  1542. int width, uint32_t *unused)
  1543. {
  1544. __asm__ volatile(
  1545. "mov %0, %%"REG_a" \n\t"
  1546. "1: \n\t"
  1547. "movq (%1, %%"REG_a",2), %%mm0 \n\t"
  1548. "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
  1549. "psrlw $8, %%mm0 \n\t"
  1550. "psrlw $8, %%mm1 \n\t"
  1551. "packuswb %%mm1, %%mm0 \n\t"
  1552. "movq %%mm0, (%2, %%"REG_a") \n\t"
  1553. "add $8, %%"REG_a" \n\t"
  1554. " js 1b \n\t"
  1555. : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst+width)
  1556. : "%"REG_a
  1557. );
  1558. }
  1559. static void RENAME(uyvyToUV)(uint8_t *dstU, uint8_t *dstV,
  1560. const uint8_t *src1, const uint8_t *src2,
  1561. int width, uint32_t *unused)
  1562. {
  1563. __asm__ volatile(
  1564. "movq "MANGLE(bm01010101)", %%mm4 \n\t"
  1565. "mov %0, %%"REG_a" \n\t"
  1566. "1: \n\t"
  1567. "movq (%1, %%"REG_a",4), %%mm0 \n\t"
  1568. "movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
  1569. "pand %%mm4, %%mm0 \n\t"
  1570. "pand %%mm4, %%mm1 \n\t"
  1571. "packuswb %%mm1, %%mm0 \n\t"
  1572. "movq %%mm0, %%mm1 \n\t"
  1573. "psrlw $8, %%mm0 \n\t"
  1574. "pand %%mm4, %%mm1 \n\t"
  1575. "packuswb %%mm0, %%mm0 \n\t"
  1576. "packuswb %%mm1, %%mm1 \n\t"
  1577. "movd %%mm0, (%3, %%"REG_a") \n\t"
  1578. "movd %%mm1, (%2, %%"REG_a") \n\t"
  1579. "add $4, %%"REG_a" \n\t"
  1580. " js 1b \n\t"
  1581. : : "g" ((x86_reg)-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
  1582. : "%"REG_a
  1583. );
  1584. assert(src1 == src2);
  1585. }
  1586. static av_always_inline void RENAME(nvXXtoUV)(uint8_t *dst1, uint8_t *dst2,
  1587. const uint8_t *src, int width)
  1588. {
  1589. __asm__ volatile(
  1590. "movq "MANGLE(bm01010101)", %%mm4 \n\t"
  1591. "mov %0, %%"REG_a" \n\t"
  1592. "1: \n\t"
  1593. "movq (%1, %%"REG_a",2), %%mm0 \n\t"
  1594. "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
  1595. "movq %%mm0, %%mm2 \n\t"
  1596. "movq %%mm1, %%mm3 \n\t"
  1597. "pand %%mm4, %%mm0 \n\t"
  1598. "pand %%mm4, %%mm1 \n\t"
  1599. "psrlw $8, %%mm2 \n\t"
  1600. "psrlw $8, %%mm3 \n\t"
  1601. "packuswb %%mm1, %%mm0 \n\t"
  1602. "packuswb %%mm3, %%mm2 \n\t"
  1603. "movq %%mm0, (%2, %%"REG_a") \n\t"
  1604. "movq %%mm2, (%3, %%"REG_a") \n\t"
  1605. "add $8, %%"REG_a" \n\t"
  1606. " js 1b \n\t"
  1607. : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst1+width), "r" (dst2+width)
  1608. : "%"REG_a
  1609. );
  1610. }
  1611. static void RENAME(nv12ToUV)(uint8_t *dstU, uint8_t *dstV,
  1612. const uint8_t *src1, const uint8_t *src2,
  1613. int width, uint32_t *unused)
  1614. {
  1615. RENAME(nvXXtoUV)(dstU, dstV, src1, width);
  1616. }
  1617. static void RENAME(nv21ToUV)(uint8_t *dstU, uint8_t *dstV,
  1618. const uint8_t *src1, const uint8_t *src2,
  1619. int width, uint32_t *unused)
  1620. {
  1621. RENAME(nvXXtoUV)(dstV, dstU, src1, width);
  1622. }
  1623. #endif /* !COMPILE_TEMPLATE_MMX2 */
  1624. static av_always_inline void RENAME(bgr24ToY_mmx)(uint8_t *dst, const uint8_t *src,
  1625. int width, enum PixelFormat srcFormat)
  1626. {
  1627. if(srcFormat == PIX_FMT_BGR24) {
  1628. __asm__ volatile(
  1629. "movq "MANGLE(ff_bgr24toY1Coeff)", %%mm5 \n\t"
  1630. "movq "MANGLE(ff_bgr24toY2Coeff)", %%mm6 \n\t"
  1631. :
  1632. );
  1633. } else {
  1634. __asm__ volatile(
  1635. "movq "MANGLE(ff_rgb24toY1Coeff)", %%mm5 \n\t"
  1636. "movq "MANGLE(ff_rgb24toY2Coeff)", %%mm6 \n\t"
  1637. :
  1638. );
  1639. }
  1640. __asm__ volatile(
  1641. "movq "MANGLE(ff_bgr24toYOffset)", %%mm4 \n\t"
  1642. "mov %2, %%"REG_a" \n\t"
  1643. "pxor %%mm7, %%mm7 \n\t"
  1644. "1: \n\t"
  1645. PREFETCH" 64(%0) \n\t"
  1646. "movd (%0), %%mm0 \n\t"
  1647. "movd 2(%0), %%mm1 \n\t"
  1648. "movd 6(%0), %%mm2 \n\t"
  1649. "movd 8(%0), %%mm3 \n\t"
  1650. "add $12, %0 \n\t"
  1651. "punpcklbw %%mm7, %%mm0 \n\t"
  1652. "punpcklbw %%mm7, %%mm1 \n\t"
  1653. "punpcklbw %%mm7, %%mm2 \n\t"
  1654. "punpcklbw %%mm7, %%mm3 \n\t"
  1655. "pmaddwd %%mm5, %%mm0 \n\t"
  1656. "pmaddwd %%mm6, %%mm1 \n\t"
  1657. "pmaddwd %%mm5, %%mm2 \n\t"
  1658. "pmaddwd %%mm6, %%mm3 \n\t"
  1659. "paddd %%mm1, %%mm0 \n\t"
  1660. "paddd %%mm3, %%mm2 \n\t"
  1661. "paddd %%mm4, %%mm0 \n\t"
  1662. "paddd %%mm4, %%mm2 \n\t"
  1663. "psrad $15, %%mm0 \n\t"
  1664. "psrad $15, %%mm2 \n\t"
  1665. "packssdw %%mm2, %%mm0 \n\t"
  1666. "packuswb %%mm0, %%mm0 \n\t"
  1667. "movd %%mm0, (%1, %%"REG_a") \n\t"
  1668. "add $4, %%"REG_a" \n\t"
  1669. " js 1b \n\t"
  1670. : "+r" (src)
  1671. : "r" (dst+width), "g" ((x86_reg)-width)
  1672. : "%"REG_a
  1673. );
  1674. }
  1675. static void RENAME(bgr24ToY)(uint8_t *dst, const uint8_t *src,
  1676. int width, uint32_t *unused)
  1677. {
  1678. RENAME(bgr24ToY_mmx)(dst, src, width, PIX_FMT_BGR24);
  1679. }
  1680. static void RENAME(rgb24ToY)(uint8_t *dst, const uint8_t *src,
  1681. int width, uint32_t *unused)
  1682. {
  1683. RENAME(bgr24ToY_mmx)(dst, src, width, PIX_FMT_RGB24);
  1684. }
  1685. static av_always_inline void RENAME(bgr24ToUV_mmx)(uint8_t *dstU, uint8_t *dstV,
  1686. const uint8_t *src, int width,
  1687. enum PixelFormat srcFormat)
  1688. {
  1689. __asm__ volatile(
  1690. "movq 24(%4), %%mm6 \n\t"
  1691. "mov %3, %%"REG_a" \n\t"
  1692. "pxor %%mm7, %%mm7 \n\t"
  1693. "1: \n\t"
  1694. PREFETCH" 64(%0) \n\t"
  1695. "movd (%0), %%mm0 \n\t"
  1696. "movd 2(%0), %%mm1 \n\t"
  1697. "punpcklbw %%mm7, %%mm0 \n\t"
  1698. "punpcklbw %%mm7, %%mm1 \n\t"
  1699. "movq %%mm0, %%mm2 \n\t"
  1700. "movq %%mm1, %%mm3 \n\t"
  1701. "pmaddwd (%4), %%mm0 \n\t"
  1702. "pmaddwd 8(%4), %%mm1 \n\t"
  1703. "pmaddwd 16(%4), %%mm2 \n\t"
  1704. "pmaddwd %%mm6, %%mm3 \n\t"
  1705. "paddd %%mm1, %%mm0 \n\t"
  1706. "paddd %%mm3, %%mm2 \n\t"
  1707. "movd 6(%0), %%mm1 \n\t"
  1708. "movd 8(%0), %%mm3 \n\t"
  1709. "add $12, %0 \n\t"
  1710. "punpcklbw %%mm7, %%mm1 \n\t"
  1711. "punpcklbw %%mm7, %%mm3 \n\t"
  1712. "movq %%mm1, %%mm4 \n\t"
  1713. "movq %%mm3, %%mm5 \n\t"
  1714. "pmaddwd (%4), %%mm1 \n\t"
  1715. "pmaddwd 8(%4), %%mm3 \n\t"
  1716. "pmaddwd 16(%4), %%mm4 \n\t"
  1717. "pmaddwd %%mm6, %%mm5 \n\t"
  1718. "paddd %%mm3, %%mm1 \n\t"
  1719. "paddd %%mm5, %%mm4 \n\t"
  1720. "movq "MANGLE(ff_bgr24toUVOffset)", %%mm3 \n\t"
  1721. "paddd %%mm3, %%mm0 \n\t"
  1722. "paddd %%mm3, %%mm2 \n\t"
  1723. "paddd %%mm3, %%mm1 \n\t"
  1724. "paddd %%mm3, %%mm4 \n\t"
  1725. "psrad $15, %%mm0 \n\t"
  1726. "psrad $15, %%mm2 \n\t"
  1727. "psrad $15, %%mm1 \n\t"
  1728. "psrad $15, %%mm4 \n\t"
  1729. "packssdw %%mm1, %%mm0 \n\t"
  1730. "packssdw %%mm4, %%mm2 \n\t"
  1731. "packuswb %%mm0, %%mm0 \n\t"
  1732. "packuswb %%mm2, %%mm2 \n\t"
  1733. "movd %%mm0, (%1, %%"REG_a") \n\t"
  1734. "movd %%mm2, (%2, %%"REG_a") \n\t"
  1735. "add $4, %%"REG_a" \n\t"
  1736. " js 1b \n\t"
  1737. : "+r" (src)
  1738. : "r" (dstU+width), "r" (dstV+width), "g" ((x86_reg)-width), "r"(ff_bgr24toUV[srcFormat == PIX_FMT_RGB24])
  1739. : "%"REG_a
  1740. );
  1741. }
  1742. static void RENAME(bgr24ToUV)(uint8_t *dstU, uint8_t *dstV,
  1743. const uint8_t *src1, const uint8_t *src2,
  1744. int width, uint32_t *unused)
  1745. {
  1746. RENAME(bgr24ToUV_mmx)(dstU, dstV, src1, width, PIX_FMT_BGR24);
  1747. assert(src1 == src2);
  1748. }
  1749. static void RENAME(rgb24ToUV)(uint8_t *dstU, uint8_t *dstV,
  1750. const uint8_t *src1, const uint8_t *src2,
  1751. int width, uint32_t *unused)
  1752. {
  1753. assert(src1==src2);
  1754. RENAME(bgr24ToUV_mmx)(dstU, dstV, src1, width, PIX_FMT_RGB24);
  1755. }
  1756. #if !COMPILE_TEMPLATE_MMX2
  1757. // bilinear / bicubic scaling
  1758. static void RENAME(hScale)(SwsContext *c, int16_t *dst, int dstW,
  1759. const uint8_t *src, const int16_t *filter,
  1760. const int16_t *filterPos, int filterSize)
  1761. {
  1762. assert(filterSize % 4 == 0 && filterSize>0);
  1763. if (filterSize==4) { // Always true for upscaling, sometimes for down, too.
  1764. x86_reg counter= -2*dstW;
  1765. filter-= counter*2;
  1766. filterPos-= counter/2;
  1767. dst-= counter/2;
  1768. __asm__ volatile(
  1769. #if defined(PIC)
  1770. "push %%"REG_b" \n\t"
  1771. #endif
  1772. "pxor %%mm7, %%mm7 \n\t"
  1773. "push %%"REG_BP" \n\t" // we use 7 regs here ...
  1774. "mov %%"REG_a", %%"REG_BP" \n\t"
  1775. ".p2align 4 \n\t"
  1776. "1: \n\t"
  1777. "movzwl (%2, %%"REG_BP"), %%eax \n\t"
  1778. "movzwl 2(%2, %%"REG_BP"), %%ebx \n\t"
  1779. "movq (%1, %%"REG_BP", 4), %%mm1 \n\t"
  1780. "movq 8(%1, %%"REG_BP", 4), %%mm3 \n\t"
  1781. "movd (%3, %%"REG_a"), %%mm0 \n\t"
  1782. "movd (%3, %%"REG_b"), %%mm2 \n\t"
  1783. "punpcklbw %%mm7, %%mm0 \n\t"
  1784. "punpcklbw %%mm7, %%mm2 \n\t"
  1785. "pmaddwd %%mm1, %%mm0 \n\t"
  1786. "pmaddwd %%mm2, %%mm3 \n\t"
  1787. "movq %%mm0, %%mm4 \n\t"
  1788. "punpckldq %%mm3, %%mm0 \n\t"
  1789. "punpckhdq %%mm3, %%mm4 \n\t"
  1790. "paddd %%mm4, %%mm0 \n\t"
  1791. "psrad $7, %%mm0 \n\t"
  1792. "packssdw %%mm0, %%mm0 \n\t"
  1793. "movd %%mm0, (%4, %%"REG_BP") \n\t"
  1794. "add $4, %%"REG_BP" \n\t"
  1795. " jnc 1b \n\t"
  1796. "pop %%"REG_BP" \n\t"
  1797. #if defined(PIC)
  1798. "pop %%"REG_b" \n\t"
  1799. #endif
  1800. : "+a" (counter)
  1801. : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
  1802. #if !defined(PIC)
  1803. : "%"REG_b
  1804. #endif
  1805. );
  1806. } else if (filterSize==8) {
  1807. x86_reg counter= -2*dstW;
  1808. filter-= counter*4;
  1809. filterPos-= counter/2;
  1810. dst-= counter/2;
  1811. __asm__ volatile(
  1812. #if defined(PIC)
  1813. "push %%"REG_b" \n\t"
  1814. #endif
  1815. "pxor %%mm7, %%mm7 \n\t"
  1816. "push %%"REG_BP" \n\t" // we use 7 regs here ...
  1817. "mov %%"REG_a", %%"REG_BP" \n\t"
  1818. ".p2align 4 \n\t"
  1819. "1: \n\t"
  1820. "movzwl (%2, %%"REG_BP"), %%eax \n\t"
  1821. "movzwl 2(%2, %%"REG_BP"), %%ebx \n\t"
  1822. "movq (%1, %%"REG_BP", 8), %%mm1 \n\t"
  1823. "movq 16(%1, %%"REG_BP", 8), %%mm3 \n\t"
  1824. "movd (%3, %%"REG_a"), %%mm0 \n\t"
  1825. "movd (%3, %%"REG_b"), %%mm2 \n\t"
  1826. "punpcklbw %%mm7, %%mm0 \n\t"
  1827. "punpcklbw %%mm7, %%mm2 \n\t"
  1828. "pmaddwd %%mm1, %%mm0 \n\t"
  1829. "pmaddwd %%mm2, %%mm3 \n\t"
  1830. "movq 8(%1, %%"REG_BP", 8), %%mm1 \n\t"
  1831. "movq 24(%1, %%"REG_BP", 8), %%mm5 \n\t"
  1832. "movd 4(%3, %%"REG_a"), %%mm4 \n\t"
  1833. "movd 4(%3, %%"REG_b"), %%mm2 \n\t"
  1834. "punpcklbw %%mm7, %%mm4 \n\t"
  1835. "punpcklbw %%mm7, %%mm2 \n\t"
  1836. "pmaddwd %%mm1, %%mm4 \n\t"
  1837. "pmaddwd %%mm2, %%mm5 \n\t"
  1838. "paddd %%mm4, %%mm0 \n\t"
  1839. "paddd %%mm5, %%mm3 \n\t"
  1840. "movq %%mm0, %%mm4 \n\t"
  1841. "punpckldq %%mm3, %%mm0 \n\t"
  1842. "punpckhdq %%mm3, %%mm4 \n\t"
  1843. "paddd %%mm4, %%mm0 \n\t"
  1844. "psrad $7, %%mm0 \n\t"
  1845. "packssdw %%mm0, %%mm0 \n\t"
  1846. "movd %%mm0, (%4, %%"REG_BP") \n\t"
  1847. "add $4, %%"REG_BP" \n\t"
  1848. " jnc 1b \n\t"
  1849. "pop %%"REG_BP" \n\t"
  1850. #if defined(PIC)
  1851. "pop %%"REG_b" \n\t"
  1852. #endif
  1853. : "+a" (counter)
  1854. : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
  1855. #if !defined(PIC)
  1856. : "%"REG_b
  1857. #endif
  1858. );
  1859. } else {
  1860. const uint8_t *offset = src+filterSize;
  1861. x86_reg counter= -2*dstW;
  1862. //filter-= counter*filterSize/2;
  1863. filterPos-= counter/2;
  1864. dst-= counter/2;
  1865. __asm__ volatile(
  1866. "pxor %%mm7, %%mm7 \n\t"
  1867. ".p2align 4 \n\t"
  1868. "1: \n\t"
  1869. "mov %2, %%"REG_c" \n\t"
  1870. "movzwl (%%"REG_c", %0), %%eax \n\t"
  1871. "movzwl 2(%%"REG_c", %0), %%edx \n\t"
  1872. "mov %5, %%"REG_c" \n\t"
  1873. "pxor %%mm4, %%mm4 \n\t"
  1874. "pxor %%mm5, %%mm5 \n\t"
  1875. "2: \n\t"
  1876. "movq (%1), %%mm1 \n\t"
  1877. "movq (%1, %6), %%mm3 \n\t"
  1878. "movd (%%"REG_c", %%"REG_a"), %%mm0 \n\t"
  1879. "movd (%%"REG_c", %%"REG_d"), %%mm2 \n\t"
  1880. "punpcklbw %%mm7, %%mm0 \n\t"
  1881. "punpcklbw %%mm7, %%mm2 \n\t"
  1882. "pmaddwd %%mm1, %%mm0 \n\t"
  1883. "pmaddwd %%mm2, %%mm3 \n\t"
  1884. "paddd %%mm3, %%mm5 \n\t"
  1885. "paddd %%mm0, %%mm4 \n\t"
  1886. "add $8, %1 \n\t"
  1887. "add $4, %%"REG_c" \n\t"
  1888. "cmp %4, %%"REG_c" \n\t"
  1889. " jb 2b \n\t"
  1890. "add %6, %1 \n\t"
  1891. "movq %%mm4, %%mm0 \n\t"
  1892. "punpckldq %%mm5, %%mm4 \n\t"
  1893. "punpckhdq %%mm5, %%mm0 \n\t"
  1894. "paddd %%mm0, %%mm4 \n\t"
  1895. "psrad $7, %%mm4 \n\t"
  1896. "packssdw %%mm4, %%mm4 \n\t"
  1897. "mov %3, %%"REG_a" \n\t"
  1898. "movd %%mm4, (%%"REG_a", %0) \n\t"
  1899. "add $4, %0 \n\t"
  1900. " jnc 1b \n\t"
  1901. : "+r" (counter), "+r" (filter)
  1902. : "m" (filterPos), "m" (dst), "m"(offset),
  1903. "m" (src), "r" ((x86_reg)filterSize*2)
  1904. : "%"REG_a, "%"REG_c, "%"REG_d
  1905. );
  1906. }
  1907. }
  1908. #endif /* !COMPILE_TEMPLATE_MMX2 */
  1909. #if COMPILE_TEMPLATE_MMX2
  1910. static void RENAME(hyscale_fast)(SwsContext *c, int16_t *dst,
  1911. int dstWidth, const uint8_t *src,
  1912. int srcW, int xInc)
  1913. {
  1914. int16_t *filterPos = c->hLumFilterPos;
  1915. int16_t *filter = c->hLumFilter;
  1916. void *mmx2FilterCode= c->lumMmx2FilterCode;
  1917. int i;
  1918. #if defined(PIC)
  1919. DECLARE_ALIGNED(8, uint64_t, ebxsave);
  1920. #endif
  1921. __asm__ volatile(
  1922. #if defined(PIC)
  1923. "mov %%"REG_b", %5 \n\t"
  1924. #endif
  1925. "pxor %%mm7, %%mm7 \n\t"
  1926. "mov %0, %%"REG_c" \n\t"
  1927. "mov %1, %%"REG_D" \n\t"
  1928. "mov %2, %%"REG_d" \n\t"
  1929. "mov %3, %%"REG_b" \n\t"
  1930. "xor %%"REG_a", %%"REG_a" \n\t" // i
  1931. PREFETCH" (%%"REG_c") \n\t"
  1932. PREFETCH" 32(%%"REG_c") \n\t"
  1933. PREFETCH" 64(%%"REG_c") \n\t"
  1934. #if ARCH_X86_64
  1935. #define CALL_MMX2_FILTER_CODE \
  1936. "movl (%%"REG_b"), %%esi \n\t"\
  1937. "call *%4 \n\t"\
  1938. "movl (%%"REG_b", %%"REG_a"), %%esi \n\t"\
  1939. "add %%"REG_S", %%"REG_c" \n\t"\
  1940. "add %%"REG_a", %%"REG_D" \n\t"\
  1941. "xor %%"REG_a", %%"REG_a" \n\t"\
  1942. #else
  1943. #define CALL_MMX2_FILTER_CODE \
  1944. "movl (%%"REG_b"), %%esi \n\t"\
  1945. "call *%4 \n\t"\
  1946. "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\
  1947. "add %%"REG_a", %%"REG_D" \n\t"\
  1948. "xor %%"REG_a", %%"REG_a" \n\t"\
  1949. #endif /* ARCH_X86_64 */
  1950. CALL_MMX2_FILTER_CODE
  1951. CALL_MMX2_FILTER_CODE
  1952. CALL_MMX2_FILTER_CODE
  1953. CALL_MMX2_FILTER_CODE
  1954. CALL_MMX2_FILTER_CODE
  1955. CALL_MMX2_FILTER_CODE
  1956. CALL_MMX2_FILTER_CODE
  1957. CALL_MMX2_FILTER_CODE
  1958. #if defined(PIC)
  1959. "mov %5, %%"REG_b" \n\t"
  1960. #endif
  1961. :: "m" (src), "m" (dst), "m" (filter), "m" (filterPos),
  1962. "m" (mmx2FilterCode)
  1963. #if defined(PIC)
  1964. ,"m" (ebxsave)
  1965. #endif
  1966. : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
  1967. #if !defined(PIC)
  1968. ,"%"REG_b
  1969. #endif
  1970. );
  1971. for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
  1972. dst[i] = src[srcW-1]*128;
  1973. }
  1974. static void RENAME(hcscale_fast)(SwsContext *c, int16_t *dst1, int16_t *dst2,
  1975. int dstWidth, const uint8_t *src1,
  1976. const uint8_t *src2, int srcW, int xInc)
  1977. {
  1978. int16_t *filterPos = c->hChrFilterPos;
  1979. int16_t *filter = c->hChrFilter;
  1980. void *mmx2FilterCode= c->chrMmx2FilterCode;
  1981. int i;
  1982. #if defined(PIC)
  1983. DECLARE_ALIGNED(8, uint64_t, ebxsave);
  1984. #endif
  1985. __asm__ volatile(
  1986. #if defined(PIC)
  1987. "mov %%"REG_b", %7 \n\t"
  1988. #endif
  1989. "pxor %%mm7, %%mm7 \n\t"
  1990. "mov %0, %%"REG_c" \n\t"
  1991. "mov %1, %%"REG_D" \n\t"
  1992. "mov %2, %%"REG_d" \n\t"
  1993. "mov %3, %%"REG_b" \n\t"
  1994. "xor %%"REG_a", %%"REG_a" \n\t" // i
  1995. PREFETCH" (%%"REG_c") \n\t"
  1996. PREFETCH" 32(%%"REG_c") \n\t"
  1997. PREFETCH" 64(%%"REG_c") \n\t"
  1998. CALL_MMX2_FILTER_CODE
  1999. CALL_MMX2_FILTER_CODE
  2000. CALL_MMX2_FILTER_CODE
  2001. CALL_MMX2_FILTER_CODE
  2002. "xor %%"REG_a", %%"REG_a" \n\t" // i
  2003. "mov %5, %%"REG_c" \n\t" // src
  2004. "mov %6, %%"REG_D" \n\t" // buf2
  2005. PREFETCH" (%%"REG_c") \n\t"
  2006. PREFETCH" 32(%%"REG_c") \n\t"
  2007. PREFETCH" 64(%%"REG_c") \n\t"
  2008. CALL_MMX2_FILTER_CODE
  2009. CALL_MMX2_FILTER_CODE
  2010. CALL_MMX2_FILTER_CODE
  2011. CALL_MMX2_FILTER_CODE
  2012. #if defined(PIC)
  2013. "mov %7, %%"REG_b" \n\t"
  2014. #endif
  2015. :: "m" (src1), "m" (dst1), "m" (filter), "m" (filterPos),
  2016. "m" (mmx2FilterCode), "m" (src2), "m"(dst2)
  2017. #if defined(PIC)
  2018. ,"m" (ebxsave)
  2019. #endif
  2020. : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
  2021. #if !defined(PIC)
  2022. ,"%"REG_b
  2023. #endif
  2024. );
  2025. for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) {
  2026. dst1[i] = src1[srcW-1]*128;
  2027. dst2[i] = src2[srcW-1]*128;
  2028. }
  2029. }
  2030. #endif /* COMPILE_TEMPLATE_MMX2 */
  2031. static av_cold void RENAME(sws_init_swScale)(SwsContext *c)
  2032. {
  2033. enum PixelFormat srcFormat = c->srcFormat,
  2034. dstFormat = c->dstFormat;
  2035. if (!is16BPS(dstFormat) && !is9_OR_10BPS(dstFormat) &&
  2036. dstFormat != PIX_FMT_NV12 && dstFormat != PIX_FMT_NV21) {
  2037. if (!(c->flags & SWS_BITEXACT)) {
  2038. if (c->flags & SWS_ACCURATE_RND) {
  2039. c->yuv2yuv1 = RENAME(yuv2yuv1_ar );
  2040. c->yuv2yuvX = RENAME(yuv2yuvX_ar );
  2041. if (!(c->flags & SWS_FULL_CHR_H_INT)) {
  2042. switch (c->dstFormat) {
  2043. case PIX_FMT_RGB32: c->yuv2packedX = RENAME(yuv2rgb32_X_ar); break;
  2044. case PIX_FMT_BGR24: c->yuv2packedX = RENAME(yuv2bgr24_X_ar); break;
  2045. case PIX_FMT_RGB555: c->yuv2packedX = RENAME(yuv2rgb555_X_ar); break;
  2046. case PIX_FMT_RGB565: c->yuv2packedX = RENAME(yuv2rgb565_X_ar); break;
  2047. case PIX_FMT_YUYV422: c->yuv2packedX = RENAME(yuv2yuyv422_X_ar); break;
  2048. default: break;
  2049. }
  2050. }
  2051. } else {
  2052. c->yuv2yuv1 = RENAME(yuv2yuv1 );
  2053. c->yuv2yuvX = RENAME(yuv2yuvX );
  2054. if (!(c->flags & SWS_FULL_CHR_H_INT)) {
  2055. switch (c->dstFormat) {
  2056. case PIX_FMT_RGB32: c->yuv2packedX = RENAME(yuv2rgb32_X); break;
  2057. case PIX_FMT_BGR24: c->yuv2packedX = RENAME(yuv2bgr24_X); break;
  2058. case PIX_FMT_RGB555: c->yuv2packedX = RENAME(yuv2rgb555_X); break;
  2059. case PIX_FMT_RGB565: c->yuv2packedX = RENAME(yuv2rgb565_X); break;
  2060. case PIX_FMT_YUYV422: c->yuv2packedX = RENAME(yuv2yuyv422_X); break;
  2061. default: break;
  2062. }
  2063. }
  2064. }
  2065. }
  2066. if (!(c->flags & SWS_FULL_CHR_H_INT)) {
  2067. switch (c->dstFormat) {
  2068. case PIX_FMT_RGB32:
  2069. c->yuv2packed1 = RENAME(yuv2rgb32_1);
  2070. c->yuv2packed2 = RENAME(yuv2rgb32_2);
  2071. break;
  2072. case PIX_FMT_BGR24:
  2073. c->yuv2packed1 = RENAME(yuv2bgr24_1);
  2074. c->yuv2packed2 = RENAME(yuv2bgr24_2);
  2075. break;
  2076. case PIX_FMT_RGB555:
  2077. c->yuv2packed1 = RENAME(yuv2rgb555_1);
  2078. c->yuv2packed2 = RENAME(yuv2rgb555_2);
  2079. break;
  2080. case PIX_FMT_RGB565:
  2081. c->yuv2packed1 = RENAME(yuv2rgb565_1);
  2082. c->yuv2packed2 = RENAME(yuv2rgb565_2);
  2083. break;
  2084. case PIX_FMT_YUYV422:
  2085. c->yuv2packed1 = RENAME(yuv2yuyv422_1);
  2086. c->yuv2packed2 = RENAME(yuv2yuyv422_2);
  2087. break;
  2088. default:
  2089. break;
  2090. }
  2091. }
  2092. }
  2093. if (c->scalingBpp == 8) {
  2094. #if !COMPILE_TEMPLATE_MMX2
  2095. c->hScale = RENAME(hScale );
  2096. #endif /* !COMPILE_TEMPLATE_MMX2 */
  2097. // Use the new MMX scaler if the MMX2 one can't be used (it is faster than the x86 ASM one).
  2098. #if COMPILE_TEMPLATE_MMX2
  2099. if (c->flags & SWS_FAST_BILINEAR && c->canMMX2BeUsed)
  2100. {
  2101. c->hyscale_fast = RENAME(hyscale_fast);
  2102. c->hcscale_fast = RENAME(hcscale_fast);
  2103. } else {
  2104. #endif /* COMPILE_TEMPLATE_MMX2 */
  2105. c->hyscale_fast = NULL;
  2106. c->hcscale_fast = NULL;
  2107. #if COMPILE_TEMPLATE_MMX2
  2108. }
  2109. #endif /* COMPILE_TEMPLATE_MMX2 */
  2110. }
  2111. #if !COMPILE_TEMPLATE_MMX2
  2112. switch(srcFormat) {
  2113. case PIX_FMT_YUYV422 : c->chrToYV12 = RENAME(yuy2ToUV); break;
  2114. case PIX_FMT_UYVY422 : c->chrToYV12 = RENAME(uyvyToUV); break;
  2115. case PIX_FMT_NV12 : c->chrToYV12 = RENAME(nv12ToUV); break;
  2116. case PIX_FMT_NV21 : c->chrToYV12 = RENAME(nv21ToUV); break;
  2117. default: break;
  2118. }
  2119. #endif /* !COMPILE_TEMPLATE_MMX2 */
  2120. if (!c->chrSrcHSubSample) {
  2121. switch(srcFormat) {
  2122. case PIX_FMT_BGR24 : c->chrToYV12 = RENAME(bgr24ToUV); break;
  2123. case PIX_FMT_RGB24 : c->chrToYV12 = RENAME(rgb24ToUV); break;
  2124. default: break;
  2125. }
  2126. }
  2127. switch (srcFormat) {
  2128. #if !COMPILE_TEMPLATE_MMX2
  2129. case PIX_FMT_YUYV422 :
  2130. case PIX_FMT_Y400A : c->lumToYV12 = RENAME(yuy2ToY); break;
  2131. case PIX_FMT_UYVY422 : c->lumToYV12 = RENAME(uyvyToY); break;
  2132. #endif /* !COMPILE_TEMPLATE_MMX2 */
  2133. case PIX_FMT_BGR24 : c->lumToYV12 = RENAME(bgr24ToY); break;
  2134. case PIX_FMT_RGB24 : c->lumToYV12 = RENAME(rgb24ToY); break;
  2135. default: break;
  2136. }
  2137. #if !COMPILE_TEMPLATE_MMX2
  2138. if (c->alpPixBuf) {
  2139. switch (srcFormat) {
  2140. case PIX_FMT_Y400A : c->alpToYV12 = RENAME(yuy2ToY); break;
  2141. default: break;
  2142. }
  2143. }
  2144. #endif /* !COMPILE_TEMPLATE_MMX2 */
  2145. }