You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

2117 lines
65KB

  1. // Software scaling and colorspace conversion routines for MPlayer
  2. // Orginal C implementation by A'rpi/ESP-team <arpi@thot.banki.hu>
  3. // current version mostly by Michael Niedermayer (michaelni@gmx.at)
  4. // the parts written by michael are under GNU GPL
  5. #undef MOVNTQ
  6. #undef PAVGB
  7. #undef PREFETCH
  8. #undef PREFETCHW
  9. #undef EMMS
  10. #undef SFENCE
  11. #ifdef HAVE_3DNOW
  12. /* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
  13. #define EMMS "femms"
  14. #else
  15. #define EMMS "emms"
  16. #endif
  17. #ifdef HAVE_3DNOW
  18. #define PREFETCH "prefetch"
  19. #define PREFETCHW "prefetchw"
  20. #elif defined ( HAVE_MMX2 )
  21. #define PREFETCH "prefetchnta"
  22. #define PREFETCHW "prefetcht0"
  23. #else
  24. #define PREFETCH "/nop"
  25. #define PREFETCHW "/nop"
  26. #endif
  27. #ifdef HAVE_MMX2
  28. #define SFENCE "sfence"
  29. #else
  30. #define SFENCE "/nop"
  31. #endif
  32. #ifdef HAVE_MMX2
  33. #define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
  34. #elif defined (HAVE_3DNOW)
  35. #define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
  36. #endif
  37. #ifdef HAVE_MMX2
  38. #define MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
  39. #else
  40. #define MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
  41. #endif
  42. #define YSCALEYUV2YV12X(x) \
  43. "xorl %%eax, %%eax \n\t"\
  44. "pxor %%mm3, %%mm3 \n\t"\
  45. "pxor %%mm4, %%mm4 \n\t"\
  46. "movl %0, %%edx \n\t"\
  47. ".balign 16 \n\t" /* FIXME Unroll? */\
  48. "1: \n\t"\
  49. "movl (%1, %%edx, 4), %%esi \n\t"\
  50. "movq (%2, %%edx, 8), %%mm0 \n\t" /* filterCoeff */\
  51. "movq " #x "(%%esi, %%eax, 2), %%mm2 \n\t" /* srcData */\
  52. "movq 8+" #x "(%%esi, %%eax, 2), %%mm5 \n\t" /* srcData */\
  53. "pmulhw %%mm0, %%mm2 \n\t"\
  54. "pmulhw %%mm0, %%mm5 \n\t"\
  55. "paddw %%mm2, %%mm3 \n\t"\
  56. "paddw %%mm5, %%mm4 \n\t"\
  57. "addl $1, %%edx \n\t"\
  58. " jnz 1b \n\t"\
  59. "psraw $3, %%mm3 \n\t"\
  60. "psraw $3, %%mm4 \n\t"\
  61. "packuswb %%mm4, %%mm3 \n\t"\
  62. MOVNTQ(%%mm3, (%3, %%eax))\
  63. "addl $8, %%eax \n\t"\
  64. "cmpl %4, %%eax \n\t"\
  65. "pxor %%mm3, %%mm3 \n\t"\
  66. "pxor %%mm4, %%mm4 \n\t"\
  67. "movl %0, %%edx \n\t"\
  68. "jb 1b \n\t"
  69. #define YSCALEYUV2YV121 \
  70. "movl %2, %%eax \n\t"\
  71. ".balign 16 \n\t" /* FIXME Unroll? */\
  72. "1: \n\t"\
  73. "movq (%0, %%eax, 2), %%mm0 \n\t"\
  74. "movq 8(%0, %%eax, 2), %%mm1 \n\t"\
  75. "psraw $7, %%mm0 \n\t"\
  76. "psraw $7, %%mm1 \n\t"\
  77. "packuswb %%mm1, %%mm0 \n\t"\
  78. MOVNTQ(%%mm0, (%1, %%eax))\
  79. "addl $8, %%eax \n\t"\
  80. "jnc 1b \n\t"
  81. /*
  82. :: "m" (-lumFilterSize), "m" (-chrFilterSize),
  83. "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
  84. "r" (dest), "m" (dstW),
  85. "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
  86. : "%eax", "%ebx", "%ecx", "%edx", "%esi"
  87. */
  88. #define YSCALEYUV2RGBX \
  89. "xorl %%eax, %%eax \n\t"\
  90. ".balign 16 \n\t"\
  91. "1: \n\t"\
  92. "movl %1, %%edx \n\t" /* -chrFilterSize */\
  93. "movl %3, %%ebx \n\t" /* chrMmxFilter+lumFilterSize */\
  94. "movl %7, %%ecx \n\t" /* chrSrc+lumFilterSize */\
  95. "pxor %%mm3, %%mm3 \n\t"\
  96. "pxor %%mm4, %%mm4 \n\t"\
  97. "2: \n\t"\
  98. "movl (%%ecx, %%edx, 4), %%esi \n\t"\
  99. "movq (%%ebx, %%edx, 8), %%mm0 \n\t" /* filterCoeff */\
  100. "movq (%%esi, %%eax), %%mm2 \n\t" /* UsrcData */\
  101. "movq 4096(%%esi, %%eax), %%mm5 \n\t" /* VsrcData */\
  102. "pmulhw %%mm0, %%mm2 \n\t"\
  103. "pmulhw %%mm0, %%mm5 \n\t"\
  104. "paddw %%mm2, %%mm3 \n\t"\
  105. "paddw %%mm5, %%mm4 \n\t"\
  106. "addl $1, %%edx \n\t"\
  107. " jnz 2b \n\t"\
  108. \
  109. "movl %0, %%edx \n\t" /* -lumFilterSize */\
  110. "movl %2, %%ebx \n\t" /* lumMmxFilter+lumFilterSize */\
  111. "movl %6, %%ecx \n\t" /* lumSrc+lumFilterSize */\
  112. "pxor %%mm1, %%mm1 \n\t"\
  113. "pxor %%mm7, %%mm7 \n\t"\
  114. "2: \n\t"\
  115. "movl (%%ecx, %%edx, 4), %%esi \n\t"\
  116. "movq (%%ebx, %%edx, 8), %%mm0 \n\t" /* filterCoeff */\
  117. "movq (%%esi, %%eax, 2), %%mm2 \n\t" /* Y1srcData */\
  118. "movq 8(%%esi, %%eax, 2), %%mm5 \n\t" /* Y2srcData */\
  119. "pmulhw %%mm0, %%mm2 \n\t"\
  120. "pmulhw %%mm0, %%mm5 \n\t"\
  121. "paddw %%mm2, %%mm1 \n\t"\
  122. "paddw %%mm5, %%mm7 \n\t"\
  123. "addl $1, %%edx \n\t"\
  124. " jnz 2b \n\t"\
  125. \
  126. "psubw "MANGLE(w400)", %%mm3 \n\t" /* (U-128)8*/\
  127. "psubw "MANGLE(w400)", %%mm4 \n\t" /* (V-128)8*/\
  128. "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
  129. "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
  130. "pmulhw "MANGLE(ugCoeff)", %%mm3\n\t"\
  131. "pmulhw "MANGLE(vgCoeff)", %%mm4\n\t"\
  132. /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
  133. "pmulhw "MANGLE(ubCoeff)", %%mm2\n\t"\
  134. "pmulhw "MANGLE(vrCoeff)", %%mm5\n\t"\
  135. "psubw "MANGLE(w80)", %%mm1 \n\t" /* 8(Y-16)*/\
  136. "psubw "MANGLE(w80)", %%mm7 \n\t" /* 8(Y-16)*/\
  137. "pmulhw "MANGLE(yCoeff)", %%mm1 \n\t"\
  138. "pmulhw "MANGLE(yCoeff)", %%mm7 \n\t"\
  139. /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
  140. "paddw %%mm3, %%mm4 \n\t"\
  141. "movq %%mm2, %%mm0 \n\t"\
  142. "movq %%mm5, %%mm6 \n\t"\
  143. "movq %%mm4, %%mm3 \n\t"\
  144. "punpcklwd %%mm2, %%mm2 \n\t"\
  145. "punpcklwd %%mm5, %%mm5 \n\t"\
  146. "punpcklwd %%mm4, %%mm4 \n\t"\
  147. "paddw %%mm1, %%mm2 \n\t"\
  148. "paddw %%mm1, %%mm5 \n\t"\
  149. "paddw %%mm1, %%mm4 \n\t"\
  150. "punpckhwd %%mm0, %%mm0 \n\t"\
  151. "punpckhwd %%mm6, %%mm6 \n\t"\
  152. "punpckhwd %%mm3, %%mm3 \n\t"\
  153. "paddw %%mm7, %%mm0 \n\t"\
  154. "paddw %%mm7, %%mm6 \n\t"\
  155. "paddw %%mm7, %%mm3 \n\t"\
  156. /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
  157. "packuswb %%mm0, %%mm2 \n\t"\
  158. "packuswb %%mm6, %%mm5 \n\t"\
  159. "packuswb %%mm3, %%mm4 \n\t"\
  160. "pxor %%mm7, %%mm7 \n\t"
  161. #define FULL_YSCALEYUV2RGB \
  162. "pxor %%mm7, %%mm7 \n\t"\
  163. "movd %6, %%mm6 \n\t" /*yalpha1*/\
  164. "punpcklwd %%mm6, %%mm6 \n\t"\
  165. "punpcklwd %%mm6, %%mm6 \n\t"\
  166. "movd %7, %%mm5 \n\t" /*uvalpha1*/\
  167. "punpcklwd %%mm5, %%mm5 \n\t"\
  168. "punpcklwd %%mm5, %%mm5 \n\t"\
  169. "xorl %%eax, %%eax \n\t"\
  170. ".balign 16 \n\t"\
  171. "1: \n\t"\
  172. "movq (%0, %%eax, 2), %%mm0 \n\t" /*buf0[eax]*/\
  173. "movq (%1, %%eax, 2), %%mm1 \n\t" /*buf1[eax]*/\
  174. "movq (%2, %%eax,2), %%mm2 \n\t" /* uvbuf0[eax]*/\
  175. "movq (%3, %%eax,2), %%mm3 \n\t" /* uvbuf1[eax]*/\
  176. "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
  177. "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
  178. "pmulhw %%mm6, %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
  179. "pmulhw %%mm5, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
  180. "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  181. "movq 4096(%2, %%eax,2), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
  182. "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
  183. "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
  184. "movq 4096(%3, %%eax,2), %%mm0 \n\t" /* uvbuf1[eax+2048]*/\
  185. "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
  186. "psubw %%mm0, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
  187. "psubw "MANGLE(w80)", %%mm1 \n\t" /* 8(Y-16)*/\
  188. "psubw "MANGLE(w400)", %%mm3 \n\t" /* 8(U-128)*/\
  189. "pmulhw "MANGLE(yCoeff)", %%mm1 \n\t"\
  190. \
  191. \
  192. "pmulhw %%mm5, %%mm4 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
  193. "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
  194. "pmulhw "MANGLE(ubCoeff)", %%mm3\n\t"\
  195. "psraw $4, %%mm0 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
  196. "pmulhw "MANGLE(ugCoeff)", %%mm2\n\t"\
  197. "paddw %%mm4, %%mm0 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
  198. "psubw "MANGLE(w400)", %%mm0 \n\t" /* (V-128)8*/\
  199. \
  200. \
  201. "movq %%mm0, %%mm4 \n\t" /* (V-128)8*/\
  202. "pmulhw "MANGLE(vrCoeff)", %%mm0\n\t"\
  203. "pmulhw "MANGLE(vgCoeff)", %%mm4\n\t"\
  204. "paddw %%mm1, %%mm3 \n\t" /* B*/\
  205. "paddw %%mm1, %%mm0 \n\t" /* R*/\
  206. "packuswb %%mm3, %%mm3 \n\t"\
  207. \
  208. "packuswb %%mm0, %%mm0 \n\t"\
  209. "paddw %%mm4, %%mm2 \n\t"\
  210. "paddw %%mm2, %%mm1 \n\t" /* G*/\
  211. \
  212. "packuswb %%mm1, %%mm1 \n\t"
  213. #define YSCALEYUV2RGB \
  214. "movd %6, %%mm6 \n\t" /*yalpha1*/\
  215. "punpcklwd %%mm6, %%mm6 \n\t"\
  216. "punpcklwd %%mm6, %%mm6 \n\t"\
  217. "movq %%mm6, "MANGLE(asm_yalpha1)"\n\t"\
  218. "movd %7, %%mm5 \n\t" /*uvalpha1*/\
  219. "punpcklwd %%mm5, %%mm5 \n\t"\
  220. "punpcklwd %%mm5, %%mm5 \n\t"\
  221. "movq %%mm5, "MANGLE(asm_uvalpha1)"\n\t"\
  222. "xorl %%eax, %%eax \n\t"\
  223. ".balign 16 \n\t"\
  224. "1: \n\t"\
  225. "movq (%2, %%eax), %%mm2 \n\t" /* uvbuf0[eax]*/\
  226. "movq (%3, %%eax), %%mm3 \n\t" /* uvbuf1[eax]*/\
  227. "movq 4096(%2, %%eax), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
  228. "movq 4096(%3, %%eax), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
  229. "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
  230. "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
  231. "movq "MANGLE(asm_uvalpha1)", %%mm0\n\t"\
  232. "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
  233. "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
  234. "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
  235. "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
  236. "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
  237. "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
  238. "psubw "MANGLE(w400)", %%mm3 \n\t" /* (U-128)8*/\
  239. "psubw "MANGLE(w400)", %%mm4 \n\t" /* (V-128)8*/\
  240. "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
  241. "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
  242. "pmulhw "MANGLE(ugCoeff)", %%mm3\n\t"\
  243. "pmulhw "MANGLE(vgCoeff)", %%mm4\n\t"\
  244. /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
  245. "movq (%0, %%eax, 2), %%mm0 \n\t" /*buf0[eax]*/\
  246. "movq (%1, %%eax, 2), %%mm1 \n\t" /*buf1[eax]*/\
  247. "movq 8(%0, %%eax, 2), %%mm6 \n\t" /*buf0[eax]*/\
  248. "movq 8(%1, %%eax, 2), %%mm7 \n\t" /*buf1[eax]*/\
  249. "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
  250. "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
  251. "pmulhw "MANGLE(asm_yalpha1)", %%mm0\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
  252. "pmulhw "MANGLE(asm_yalpha1)", %%mm6\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
  253. "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  254. "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  255. "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
  256. "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
  257. "pmulhw "MANGLE(ubCoeff)", %%mm2\n\t"\
  258. "pmulhw "MANGLE(vrCoeff)", %%mm5\n\t"\
  259. "psubw "MANGLE(w80)", %%mm1 \n\t" /* 8(Y-16)*/\
  260. "psubw "MANGLE(w80)", %%mm7 \n\t" /* 8(Y-16)*/\
  261. "pmulhw "MANGLE(yCoeff)", %%mm1 \n\t"\
  262. "pmulhw "MANGLE(yCoeff)", %%mm7 \n\t"\
  263. /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
  264. "paddw %%mm3, %%mm4 \n\t"\
  265. "movq %%mm2, %%mm0 \n\t"\
  266. "movq %%mm5, %%mm6 \n\t"\
  267. "movq %%mm4, %%mm3 \n\t"\
  268. "punpcklwd %%mm2, %%mm2 \n\t"\
  269. "punpcklwd %%mm5, %%mm5 \n\t"\
  270. "punpcklwd %%mm4, %%mm4 \n\t"\
  271. "paddw %%mm1, %%mm2 \n\t"\
  272. "paddw %%mm1, %%mm5 \n\t"\
  273. "paddw %%mm1, %%mm4 \n\t"\
  274. "punpckhwd %%mm0, %%mm0 \n\t"\
  275. "punpckhwd %%mm6, %%mm6 \n\t"\
  276. "punpckhwd %%mm3, %%mm3 \n\t"\
  277. "paddw %%mm7, %%mm0 \n\t"\
  278. "paddw %%mm7, %%mm6 \n\t"\
  279. "paddw %%mm7, %%mm3 \n\t"\
  280. /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
  281. "packuswb %%mm0, %%mm2 \n\t"\
  282. "packuswb %%mm6, %%mm5 \n\t"\
  283. "packuswb %%mm3, %%mm4 \n\t"\
  284. "pxor %%mm7, %%mm7 \n\t"
  285. #define YSCALEYUV2RGB1 \
  286. "xorl %%eax, %%eax \n\t"\
  287. ".balign 16 \n\t"\
  288. "1: \n\t"\
  289. "movq (%2, %%eax), %%mm3 \n\t" /* uvbuf0[eax]*/\
  290. "movq 4096(%2, %%eax), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
  291. "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
  292. "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
  293. "psubw "MANGLE(w400)", %%mm3 \n\t" /* (U-128)8*/\
  294. "psubw "MANGLE(w400)", %%mm4 \n\t" /* (V-128)8*/\
  295. "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
  296. "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
  297. "pmulhw "MANGLE(ugCoeff)", %%mm3\n\t"\
  298. "pmulhw "MANGLE(vgCoeff)", %%mm4\n\t"\
  299. /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
  300. "movq (%0, %%eax, 2), %%mm1 \n\t" /*buf0[eax]*/\
  301. "movq 8(%0, %%eax, 2), %%mm7 \n\t" /*buf0[eax]*/\
  302. "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  303. "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  304. "pmulhw "MANGLE(ubCoeff)", %%mm2\n\t"\
  305. "pmulhw "MANGLE(vrCoeff)", %%mm5\n\t"\
  306. "psubw "MANGLE(w80)", %%mm1 \n\t" /* 8(Y-16)*/\
  307. "psubw "MANGLE(w80)", %%mm7 \n\t" /* 8(Y-16)*/\
  308. "pmulhw "MANGLE(yCoeff)", %%mm1 \n\t"\
  309. "pmulhw "MANGLE(yCoeff)", %%mm7 \n\t"\
  310. /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
  311. "paddw %%mm3, %%mm4 \n\t"\
  312. "movq %%mm2, %%mm0 \n\t"\
  313. "movq %%mm5, %%mm6 \n\t"\
  314. "movq %%mm4, %%mm3 \n\t"\
  315. "punpcklwd %%mm2, %%mm2 \n\t"\
  316. "punpcklwd %%mm5, %%mm5 \n\t"\
  317. "punpcklwd %%mm4, %%mm4 \n\t"\
  318. "paddw %%mm1, %%mm2 \n\t"\
  319. "paddw %%mm1, %%mm5 \n\t"\
  320. "paddw %%mm1, %%mm4 \n\t"\
  321. "punpckhwd %%mm0, %%mm0 \n\t"\
  322. "punpckhwd %%mm6, %%mm6 \n\t"\
  323. "punpckhwd %%mm3, %%mm3 \n\t"\
  324. "paddw %%mm7, %%mm0 \n\t"\
  325. "paddw %%mm7, %%mm6 \n\t"\
  326. "paddw %%mm7, %%mm3 \n\t"\
  327. /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
  328. "packuswb %%mm0, %%mm2 \n\t"\
  329. "packuswb %%mm6, %%mm5 \n\t"\
  330. "packuswb %%mm3, %%mm4 \n\t"\
  331. "pxor %%mm7, %%mm7 \n\t"
  332. // do vertical chrominance interpolation
  333. #define YSCALEYUV2RGB1b \
  334. "xorl %%eax, %%eax \n\t"\
  335. ".balign 16 \n\t"\
  336. "1: \n\t"\
  337. "movq (%2, %%eax), %%mm2 \n\t" /* uvbuf0[eax]*/\
  338. "movq (%3, %%eax), %%mm3 \n\t" /* uvbuf1[eax]*/\
  339. "movq 4096(%2, %%eax), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
  340. "movq 4096(%3, %%eax), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
  341. "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
  342. "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
  343. "psrlw $5, %%mm3 \n\t" /*FIXME might overflow*/\
  344. "psrlw $5, %%mm4 \n\t" /*FIXME might overflow*/\
  345. "psubw "MANGLE(w400)", %%mm3 \n\t" /* (U-128)8*/\
  346. "psubw "MANGLE(w400)", %%mm4 \n\t" /* (V-128)8*/\
  347. "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
  348. "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
  349. "pmulhw "MANGLE(ugCoeff)", %%mm3\n\t"\
  350. "pmulhw "MANGLE(vgCoeff)", %%mm4\n\t"\
  351. /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
  352. "movq (%0, %%eax, 2), %%mm1 \n\t" /*buf0[eax]*/\
  353. "movq 8(%0, %%eax, 2), %%mm7 \n\t" /*buf0[eax]*/\
  354. "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  355. "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
  356. "pmulhw "MANGLE(ubCoeff)", %%mm2\n\t"\
  357. "pmulhw "MANGLE(vrCoeff)", %%mm5\n\t"\
  358. "psubw "MANGLE(w80)", %%mm1 \n\t" /* 8(Y-16)*/\
  359. "psubw "MANGLE(w80)", %%mm7 \n\t" /* 8(Y-16)*/\
  360. "pmulhw "MANGLE(yCoeff)", %%mm1 \n\t"\
  361. "pmulhw "MANGLE(yCoeff)", %%mm7 \n\t"\
  362. /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
  363. "paddw %%mm3, %%mm4 \n\t"\
  364. "movq %%mm2, %%mm0 \n\t"\
  365. "movq %%mm5, %%mm6 \n\t"\
  366. "movq %%mm4, %%mm3 \n\t"\
  367. "punpcklwd %%mm2, %%mm2 \n\t"\
  368. "punpcklwd %%mm5, %%mm5 \n\t"\
  369. "punpcklwd %%mm4, %%mm4 \n\t"\
  370. "paddw %%mm1, %%mm2 \n\t"\
  371. "paddw %%mm1, %%mm5 \n\t"\
  372. "paddw %%mm1, %%mm4 \n\t"\
  373. "punpckhwd %%mm0, %%mm0 \n\t"\
  374. "punpckhwd %%mm6, %%mm6 \n\t"\
  375. "punpckhwd %%mm3, %%mm3 \n\t"\
  376. "paddw %%mm7, %%mm0 \n\t"\
  377. "paddw %%mm7, %%mm6 \n\t"\
  378. "paddw %%mm7, %%mm3 \n\t"\
  379. /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
  380. "packuswb %%mm0, %%mm2 \n\t"\
  381. "packuswb %%mm6, %%mm5 \n\t"\
  382. "packuswb %%mm3, %%mm4 \n\t"\
  383. "pxor %%mm7, %%mm7 \n\t"
  384. #define WRITEBGR32 \
  385. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
  386. "movq %%mm2, %%mm1 \n\t" /* B */\
  387. "movq %%mm5, %%mm6 \n\t" /* R */\
  388. "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
  389. "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
  390. "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
  391. "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
  392. "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
  393. "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
  394. "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
  395. "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
  396. "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
  397. "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
  398. \
  399. MOVNTQ(%%mm0, (%4, %%eax, 4))\
  400. MOVNTQ(%%mm2, 8(%4, %%eax, 4))\
  401. MOVNTQ(%%mm1, 16(%4, %%eax, 4))\
  402. MOVNTQ(%%mm3, 24(%4, %%eax, 4))\
  403. \
  404. "addl $8, %%eax \n\t"\
  405. "cmpl %5, %%eax \n\t"\
  406. " jb 1b \n\t"
  407. #define WRITEBGR16 \
  408. "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
  409. "pand "MANGLE(bFC)", %%mm4 \n\t" /* G */\
  410. "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
  411. "psrlq $3, %%mm2 \n\t"\
  412. \
  413. "movq %%mm2, %%mm1 \n\t"\
  414. "movq %%mm4, %%mm3 \n\t"\
  415. \
  416. "punpcklbw %%mm7, %%mm3 \n\t"\
  417. "punpcklbw %%mm5, %%mm2 \n\t"\
  418. "punpckhbw %%mm7, %%mm4 \n\t"\
  419. "punpckhbw %%mm5, %%mm1 \n\t"\
  420. \
  421. "psllq $3, %%mm3 \n\t"\
  422. "psllq $3, %%mm4 \n\t"\
  423. \
  424. "por %%mm3, %%mm2 \n\t"\
  425. "por %%mm4, %%mm1 \n\t"\
  426. \
  427. MOVNTQ(%%mm2, (%4, %%eax, 2))\
  428. MOVNTQ(%%mm1, 8(%4, %%eax, 2))\
  429. \
  430. "addl $8, %%eax \n\t"\
  431. "cmpl %5, %%eax \n\t"\
  432. " jb 1b \n\t"
  433. #define WRITEBGR15 \
  434. "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
  435. "pand "MANGLE(bF8)", %%mm4 \n\t" /* G */\
  436. "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
  437. "psrlq $3, %%mm2 \n\t"\
  438. "psrlq $1, %%mm5 \n\t"\
  439. \
  440. "movq %%mm2, %%mm1 \n\t"\
  441. "movq %%mm4, %%mm3 \n\t"\
  442. \
  443. "punpcklbw %%mm7, %%mm3 \n\t"\
  444. "punpcklbw %%mm5, %%mm2 \n\t"\
  445. "punpckhbw %%mm7, %%mm4 \n\t"\
  446. "punpckhbw %%mm5, %%mm1 \n\t"\
  447. \
  448. "psllq $2, %%mm3 \n\t"\
  449. "psllq $2, %%mm4 \n\t"\
  450. \
  451. "por %%mm3, %%mm2 \n\t"\
  452. "por %%mm4, %%mm1 \n\t"\
  453. \
  454. MOVNTQ(%%mm2, (%4, %%eax, 2))\
  455. MOVNTQ(%%mm1, 8(%4, %%eax, 2))\
  456. \
  457. "addl $8, %%eax \n\t"\
  458. "cmpl %5, %%eax \n\t"\
  459. " jb 1b \n\t"
  460. #define WRITEBGR24OLD \
  461. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
  462. "movq %%mm2, %%mm1 \n\t" /* B */\
  463. "movq %%mm5, %%mm6 \n\t" /* R */\
  464. "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
  465. "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
  466. "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
  467. "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
  468. "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
  469. "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
  470. "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
  471. "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
  472. "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
  473. "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
  474. \
  475. "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
  476. "psrlq $8, %%mm0 \n\t" /* 00RGB0RG 0 */\
  477. "pand "MANGLE(bm00000111)", %%mm4\n\t" /* 00000RGB 0 */\
  478. "pand "MANGLE(bm11111000)", %%mm0\n\t" /* 00RGB000 0.5 */\
  479. "por %%mm4, %%mm0 \n\t" /* 00RGBRGB 0 */\
  480. "movq %%mm2, %%mm4 \n\t" /* 0RGB0RGB 1 */\
  481. "psllq $48, %%mm2 \n\t" /* GB000000 1 */\
  482. "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
  483. \
  484. "movq %%mm4, %%mm2 \n\t" /* 0RGB0RGB 1 */\
  485. "psrld $16, %%mm4 \n\t" /* 000R000R 1 */\
  486. "psrlq $24, %%mm2 \n\t" /* 0000RGB0 1.5 */\
  487. "por %%mm4, %%mm2 \n\t" /* 000RRGBR 1 */\
  488. "pand "MANGLE(bm00001111)", %%mm2\n\t" /* 0000RGBR 1 */\
  489. "movq %%mm1, %%mm4 \n\t" /* 0RGB0RGB 2 */\
  490. "psrlq $8, %%mm1 \n\t" /* 00RGB0RG 2 */\
  491. "pand "MANGLE(bm00000111)", %%mm4\n\t" /* 00000RGB 2 */\
  492. "pand "MANGLE(bm11111000)", %%mm1\n\t" /* 00RGB000 2.5 */\
  493. "por %%mm4, %%mm1 \n\t" /* 00RGBRGB 2 */\
  494. "movq %%mm1, %%mm4 \n\t" /* 00RGBRGB 2 */\
  495. "psllq $32, %%mm1 \n\t" /* BRGB0000 2 */\
  496. "por %%mm1, %%mm2 \n\t" /* BRGBRGBR 1 */\
  497. \
  498. "psrlq $32, %%mm4 \n\t" /* 000000RG 2.5 */\
  499. "movq %%mm3, %%mm5 \n\t" /* 0RGB0RGB 3 */\
  500. "psrlq $8, %%mm3 \n\t" /* 00RGB0RG 3 */\
  501. "pand "MANGLE(bm00000111)", %%mm5\n\t" /* 00000RGB 3 */\
  502. "pand "MANGLE(bm11111000)", %%mm3\n\t" /* 00RGB000 3.5 */\
  503. "por %%mm5, %%mm3 \n\t" /* 00RGBRGB 3 */\
  504. "psllq $16, %%mm3 \n\t" /* RGBRGB00 3 */\
  505. "por %%mm4, %%mm3 \n\t" /* RGBRGBRG 2.5 */\
  506. \
  507. MOVNTQ(%%mm0, (%%ebx))\
  508. MOVNTQ(%%mm2, 8(%%ebx))\
  509. MOVNTQ(%%mm3, 16(%%ebx))\
  510. "addl $24, %%ebx \n\t"\
  511. \
  512. "addl $8, %%eax \n\t"\
  513. "cmpl %5, %%eax \n\t"\
  514. " jb 1b \n\t"
  515. #define WRITEBGR24MMX \
  516. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
  517. "movq %%mm2, %%mm1 \n\t" /* B */\
  518. "movq %%mm5, %%mm6 \n\t" /* R */\
  519. "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
  520. "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
  521. "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
  522. "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
  523. "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
  524. "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
  525. "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
  526. "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
  527. "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
  528. "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
  529. \
  530. "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
  531. "movq %%mm2, %%mm6 \n\t" /* 0RGB0RGB 1 */\
  532. "movq %%mm1, %%mm5 \n\t" /* 0RGB0RGB 2 */\
  533. "movq %%mm3, %%mm7 \n\t" /* 0RGB0RGB 3 */\
  534. \
  535. "psllq $40, %%mm0 \n\t" /* RGB00000 0 */\
  536. "psllq $40, %%mm2 \n\t" /* RGB00000 1 */\
  537. "psllq $40, %%mm1 \n\t" /* RGB00000 2 */\
  538. "psllq $40, %%mm3 \n\t" /* RGB00000 3 */\
  539. \
  540. "punpckhdq %%mm4, %%mm0 \n\t" /* 0RGBRGB0 0 */\
  541. "punpckhdq %%mm6, %%mm2 \n\t" /* 0RGBRGB0 1 */\
  542. "punpckhdq %%mm5, %%mm1 \n\t" /* 0RGBRGB0 2 */\
  543. "punpckhdq %%mm7, %%mm3 \n\t" /* 0RGBRGB0 3 */\
  544. \
  545. "psrlq $8, %%mm0 \n\t" /* 00RGBRGB 0 */\
  546. "movq %%mm2, %%mm6 \n\t" /* 0RGBRGB0 1 */\
  547. "psllq $40, %%mm2 \n\t" /* GB000000 1 */\
  548. "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
  549. MOVNTQ(%%mm0, (%%ebx))\
  550. \
  551. "psrlq $24, %%mm6 \n\t" /* 0000RGBR 1 */\
  552. "movq %%mm1, %%mm5 \n\t" /* 0RGBRGB0 2 */\
  553. "psllq $24, %%mm1 \n\t" /* BRGB0000 2 */\
  554. "por %%mm1, %%mm6 \n\t" /* BRGBRGBR 1 */\
  555. MOVNTQ(%%mm6, 8(%%ebx))\
  556. \
  557. "psrlq $40, %%mm5 \n\t" /* 000000RG 2 */\
  558. "psllq $8, %%mm3 \n\t" /* RGBRGB00 3 */\
  559. "por %%mm3, %%mm5 \n\t" /* RGBRGBRG 2 */\
  560. MOVNTQ(%%mm5, 16(%%ebx))\
  561. \
  562. "addl $24, %%ebx \n\t"\
  563. \
  564. "addl $8, %%eax \n\t"\
  565. "cmpl %5, %%eax \n\t"\
  566. " jb 1b \n\t"
  567. #define WRITEBGR24MMX2 \
  568. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
  569. "movq "MANGLE(M24A)", %%mm0 \n\t"\
  570. "movq "MANGLE(M24C)", %%mm7 \n\t"\
  571. "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2 B1 B0 B1 B0 */\
  572. "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2 G1 G0 G1 G0 */\
  573. "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0 R1 R0 R1 R0 */\
  574. \
  575. "pand %%mm0, %%mm1 \n\t" /* B2 B1 B0 */\
  576. "pand %%mm0, %%mm3 \n\t" /* G2 G1 G0 */\
  577. "pand %%mm7, %%mm6 \n\t" /* R1 R0 */\
  578. \
  579. "psllq $8, %%mm3 \n\t" /* G2 G1 G0 */\
  580. "por %%mm1, %%mm6 \n\t"\
  581. "por %%mm3, %%mm6 \n\t"\
  582. MOVNTQ(%%mm6, (%%ebx))\
  583. \
  584. "psrlq $8, %%mm4 \n\t" /* 00 G7 G6 G5 G4 G3 G2 G1 */\
  585. "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4 B3 B2 B3 B2 */\
  586. "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3 G4 G3 G4 G3 */\
  587. "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4 R3 R2 R3 R2 */\
  588. \
  589. "pand "MANGLE(M24B)", %%mm1 \n\t" /* B5 B4 B3 */\
  590. "pand %%mm7, %%mm3 \n\t" /* G4 G3 */\
  591. "pand %%mm0, %%mm6 \n\t" /* R4 R3 R2 */\
  592. \
  593. "por %%mm1, %%mm3 \n\t" /* B5 G4 B4 G3 B3 */\
  594. "por %%mm3, %%mm6 \n\t"\
  595. MOVNTQ(%%mm6, 8(%%ebx))\
  596. \
  597. "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6 B7 B6 B6 B7 */\
  598. "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7 G6 G5 G6 G5 */\
  599. "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6 R5 R4 R5 R4 */\
  600. \
  601. "pand %%mm7, %%mm1 \n\t" /* B7 B6 */\
  602. "pand %%mm0, %%mm3 \n\t" /* G7 G6 G5 */\
  603. "pand "MANGLE(M24B)", %%mm6 \n\t" /* R7 R6 R5 */\
  604. \
  605. "por %%mm1, %%mm3 \n\t"\
  606. "por %%mm3, %%mm6 \n\t"\
  607. MOVNTQ(%%mm6, 16(%%ebx))\
  608. \
  609. "addl $24, %%ebx \n\t"\
  610. \
  611. "addl $8, %%eax \n\t"\
  612. "cmpl %5, %%eax \n\t"\
  613. " jb 1b \n\t"
  614. #ifdef HAVE_MMX2
  615. #undef WRITEBGR24
  616. #define WRITEBGR24 WRITEBGR24MMX2
  617. #else
  618. #undef WRITEBGR24
  619. #define WRITEBGR24 WRITEBGR24MMX
  620. #endif
  621. static inline void RENAME(yuv2yuvX)(int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
  622. int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
  623. uint8_t *dest, uint8_t *uDest, uint8_t *vDest, int dstW,
  624. int16_t * lumMmxFilter, int16_t * chrMmxFilter)
  625. {
  626. #ifdef HAVE_MMX
  627. if(uDest != NULL)
  628. {
  629. asm volatile(
  630. YSCALEYUV2YV12X(0)
  631. :: "m" (-chrFilterSize), "r" (chrSrc+chrFilterSize),
  632. "r" (chrMmxFilter+chrFilterSize*4), "r" (uDest), "m" (dstW>>1)
  633. : "%eax", "%edx", "%esi"
  634. );
  635. asm volatile(
  636. YSCALEYUV2YV12X(4096)
  637. :: "m" (-chrFilterSize), "r" (chrSrc+chrFilterSize),
  638. "r" (chrMmxFilter+chrFilterSize*4), "r" (vDest), "m" (dstW>>1)
  639. : "%eax", "%edx", "%esi"
  640. );
  641. }
  642. asm volatile(
  643. YSCALEYUV2YV12X(0)
  644. :: "m" (-lumFilterSize), "r" (lumSrc+lumFilterSize),
  645. "r" (lumMmxFilter+lumFilterSize*4), "r" (dest), "m" (dstW)
  646. : "%eax", "%edx", "%esi"
  647. );
  648. #else
  649. yuv2yuvXinC(lumFilter, lumSrc, lumFilterSize,
  650. chrFilter, chrSrc, chrFilterSize,
  651. dest, uDest, vDest, dstW);
  652. #endif
  653. }
  654. static inline void RENAME(yuv2yuv1)(int16_t *lumSrc, int16_t *chrSrc,
  655. uint8_t *dest, uint8_t *uDest, uint8_t *vDest, int dstW)
  656. {
  657. #ifdef HAVE_MMX
  658. if(uDest != NULL)
  659. {
  660. asm volatile(
  661. YSCALEYUV2YV121
  662. :: "r" (chrSrc + (dstW>>1)), "r" (uDest + (dstW>>1)),
  663. "g" (-(dstW>>1))
  664. : "%eax"
  665. );
  666. asm volatile(
  667. YSCALEYUV2YV121
  668. :: "r" (chrSrc + 2048 + (dstW>>1)), "r" (vDest + (dstW>>1)),
  669. "g" (-(dstW>>1))
  670. : "%eax"
  671. );
  672. }
  673. asm volatile(
  674. YSCALEYUV2YV121
  675. :: "r" (lumSrc + dstW), "r" (dest + dstW),
  676. "g" (-dstW)
  677. : "%eax"
  678. );
  679. #else
  680. //FIXME Optimize (just quickly writen not opti..)
  681. //FIXME replace MINMAX with LUTs
  682. int i;
  683. for(i=0; i<dstW; i++)
  684. {
  685. int val= lumSrc[i]>>7;
  686. dest[i]= MIN(MAX(val>>19, 0), 255);
  687. }
  688. if(uDest != NULL)
  689. for(i=0; i<(dstW>>1); i++)
  690. {
  691. int u=chrSrc[i]>>7;
  692. int v=chrSrc[i + 2048]>>7;
  693. uDest[i]= MIN(MAX(u>>19, 0), 255);
  694. vDest[i]= MIN(MAX(v>>19, 0), 255);
  695. }
  696. #endif
  697. }
  698. /**
  699. * vertical scale YV12 to RGB
  700. */
  701. static inline void RENAME(yuv2rgbX)(int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
  702. int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
  703. uint8_t *dest, int dstW, int dstFormat, int16_t * lumMmxFilter, int16_t * chrMmxFilter)
  704. {
  705. /* if(flags&SWS_FULL_UV_IPOL)
  706. {
  707. //FIXME
  708. }//FULL_UV_IPOL
  709. else*/
  710. {
  711. #ifdef HAVE_MMX
  712. if(dstFormat == IMGFMT_BGR32) //FIXME untested
  713. {
  714. asm volatile(
  715. YSCALEYUV2RGBX
  716. WRITEBGR32
  717. :: "m" (-lumFilterSize), "m" (-chrFilterSize),
  718. "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
  719. "r" (dest), "m" (dstW),
  720. "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
  721. : "%eax", "%ebx", "%ecx", "%edx", "%esi"
  722. );
  723. }
  724. else if(dstFormat == IMGFMT_BGR24) //FIXME untested
  725. {
  726. asm volatile(
  727. YSCALEYUV2RGBX
  728. "leal (%%eax, %%eax, 2), %%ebx \n\t" //FIXME optimize
  729. "addl %4, %%ebx \n\t"
  730. WRITEBGR24
  731. :: "m" (-lumFilterSize), "m" (-chrFilterSize),
  732. "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
  733. "r" (dest), "m" (dstW),
  734. "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
  735. : "%eax", "%ebx", "%ecx", "%edx", "%esi"
  736. );
  737. }
  738. else if(dstFormat==IMGFMT_BGR15)
  739. {
  740. asm volatile(
  741. YSCALEYUV2RGBX
  742. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  743. #ifdef DITHER1XBPP
  744. "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
  745. "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
  746. "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
  747. #endif
  748. WRITEBGR15
  749. :: "m" (-lumFilterSize), "m" (-chrFilterSize),
  750. "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
  751. "r" (dest), "m" (dstW),
  752. "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
  753. : "%eax", "%ebx", "%ecx", "%edx", "%esi"
  754. );
  755. }
  756. else if(dstFormat==IMGFMT_BGR16)
  757. {
  758. asm volatile(
  759. YSCALEYUV2RGBX
  760. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  761. #ifdef DITHER1XBPP
  762. "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
  763. "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
  764. "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
  765. #endif
  766. WRITEBGR16
  767. :: "m" (-lumFilterSize), "m" (-chrFilterSize),
  768. "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
  769. "r" (dest), "m" (dstW),
  770. "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
  771. : "%eax", "%ebx", "%ecx", "%edx", "%esi"
  772. );
  773. }
  774. #else
  775. yuv2rgbXinC(lumFilter, lumSrc, lumFilterSize,
  776. chrFilter, chrSrc, chrFilterSize,
  777. dest, dstW, dstFormat);
  778. #endif
  779. } //!FULL_UV_IPOL
  780. }
  781. /**
  782. * vertical bilinear scale YV12 to RGB
  783. */
  784. static inline void RENAME(yuv2rgb2)(uint16_t *buf0, uint16_t *buf1, uint16_t *uvbuf0, uint16_t *uvbuf1,
  785. uint8_t *dest, int dstW, int yalpha, int uvalpha, int dstFormat, int flags)
  786. {
  787. int yalpha1=yalpha^4095;
  788. int uvalpha1=uvalpha^4095;
  789. if(flags&SWS_FULL_UV_IPOL)
  790. {
  791. #ifdef HAVE_MMX
  792. if(dstFormat==IMGFMT_BGR32)
  793. {
  794. asm volatile(
  795. FULL_YSCALEYUV2RGB
  796. "punpcklbw %%mm1, %%mm3 \n\t" // BGBGBGBG
  797. "punpcklbw %%mm7, %%mm0 \n\t" // R0R0R0R0
  798. "movq %%mm3, %%mm1 \n\t"
  799. "punpcklwd %%mm0, %%mm3 \n\t" // BGR0BGR0
  800. "punpckhwd %%mm0, %%mm1 \n\t" // BGR0BGR0
  801. MOVNTQ(%%mm3, (%4, %%eax, 4))
  802. MOVNTQ(%%mm1, 8(%4, %%eax, 4))
  803. "addl $4, %%eax \n\t"
  804. "cmpl %5, %%eax \n\t"
  805. " jb 1b \n\t"
  806. :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
  807. "m" (yalpha1), "m" (uvalpha1)
  808. : "%eax"
  809. );
  810. }
  811. else if(dstFormat==IMGFMT_BGR24)
  812. {
  813. asm volatile(
  814. FULL_YSCALEYUV2RGB
  815. // lsb ... msb
  816. "punpcklbw %%mm1, %%mm3 \n\t" // BGBGBGBG
  817. "punpcklbw %%mm7, %%mm0 \n\t" // R0R0R0R0
  818. "movq %%mm3, %%mm1 \n\t"
  819. "punpcklwd %%mm0, %%mm3 \n\t" // BGR0BGR0
  820. "punpckhwd %%mm0, %%mm1 \n\t" // BGR0BGR0
  821. "movq %%mm3, %%mm2 \n\t" // BGR0BGR0
  822. "psrlq $8, %%mm3 \n\t" // GR0BGR00
  823. "pand "MANGLE(bm00000111)", %%mm2\n\t" // BGR00000
  824. "pand "MANGLE(bm11111000)", %%mm3\n\t" // 000BGR00
  825. "por %%mm2, %%mm3 \n\t" // BGRBGR00
  826. "movq %%mm1, %%mm2 \n\t"
  827. "psllq $48, %%mm1 \n\t" // 000000BG
  828. "por %%mm1, %%mm3 \n\t" // BGRBGRBG
  829. "movq %%mm2, %%mm1 \n\t" // BGR0BGR0
  830. "psrld $16, %%mm2 \n\t" // R000R000
  831. "psrlq $24, %%mm1 \n\t" // 0BGR0000
  832. "por %%mm2, %%mm1 \n\t" // RBGRR000
  833. "movl %4, %%ebx \n\t"
  834. "addl %%eax, %%ebx \n\t"
  835. #ifdef HAVE_MMX2
  836. //FIXME Alignment
  837. "movntq %%mm3, (%%ebx, %%eax, 2)\n\t"
  838. "movntq %%mm1, 8(%%ebx, %%eax, 2)\n\t"
  839. #else
  840. "movd %%mm3, (%%ebx, %%eax, 2) \n\t"
  841. "psrlq $32, %%mm3 \n\t"
  842. "movd %%mm3, 4(%%ebx, %%eax, 2) \n\t"
  843. "movd %%mm1, 8(%%ebx, %%eax, 2) \n\t"
  844. #endif
  845. "addl $4, %%eax \n\t"
  846. "cmpl %5, %%eax \n\t"
  847. " jb 1b \n\t"
  848. :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstW),
  849. "m" (yalpha1), "m" (uvalpha1)
  850. : "%eax", "%ebx"
  851. );
  852. }
  853. else if(dstFormat==IMGFMT_BGR15)
  854. {
  855. asm volatile(
  856. FULL_YSCALEYUV2RGB
  857. #ifdef DITHER1XBPP
  858. "paddusb "MANGLE(g5Dither)", %%mm1\n\t"
  859. "paddusb "MANGLE(r5Dither)", %%mm0\n\t"
  860. "paddusb "MANGLE(b5Dither)", %%mm3\n\t"
  861. #endif
  862. "punpcklbw %%mm7, %%mm1 \n\t" // 0G0G0G0G
  863. "punpcklbw %%mm7, %%mm3 \n\t" // 0B0B0B0B
  864. "punpcklbw %%mm7, %%mm0 \n\t" // 0R0R0R0R
  865. "psrlw $3, %%mm3 \n\t"
  866. "psllw $2, %%mm1 \n\t"
  867. "psllw $7, %%mm0 \n\t"
  868. "pand "MANGLE(g15Mask)", %%mm1 \n\t"
  869. "pand "MANGLE(r15Mask)", %%mm0 \n\t"
  870. "por %%mm3, %%mm1 \n\t"
  871. "por %%mm1, %%mm0 \n\t"
  872. MOVNTQ(%%mm0, (%4, %%eax, 2))
  873. "addl $4, %%eax \n\t"
  874. "cmpl %5, %%eax \n\t"
  875. " jb 1b \n\t"
  876. :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
  877. "m" (yalpha1), "m" (uvalpha1)
  878. : "%eax"
  879. );
  880. }
  881. else if(dstFormat==IMGFMT_BGR16)
  882. {
  883. asm volatile(
  884. FULL_YSCALEYUV2RGB
  885. #ifdef DITHER1XBPP
  886. "paddusb "MANGLE(g6Dither)", %%mm1\n\t"
  887. "paddusb "MANGLE(r5Dither)", %%mm0\n\t"
  888. "paddusb "MANGLE(b5Dither)", %%mm3\n\t"
  889. #endif
  890. "punpcklbw %%mm7, %%mm1 \n\t" // 0G0G0G0G
  891. "punpcklbw %%mm7, %%mm3 \n\t" // 0B0B0B0B
  892. "punpcklbw %%mm7, %%mm0 \n\t" // 0R0R0R0R
  893. "psrlw $3, %%mm3 \n\t"
  894. "psllw $3, %%mm1 \n\t"
  895. "psllw $8, %%mm0 \n\t"
  896. "pand "MANGLE(g16Mask)", %%mm1 \n\t"
  897. "pand "MANGLE(r16Mask)", %%mm0 \n\t"
  898. "por %%mm3, %%mm1 \n\t"
  899. "por %%mm1, %%mm0 \n\t"
  900. MOVNTQ(%%mm0, (%4, %%eax, 2))
  901. "addl $4, %%eax \n\t"
  902. "cmpl %5, %%eax \n\t"
  903. " jb 1b \n\t"
  904. :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
  905. "m" (yalpha1), "m" (uvalpha1)
  906. : "%eax"
  907. );
  908. }
  909. #else
  910. if(dstFormat==IMGFMT_BGR32)
  911. {
  912. int i;
  913. for(i=0;i<dstW;i++){
  914. // vertical linear interpolation && yuv2rgb in a single step:
  915. int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
  916. int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
  917. int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
  918. dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)];
  919. dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)];
  920. dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)];
  921. dest+= 4;
  922. }
  923. }
  924. else if(dstFormat==IMGFMT_BGR24)
  925. {
  926. int i;
  927. for(i=0;i<dstW;i++){
  928. // vertical linear interpolation && yuv2rgb in a single step:
  929. int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
  930. int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
  931. int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
  932. dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)];
  933. dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)];
  934. dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)];
  935. dest+= 3;
  936. }
  937. }
  938. else if(dstFormat==IMGFMT_BGR16)
  939. {
  940. int i;
  941. for(i=0;i<dstW;i++){
  942. // vertical linear interpolation && yuv2rgb in a single step:
  943. int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
  944. int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
  945. int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
  946. ((uint16_t*)dest)[i] =
  947. clip_table16b[(Y + yuvtab_40cf[U]) >>13] |
  948. clip_table16g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
  949. clip_table16r[(Y + yuvtab_3343[V]) >>13];
  950. }
  951. }
  952. else if(dstFormat==IMGFMT_BGR15)
  953. {
  954. int i;
  955. for(i=0;i<dstW;i++){
  956. // vertical linear interpolation && yuv2rgb in a single step:
  957. int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
  958. int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
  959. int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
  960. ((uint16_t*)dest)[i] =
  961. clip_table15b[(Y + yuvtab_40cf[U]) >>13] |
  962. clip_table15g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
  963. clip_table15r[(Y + yuvtab_3343[V]) >>13];
  964. }
  965. }
  966. #endif
  967. }//FULL_UV_IPOL
  968. else
  969. {
  970. #ifdef HAVE_MMX
  971. if(dstFormat==IMGFMT_BGR32)
  972. {
  973. asm volatile(
  974. YSCALEYUV2RGB
  975. WRITEBGR32
  976. :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
  977. "m" (yalpha1), "m" (uvalpha1)
  978. : "%eax"
  979. );
  980. }
  981. else if(dstFormat==IMGFMT_BGR24)
  982. {
  983. asm volatile(
  984. "movl %4, %%ebx \n\t"
  985. YSCALEYUV2RGB
  986. WRITEBGR24
  987. :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstW),
  988. "m" (yalpha1), "m" (uvalpha1)
  989. : "%eax", "%ebx"
  990. );
  991. }
  992. else if(dstFormat==IMGFMT_BGR15)
  993. {
  994. asm volatile(
  995. YSCALEYUV2RGB
  996. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  997. #ifdef DITHER1XBPP
  998. "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
  999. "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
  1000. "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
  1001. #endif
  1002. WRITEBGR15
  1003. :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
  1004. "m" (yalpha1), "m" (uvalpha1)
  1005. : "%eax"
  1006. );
  1007. }
  1008. else if(dstFormat==IMGFMT_BGR16)
  1009. {
  1010. asm volatile(
  1011. YSCALEYUV2RGB
  1012. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1013. #ifdef DITHER1XBPP
  1014. "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
  1015. "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
  1016. "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
  1017. #endif
  1018. WRITEBGR16
  1019. :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
  1020. "m" (yalpha1), "m" (uvalpha1)
  1021. : "%eax"
  1022. );
  1023. }
  1024. #else
  1025. if(dstFormat==IMGFMT_BGR32)
  1026. {
  1027. int i;
  1028. for(i=0; i<dstW-1; i+=2){
  1029. // vertical linear interpolation && yuv2rgb in a single step:
  1030. int Y1=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
  1031. int Y2=yuvtab_2568[((buf0[i+1]*yalpha1+buf1[i+1]*yalpha)>>19)];
  1032. int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
  1033. int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
  1034. int Cb= yuvtab_40cf[U];
  1035. int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
  1036. int Cr= yuvtab_3343[V];
  1037. dest[4*i+0]=clip_table[((Y1 + Cb) >>13)];
  1038. dest[4*i+1]=clip_table[((Y1 + Cg) >>13)];
  1039. dest[4*i+2]=clip_table[((Y1 + Cr) >>13)];
  1040. dest[4*i+4]=clip_table[((Y2 + Cb) >>13)];
  1041. dest[4*i+5]=clip_table[((Y2 + Cg) >>13)];
  1042. dest[4*i+6]=clip_table[((Y2 + Cr) >>13)];
  1043. }
  1044. }
  1045. else if(dstFormat==IMGFMT_BGR24)
  1046. {
  1047. int i;
  1048. for(i=0; i<dstW-1; i+=2){
  1049. // vertical linear interpolation && yuv2rgb in a single step:
  1050. int Y1=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
  1051. int Y2=yuvtab_2568[((buf0[i+1]*yalpha1+buf1[i+1]*yalpha)>>19)];
  1052. int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
  1053. int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
  1054. int Cb= yuvtab_40cf[U];
  1055. int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
  1056. int Cr= yuvtab_3343[V];
  1057. dest[0]=clip_table[((Y1 + Cb) >>13)];
  1058. dest[1]=clip_table[((Y1 + Cg) >>13)];
  1059. dest[2]=clip_table[((Y1 + Cr) >>13)];
  1060. dest[3]=clip_table[((Y2 + Cb) >>13)];
  1061. dest[4]=clip_table[((Y2 + Cg) >>13)];
  1062. dest[5]=clip_table[((Y2 + Cr) >>13)];
  1063. dest+=6;
  1064. }
  1065. }
  1066. else if(dstFormat==IMGFMT_BGR16)
  1067. {
  1068. int i;
  1069. for(i=0; i<dstW-1; i+=2){
  1070. // vertical linear interpolation && yuv2rgb in a single step:
  1071. int Y1=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
  1072. int Y2=yuvtab_2568[((buf0[i+1]*yalpha1+buf1[i+1]*yalpha)>>19)];
  1073. int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
  1074. int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
  1075. int Cb= yuvtab_40cf[U];
  1076. int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
  1077. int Cr= yuvtab_3343[V];
  1078. ((uint16_t*)dest)[i] =
  1079. clip_table16b[(Y1 + Cb) >>13] |
  1080. clip_table16g[(Y1 + Cg) >>13] |
  1081. clip_table16r[(Y1 + Cr) >>13];
  1082. ((uint16_t*)dest)[i+1] =
  1083. clip_table16b[(Y2 + Cb) >>13] |
  1084. clip_table16g[(Y2 + Cg) >>13] |
  1085. clip_table16r[(Y2 + Cr) >>13];
  1086. }
  1087. }
  1088. else if(dstFormat==IMGFMT_BGR15)
  1089. {
  1090. int i;
  1091. for(i=0; i<dstW-1; i+=2){
  1092. // vertical linear interpolation && yuv2rgb in a single step:
  1093. int Y1=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
  1094. int Y2=yuvtab_2568[((buf0[i+1]*yalpha1+buf1[i+1]*yalpha)>>19)];
  1095. int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
  1096. int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
  1097. int Cb= yuvtab_40cf[U];
  1098. int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
  1099. int Cr= yuvtab_3343[V];
  1100. ((uint16_t*)dest)[i] =
  1101. clip_table15b[(Y1 + Cb) >>13] |
  1102. clip_table15g[(Y1 + Cg) >>13] |
  1103. clip_table15r[(Y1 + Cr) >>13];
  1104. ((uint16_t*)dest)[i+1] =
  1105. clip_table15b[(Y2 + Cb) >>13] |
  1106. clip_table15g[(Y2 + Cg) >>13] |
  1107. clip_table15r[(Y2 + Cr) >>13];
  1108. }
  1109. }
  1110. #endif
  1111. } //!FULL_UV_IPOL
  1112. }
  1113. /**
  1114. * YV12 to RGB without scaling or interpolating
  1115. */
  1116. static inline void RENAME(yuv2rgb1)(uint16_t *buf0, uint16_t *uvbuf0, uint16_t *uvbuf1,
  1117. uint8_t *dest, int dstW, int uvalpha, int dstFormat, int flags)
  1118. {
  1119. int uvalpha1=uvalpha^4095;
  1120. const int yalpha1=0;
  1121. if(flags&SWS_FULL_UV_IPOL)
  1122. {
  1123. RENAME(yuv2rgb2)(buf0, buf0, uvbuf0, uvbuf1, dest, dstW, 0, uvalpha, dstFormat, flags);
  1124. return;
  1125. }
  1126. #ifdef HAVE_MMX
  1127. if( uvalpha < 2048 ) // note this is not correct (shifts chrominance by 0.5 pixels) but its a bit faster
  1128. {
  1129. if(dstFormat==IMGFMT_BGR32)
  1130. {
  1131. asm volatile(
  1132. YSCALEYUV2RGB1
  1133. WRITEBGR32
  1134. :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
  1135. "m" (yalpha1), "m" (uvalpha1)
  1136. : "%eax"
  1137. );
  1138. }
  1139. else if(dstFormat==IMGFMT_BGR24)
  1140. {
  1141. asm volatile(
  1142. "movl %4, %%ebx \n\t"
  1143. YSCALEYUV2RGB1
  1144. WRITEBGR24
  1145. :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstW),
  1146. "m" (yalpha1), "m" (uvalpha1)
  1147. : "%eax", "%ebx"
  1148. );
  1149. }
  1150. else if(dstFormat==IMGFMT_BGR15)
  1151. {
  1152. asm volatile(
  1153. YSCALEYUV2RGB1
  1154. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1155. #ifdef DITHER1XBPP
  1156. "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
  1157. "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
  1158. "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
  1159. #endif
  1160. WRITEBGR15
  1161. :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
  1162. "m" (yalpha1), "m" (uvalpha1)
  1163. : "%eax"
  1164. );
  1165. }
  1166. else if(dstFormat==IMGFMT_BGR16)
  1167. {
  1168. asm volatile(
  1169. YSCALEYUV2RGB1
  1170. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1171. #ifdef DITHER1XBPP
  1172. "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
  1173. "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
  1174. "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
  1175. #endif
  1176. WRITEBGR16
  1177. :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
  1178. "m" (yalpha1), "m" (uvalpha1)
  1179. : "%eax"
  1180. );
  1181. }
  1182. }
  1183. else
  1184. {
  1185. if(dstFormat==IMGFMT_BGR32)
  1186. {
  1187. asm volatile(
  1188. YSCALEYUV2RGB1b
  1189. WRITEBGR32
  1190. :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
  1191. "m" (yalpha1), "m" (uvalpha1)
  1192. : "%eax"
  1193. );
  1194. }
  1195. else if(dstFormat==IMGFMT_BGR24)
  1196. {
  1197. asm volatile(
  1198. "movl %4, %%ebx \n\t"
  1199. YSCALEYUV2RGB1b
  1200. WRITEBGR24
  1201. :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstW),
  1202. "m" (yalpha1), "m" (uvalpha1)
  1203. : "%eax", "%ebx"
  1204. );
  1205. }
  1206. else if(dstFormat==IMGFMT_BGR15)
  1207. {
  1208. asm volatile(
  1209. YSCALEYUV2RGB1b
  1210. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1211. #ifdef DITHER1XBPP
  1212. "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
  1213. "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
  1214. "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
  1215. #endif
  1216. WRITEBGR15
  1217. :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
  1218. "m" (yalpha1), "m" (uvalpha1)
  1219. : "%eax"
  1220. );
  1221. }
  1222. else if(dstFormat==IMGFMT_BGR16)
  1223. {
  1224. asm volatile(
  1225. YSCALEYUV2RGB1b
  1226. /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
  1227. #ifdef DITHER1XBPP
  1228. "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
  1229. "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
  1230. "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
  1231. #endif
  1232. WRITEBGR16
  1233. :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
  1234. "m" (yalpha1), "m" (uvalpha1)
  1235. : "%eax"
  1236. );
  1237. }
  1238. }
  1239. #else
  1240. //FIXME write 2 versions (for even & odd lines)
  1241. if(dstFormat==IMGFMT_BGR32)
  1242. {
  1243. int i;
  1244. for(i=0; i<dstW-1; i+=2){
  1245. // vertical linear interpolation && yuv2rgb in a single step:
  1246. int Y1=yuvtab_2568[buf0[i]>>7];
  1247. int Y2=yuvtab_2568[buf0[i+1]>>7];
  1248. int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
  1249. int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
  1250. int Cb= yuvtab_40cf[U];
  1251. int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
  1252. int Cr= yuvtab_3343[V];
  1253. dest[4*i+0]=clip_table[((Y1 + Cb) >>13)];
  1254. dest[4*i+1]=clip_table[((Y1 + Cg) >>13)];
  1255. dest[4*i+2]=clip_table[((Y1 + Cr) >>13)];
  1256. dest[4*i+4]=clip_table[((Y2 + Cb) >>13)];
  1257. dest[4*i+5]=clip_table[((Y2 + Cg) >>13)];
  1258. dest[4*i+6]=clip_table[((Y2 + Cr) >>13)];
  1259. }
  1260. }
  1261. else if(dstFormat==IMGFMT_BGR24)
  1262. {
  1263. int i;
  1264. for(i=0; i<dstW-1; i+=2){
  1265. // vertical linear interpolation && yuv2rgb in a single step:
  1266. int Y1=yuvtab_2568[buf0[i]>>7];
  1267. int Y2=yuvtab_2568[buf0[i+1]>>7];
  1268. int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
  1269. int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
  1270. int Cb= yuvtab_40cf[U];
  1271. int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
  1272. int Cr= yuvtab_3343[V];
  1273. dest[0]=clip_table[((Y1 + Cb) >>13)];
  1274. dest[1]=clip_table[((Y1 + Cg) >>13)];
  1275. dest[2]=clip_table[((Y1 + Cr) >>13)];
  1276. dest[3]=clip_table[((Y2 + Cb) >>13)];
  1277. dest[4]=clip_table[((Y2 + Cg) >>13)];
  1278. dest[5]=clip_table[((Y2 + Cr) >>13)];
  1279. dest+=6;
  1280. }
  1281. }
  1282. else if(dstFormat==IMGFMT_BGR16)
  1283. {
  1284. int i;
  1285. for(i=0; i<dstW-1; i+=2){
  1286. // vertical linear interpolation && yuv2rgb in a single step:
  1287. int Y1=yuvtab_2568[buf0[i]>>7];
  1288. int Y2=yuvtab_2568[buf0[i+1]>>7];
  1289. int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
  1290. int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
  1291. int Cb= yuvtab_40cf[U];
  1292. int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
  1293. int Cr= yuvtab_3343[V];
  1294. ((uint16_t*)dest)[i] =
  1295. clip_table16b[(Y1 + Cb) >>13] |
  1296. clip_table16g[(Y1 + Cg) >>13] |
  1297. clip_table16r[(Y1 + Cr) >>13];
  1298. ((uint16_t*)dest)[i+1] =
  1299. clip_table16b[(Y2 + Cb) >>13] |
  1300. clip_table16g[(Y2 + Cg) >>13] |
  1301. clip_table16r[(Y2 + Cr) >>13];
  1302. }
  1303. }
  1304. else if(dstFormat==IMGFMT_BGR15)
  1305. {
  1306. int i;
  1307. for(i=0; i<dstW-1; i+=2){
  1308. // vertical linear interpolation && yuv2rgb in a single step:
  1309. int Y1=yuvtab_2568[buf0[i]>>7];
  1310. int Y2=yuvtab_2568[buf0[i+1]>>7];
  1311. int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
  1312. int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
  1313. int Cb= yuvtab_40cf[U];
  1314. int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
  1315. int Cr= yuvtab_3343[V];
  1316. ((uint16_t*)dest)[i] =
  1317. clip_table15b[(Y1 + Cb) >>13] |
  1318. clip_table15g[(Y1 + Cg) >>13] |
  1319. clip_table15r[(Y1 + Cr) >>13];
  1320. ((uint16_t*)dest)[i+1] =
  1321. clip_table15b[(Y2 + Cb) >>13] |
  1322. clip_table15g[(Y2 + Cg) >>13] |
  1323. clip_table15r[(Y2 + Cr) >>13];
  1324. }
  1325. }
  1326. #endif
  1327. }
  1328. // Bilinear / Bicubic scaling
  1329. static inline void RENAME(hScale)(int16_t *dst, int dstW, uint8_t *src, int srcW, int xInc,
  1330. int16_t *filter, int16_t *filterPos, int filterSize)
  1331. {
  1332. #ifdef HAVE_MMX
  1333. if(filterSize==4) // allways true for upscaling, sometimes for down too
  1334. {
  1335. int counter= -2*dstW;
  1336. filter-= counter*2;
  1337. filterPos-= counter/2;
  1338. dst-= counter/2;
  1339. asm volatile(
  1340. "pxor %%mm7, %%mm7 \n\t"
  1341. "movq "MANGLE(w02)", %%mm6 \n\t"
  1342. "pushl %%ebp \n\t" // we use 7 regs here ...
  1343. "movl %%eax, %%ebp \n\t"
  1344. ".balign 16 \n\t"
  1345. "1: \n\t"
  1346. "movzwl (%2, %%ebp), %%eax \n\t"
  1347. "movzwl 2(%2, %%ebp), %%ebx \n\t"
  1348. "movq (%1, %%ebp, 4), %%mm1 \n\t"
  1349. "movq 8(%1, %%ebp, 4), %%mm3 \n\t"
  1350. "movd (%3, %%eax), %%mm0 \n\t"
  1351. "movd (%3, %%ebx), %%mm2 \n\t"
  1352. "punpcklbw %%mm7, %%mm0 \n\t"
  1353. "punpcklbw %%mm7, %%mm2 \n\t"
  1354. "pmaddwd %%mm1, %%mm0 \n\t"
  1355. "pmaddwd %%mm2, %%mm3 \n\t"
  1356. "psrad $8, %%mm0 \n\t"
  1357. "psrad $8, %%mm3 \n\t"
  1358. "packssdw %%mm3, %%mm0 \n\t"
  1359. "pmaddwd %%mm6, %%mm0 \n\t"
  1360. "packssdw %%mm0, %%mm0 \n\t"
  1361. "movd %%mm0, (%4, %%ebp) \n\t"
  1362. "addl $4, %%ebp \n\t"
  1363. " jnc 1b \n\t"
  1364. "popl %%ebp \n\t"
  1365. : "+a" (counter)
  1366. : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
  1367. : "%ebx"
  1368. );
  1369. }
  1370. else if(filterSize==8)
  1371. {
  1372. int counter= -2*dstW;
  1373. filter-= counter*4;
  1374. filterPos-= counter/2;
  1375. dst-= counter/2;
  1376. asm volatile(
  1377. "pxor %%mm7, %%mm7 \n\t"
  1378. "movq "MANGLE(w02)", %%mm6 \n\t"
  1379. "pushl %%ebp \n\t" // we use 7 regs here ...
  1380. "movl %%eax, %%ebp \n\t"
  1381. ".balign 16 \n\t"
  1382. "1: \n\t"
  1383. "movzwl (%2, %%ebp), %%eax \n\t"
  1384. "movzwl 2(%2, %%ebp), %%ebx \n\t"
  1385. "movq (%1, %%ebp, 8), %%mm1 \n\t"
  1386. "movq 16(%1, %%ebp, 8), %%mm3 \n\t"
  1387. "movd (%3, %%eax), %%mm0 \n\t"
  1388. "movd (%3, %%ebx), %%mm2 \n\t"
  1389. "punpcklbw %%mm7, %%mm0 \n\t"
  1390. "punpcklbw %%mm7, %%mm2 \n\t"
  1391. "pmaddwd %%mm1, %%mm0 \n\t"
  1392. "pmaddwd %%mm2, %%mm3 \n\t"
  1393. "movq 8(%1, %%ebp, 8), %%mm1 \n\t"
  1394. "movq 24(%1, %%ebp, 8), %%mm5 \n\t"
  1395. "movd 4(%3, %%eax), %%mm4 \n\t"
  1396. "movd 4(%3, %%ebx), %%mm2 \n\t"
  1397. "punpcklbw %%mm7, %%mm4 \n\t"
  1398. "punpcklbw %%mm7, %%mm2 \n\t"
  1399. "pmaddwd %%mm1, %%mm4 \n\t"
  1400. "pmaddwd %%mm2, %%mm5 \n\t"
  1401. "paddd %%mm4, %%mm0 \n\t"
  1402. "paddd %%mm5, %%mm3 \n\t"
  1403. "psrad $8, %%mm0 \n\t"
  1404. "psrad $8, %%mm3 \n\t"
  1405. "packssdw %%mm3, %%mm0 \n\t"
  1406. "pmaddwd %%mm6, %%mm0 \n\t"
  1407. "packssdw %%mm0, %%mm0 \n\t"
  1408. "movd %%mm0, (%4, %%ebp) \n\t"
  1409. "addl $4, %%ebp \n\t"
  1410. " jnc 1b \n\t"
  1411. "popl %%ebp \n\t"
  1412. : "+a" (counter)
  1413. : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
  1414. : "%ebx"
  1415. );
  1416. }
  1417. else
  1418. {
  1419. int counter= -2*dstW;
  1420. // filter-= counter*filterSize/2;
  1421. filterPos-= counter/2;
  1422. dst-= counter/2;
  1423. asm volatile(
  1424. "pxor %%mm7, %%mm7 \n\t"
  1425. "movq "MANGLE(w02)", %%mm6 \n\t"
  1426. ".balign 16 \n\t"
  1427. "1: \n\t"
  1428. "movl %2, %%ecx \n\t"
  1429. "movzwl (%%ecx, %0), %%eax \n\t"
  1430. "movzwl 2(%%ecx, %0), %%ebx \n\t"
  1431. "movl %5, %%ecx \n\t"
  1432. "pxor %%mm4, %%mm4 \n\t"
  1433. "pxor %%mm5, %%mm5 \n\t"
  1434. "2: \n\t"
  1435. "movq (%1), %%mm1 \n\t"
  1436. "movq (%1, %6), %%mm3 \n\t"
  1437. "movd (%%ecx, %%eax), %%mm0 \n\t"
  1438. "movd (%%ecx, %%ebx), %%mm2 \n\t"
  1439. "punpcklbw %%mm7, %%mm0 \n\t"
  1440. "punpcklbw %%mm7, %%mm2 \n\t"
  1441. "pmaddwd %%mm1, %%mm0 \n\t"
  1442. "pmaddwd %%mm2, %%mm3 \n\t"
  1443. "paddd %%mm3, %%mm5 \n\t"
  1444. "paddd %%mm0, %%mm4 \n\t"
  1445. "addl $8, %1 \n\t"
  1446. "addl $4, %%ecx \n\t"
  1447. "cmpl %4, %%ecx \n\t"
  1448. " jb 2b \n\t"
  1449. "addl %6, %1 \n\t"
  1450. "psrad $8, %%mm4 \n\t"
  1451. "psrad $8, %%mm5 \n\t"
  1452. "packssdw %%mm5, %%mm4 \n\t"
  1453. "pmaddwd %%mm6, %%mm4 \n\t"
  1454. "packssdw %%mm4, %%mm4 \n\t"
  1455. "movl %3, %%eax \n\t"
  1456. "movd %%mm4, (%%eax, %0) \n\t"
  1457. "addl $4, %0 \n\t"
  1458. " jnc 1b \n\t"
  1459. : "+r" (counter), "+r" (filter)
  1460. : "m" (filterPos), "m" (dst), "m"(src+filterSize),
  1461. "m" (src), "r" (filterSize*2)
  1462. : "%ebx", "%eax", "%ecx"
  1463. );
  1464. }
  1465. #else
  1466. int i;
  1467. for(i=0; i<dstW; i++)
  1468. {
  1469. int j;
  1470. int srcPos= filterPos[i];
  1471. int val=0;
  1472. // printf("filterPos: %d\n", filterPos[i]);
  1473. for(j=0; j<filterSize; j++)
  1474. {
  1475. // printf("filter: %d, src: %d\n", filter[i], src[srcPos + j]);
  1476. val += ((int)src[srcPos + j])*filter[filterSize*i + j];
  1477. }
  1478. // filter += hFilterSize;
  1479. dst[i] = MIN(MAX(0, val>>7), (1<<15)-1); // the cubic equation does overflow ...
  1480. // dst[i] = val>>7;
  1481. }
  1482. #endif
  1483. }
  1484. // *** horizontal scale Y line to temp buffer
  1485. static inline void RENAME(hyscale)(uint16_t *dst, int dstWidth, uint8_t *src, int srcW, int xInc,
  1486. int flags, int canMMX2BeUsed, int16_t *hLumFilter,
  1487. int16_t *hLumFilterPos, int hLumFilterSize, void *funnyYCode)
  1488. {
  1489. #ifdef HAVE_MMX
  1490. // use the new MMX scaler if th mmx2 cant be used (its faster than the x86asm one)
  1491. if(!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
  1492. #else
  1493. if(!(flags&SWS_FAST_BILINEAR))
  1494. #endif
  1495. {
  1496. RENAME(hScale)(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize);
  1497. }
  1498. else // Fast Bilinear upscale / crap downscale
  1499. {
  1500. #ifdef ARCH_X86
  1501. #ifdef HAVE_MMX2
  1502. int i;
  1503. if(canMMX2BeUsed)
  1504. {
  1505. asm volatile(
  1506. "pxor %%mm7, %%mm7 \n\t"
  1507. "pxor %%mm2, %%mm2 \n\t" // 2*xalpha
  1508. "movd %5, %%mm6 \n\t" // xInc&0xFFFF
  1509. "punpcklwd %%mm6, %%mm6 \n\t"
  1510. "punpcklwd %%mm6, %%mm6 \n\t"
  1511. "movq %%mm6, %%mm2 \n\t"
  1512. "psllq $16, %%mm2 \n\t"
  1513. "paddw %%mm6, %%mm2 \n\t"
  1514. "psllq $16, %%mm2 \n\t"
  1515. "paddw %%mm6, %%mm2 \n\t"
  1516. "psllq $16, %%mm2 \n\t" //0,t,2t,3t t=xInc&0xFF
  1517. "movq %%mm2, %%mm4 \n\t"
  1518. "movd %4, %%mm6 \n\t" //(xInc*4)&0xFFFF
  1519. "punpcklwd %%mm6, %%mm6 \n\t"
  1520. "punpcklwd %%mm6, %%mm6 \n\t"
  1521. "xorl %%eax, %%eax \n\t" // i
  1522. "movl %0, %%esi \n\t" // src
  1523. "movl %1, %%edi \n\t" // buf1
  1524. "movl %3, %%edx \n\t" // (xInc*4)>>16
  1525. "xorl %%ecx, %%ecx \n\t"
  1526. "xorl %%ebx, %%ebx \n\t"
  1527. "movw %4, %%bx \n\t" // (xInc*4)&0xFFFF
  1528. #define FUNNY_Y_CODE \
  1529. PREFETCH" 1024(%%esi) \n\t"\
  1530. PREFETCH" 1056(%%esi) \n\t"\
  1531. PREFETCH" 1088(%%esi) \n\t"\
  1532. "call *%6 \n\t"\
  1533. "movq %%mm4, %%mm2 \n\t"\
  1534. "xorl %%ecx, %%ecx \n\t"
  1535. FUNNY_Y_CODE
  1536. FUNNY_Y_CODE
  1537. FUNNY_Y_CODE
  1538. FUNNY_Y_CODE
  1539. FUNNY_Y_CODE
  1540. FUNNY_Y_CODE
  1541. FUNNY_Y_CODE
  1542. FUNNY_Y_CODE
  1543. :: "m" (src), "m" (dst), "m" (dstWidth), "m" ((xInc*4)>>16),
  1544. "m" ((xInc*4)&0xFFFF), "m" (xInc&0xFFFF), "m" (funnyYCode)
  1545. : "%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi"
  1546. );
  1547. for(i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) dst[i] = src[srcW-1]*128;
  1548. }
  1549. else
  1550. {
  1551. #endif
  1552. //NO MMX just normal asm ...
  1553. asm volatile(
  1554. "xorl %%eax, %%eax \n\t" // i
  1555. "xorl %%ebx, %%ebx \n\t" // xx
  1556. "xorl %%ecx, %%ecx \n\t" // 2*xalpha
  1557. ".balign 16 \n\t"
  1558. "1: \n\t"
  1559. "movzbl (%0, %%ebx), %%edi \n\t" //src[xx]
  1560. "movzbl 1(%0, %%ebx), %%esi \n\t" //src[xx+1]
  1561. "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
  1562. "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
  1563. "shll $16, %%edi \n\t"
  1564. "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
  1565. "movl %1, %%edi \n\t"
  1566. "shrl $9, %%esi \n\t"
  1567. "movw %%si, (%%edi, %%eax, 2) \n\t"
  1568. "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
  1569. "adcl %3, %%ebx \n\t" //xx+= xInc>>8 + carry
  1570. "movzbl (%0, %%ebx), %%edi \n\t" //src[xx]
  1571. "movzbl 1(%0, %%ebx), %%esi \n\t" //src[xx+1]
  1572. "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
  1573. "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
  1574. "shll $16, %%edi \n\t"
  1575. "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
  1576. "movl %1, %%edi \n\t"
  1577. "shrl $9, %%esi \n\t"
  1578. "movw %%si, 2(%%edi, %%eax, 2) \n\t"
  1579. "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
  1580. "adcl %3, %%ebx \n\t" //xx+= xInc>>8 + carry
  1581. "addl $2, %%eax \n\t"
  1582. "cmpl %2, %%eax \n\t"
  1583. " jb 1b \n\t"
  1584. :: "r" (src), "m" (dst), "m" (dstWidth), "m" (xInc>>16), "m" (xInc&0xFFFF)
  1585. : "%eax", "%ebx", "%ecx", "%edi", "%esi"
  1586. );
  1587. #ifdef HAVE_MMX2
  1588. } //if MMX2 cant be used
  1589. #endif
  1590. #else
  1591. int i;
  1592. unsigned int xpos=0;
  1593. for(i=0;i<dstWidth;i++)
  1594. {
  1595. register unsigned int xx=xpos>>16;
  1596. register unsigned int xalpha=(xpos&0xFFFF)>>9;
  1597. dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha;
  1598. xpos+=xInc;
  1599. }
  1600. #endif
  1601. }
  1602. }
  1603. inline static void RENAME(hcscale)(uint16_t *dst, int dstWidth, uint8_t *src1, uint8_t *src2,
  1604. int srcW, int xInc, int flags, int canMMX2BeUsed, int16_t *hChrFilter,
  1605. int16_t *hChrFilterPos, int hChrFilterSize, void *funnyUVCode)
  1606. {
  1607. #ifdef HAVE_MMX
  1608. // use the new MMX scaler if th mmx2 cant be used (its faster than the x86asm one)
  1609. if(!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
  1610. #else
  1611. if(!(flags&SWS_FAST_BILINEAR))
  1612. #endif
  1613. {
  1614. RENAME(hScale)(dst , dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
  1615. RENAME(hScale)(dst+2048, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
  1616. }
  1617. else // Fast Bilinear upscale / crap downscale
  1618. {
  1619. #ifdef ARCH_X86
  1620. #ifdef HAVE_MMX2
  1621. int i;
  1622. if(canMMX2BeUsed)
  1623. {
  1624. asm volatile(
  1625. "pxor %%mm7, %%mm7 \n\t"
  1626. "pxor %%mm2, %%mm2 \n\t" // 2*xalpha
  1627. "movd %5, %%mm6 \n\t" // xInc&0xFFFF
  1628. "punpcklwd %%mm6, %%mm6 \n\t"
  1629. "punpcklwd %%mm6, %%mm6 \n\t"
  1630. "movq %%mm6, %%mm2 \n\t"
  1631. "psllq $16, %%mm2 \n\t"
  1632. "paddw %%mm6, %%mm2 \n\t"
  1633. "psllq $16, %%mm2 \n\t"
  1634. "paddw %%mm6, %%mm2 \n\t"
  1635. "psllq $16, %%mm2 \n\t" //0,t,2t,3t t=xInc&0xFFFF
  1636. "movq %%mm2, %%mm4 \n\t"
  1637. "movd %4, %%mm6 \n\t" //(xInc*4)&0xFFFF
  1638. "punpcklwd %%mm6, %%mm6 \n\t"
  1639. "punpcklwd %%mm6, %%mm6 \n\t"
  1640. "xorl %%eax, %%eax \n\t" // i
  1641. "movl %0, %%esi \n\t" // src
  1642. "movl %1, %%edi \n\t" // buf1
  1643. "movl %3, %%edx \n\t" // (xInc*4)>>16
  1644. "xorl %%ecx, %%ecx \n\t"
  1645. "xorl %%ebx, %%ebx \n\t"
  1646. "movw %4, %%bx \n\t" // (xInc*4)&0xFFFF
  1647. #define FUNNYUVCODE \
  1648. PREFETCH" 1024(%%esi) \n\t"\
  1649. PREFETCH" 1056(%%esi) \n\t"\
  1650. PREFETCH" 1088(%%esi) \n\t"\
  1651. "call *%7 \n\t"\
  1652. "movq %%mm4, %%mm2 \n\t"\
  1653. "xorl %%ecx, %%ecx \n\t"
  1654. FUNNYUVCODE
  1655. FUNNYUVCODE
  1656. FUNNYUVCODE
  1657. FUNNYUVCODE
  1658. FUNNYUVCODE
  1659. FUNNYUVCODE
  1660. FUNNYUVCODE
  1661. FUNNYUVCODE
  1662. "xorl %%eax, %%eax \n\t" // i
  1663. "movl %6, %%esi \n\t" // src
  1664. "movl %1, %%edi \n\t" // buf1
  1665. "addl $4096, %%edi \n\t"
  1666. FUNNYUVCODE
  1667. FUNNYUVCODE
  1668. FUNNYUVCODE
  1669. FUNNYUVCODE
  1670. FUNNYUVCODE
  1671. FUNNYUVCODE
  1672. FUNNYUVCODE
  1673. FUNNYUVCODE
  1674. :: "m" (src1), "m" (dst), "m" (dstWidth), "m" ((xInc*4)>>16),
  1675. "m" ((xInc*4)&0xFFFF), "m" (xInc&0xFFFF), "m" (src2), "m" (funnyUVCode)
  1676. : "%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi"
  1677. );
  1678. for(i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
  1679. {
  1680. // printf("%d %d %d\n", dstWidth, i, srcW);
  1681. dst[i] = src1[srcW-1]*128;
  1682. dst[i+2048] = src2[srcW-1]*128;
  1683. }
  1684. }
  1685. else
  1686. {
  1687. #endif
  1688. asm volatile(
  1689. "xorl %%eax, %%eax \n\t" // i
  1690. "xorl %%ebx, %%ebx \n\t" // xx
  1691. "xorl %%ecx, %%ecx \n\t" // 2*xalpha
  1692. ".balign 16 \n\t"
  1693. "1: \n\t"
  1694. "movl %0, %%esi \n\t"
  1695. "movzbl (%%esi, %%ebx), %%edi \n\t" //src[xx]
  1696. "movzbl 1(%%esi, %%ebx), %%esi \n\t" //src[xx+1]
  1697. "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
  1698. "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
  1699. "shll $16, %%edi \n\t"
  1700. "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
  1701. "movl %1, %%edi \n\t"
  1702. "shrl $9, %%esi \n\t"
  1703. "movw %%si, (%%edi, %%eax, 2) \n\t"
  1704. "movzbl (%5, %%ebx), %%edi \n\t" //src[xx]
  1705. "movzbl 1(%5, %%ebx), %%esi \n\t" //src[xx+1]
  1706. "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
  1707. "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
  1708. "shll $16, %%edi \n\t"
  1709. "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
  1710. "movl %1, %%edi \n\t"
  1711. "shrl $9, %%esi \n\t"
  1712. "movw %%si, 4096(%%edi, %%eax, 2)\n\t"
  1713. "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
  1714. "adcl %3, %%ebx \n\t" //xx+= xInc>>8 + carry
  1715. "addl $1, %%eax \n\t"
  1716. "cmpl %2, %%eax \n\t"
  1717. " jb 1b \n\t"
  1718. :: "m" (src1), "m" (dst), "m" (dstWidth), "m" (xInc>>16), "m" (xInc&0xFFFF),
  1719. "r" (src2)
  1720. : "%eax", "%ebx", "%ecx", "%edi", "%esi"
  1721. );
  1722. #ifdef HAVE_MMX2
  1723. } //if MMX2 cant be used
  1724. #endif
  1725. #else
  1726. int i;
  1727. unsigned int xpos=0;
  1728. for(i=0;i<dstWidth;i++)
  1729. {
  1730. register unsigned int xx=xpos>>16;
  1731. register unsigned int xalpha=(xpos&0xFFFF)>>9;
  1732. dst[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha);
  1733. dst[i+2048]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha);
  1734. /* slower
  1735. dst[i]= (src1[xx]<<7) + (src1[xx+1] - src1[xx])*xalpha;
  1736. dst[i+2048]=(src2[xx]<<7) + (src2[xx+1] - src2[xx])*xalpha;
  1737. */
  1738. xpos+=xInc;
  1739. }
  1740. #endif
  1741. }
  1742. }
  1743. static void RENAME(swScale)(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
  1744. int srcSliceH, uint8_t* dst[], int dstStride[]){
  1745. /* load a few things into local vars to make the code more readable? and faster */
  1746. const int srcW= c->srcW;
  1747. const int dstW= c->dstW;
  1748. const int dstH= c->dstH;
  1749. const int chrDstW= c->chrDstW;
  1750. const int lumXInc= c->lumXInc;
  1751. const int chrXInc= c->chrXInc;
  1752. const int dstFormat= c->dstFormat; //FIXME serach for dstbpp;
  1753. const int flags= c->flags;
  1754. const int canMMX2BeUsed= c->canMMX2BeUsed;
  1755. int16_t *vLumFilterPos= c->vLumFilterPos;
  1756. int16_t *vChrFilterPos= c->vChrFilterPos;
  1757. int16_t *hLumFilterPos= c->hLumFilterPos;
  1758. int16_t *hChrFilterPos= c->hChrFilterPos;
  1759. int16_t *vLumFilter= c->vLumFilter;
  1760. int16_t *vChrFilter= c->vChrFilter;
  1761. int16_t *hLumFilter= c->hLumFilter;
  1762. int16_t *hChrFilter= c->hChrFilter;
  1763. int16_t *lumMmxFilter= c->lumMmxFilter;
  1764. int16_t *chrMmxFilter= c->chrMmxFilter;
  1765. const int vLumFilterSize= c->vLumFilterSize;
  1766. const int vChrFilterSize= c->vChrFilterSize;
  1767. const int hLumFilterSize= c->hLumFilterSize;
  1768. const int hChrFilterSize= c->hChrFilterSize;
  1769. int16_t **lumPixBuf= c->lumPixBuf;
  1770. int16_t **chrPixBuf= c->chrPixBuf;
  1771. const int vLumBufSize= c->vLumBufSize;
  1772. const int vChrBufSize= c->vChrBufSize;
  1773. uint8_t *funnyYCode= c->funnyYCode;
  1774. uint8_t *funnyUVCode= c->funnyUVCode;
  1775. /* vars whch will change and which we need to storw back in the context */
  1776. int dstY= c->dstY;
  1777. int lumBufIndex= c->lumBufIndex;
  1778. int chrBufIndex= c->chrBufIndex;
  1779. int lastInLumBuf= c->lastInLumBuf;
  1780. int lastInChrBuf= c->lastInChrBuf;
  1781. if(srcSliceY ==0){
  1782. lumBufIndex=0;
  1783. chrBufIndex=0;
  1784. dstY=0;
  1785. lastInLumBuf= -1;
  1786. lastInChrBuf= -1;
  1787. }
  1788. for(;dstY < dstH; dstY++){
  1789. unsigned char *dest =dst[0]+dstStride[0]*dstY;
  1790. unsigned char *uDest=dst[1]+dstStride[1]*(dstY>>1);
  1791. unsigned char *vDest=dst[2]+dstStride[2]*(dstY>>1);
  1792. const int chrDstY= dstFormat==IMGFMT_YV12 ? (dstY>>1) : dstY;
  1793. const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input
  1794. const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input
  1795. const int lastLumSrcY= firstLumSrcY + vLumFilterSize -1; // Last line needed as input
  1796. const int lastChrSrcY= firstChrSrcY + vChrFilterSize -1; // Last line needed as input
  1797. if(flags&SWS_FAST_BILINEAR)
  1798. {
  1799. //handle holes
  1800. if(firstLumSrcY > lastInLumBuf) lastInLumBuf= firstLumSrcY-1;
  1801. if(firstChrSrcY > lastInChrBuf) lastInChrBuf= firstChrSrcY-1;
  1802. }
  1803. ASSERT(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1)
  1804. ASSERT(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1)
  1805. // Do we have enough lines in this slice to output the dstY line
  1806. if(lastLumSrcY < srcSliceY + srcSliceH && lastChrSrcY < ((srcSliceY + srcSliceH)>>1))
  1807. {
  1808. //Do horizontal scaling
  1809. while(lastInLumBuf < lastLumSrcY)
  1810. {
  1811. uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
  1812. lumBufIndex++;
  1813. ASSERT(lumBufIndex < 2*vLumBufSize)
  1814. ASSERT(lastInLumBuf + 1 - srcSliceY < srcSliceH)
  1815. ASSERT(lastInLumBuf + 1 - srcSliceY >= 0)
  1816. // printf("%d %d\n", lumBufIndex, vLumBufSize);
  1817. RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
  1818. flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
  1819. funnyYCode);
  1820. lastInLumBuf++;
  1821. }
  1822. while(lastInChrBuf < lastChrSrcY)
  1823. {
  1824. uint8_t *src1= src[1]+(lastInChrBuf + 1 - (srcSliceY>>1))*srcStride[1];
  1825. uint8_t *src2= src[2]+(lastInChrBuf + 1 - (srcSliceY>>1))*srcStride[2];
  1826. chrBufIndex++;
  1827. ASSERT(chrBufIndex < 2*vChrBufSize)
  1828. ASSERT(lastInChrBuf + 1 - (srcSliceY>>1) < (srcSliceH>>1))
  1829. ASSERT(lastInChrBuf + 1 - (srcSliceY>>1) >= 0)
  1830. //FIXME replace parameters through context struct (some at least)
  1831. RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, (srcW+1)>>1, chrXInc,
  1832. flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
  1833. funnyUVCode);
  1834. lastInChrBuf++;
  1835. }
  1836. //wrap buf index around to stay inside the ring buffer
  1837. if(lumBufIndex >= vLumBufSize ) lumBufIndex-= vLumBufSize;
  1838. if(chrBufIndex >= vChrBufSize ) chrBufIndex-= vChrBufSize;
  1839. }
  1840. else // not enough lines left in this slice -> load the rest in the buffer
  1841. {
  1842. /* printf("%d %d Last:%d %d LastInBuf:%d %d Index:%d %d Y:%d FSize: %d %d BSize: %d %d\n",
  1843. firstChrSrcY,firstLumSrcY,lastChrSrcY,lastLumSrcY,
  1844. lastInChrBuf,lastInLumBuf,chrBufIndex,lumBufIndex,dstY,vChrFilterSize,vLumFilterSize,
  1845. vChrBufSize, vLumBufSize);
  1846. */
  1847. //Do horizontal scaling
  1848. while(lastInLumBuf+1 < srcSliceY + srcSliceH)
  1849. {
  1850. uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
  1851. lumBufIndex++;
  1852. ASSERT(lumBufIndex < 2*vLumBufSize)
  1853. ASSERT(lastInLumBuf + 1 - srcSliceY < srcSliceH)
  1854. ASSERT(lastInLumBuf + 1 - srcSliceY >= 0)
  1855. RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
  1856. flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
  1857. funnyYCode);
  1858. lastInLumBuf++;
  1859. }
  1860. while(lastInChrBuf+1 < ((srcSliceY + srcSliceH)>>1))
  1861. {
  1862. uint8_t *src1= src[1]+(lastInChrBuf + 1 - (srcSliceY>>1))*srcStride[1];
  1863. uint8_t *src2= src[2]+(lastInChrBuf + 1 - (srcSliceY>>1))*srcStride[2];
  1864. chrBufIndex++;
  1865. ASSERT(chrBufIndex < 2*vChrBufSize)
  1866. ASSERT(lastInChrBuf + 1 - (srcSliceY>>1) < (srcSliceH>>1))
  1867. ASSERT(lastInChrBuf + 1 - (srcSliceY>>1) >= 0)
  1868. RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, (srcW+1)>>1, chrXInc,
  1869. flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
  1870. funnyUVCode);
  1871. lastInChrBuf++;
  1872. }
  1873. //wrap buf index around to stay inside the ring buffer
  1874. if(lumBufIndex >= vLumBufSize ) lumBufIndex-= vLumBufSize;
  1875. if(chrBufIndex >= vChrBufSize ) chrBufIndex-= vChrBufSize;
  1876. break; //we cant output a dstY line so lets try with the next slice
  1877. }
  1878. #ifdef HAVE_MMX
  1879. b5Dither= dither8[dstY&1];
  1880. g6Dither= dither4[dstY&1];
  1881. g5Dither= dither8[dstY&1];
  1882. r5Dither= dither8[(dstY+1)&1];
  1883. #endif
  1884. if(dstY < dstH-2)
  1885. {
  1886. if(dstFormat==IMGFMT_YV12) //YV12
  1887. {
  1888. if(dstY&1) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
  1889. if(vLumFilterSize == 1 && vChrFilterSize == 1) // Unscaled YV12
  1890. {
  1891. int16_t *lumBuf = lumPixBuf[0];
  1892. int16_t *chrBuf= chrPixBuf[0];
  1893. RENAME(yuv2yuv1)(lumBuf, chrBuf, dest, uDest, vDest, dstW);
  1894. }
  1895. else //General YV12
  1896. {
  1897. int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
  1898. int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
  1899. RENAME(yuv2yuvX)(
  1900. vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
  1901. vChrFilter+(dstY>>1)*vChrFilterSize, chrSrcPtr, vChrFilterSize,
  1902. dest, uDest, vDest, dstW,
  1903. lumMmxFilter+dstY*vLumFilterSize*4, chrMmxFilter+(dstY>>1)*vChrFilterSize*4);
  1904. }
  1905. }
  1906. else
  1907. {
  1908. int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
  1909. int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
  1910. ASSERT(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
  1911. ASSERT(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
  1912. if(vLumFilterSize == 1 && vChrFilterSize == 2) //Unscaled RGB
  1913. {
  1914. int chrAlpha= vChrFilter[2*dstY+1];
  1915. RENAME(yuv2rgb1)(*lumSrcPtr, *chrSrcPtr, *(chrSrcPtr+1),
  1916. dest, dstW, chrAlpha, dstFormat, flags);
  1917. }
  1918. else if(vLumFilterSize == 2 && vChrFilterSize == 2) //BiLinear Upscale RGB
  1919. {
  1920. int lumAlpha= vLumFilter[2*dstY+1];
  1921. int chrAlpha= vChrFilter[2*dstY+1];
  1922. RENAME(yuv2rgb2)(*lumSrcPtr, *(lumSrcPtr+1), *chrSrcPtr, *(chrSrcPtr+1),
  1923. dest, dstW, lumAlpha, chrAlpha, dstFormat, flags);
  1924. }
  1925. else //General RGB
  1926. {
  1927. RENAME(yuv2rgbX)(
  1928. vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
  1929. vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
  1930. dest, dstW, dstFormat,
  1931. lumMmxFilter+dstY*vLumFilterSize*4, chrMmxFilter+dstY*vChrFilterSize*4);
  1932. }
  1933. }
  1934. }
  1935. else // hmm looks like we cant use MMX here without overwriting this arrays tail
  1936. {
  1937. int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
  1938. int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
  1939. if(dstFormat==IMGFMT_YV12) //YV12
  1940. {
  1941. if(dstY&1) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
  1942. yuv2yuvXinC(
  1943. vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
  1944. vChrFilter+(dstY>>1)*vChrFilterSize, chrSrcPtr, vChrFilterSize,
  1945. dest, uDest, vDest, dstW);
  1946. }
  1947. else
  1948. {
  1949. ASSERT(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
  1950. ASSERT(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
  1951. yuv2rgbXinC(
  1952. vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
  1953. vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
  1954. dest, dstW, dstFormat);
  1955. }
  1956. }
  1957. }
  1958. #ifdef HAVE_MMX
  1959. __asm __volatile(SFENCE:::"memory");
  1960. __asm __volatile(EMMS:::"memory");
  1961. #endif
  1962. /* store changed local vars back in the context */
  1963. c->dstY= dstY;
  1964. c->lumBufIndex= lumBufIndex;
  1965. c->chrBufIndex= chrBufIndex;
  1966. c->lastInLumBuf= lastInLumBuf;
  1967. c->lastInChrBuf= lastInChrBuf;
  1968. }