You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

373 lines
11KB

  1. /*
  2. * DSP utils : average functions are compiled twice for 3dnow/mmx2
  3. * Copyright (c) 2000, 2001 Gerard Lantau.
  4. * Copyright (c) 2002 Michael Niedermayer
  5. *
  6. * This program is free software; you can redistribute it and/or modify
  7. * it under the terms of the GNU General Public License as published by
  8. * the Free Software Foundation; either version 2 of the License, or
  9. * (at your option) any later version.
  10. *
  11. * This program is distributed in the hope that it will be useful,
  12. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  14. * GNU General Public License for more details.
  15. *
  16. * You should have received a copy of the GNU General Public License
  17. * along with this program; if not, write to the Free Software
  18. * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  19. *
  20. * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
  21. * mostly rewritten by Michael Niedermayer <michaelni@gmx.at>
  22. */
  23. static void DEF(put_pixels_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
  24. {
  25. __asm __volatile(
  26. "xorl %%eax, %%eax \n\t"
  27. ".balign 16 \n\t"
  28. "1: \n\t"
  29. "movq (%1, %%eax), %%mm0 \n\t"
  30. "movq 1(%1, %%eax), %%mm1 \n\t"
  31. "movq (%2, %%eax), %%mm2 \n\t"
  32. "movq 1(%2, %%eax), %%mm3 \n\t"
  33. PAVGB" %%mm1, %%mm0 \n\t"
  34. PAVGB" %%mm3, %%mm2 \n\t"
  35. "movq %%mm0, (%3, %%eax) \n\t"
  36. "movq %%mm2, (%4, %%eax) \n\t"
  37. "addl %5, %%eax \n\t"
  38. "movq (%1, %%eax), %%mm0 \n\t"
  39. "movq 1(%1, %%eax), %%mm1 \n\t"
  40. "movq (%2, %%eax), %%mm2 \n\t"
  41. "movq 1(%2, %%eax), %%mm3 \n\t"
  42. PAVGB" %%mm1, %%mm0 \n\t"
  43. PAVGB" %%mm3, %%mm2 \n\t"
  44. "movq %%mm0, (%3, %%eax) \n\t"
  45. "movq %%mm2, (%4, %%eax) \n\t"
  46. "addl %5, %%eax \n\t"
  47. "subl $4, %0 \n\t"
  48. " jnz 1b \n\t"
  49. :"+g"(h)
  50. :"r"(pixels), "r"(pixels+line_size), "r" (block), "r" (block+line_size),
  51. "r"(line_size<<1)
  52. :"%eax", "memory");
  53. }
  54. static void DEF(put_no_rnd_pixels_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
  55. {
  56. __asm __volatile(
  57. "xorl %%eax, %%eax \n\t"
  58. "movq "MANGLE(mm_bone)", %%mm7 \n\t"
  59. ".balign 16 \n\t"
  60. "1: \n\t"
  61. "movq (%1, %%eax), %%mm0 \n\t"
  62. "movq 1(%1, %%eax), %%mm1 \n\t"
  63. "movq (%2, %%eax), %%mm2 \n\t"
  64. "movq 1(%2, %%eax), %%mm3 \n\t"
  65. "psubusb %%mm7, %%mm0 \n\t"
  66. "psubusb %%mm7, %%mm2 \n\t"
  67. PAVGB" %%mm1, %%mm0 \n\t"
  68. PAVGB" %%mm3, %%mm2 \n\t"
  69. "movq %%mm0, (%3, %%eax) \n\t"
  70. "movq %%mm2, (%4, %%eax) \n\t"
  71. "addl %5, %%eax \n\t"
  72. "movq (%1, %%eax), %%mm0 \n\t"
  73. "movq 1(%1, %%eax), %%mm1 \n\t"
  74. "movq (%2, %%eax), %%mm2 \n\t"
  75. "movq 1(%2, %%eax), %%mm3 \n\t"
  76. "psubusb %%mm7, %%mm0 \n\t"
  77. "psubusb %%mm7, %%mm2 \n\t"
  78. PAVGB" %%mm1, %%mm0 \n\t"
  79. PAVGB" %%mm3, %%mm2 \n\t"
  80. "movq %%mm0, (%3, %%eax) \n\t"
  81. "movq %%mm2, (%4, %%eax) \n\t"
  82. "addl %5, %%eax \n\t"
  83. "subl $4, %0 \n\t"
  84. " jnz 1b \n\t"
  85. :"+g"(h)
  86. :"r"(pixels), "r"(pixels+line_size), "r" (block), "r" (block+line_size),
  87. "r"(line_size<<1)
  88. :"%eax", "memory");
  89. }
  90. static void DEF(put_pixels_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
  91. {
  92. __asm __volatile(
  93. "xorl %%eax, %%eax \n\t"
  94. "movq (%1), %%mm0 \n\t"
  95. ".balign 16 \n\t"
  96. "1: \n\t"
  97. "movq (%2, %%eax), %%mm1 \n\t"
  98. "movq (%3, %%eax), %%mm2 \n\t"
  99. PAVGB" %%mm1, %%mm0 \n\t"
  100. PAVGB" %%mm2, %%mm1 \n\t"
  101. "movq %%mm0, (%4, %%eax) \n\t"
  102. "movq %%mm1, (%5, %%eax) \n\t"
  103. "addl %6, %%eax \n\t"
  104. "movq (%2, %%eax), %%mm1 \n\t"
  105. "movq (%3, %%eax), %%mm0 \n\t"
  106. PAVGB" %%mm1, %%mm2 \n\t"
  107. PAVGB" %%mm0, %%mm1 \n\t"
  108. "movq %%mm2, (%4, %%eax) \n\t"
  109. "movq %%mm1, (%5, %%eax) \n\t"
  110. "addl %6, %%eax \n\t"
  111. "subl $4, %0 \n\t"
  112. " jnz 1b \n\t"
  113. :"+g"(h)
  114. :"r"(pixels), "r"(pixels+line_size), "r"(pixels+line_size*2), "r" (block),
  115. "r" (block+line_size), "g"(line_size<<1)
  116. :"%eax", "memory");
  117. }
  118. static void DEF(put_no_rnd_pixels_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
  119. {
  120. __asm __volatile(
  121. "movq "MANGLE(mm_bone)", %%mm7 \n\t"
  122. "xorl %%eax, %%eax \n\t"
  123. "movq (%1), %%mm0 \n\t"
  124. ".balign 16 \n\t"
  125. "1: \n\t"
  126. "movq (%2, %%eax), %%mm1 \n\t"
  127. "movq (%3, %%eax), %%mm2 \n\t"
  128. "psubusb %%mm7, %%mm1 \n\t"
  129. PAVGB" %%mm1, %%mm0 \n\t"
  130. PAVGB" %%mm2, %%mm1 \n\t"
  131. "movq %%mm0, (%4, %%eax) \n\t"
  132. "movq %%mm1, (%5, %%eax) \n\t"
  133. "addl %6, %%eax \n\t"
  134. "movq (%2, %%eax), %%mm1 \n\t"
  135. "movq (%3, %%eax), %%mm0 \n\t"
  136. "psubusb %%mm7, %%mm1 \n\t"
  137. PAVGB" %%mm1, %%mm2 \n\t"
  138. PAVGB" %%mm0, %%mm1 \n\t"
  139. "movq %%mm2, (%4, %%eax) \n\t"
  140. "movq %%mm1, (%5, %%eax) \n\t"
  141. "addl %6, %%eax \n\t"
  142. "subl $4, %0 \n\t"
  143. " jnz 1b \n\t"
  144. :"+g"(h)
  145. :"r"(pixels), "r"(pixels+line_size), "r"(pixels+line_size*2), "r" (block),
  146. "r" (block+line_size), "g"(line_size<<1)
  147. :"%eax", "memory");
  148. }
  149. static void DEF(avg_pixels)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
  150. {
  151. __asm __volatile(
  152. "xorl %%eax, %%eax \n\t"
  153. ".balign 16 \n\t"
  154. "1: \n\t"
  155. "movq (%1, %%eax), %%mm0 \n\t"
  156. "movq (%2, %%eax), %%mm2 \n\t"
  157. "movq (%3, %%eax), %%mm3 \n\t"
  158. "movq (%4, %%eax), %%mm4 \n\t"
  159. PAVGB" %%mm3, %%mm0 \n\t"
  160. PAVGB" %%mm4, %%mm2 \n\t"
  161. "movq %%mm0, (%3, %%eax) \n\t"
  162. "movq %%mm2, (%4, %%eax) \n\t"
  163. "addl %5, %%eax \n\t"
  164. "movq (%1, %%eax), %%mm0 \n\t"
  165. "movq (%2, %%eax), %%mm2 \n\t"
  166. "movq (%3, %%eax), %%mm3 \n\t"
  167. "movq (%4, %%eax), %%mm4 \n\t"
  168. PAVGB" %%mm3, %%mm0 \n\t"
  169. PAVGB" %%mm4, %%mm2 \n\t"
  170. "movq %%mm0, (%3, %%eax) \n\t"
  171. "movq %%mm2, (%4, %%eax) \n\t"
  172. "addl %5, %%eax \n\t"
  173. "subl $4, %0 \n\t"
  174. " jnz 1b \n\t"
  175. :"+g"(h)
  176. :"r"(pixels), "r"(pixels+line_size), "r" (block), "r" (block+line_size),
  177. "r"(line_size<<1)
  178. :"%eax", "memory");
  179. }
  180. static void DEF(avg_pixels_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
  181. {
  182. __asm __volatile(
  183. "xorl %%eax, %%eax \n\t"
  184. ".balign 16 \n\t"
  185. "1: \n\t"
  186. "movq (%1, %%eax), %%mm0 \n\t"
  187. "movq 1(%1, %%eax), %%mm1 \n\t"
  188. "movq (%2, %%eax), %%mm2 \n\t"
  189. "movq 1(%2, %%eax), %%mm3 \n\t"
  190. PAVGB" %%mm1, %%mm0 \n\t"
  191. PAVGB" %%mm3, %%mm2 \n\t"
  192. "movq (%3, %%eax), %%mm3 \n\t"
  193. "movq (%4, %%eax), %%mm4 \n\t"
  194. PAVGB" %%mm3, %%mm0 \n\t"
  195. PAVGB" %%mm4, %%mm2 \n\t"
  196. "movq %%mm0, (%3, %%eax) \n\t"
  197. "movq %%mm2, (%4, %%eax) \n\t"
  198. "addl %5, %%eax \n\t"
  199. "movq (%1, %%eax), %%mm0 \n\t"
  200. "movq 1(%1, %%eax), %%mm1 \n\t"
  201. "movq (%2, %%eax), %%mm2 \n\t"
  202. "movq 1(%2, %%eax), %%mm3 \n\t"
  203. PAVGB" %%mm1, %%mm0 \n\t"
  204. PAVGB" %%mm3, %%mm2 \n\t"
  205. "movq (%3, %%eax), %%mm3 \n\t"
  206. "movq (%4, %%eax), %%mm4 \n\t"
  207. PAVGB" %%mm3, %%mm0 \n\t"
  208. PAVGB" %%mm4, %%mm2 \n\t"
  209. "movq %%mm0, (%3, %%eax) \n\t"
  210. "movq %%mm2, (%4, %%eax) \n\t"
  211. "addl %5, %%eax \n\t"
  212. "subl $4, %0 \n\t"
  213. " jnz 1b \n\t"
  214. :"+g"(h)
  215. :"r"(pixels), "r"(pixels+line_size), "r" (block), "r" (block+line_size),
  216. "r"(line_size<<1)
  217. :"%eax", "memory");
  218. }
  219. static void DEF(avg_pixels_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
  220. {
  221. __asm __volatile(
  222. "xorl %%eax, %%eax \n\t"
  223. "movq (%1), %%mm0 \n\t"
  224. ".balign 16 \n\t"
  225. "1: \n\t"
  226. "movq (%2, %%eax), %%mm1 \n\t"
  227. "movq (%3, %%eax), %%mm2 \n\t"
  228. PAVGB" %%mm1, %%mm0 \n\t"
  229. PAVGB" %%mm2, %%mm1 \n\t"
  230. "movq (%4, %%eax), %%mm3 \n\t"
  231. "movq (%5, %%eax), %%mm4 \n\t"
  232. PAVGB" %%mm3, %%mm0 \n\t"
  233. PAVGB" %%mm4, %%mm1 \n\t"
  234. "movq %%mm0, (%4, %%eax) \n\t"
  235. "movq %%mm1, (%5, %%eax) \n\t"
  236. "addl %6, %%eax \n\t"
  237. "movq (%2, %%eax), %%mm1 \n\t"
  238. "movq (%3, %%eax), %%mm0 \n\t"
  239. PAVGB" %%mm1, %%mm2 \n\t"
  240. PAVGB" %%mm0, %%mm1 \n\t"
  241. "movq (%4, %%eax), %%mm3 \n\t"
  242. "movq (%5, %%eax), %%mm4 \n\t"
  243. PAVGB" %%mm3, %%mm2 \n\t"
  244. PAVGB" %%mm4, %%mm1 \n\t"
  245. "movq %%mm2, (%4, %%eax) \n\t"
  246. "movq %%mm1, (%5, %%eax) \n\t"
  247. "addl %6, %%eax \n\t"
  248. "subl $4, %0 \n\t"
  249. " jnz 1b \n\t"
  250. :"+g"(h)
  251. :"r"(pixels), "r"(pixels+line_size), "r"(pixels+line_size*2), "r" (block),
  252. "r" (block+line_size), "g"(line_size<<1)
  253. :"%eax", "memory");
  254. }
  255. // Note this is not correctly rounded, but this function is only used for b frames so it doesnt matter
  256. static void DEF(avg_pixels_xy2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
  257. {
  258. __asm __volatile(
  259. "movq "MANGLE(mm_bone)", %%mm7 \n\t"
  260. "xorl %%eax, %%eax \n\t"
  261. "movq (%1), %%mm0 \n\t"
  262. "movq 1(%1), %%mm1 \n\t"
  263. PAVGB" %%mm1, %%mm0 \n\t"
  264. ".balign 16 \n\t"
  265. "1: \n\t"
  266. "movq (%2, %%eax), %%mm1 \n\t"
  267. "movq (%3, %%eax), %%mm2 \n\t"
  268. "movq 1(%2, %%eax), %%mm3 \n\t"
  269. "movq 1(%3, %%eax), %%mm4 \n\t"
  270. "psubusb %%mm7, %%mm2 \n\t"
  271. PAVGB" %%mm3, %%mm1 \n\t"
  272. PAVGB" %%mm4, %%mm2 \n\t"
  273. PAVGB" %%mm1, %%mm0 \n\t"
  274. PAVGB" %%mm2, %%mm1 \n\t"
  275. "movq (%4, %%eax), %%mm3 \n\t"
  276. "movq (%5, %%eax), %%mm4 \n\t"
  277. PAVGB" %%mm3, %%mm0 \n\t"
  278. PAVGB" %%mm4, %%mm1 \n\t"
  279. "movq %%mm0, (%4, %%eax) \n\t"
  280. "movq %%mm1, (%5, %%eax) \n\t"
  281. "addl %6, %%eax \n\t"
  282. "movq (%2, %%eax), %%mm1 \n\t"
  283. "movq (%3, %%eax), %%mm0 \n\t"
  284. "movq 1(%2, %%eax), %%mm3 \n\t"
  285. "movq 1(%3, %%eax), %%mm4 \n\t"
  286. PAVGB" %%mm3, %%mm1 \n\t"
  287. PAVGB" %%mm4, %%mm0 \n\t"
  288. PAVGB" %%mm1, %%mm2 \n\t"
  289. PAVGB" %%mm0, %%mm1 \n\t"
  290. "movq (%4, %%eax), %%mm3 \n\t"
  291. "movq (%5, %%eax), %%mm4 \n\t"
  292. PAVGB" %%mm3, %%mm2 \n\t"
  293. PAVGB" %%mm4, %%mm1 \n\t"
  294. "movq %%mm2, (%4, %%eax) \n\t"
  295. "movq %%mm1, (%5, %%eax) \n\t"
  296. "addl %6, %%eax \n\t"
  297. "subl $4, %0 \n\t"
  298. " jnz 1b \n\t"
  299. :"+g"(h)
  300. :"r"(pixels), "r"(pixels+line_size), "r"(pixels+line_size*2), "r" (block),
  301. "r" (block+line_size), "g"(line_size<<1)
  302. :"%eax", "memory");
  303. }
  304. //Note: the sub* functions are no used
  305. static void DEF(sub_pixels_x2)( DCTELEM *block, const UINT8 *pixels, int line_size, int h)
  306. {
  307. DCTELEM *p;
  308. const UINT8 *pix;
  309. p = block;
  310. pix = pixels;
  311. __asm __volatile(
  312. "pxor %%mm7, %%mm7":);
  313. do {
  314. __asm __volatile(
  315. "movq 1%1, %%mm2\n\t"
  316. "movq %0, %%mm0\n\t"
  317. PAVGB" %1, %%mm2\n\t"
  318. "movq 8%0, %%mm1\n\t"
  319. "movq %%mm2, %%mm3\n\t"
  320. "punpcklbw %%mm7, %%mm2\n\t"
  321. "punpckhbw %%mm7, %%mm3\n\t"
  322. "psubsw %%mm2, %%mm0\n\t"
  323. "psubsw %%mm3, %%mm1\n\t"
  324. "movq %%mm0, %0\n\t"
  325. "movq %%mm1, 8%0\n\t"
  326. :"+m"(*p)
  327. :"m"(*pix)
  328. :"memory");
  329. pix += line_size;
  330. p += 8;
  331. } while (--h);
  332. }
  333. static void DEF(sub_pixels_y2)( DCTELEM *block, const UINT8 *pixels, int line_size, int h)
  334. {
  335. DCTELEM *p;
  336. const UINT8 *pix;
  337. p = block;
  338. pix = pixels;
  339. __asm __volatile(
  340. "pxor %%mm7, %%mm7":);
  341. do {
  342. __asm __volatile(
  343. "movq %2, %%mm2\n\t"
  344. "movq %0, %%mm0\n\t"
  345. PAVGB" %1, %%mm2\n\t"
  346. "movq 8%0, %%mm1\n\t"
  347. "movq %%mm2, %%mm3\n\t"
  348. "punpcklbw %%mm7, %%mm2\n\t"
  349. "punpckhbw %%mm7, %%mm3\n\t"
  350. "psubsw %%mm2, %%mm0\n\t"
  351. "psubsw %%mm3, %%mm1\n\t"
  352. "movq %%mm0, %0\n\t"
  353. "movq %%mm1, 8%0\n\t"
  354. :"+m"(*p)
  355. :"m"(*pix), "m"(*(pix+line_size))
  356. :"memory");
  357. pix += line_size;
  358. p += 8;
  359. } while (--h);
  360. }